Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 18 additions & 66 deletions bin/cuda/gpu_benchmark.cu
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
#include "gpu_benchmark.h"

#include "kernels.h"

#include <cub/cub.cuh>
#include <cuda_runtime.h>

#include <chrono>
#include <iostream>
#include <optional>

// The macro wraps any CUDA API call
#define CUDA_CHECK(ans) \
Expand All @@ -26,69 +27,10 @@ float getElapsedTime(const cudaEvent_t &gpu_start, cudaEvent_t &gpu_stop) {
return gpu_elapsed_time / 1000.0f;
}

// Function to run the GPU benchmark with no time limit
void runBenchmark(long max_work) {
uint32_t n = 256 * 256;
uint64_t m = max_work * 16384 / n;

unsigned long long int *d_count;
curandState *d_state;
CUDA_CHECK(cudaMalloc((void **)&d_count, 256 * sizeof(unsigned long long int)));
CUDA_CHECK(cudaMalloc((void **)&d_state, n * sizeof(curandState)));
CUDA_CHECK(cudaMemset(d_count, 0, 256 * sizeof(unsigned long long int)));

// set up timing stuff
cudaEvent_t gpu_start, gpu_stop;
CUDA_CHECK(cudaEventCreate(&gpu_start));
CUDA_CHECK(cudaEventCreate(&gpu_stop));

// set kernel
dim3 gridSize = 256;
dim3 blockSize = 256;
setup_kernel<<<gridSize, blockSize>>>(d_state);

// monte carlo kernel
CUDA_CHECK(cudaEventRecord(gpu_start, 0));
monte_carlo_kernel<<<gridSize, blockSize>>>(d_state, d_count, m);
CUDA_CHECK(cudaDeviceSynchronize());

float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop);
CUDA_CHECK(cudaEventDestroy(gpu_start));
CUDA_CHECK(cudaEventDestroy(gpu_stop));

// Allocate device output array
unsigned long long int *d_out = nullptr;
CUDA_CHECK(cudaMalloc((void **)&d_out, sizeof(unsigned long long int)));

// Request and allocate temporary storage
void *d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
CUDA_CHECK(cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256));
CUDA_CHECK(cudaMalloc((void **)&d_temp_storage, temp_storage_bytes));

// Run
CUDA_CHECK(cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256));

// copy results back to the host
unsigned long long int h_count = 0;
CUDA_CHECK(cudaMemcpy(&h_count, d_out, sizeof(unsigned long long int), cudaMemcpyDeviceToHost));

// display results and timings for gpu
float pi = h_count * 4.0 / (n * m);
std::cout << "Approximate pi calculated on GPU is: " << pi << " and calculation took " << gpu_elapsed_time << "s\n";
std::cout << "Benchmark completed!" << std::endl;

CUDA_CHECK(cudaFree(d_count));
CUDA_CHECK(cudaFree(d_state));
CUDA_CHECK(cudaFree(d_out));
CUDA_CHECK(cudaFree(d_temp_storage));
}

// Function to run the GPU benchmark for a specified time
void runBenchmarkTime(long max_work, int runtime_in_seconds) {

void runBenchmarkTime(long max_work, std::optional<int> runtime_in_seconds) {
uint32_t n = 256 * 256;
uint64_t m = max_work * 16384 / n;
uint64_t m = (max_work + n - 1) / n;

// allocate memory
unsigned long long int *d_count;
Expand All @@ -108,13 +50,21 @@ void runBenchmarkTime(long max_work, int runtime_in_seconds) {

setup_kernel<<<gridSize, blockSize>>>(d_state);

// monte carlo kernel
CUDA_CHECK(cudaEventRecord(gpu_start, 0));

int iteration = 0;
// Run the workload loop until the specified runtime is reached
while (getElapsedTime(gpu_start, gpu_stop) < runtime_in_seconds) {
if (!runtime_in_seconds.has_value()) {
monte_carlo_kernel<<<gridSize, blockSize>>>(d_state, d_count, m);
CUDA_CHECK(cudaDeviceSynchronize()); // Ensure the kernel has finished executing
CUDA_CHECK(cudaDeviceSynchronize());
iteration++;
} else {
// Run the workload loop until the specified runtime is reached
while (getElapsedTime(gpu_start, gpu_stop) < runtime_in_seconds) {
monte_carlo_kernel<<<gridSize, blockSize>>>(d_state, d_count, m);
CUDA_CHECK(cudaDeviceSynchronize()); // Ensure the kernel has finished executing
iteration++;
}
}

float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop);
Expand Down Expand Up @@ -149,6 +99,8 @@ void runBenchmarkTime(long max_work, int runtime_in_seconds) {
CUDA_CHECK(cudaFree(d_temp_storage));
}

void runBenchmark(long max_work) { runBenchmarkTime(max_work, std::nullopt); }

int main(int argc, char *argv[]) {
// Check for the correct number of command line arguments
if (argc == 2) {
Expand Down
10 changes: 0 additions & 10 deletions bin/cuda/gpu_benchmark.h

This file was deleted.

83 changes: 18 additions & 65 deletions bin/hip/gpu_benchmark.hip
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include <chrono>
#include <iostream>
#include <optional>

#define HIP_CHECK(expression) \
{ \
Expand All @@ -24,69 +25,10 @@ float getElapsedTime(const hipEvent_t &gpu_start, hipEvent_t &gpu_stop) {
return gpu_elapsed_time / 1000.0f;
}

// Function to run the GPU benchmark with no time limit
void runBenchmark(long max_work) {
uint32_t n = 256 * 256;
uint64_t m = max_work * 16384 / n;

unsigned long long int *d_count;
hiprandState *d_state;
HIP_CHECK(hipMalloc((void **)&d_count, 256 * sizeof(unsigned long long int)));
HIP_CHECK(hipMalloc((void **)&d_state, n * sizeof(hiprandState)));
HIP_CHECK(hipMemset(d_count, 0, 256 * sizeof(unsigned long long int)));

// set up timing stuff
hipEvent_t gpu_start, gpu_stop;
HIP_CHECK(hipEventCreate(&gpu_start));
HIP_CHECK(hipEventCreate(&gpu_stop));

// set kernel
dim3 gridSize = 256;
dim3 blockSize = 256;
setup_kernel<<<gridSize, blockSize>>>(d_state);

// monte carlo kernel
HIP_CHECK(hipEventRecord(gpu_start, 0));
monte_carlo_kernel<<<gridSize, blockSize>>>(d_state, d_count, m);
HIP_CHECK(hipDeviceSynchronize());

float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop);
HIP_CHECK(hipEventDestroy(gpu_start));
HIP_CHECK(hipEventDestroy(gpu_stop));

// Allocate device output array
unsigned long long int *d_out = nullptr;
HIP_CHECK(hipMalloc((void **)&d_out, sizeof(unsigned long long int)));

// Request and allocate temporary storage
void *d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
HIP_CHECK(hipcub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256));
HIP_CHECK(hipMalloc((void **)&d_temp_storage, temp_storage_bytes));

// Run
HIP_CHECK(hipcub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256));

// copy results back to the host
unsigned long long int h_count = 0;
HIP_CHECK(hipMemcpy(&h_count, d_out, sizeof(unsigned long long int), hipMemcpyDeviceToHost));

// display results and timings for gpu
float pi = h_count * 4.0 / (n * m);
std::cout << "Approximate pi calculated on GPU is: " << pi << " and calculation took " << gpu_elapsed_time << "s\n";
std::cout << "Benchmark completed!" << std::endl;

HIP_CHECK(hipFree(d_count));
HIP_CHECK(hipFree(d_state));
HIP_CHECK(hipFree(d_out));
HIP_CHECK(hipFree(d_temp_storage));
}

// Function to run the GPU benchmark for a specified time
void runBenchmarkTime(long max_work, int runtime_in_seconds) {

void runBenchmarkTime(long max_work, std::optional<int> runtime_in_seconds) {
uint32_t n = 256 * 256;
uint64_t m = max_work * 16384 / n;
uint64_t m = (max_work + n - 1) / n;

// allocate memory
unsigned long long int *d_count;
Expand All @@ -99,20 +41,27 @@ void runBenchmarkTime(long max_work, int runtime_in_seconds) {
hipEvent_t gpu_start, gpu_stop;
HIP_CHECK(hipEventCreate(&gpu_start));
HIP_CHECK(hipEventCreate(&gpu_stop));
HIP_CHECK(hipEventRecord(gpu_start, 0));

// set kernel
dim3 gridSize = 256;
dim3 blockSize = 256;

setup_kernel<<<gridSize, blockSize>>>(d_state);

HIP_CHECK(hipEventRecord(gpu_start, 0));
int iteration = 0;
// Run the workload loop until the specified runtime is reached
while (getElapsedTime(gpu_start, gpu_stop) < runtime_in_seconds) {

if (!runtime_in_seconds.has_value()) {
monte_carlo_kernel<<<gridSize, blockSize>>>(d_state, d_count, m);
HIP_CHECK(hipDeviceSynchronize()); // Ensure the kernel has finished executing
HIP_CHECK(hipDeviceSynchronize());
iteration++;
} else {
// Run the workload loop until the specified runtime is reached
while (getElapsedTime(gpu_start, gpu_stop) < runtime_in_seconds) {
monte_carlo_kernel<<<gridSize, blockSize>>>(d_state, d_count, m);
HIP_CHECK(hipDeviceSynchronize()); // Ensure the kernel has finished executing
iteration++;
}
}

float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop);
Expand Down Expand Up @@ -147,6 +96,10 @@ void runBenchmarkTime(long max_work, int runtime_in_seconds) {
HIP_CHECK(hipFree(d_temp_storage));
}

void runBenchmark(long max_work) {
runBenchmarkTime(max_work, std::nullopt);
}

int main(int argc, char *argv[]) {
// Check for the correct number of command line arguments
if (argc == 2) {
Expand Down
40 changes: 29 additions & 11 deletions bin/wfbench
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import json
import logging
import pandas as pd
import psutil
import shutil

from io import StringIO
from filelock import FileLock
Expand Down Expand Up @@ -230,11 +231,21 @@ class GPUBenchmark:

@staticmethod
def get_available_gpus():
proc = subprocess.Popen(["nvidia-smi", "--query-gpu=utilization.gpu", "--format=csv"], stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
stdout, _ = proc.communicate()
df = pd.read_csv(StringIO(stdout.decode("utf-8")), sep=" ")
return df[df["utilization.gpu"] <= 5].index.to_list()
if shutil.which("nvidia-smi") is not None:
proc = subprocess.Popen(["nvidia-smi", "--query-gpu=utilization.gpu", "--format=csv"], stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
stdout, _ = proc.communicate()
df = pd.read_csv(StringIO(stdout.decode("utf-8")), sep=" ")
return df[df["utilization.gpu"] <= 5].index.to_list()
elif shutil.which("amd-smi") is not None:
proc = subprocess.Popen(["amd-smi", "monitor", "-u", "--csv"], stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
stdout, _ = proc.communicate()
df = pd.read_csv(StringIO(stdout.decode("utf-8")), sep=",")
return df[df["gfx"] <= 5].index.to_list()
else:
log_error("No supported GPU monitoring tool found.")
return []

def __init__(self):
self.work = None
Expand All @@ -250,7 +261,7 @@ class GPUBenchmark:
log_debug(f"GPU benchmark instantiated for device {self.device}")

def set_work(self, work: int):
self.work = work
self.work = 1000000 * work

def set_time(self, duration: float):
self.duration = duration
Expand All @@ -261,13 +272,20 @@ class GPUBenchmark:

if self.duration is not None:
log_debug(f"Running GPU benchmark for {self.duration} seconds")
gpu_prog = [
f"CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES={self.device} {this_dir.joinpath('./gpu_benchmark')} {self.work} {self.duration}"]
if shutil.which("nvidia-smi") is not None:
gpu_prog = [
f"CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES={self.device} {this_dir.joinpath('./gpu_benchmark')} {self.work} {self.duration}"]
else:
gpu_prog = [
f"HIP_DEVICE_ORDER=PCI_BUS_ID HIP_VISIBLE_DEVICES={self.device} {this_dir.joinpath('./gpu_benchmark')} {self.work} {self.duration}"]
else:
log_debug(f"Running GPU benchmark for {self.work} units of work")
gpu_prog = [
f"CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES={self.device} {this_dir.joinpath('./gpu_benchmark')} {self.work}"]

if shutil.which("nvidia-smi") is not None:
gpu_prog = [
f"CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES={self.device} {this_dir.joinpath('./gpu_benchmark')} {self.work}"]
else:
gpu_prog = [
f"HIP_DEVICE_ORDER=PCI_BUS_ID HIP_VISIBLE_DEVICES={self.device} {this_dir.joinpath('./gpu_benchmark')} {self.work}"]
p = subprocess.Popen(gpu_prog, shell=True)
return ProcessHandle(p)

Expand Down
Loading