wfcommons · quantumsteve · May 11, 2026 · May 11, 2026 · May 12, 2026 · May 26, 2026
diff --git a/bin/cuda/gpu_benchmark.cu b/bin/cuda/gpu_benchmark.cu
@@ -1,10 +1,11 @@
-#include "gpu_benchmark.h"
-
 #include "kernels.h"
+
 #include <cub/cub.cuh>
+#include <cuda_runtime.h>
 
 #include <chrono>
 #include <iostream>
+#include <optional>
 
 // The macro wraps any CUDA API call
 #define CUDA_CHECK(ans)                                                                                                \
@@ -26,69 +27,10 @@ float getElapsedTime(const cudaEvent_t &gpu_start, cudaEvent_t &gpu_stop) {
   return gpu_elapsed_time / 1000.0f;
 }
 
-// Function to run the GPU benchmark with no time limit
-void runBenchmark(long max_work) {
-  uint32_t n = 256 * 256;
-  uint64_t m = max_work * 16384 / n;
-
-  unsigned long long int *d_count;
-  curandState *d_state;
-  CUDA_CHECK(cudaMalloc((void **)&d_count, 256 * sizeof(unsigned long long int)));
-  CUDA_CHECK(cudaMalloc((void **)&d_state, n * sizeof(curandState)));
-  CUDA_CHECK(cudaMemset(d_count, 0, 256 * sizeof(unsigned long long int)));
-
-  // set up timing stuff
-  cudaEvent_t gpu_start, gpu_stop;
-  CUDA_CHECK(cudaEventCreate(&gpu_start));
-  CUDA_CHECK(cudaEventCreate(&gpu_stop));
-
-  // set kernel
-  dim3 gridSize = 256;
-  dim3 blockSize = 256;
-  setup_kernel<<<gridSize, blockSize>>>(d_state);
-
-  // monte carlo kernel
-  CUDA_CHECK(cudaEventRecord(gpu_start, 0));
-  monte_carlo_kernel<<<gridSize, blockSize>>>(d_state, d_count, m);
-  CUDA_CHECK(cudaDeviceSynchronize());
-
-  float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop);
-  CUDA_CHECK(cudaEventDestroy(gpu_start));
-  CUDA_CHECK(cudaEventDestroy(gpu_stop));
-
-  // Allocate device output array
-  unsigned long long int *d_out = nullptr;
-  CUDA_CHECK(cudaMalloc((void **)&d_out, sizeof(unsigned long long int)));
-
-  // Request and allocate temporary storage
-  void *d_temp_storage = nullptr;
-  size_t temp_storage_bytes = 0;
-  CUDA_CHECK(cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256));
-  CUDA_CHECK(cudaMalloc((void **)&d_temp_storage, temp_storage_bytes));
-
-  // Run
-  CUDA_CHECK(cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256));
-
-  // copy results back to the host
-  unsigned long long int h_count = 0;
-  CUDA_CHECK(cudaMemcpy(&h_count, d_out, sizeof(unsigned long long int), cudaMemcpyDeviceToHost));
-
-  // display results and timings for gpu
-  float pi = h_count * 4.0 / (n * m);
-  std::cout << "Approximate pi calculated on GPU is: " << pi << " and calculation took " << gpu_elapsed_time << "s\n";
-  std::cout << "Benchmark completed!" << std::endl;
-
-  CUDA_CHECK(cudaFree(d_count));
-  CUDA_CHECK(cudaFree(d_state));
-  CUDA_CHECK(cudaFree(d_out));
-  CUDA_CHECK(cudaFree(d_temp_storage));
-}
-
 // Function to run the GPU benchmark for a specified time
-void runBenchmarkTime(long max_work, int runtime_in_seconds) {
-
+void runBenchmarkTime(long max_work, std::optional<int> runtime_in_seconds) {
   uint32_t n = 256 * 256;
-  uint64_t m = max_work * 16384 / n;
+  uint64_t m = (max_work + n - 1) / n;
 
   // allocate memory
   unsigned long long int *d_count;
@@ -108,13 +50,21 @@ void runBenchmarkTime(long max_work, int runtime_in_seconds) {
 
   setup_kernel<<<gridSize, blockSize>>>(d_state);
 
+  // monte carlo kernel
   CUDA_CHECK(cudaEventRecord(gpu_start, 0));
+
   int iteration = 0;
-  // Run the workload loop until the specified runtime is reached
-  while (getElapsedTime(gpu_start, gpu_stop) < runtime_in_seconds) {
+  if (!runtime_in_seconds.has_value()) {
     monte_carlo_kernel<<<gridSize, blockSize>>>(d_state, d_count, m);
-    CUDA_CHECK(cudaDeviceSynchronize()); // Ensure the kernel has finished executing
+    CUDA_CHECK(cudaDeviceSynchronize());
     iteration++;
+  } else {
+    // Run the workload loop until the specified runtime is reached
+    while (getElapsedTime(gpu_start, gpu_stop) < runtime_in_seconds) {
+      monte_carlo_kernel<<<gridSize, blockSize>>>(d_state, d_count, m);
+      CUDA_CHECK(cudaDeviceSynchronize()); // Ensure the kernel has finished executing
+      iteration++;
+    }
   }
 
   float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop);
@@ -149,6 +99,8 @@ void runBenchmarkTime(long max_work, int runtime_in_seconds) {
   CUDA_CHECK(cudaFree(d_temp_storage));
 }
 
+void runBenchmark(long max_work) { runBenchmarkTime(max_work, std::nullopt); }
+
 int main(int argc, char *argv[]) {
   // Check for the correct number of command line arguments
   if (argc == 2) {

diff --git a/bin/cuda/gpu_benchmark.h b/bin/cuda/gpu_benchmark.h
diff --git a/bin/hip/gpu_benchmark.hip b/bin/hip/gpu_benchmark.hip
@@ -3,6 +3,7 @@
 
 #include <chrono>
 #include <iostream>
+#include <optional>
 
 #define HIP_CHECK(expression)                  \
 {                                              \
@@ -24,69 +25,10 @@ float getElapsedTime(const hipEvent_t &gpu_start, hipEvent_t &gpu_stop) {
   return gpu_elapsed_time / 1000.0f;
 }
 
-// Function to run the GPU benchmark with no time limit
-void runBenchmark(long max_work) {
-  uint32_t n = 256 * 256;
-  uint64_t m = max_work * 16384 / n;
-
-  unsigned long long int *d_count;
-  hiprandState *d_state;
-  HIP_CHECK(hipMalloc((void **)&d_count, 256 * sizeof(unsigned long long int)));
-  HIP_CHECK(hipMalloc((void **)&d_state, n * sizeof(hiprandState)));
-  HIP_CHECK(hipMemset(d_count, 0, 256 * sizeof(unsigned long long int)));
-
-  // set up timing stuff
-  hipEvent_t gpu_start, gpu_stop;
-  HIP_CHECK(hipEventCreate(&gpu_start));
-  HIP_CHECK(hipEventCreate(&gpu_stop));
-
-  // set kernel
-  dim3 gridSize = 256;
-  dim3 blockSize = 256;
-  setup_kernel<<<gridSize, blockSize>>>(d_state);
-
-  // monte carlo kernel
-  HIP_CHECK(hipEventRecord(gpu_start, 0));
-  monte_carlo_kernel<<<gridSize, blockSize>>>(d_state, d_count, m);
-  HIP_CHECK(hipDeviceSynchronize());
-
-  float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop);
-  HIP_CHECK(hipEventDestroy(gpu_start));
-  HIP_CHECK(hipEventDestroy(gpu_stop));
-
-  // Allocate device output array
-  unsigned long long int *d_out = nullptr;
-  HIP_CHECK(hipMalloc((void **)&d_out, sizeof(unsigned long long int)));
-
-  // Request and allocate temporary storage
-  void *d_temp_storage = nullptr;
-  size_t temp_storage_bytes = 0;
-  HIP_CHECK(hipcub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256));
-  HIP_CHECK(hipMalloc((void **)&d_temp_storage, temp_storage_bytes));
-
-  // Run
-  HIP_CHECK(hipcub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256));
-
-  // copy results back to the host
-  unsigned long long int h_count = 0;
-  HIP_CHECK(hipMemcpy(&h_count, d_out, sizeof(unsigned long long int), hipMemcpyDeviceToHost));
-
-  // display results and timings for gpu
-  float pi = h_count * 4.0 / (n * m);
-  std::cout << "Approximate pi calculated on GPU is: " << pi << " and calculation took " << gpu_elapsed_time << "s\n";
-  std::cout << "Benchmark completed!" << std::endl;
-
-  HIP_CHECK(hipFree(d_count));
-  HIP_CHECK(hipFree(d_state));
-  HIP_CHECK(hipFree(d_out));
-  HIP_CHECK(hipFree(d_temp_storage));
-}
-
 // Function to run the GPU benchmark for a specified time
-void runBenchmarkTime(long max_work, int runtime_in_seconds) {
-
+void runBenchmarkTime(long max_work, std::optional<int> runtime_in_seconds) {
   uint32_t n = 256 * 256;
-  uint64_t m = max_work * 16384 / n;
+  uint64_t m = (max_work + n - 1) / n;
 
   // allocate memory
   unsigned long long int *d_count;
@@ -99,20 +41,27 @@ void runBenchmarkTime(long max_work, int runtime_in_seconds) {
   hipEvent_t gpu_start, gpu_stop;
   HIP_CHECK(hipEventCreate(&gpu_start));
   HIP_CHECK(hipEventCreate(&gpu_stop));
-  HIP_CHECK(hipEventRecord(gpu_start, 0));
 
   // set kernel
   dim3 gridSize = 256;
   dim3 blockSize = 256;
 
   setup_kernel<<<gridSize, blockSize>>>(d_state);
 
+  HIP_CHECK(hipEventRecord(gpu_start, 0));
   int iteration = 0;
-  // Run the workload loop until the specified runtime is reached
-  while (getElapsedTime(gpu_start, gpu_stop) < runtime_in_seconds) {
+
+  if (!runtime_in_seconds.has_value()) {
     monte_carlo_kernel<<<gridSize, blockSize>>>(d_state, d_count, m);
-    HIP_CHECK(hipDeviceSynchronize()); // Ensure the kernel has finished executing
+    HIP_CHECK(hipDeviceSynchronize());
     iteration++;
+  } else {
+    // Run the workload loop until the specified runtime is reached
+    while (getElapsedTime(gpu_start, gpu_stop) < runtime_in_seconds) {
+      monte_carlo_kernel<<<gridSize, blockSize>>>(d_state, d_count, m);
+      HIP_CHECK(hipDeviceSynchronize()); // Ensure the kernel has finished executing
+      iteration++;
+    }
   }
 
   float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop);
@@ -147,6 +96,10 @@ void runBenchmarkTime(long max_work, int runtime_in_seconds) {
   HIP_CHECK(hipFree(d_temp_storage));
 }
 
+void runBenchmark(long max_work) {
+  runBenchmarkTime(max_work, std::nullopt);
+}
+
 int main(int argc, char *argv[]) {
   // Check for the correct number of command line arguments
   if (argc == 2) {

diff --git a/bin/wfbench b/bin/wfbench
@@ -20,6 +20,7 @@ import json
 import logging
 import pandas as pd
 import psutil
+import shutil
 
 from io import StringIO
 from filelock import FileLock
@@ -230,11 +231,21 @@ class GPUBenchmark:
 
     @staticmethod
     def get_available_gpus():
-        proc = subprocess.Popen(["nvidia-smi", "--query-gpu=utilization.gpu", "--format=csv"], stdout=subprocess.PIPE,
-                                stderr=subprocess.PIPE)
-        stdout, _ = proc.communicate()
-        df = pd.read_csv(StringIO(stdout.decode("utf-8")), sep=" ")
-        return df[df["utilization.gpu"] <= 5].index.to_list()
+        if shutil.which("nvidia-smi") is not None:
+            proc = subprocess.Popen(["nvidia-smi", "--query-gpu=utilization.gpu", "--format=csv"], stdout=subprocess.PIPE,
+                                    stderr=subprocess.PIPE)
+            stdout, _ = proc.communicate()
+            df = pd.read_csv(StringIO(stdout.decode("utf-8")), sep=" ")
+            return df[df["utilization.gpu"] <= 5].index.to_list()
+        elif shutil.which("amd-smi") is not None:
+            proc = subprocess.Popen(["amd-smi", "monitor", "-u", "--csv"], stdout=subprocess.PIPE,
+                                    stderr=subprocess.PIPE)
+            stdout, _ = proc.communicate()
+            df = pd.read_csv(StringIO(stdout.decode("utf-8")), sep=",")
+            return df[df["gfx"] <= 5].index.to_list()
+        else:
+            log_error("No supported GPU monitoring tool found.")
+            return []
 
     def __init__(self):
         self.work = None
@@ -250,7 +261,7 @@ class GPUBenchmark:
         log_debug(f"GPU benchmark instantiated for device {self.device}")
 
     def set_work(self, work: int):
-        self.work = work
+        self.work = 1000000 * work
 
     def set_time(self, duration: float):
         self.duration = duration
@@ -261,13 +272,20 @@ class GPUBenchmark:
 
         if self.duration is not None:
             log_debug(f"Running GPU benchmark for {self.duration} seconds")
-            gpu_prog = [
-                f"CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES={self.device} {this_dir.joinpath('./gpu_benchmark')} {self.work} {self.duration}"]
+            if shutil.which("nvidia-smi") is not None:
+                gpu_prog = [
+                    f"CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES={self.device} {this_dir.joinpath('./gpu_benchmark')} {self.work} {self.duration}"]
+            else:
+                gpu_prog = [
+                    f"HIP_DEVICE_ORDER=PCI_BUS_ID HIP_VISIBLE_DEVICES={self.device} {this_dir.joinpath('./gpu_benchmark')} {self.work} {self.duration}"]
         else:
             log_debug(f"Running GPU benchmark for {self.work} units of work")
-            gpu_prog = [
-                f"CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES={self.device} {this_dir.joinpath('./gpu_benchmark')} {self.work}"]
-
+            if shutil.which("nvidia-smi") is not None:
+                gpu_prog = [
+                    f"CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES={self.device} {this_dir.joinpath('./gpu_benchmark')} {self.work}"]
+            else:
+                gpu_prog = [
+                    f"HIP_DEVICE_ORDER=PCI_BUS_ID HIP_VISIBLE_DEVICES={self.device} {this_dir.joinpath('./gpu_benchmark')} {self.work}"]
         p = subprocess.Popen(gpu_prog, shell=True)
         return ProcessHandle(p)