[REFACTOR][CUDA] Phase out cuda_common.h by tqchen · Pull Request #19770 · apache/tvm

tqchen · 2026-06-14T19:46:43Z

Summary

The CUDA runtime can rely on the shared tvm-ffi CUDA error helper instead of carrying a TVM-local common header. This keeps CUDA error handling aligned with the FFI CUDA support and removes the remaining reason for src/backend/cuda/runtime/cuda_common.h.

Main changes:

Move CUDA workspace thread-local state into cuda_device_api.cc.
Replace CUDA_CALL use sites with TVM_FFI_CHECK_CUDA_ERROR from tvm/ffi/extra/cuda/base.h.
Inline the remaining CUDA driver checks at their call sites and delete cuda_common.h.

The CUDA runtime can rely on the shared tvm-ffi CUDA error helper instead of carrying a TVM-local common header. This PR removes the legacy header by moving the CUDA workspace state into cuda_device_api.cc, replacing CUDA_CALL users with TVM_FFI_CHECK_CUDA_ERROR, and inlining the remaining CUDA driver checks at their call sites.

gemini-code-assist

Code Review

This pull request migrates the CUDA runtime and contrib modules to use the new FFI-based error checking (TVM_FFI_CHECK_CUDA_ERROR) and removes the legacy cuda_common.h header. The review feedback highlights several critical issues regarding exception safety and error handling: potential memory and resource leaks on exception paths in cuda_ipc_memory.cc, tensorrt_calibrator.h, and cublas_utils.cc; potential corruption of the thread-local stream environment in cuda_graph_builtin.cc if stream capture fails; and potential undefined behavior or crashes in cuda_device_api.cc and cuda_module.cc if cuGetErrorName returns a null pointer.

Important

The consumer version of Gemini Code Assist on GitHub is being sunset. Starting June 18, 2026, new organization installations will be blocked, and all code review activity will officially cease on July 17, 2026.
For more details on the timeline and next steps, please review the Help Documentation.

gemini-code-assist · 2026-06-14T19:48:38Z

+  TVM_FFI_CHECK_CUDA_ERROR(cudaMalloc(&d_src, CUDA_IPC_HANDLE_SIZE));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaMalloc(&d_dst, CUDA_IPC_HANDLE_SIZE * ctx->worker->num_workers));
+  TVM_FFI_CHECK_CUDA_ERROR(
+      cudaMemcpy(d_src, &local_handle, CUDA_IPC_HANDLE_SIZE, cudaMemcpyHostToDevice));
  NCCL_CALL(ncclAllGather(d_src, d_dst, CUDA_IPC_HANDLE_SIZE, ncclChar, ctx->global_comm,
                          /*stream=*/nullptr));
  std::vector<char> serial_handles(CUDA_IPC_HANDLE_SIZE * ctx->worker->num_workers, 0);
-  CUDA_CALL(cudaMemcpy(serial_handles.data(), d_dst,
-                       CUDA_IPC_HANDLE_SIZE * ctx->worker->num_workers, cudaMemcpyDefault));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaMemcpy(serial_handles.data(), d_dst,
+                                      CUDA_IPC_HANDLE_SIZE * ctx->worker->num_workers,
+                                      cudaMemcpyDefault));
  std::vector<cudaIpcMemHandle_t> handles(ctx->worker->num_workers);
  for (int i = 0; i < ctx->worker->num_workers; ++i) {
    memcpy(handles[i].reserved, &serial_handles[i * CUDA_IPC_HANDLE_SIZE], CUDA_IPC_HANDLE_SIZE);
  }
-  CUDA_CALL(cudaFree(d_src));
-  CUDA_CALL(cudaFree(d_dst));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaFree(d_src));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaFree(d_dst));


If cudaMalloc for d_dst, cudaMemcpy, or ncclAllGather throws an exception, the allocated GPU memory d_src and d_dst will be leaked. We should use a custom deleter with std::unique_ptr to ensure cudaFree is called on exception paths.

void *d_src = nullptr, *d_dst = nullptr; struct CUDAFreeDeleter { void operator()(void* ptr) const { if (ptr) cudaFree(ptr); } }; TVM_FFI_CHECK_CUDA_ERROR(cudaMalloc(&d_src, CUDA_IPC_HANDLE_SIZE)); std::unique_ptr<void, CUDAFreeDeleter> src_guard(d_src); TVM_FFI_CHECK_CUDA_ERROR(cudaMalloc(&d_dst, CUDA_IPC_HANDLE_SIZE * ctx->worker->num_workers)); std::unique_ptr<void, CUDAFreeDeleter> dst_guard(d_dst); TVM_FFI_CHECK_CUDA_ERROR( cudaMemcpy(d_src, &local_handle, CUDA_IPC_HANDLE_SIZE, cudaMemcpyHostToDevice)); NCCL_CALL(ncclAllGather(d_src, d_dst, CUDA_IPC_HANDLE_SIZE, ncclChar, ctx->global_comm, /*stream=*/nullptr)); std::vector<char> serial_handles(CUDA_IPC_HANDLE_SIZE * ctx->worker->num_workers, 0); TVM_FFI_CHECK_CUDA_ERROR(cudaMemcpy(serial_handles.data(), d_dst, CUDA_IPC_HANDLE_SIZE * ctx->worker->num_workers, cudaMemcpyDefault)); std::vector<cudaIpcMemHandle_t> handles(ctx->worker->num_workers); for (int i = 0; i < ctx->worker->num_workers; ++i) { memcpy(handles[i].reserved, &serial_handles[i * CUDA_IPC_HANDLE_SIZE], CUDA_IPC_HANDLE_SIZE); } return handles;

gemini-code-assist · 2026-06-14T19:48:39Z

+    TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id_));
    TVM_FFI_CHECK_SAFE_CALL(
        TVMFFIEnvSetStream(kDLCUDA, device_id_, capture_stream_,
                           reinterpret_cast<TVMFFIStreamHandle*>(&prev_default_stream_)));
-    CUDA_CALL(cudaStreamBeginCapture(capture_stream_, cudaStreamCaptureModeGlobal));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaStreamBeginCapture(capture_stream_, cudaStreamCaptureModeGlobal));


If cudaStreamBeginCapture throws an exception, the constructor of CUDACaptureStream fails and the destructor is never called. This leaves the thread-local stream environment pointing to the destroyed capture_stream_ member, causing subsequent operations to crash or exhibit undefined behavior. We should catch any exception and restore the stream environment before rethrowing.

TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id_)); TVM_FFI_CHECK_SAFE_CALL( TVMFFIEnvSetStream(kDLCUDA, device_id_, capture_stream_, reinterpret_cast<TVMFFIStreamHandle*>(&prev_default_stream_))); try { TVM_FFI_CHECK_CUDA_ERROR(cudaStreamBeginCapture(capture_stream_, cudaStreamCaptureModeGlobal)); } catch (...) { TVMFFIEnvSetStream(kDLCUDA, device_id_, prev_default_stream_, nullptr); throw; }

gemini-code-assist · 2026-06-14T19:48:39Z

+        CUresult result = cuDeviceGetName(&name[0], name.size(), dev.device_id);
+        if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) {
+          const char* msg;
+          cuGetErrorName(result, &msg);
+          TVM_FFI_THROW(CUDAError) << "cuDeviceGetName failed with error: " << msg;
+        }


The cuGetErrorName function can return CUDA_ERROR_INVALID_VALUE if the error code is unrecognized, which sets msg to nullptr. Streaming a null pointer to TVM_FFI_THROW results in undefined behavior (and potential crashes). It is safer to check if msg is valid or use a fallback string.

CUresult result = cuDeviceGetName(&name[0], name.size(), dev.device_id); if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) { const char* msg = nullptr; cuGetErrorName(result, &msg); TVM_FFI_THROW(CUDAError) << "cuDeviceGetName failed with error: " << (msg ? msg : "unknown error"); }

gemini-code-assist · 2026-06-14T19:48:39Z

+      CUresult result = cuModuleLoadData(&(module_[device_id]), code_.data());
+      if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) {
+        const char* msg;
+        cuGetErrorName(result, &msg);
+        TVM_FFI_THROW(CUDAError) << "cuModuleLoadData failed with error: " << msg;
+      }


The cuGetErrorName function can return CUDA_ERROR_INVALID_VALUE if the error code is unrecognized, which sets msg to nullptr. Streaming a null pointer to TVM_FFI_THROW results in undefined behavior (and potential crashes). It is safer to check if msg is valid or use a fallback string.

Suggested change

CUresult result = cuModuleLoadData(&(module_[device_id]), code_.data());

if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) {

const char* msg;

cuGetErrorName(result, &msg);

TVM_FFI_THROW(CUDAError) << "cuModuleLoadData failed with error: " << msg;

}

CUresult result = cuModuleLoadData(&(module_[device_id]), code_.data());

if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) {

const char* msg = nullptr;

cuGetErrorName(result, &msg);

TVM_FFI_THROW(CUDAError) << "cuModuleLoadData failed with error: " << (msg ? msg : "unknown error");

}

gemini-code-assist · 2026-06-14T19:48:39Z

    std::vector<float*> data_host(bindings.size(), nullptr);
    for (size_t i = 0; i < bindings.size(); ++i) {
      data_host[i] = new float[batch_size_ * binding_sizes[i]];
-      CUDA_CALL(cudaMemcpy(static_cast<void*>(data_host[i]), bindings[i],
-                           batch_size_ * binding_sizes[i] * sizeof(float), cudaMemcpyDeviceToHost));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaMemcpy(static_cast<void*>(data_host[i]), bindings[i],
+                                          batch_size_ * binding_sizes[i] * sizeof(float),
+                                          cudaMemcpyDeviceToHost));
    }


If cudaMemcpy throws an exception, any previously allocated data_host[i] in the loop will be leaked because they are not yet pushed to data_ and the destructor only cleans up data_. We should use a try-catch block to free them on failure.

std::vector<float*> data_host(bindings.size(), nullptr); try { for (size_t i = 0; i < bindings.size(); ++i) { data_host[i] = new float[batch_size_ * binding_sizes[i]]; TVM_FFI_CHECK_CUDA_ERROR(cudaMemcpy(static_cast<void*>(data_host[i]), bindings[i], batch_size_ * binding_sizes[i] * sizeof(float), cudaMemcpyDeviceToHost)); } } catch (...) { for (float* ptr : data_host) { delete[] ptr; } throw; }

gemini-code-assist · 2026-06-14T19:48:39Z

 CuBlasLtThreadEntry::CuBlasLtThreadEntry() {
  CHECK_CUBLAS_ERROR(cublasLtCreate(&handle));
  CHECK_CUBLAS_ERROR(cublasLtMatmulPreferenceCreate(&matmul_pref_desc));
-  CUDA_CALL(cudaMalloc(&workspace_ptr, workspace_size));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaMalloc(&workspace_ptr, workspace_size));
 }


If cudaMalloc throws an exception, the constructor fails and the destructor is not called. This causes handle and matmul_pref_desc to be leaked. We should wrap the constructor body in a try-catch block to perform cleanup on failure.

CuBlasLtThreadEntry::CuBlasLtThreadEntry() { CHECK_CUBLAS_ERROR(cublasLtCreate(&handle)); try { CHECK_CUBLAS_ERROR(cublasLtMatmulPreferenceCreate(&matmul_pref_desc)); TVM_FFI_CHECK_CUDA_ERROR(cudaMalloc(&workspace_ptr, workspace_size)); } catch (...) { if (matmul_pref_desc) { cublasLtMatmulPreferenceDestroy(matmul_pref_desc); } if (handle) { cublasLtDestroy(handle); } throw; } }

gemini-code-assist Bot reviewed Jun 14, 2026

View reviewed changes

tlopex approved these changes Jun 14, 2026

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[REFACTOR][CUDA] Phase out cuda_common.h#19770

[REFACTOR][CUDA] Phase out cuda_common.h#19770
tqchen wants to merge 1 commit into
apache:mainfrom
tqchen:tvm-cuda-common-simplification-phase-out-cuda-common-h

tqchen commented Jun 14, 2026

Uh oh!

gemini-code-assist Bot left a comment

Uh oh!

gemini-code-assist Bot Jun 14, 2026

Uh oh!

gemini-code-assist Bot Jun 14, 2026

Uh oh!

gemini-code-assist Bot Jun 14, 2026

Uh oh!

gemini-code-assist Bot Jun 14, 2026

Uh oh!

gemini-code-assist Bot Jun 14, 2026

Uh oh!

gemini-code-assist Bot Jun 14, 2026

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

Conversation

tqchen commented Jun 14, 2026

Summary

Uh oh!

gemini-code-assist Bot left a comment

Choose a reason for hiding this comment

Code Review

Uh oh!

gemini-code-assist Bot Jun 14, 2026

Choose a reason for hiding this comment

Uh oh!

gemini-code-assist Bot Jun 14, 2026

Choose a reason for hiding this comment

Uh oh!

gemini-code-assist Bot Jun 14, 2026

Choose a reason for hiding this comment

Uh oh!

gemini-code-assist Bot Jun 14, 2026

Choose a reason for hiding this comment

Uh oh!

gemini-code-assist Bot Jun 14, 2026

Choose a reason for hiding this comment

Uh oh!

gemini-code-assist Bot Jun 14, 2026

Choose a reason for hiding this comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants