From 0f09209857c730907e308e590670de247428004f Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sun, 14 Jun 2026 19:46:17 +0000
Subject: [PATCH 1/2] [REFACTOR][CUDA] Phase out cuda_common.h

The CUDA runtime can rely on the shared tvm-ffi CUDA error helper instead of carrying a TVM-local common header. This PR removes the legacy header by moving the CUDA workspace state into cuda_device_api.cc, replacing CUDA_CALL users with TVM_FFI_CHECK_CUDA_ERROR, and inlining the remaining CUDA driver checks at their call sites.
---
 src/backend/cuda/runtime/cuda_common.h        |  69 ---------
 src/backend/cuda/runtime/cuda_device_api.cc   | 140 ++++++++++--------
 src/backend/cuda/runtime/cuda_module.cc       |  16 +-
 .../contrib/cublas/cublas_json_runtime.cc     |   4 +-
 .../extra/contrib/cublas/cublas_utils.cc      |   5 +-
 .../extra/contrib/cudnn/conv_backward.cc      |   4 +-
 .../extra/contrib/cudnn/conv_forward.cc       |   2 +-
 .../contrib/cudnn/cudnn_frontend/attention.cc |   4 +-
 .../extra/contrib/cudnn/cudnn_json_runtime.cc |   4 +-
 .../extra/contrib/cudnn/cudnn_utils.cc        |   2 +-
 src/runtime/extra/contrib/cudnn/cudnn_utils.h |   3 +-
 src/runtime/extra/contrib/cudnn/softmax.cc    |   2 +-
 src/runtime/extra/contrib/curand/curand.cc    |   2 +-
 .../cutlass/fp16_group_gemm_runner_sm100.cuh  |   3 +-
 .../cutlass/fp16_group_gemm_runner_sm90.cuh   |   3 +-
 ...fp8_groupwise_scaled_gemm_runner_sm100.cuh |   4 +-
 .../fp8_groupwise_scaled_gemm_runner_sm90.cuh |   4 +-
 ...oupwise_scaled_group_gemm_runner_sm100.cuh |   4 +-
 .../extra/contrib/cutlass/gemm_runner.cuh     |   3 +-
 .../extra/contrib/nvshmem/dist_gemm.cu        |   7 +-
 src/runtime/extra/contrib/nvshmem/init.cc     |   7 +-
 .../extra/contrib/nvshmem/memory_allocator.cc |   2 +-
 .../contrib/tensorrt/tensorrt_calibrator.h    |  19 ++-
 src/runtime/extra/contrib/thrust/thrust.cu    |   5 +-
 .../extra/disco/cuda_ipc/cuda_ipc_memory.cc   |  39 ++---
 src/runtime/extra/disco/nccl/nccl_context.h   |  17 ++-
 src/runtime/vm/cuda/cuda_graph_builtin.cc     |  18 +--
 27 files changed, 181 insertions(+), 211 deletions(-)
 delete mode 100644 src/backend/cuda/runtime/cuda_common.h
diff --git a/src/backend/cuda/runtime/cuda_common.h b/src/backend/cuda/runtime/cuda_common.h
deleted file mode 100644
index 183a2e870229..000000000000
--- a/src/backend/cuda/runtime/cuda_common.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file cuda_common.h
- * \brief Common utilities for CUDA
- */
-#ifndef TVM_RUNTIME_CUDA_CUDA_COMMON_H_
-#define TVM_RUNTIME_CUDA_CUDA_COMMON_H_
-
-#include <cuda_runtime.h>
-#include <tvm/ffi/function.h>
-
-#include <string>
-
-#include "../../../runtime/workspace_pool.h"
-
-namespace tvm {
-namespace runtime {
-
-#define CUDA_DRIVER_CALL(x)                                             \
-  {                                                                     \
-    CUresult result = x;                                                \
-    if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) { \
-      const char* msg;                                                  \
-      cuGetErrorName(result, &msg);                                     \
-      TVM_FFI_THROW(CUDAError) << "" #x " failed with error: " << msg;  \
-    }                                                                   \
-  }
-
-#ifndef CUDA_CALL
-#define CUDA_CALL(func)                                               \
-  {                                                                   \
-    cudaError_t e = (func);                                           \
-    TVM_FFI_ICHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \
-        << "CUDA: " << cudaGetErrorString(e);                         \
-  }
-#endif
-
-/*! \brief Thread local workspace */
-class CUDAThreadEntry {
- public:
-  /*! \brief thread local pool*/
-  WorkspacePool pool;
-  /*! \brief constructor */
-  CUDAThreadEntry();
-  // get the threadlocal workspace
-  static CUDAThreadEntry* ThreadLocal();
-};
-
-}  // namespace runtime
-}  // namespace tvm
-#endif  // TVM_RUNTIME_CUDA_CUDA_COMMON_H_
diff --git a/src/backend/cuda/runtime/cuda_device_api.cc b/src/backend/cuda/runtime/cuda_device_api.cc
index 969f40a081f4..68ae39de56bf 100644
--- a/src/backend/cuda/runtime/cuda_device_api.cc
+++ b/src/backend/cuda/runtime/cuda_device_api.cc
@@ -24,6 +24,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <tvm/ffi/extra/c_env_api.h>
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/device_api.h>
@@ -32,14 +33,26 @@
 
 #include <cstring>
 
-#include "cuda_common.h"
+#include "../../../runtime/workspace_pool.h"
 
 namespace tvm {
 namespace runtime {
 
+#ifndef CUDA_DRIVER_CALL
+#define CUDA_DRIVER_CALL(x)                                             \
+  {                                                                     \
+    CUresult result = x;                                                \
+    if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) { \
+      const char* msg;                                                  \
+      cuGetErrorName(result, &msg);                                     \
+      TVM_FFI_THROW(CUDAError) << "" #x " failed with error: " << msg;  \
+    }                                                                   \
+  }
+#endif
+
 class CUDADeviceAPI final : public DeviceAPI {
  public:
-  void SetDevice(Device dev) final { CUDA_CALL(cudaSetDevice(dev.device_id)); }
+  void SetDevice(Device dev) final { TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev.device_id)); }
   void GetAttr(Device dev, DeviceAttrKind kind, ffi::Any* rv) final {
     int value = 0;
     switch (kind) {
@@ -50,23 +63,27 @@ class CUDADeviceAPI final : public DeviceAPI {
         break;
       }
       case kMaxThreadsPerBlock: {
-        CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrMaxThreadsPerBlock, dev.device_id));
+        TVM_FFI_CHECK_CUDA_ERROR(
+            cudaDeviceGetAttribute(&value, cudaDevAttrMaxThreadsPerBlock, dev.device_id));
         break;
       }
       case kWarpSize: {
-        CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrWarpSize, dev.device_id));
+        TVM_FFI_CHECK_CUDA_ERROR(
+            cudaDeviceGetAttribute(&value, cudaDevAttrWarpSize, dev.device_id));
         break;
       }
       case kMaxSharedMemoryPerBlock: {
-        CUDA_CALL(
+        TVM_FFI_CHECK_CUDA_ERROR(
             cudaDeviceGetAttribute(&value, cudaDevAttrMaxSharedMemoryPerBlock, dev.device_id));
         break;
       }
       case kComputeVersion: {
         std::ostringstream os;
-        CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrComputeCapabilityMajor, dev.device_id));
+        TVM_FFI_CHECK_CUDA_ERROR(
+            cudaDeviceGetAttribute(&value, cudaDevAttrComputeCapabilityMajor, dev.device_id));
         os << value << ".";
-        CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrComputeCapabilityMinor, dev.device_id));
+        TVM_FFI_CHECK_CUDA_ERROR(
+            cudaDeviceGetAttribute(&value, cudaDevAttrComputeCapabilityMinor, dev.device_id));
         os << value;
         *rv = os.str();
         return;
@@ -79,18 +96,23 @@ class CUDADeviceAPI final : public DeviceAPI {
         return;
       }
       case kMaxClockRate: {
-        CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrClockRate, dev.device_id));
+        TVM_FFI_CHECK_CUDA_ERROR(
+            cudaDeviceGetAttribute(&value, cudaDevAttrClockRate, dev.device_id));
         break;
       }
       case kMultiProcessorCount: {
-        CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrMultiProcessorCount, dev.device_id));
+        TVM_FFI_CHECK_CUDA_ERROR(
+            cudaDeviceGetAttribute(&value, cudaDevAttrMultiProcessorCount, dev.device_id));
         break;
       }
       case kMaxThreadDimensions: {
         int dims[3];
-        CUDA_CALL(cudaDeviceGetAttribute(&dims[0], cudaDevAttrMaxBlockDimX, dev.device_id));
-        CUDA_CALL(cudaDeviceGetAttribute(&dims[1], cudaDevAttrMaxBlockDimY, dev.device_id));
-        CUDA_CALL(cudaDeviceGetAttribute(&dims[2], cudaDevAttrMaxBlockDimZ, dev.device_id));
+        TVM_FFI_CHECK_CUDA_ERROR(
+            cudaDeviceGetAttribute(&dims[0], cudaDevAttrMaxBlockDimX, dev.device_id));
+        TVM_FFI_CHECK_CUDA_ERROR(
+            cudaDeviceGetAttribute(&dims[1], cudaDevAttrMaxBlockDimY, dev.device_id));
+        TVM_FFI_CHECK_CUDA_ERROR(
+            cudaDeviceGetAttribute(&dims[2], cudaDevAttrMaxBlockDimZ, dev.device_id));
 
         std::stringstream ss;  // use json string to return multiple int values;
         ss << "[" << dims[0] << ", " << dims[1] << ", " << dims[2] << "]";
@@ -98,7 +120,8 @@ class CUDADeviceAPI final : public DeviceAPI {
         return;
       }
       case kMaxRegistersPerBlock: {
-        CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrMaxRegistersPerBlock, dev.device_id));
+        TVM_FFI_CHECK_CUDA_ERROR(
+            cudaDeviceGetAttribute(&value, cudaDevAttrMaxRegistersPerBlock, dev.device_id));
         break;
       }
       case kGcnArch:
@@ -112,20 +135,21 @@ class CUDADeviceAPI final : public DeviceAPI {
       case kL2CacheSizeBytes: {
         // Get size of device l2 cache size in bytes.
         int l2_size = 0;
-        CUDA_CALL(cudaDeviceGetAttribute(&l2_size, cudaDevAttrL2CacheSize, dev.device_id));
+        TVM_FFI_CHECK_CUDA_ERROR(
+            cudaDeviceGetAttribute(&l2_size, cudaDevAttrL2CacheSize, dev.device_id));
         *rv = l2_size;
         return;
       }
       case kTotalGlobalMemory: {
         cudaDeviceProp prop;
-        CUDA_CALL(cudaGetDeviceProperties(&prop, dev.device_id));
+        TVM_FFI_CHECK_CUDA_ERROR(cudaGetDeviceProperties(&prop, dev.device_id));
         int64_t total_global_memory = prop.totalGlobalMem;
         *rv = total_global_memory;
         return;
       }
       case kAvailableGlobalMemory: {
         size_t free_mem, total_mem;
-        CUDA_CALL(cudaMemGetInfo(&free_mem, &total_mem));
+        TVM_FFI_CHECK_CUDA_ERROR(cudaMemGetInfo(&free_mem, &total_mem));
         *rv = static_cast<int64_t>(free_mem);
         return;
       }
@@ -139,14 +163,14 @@ class CUDADeviceAPI final : public DeviceAPI {
     void* ret;
     if (dev.device_type == kDLCUDAHost) {
       VLOG(1) << "allocating " << nbytes << "bytes on host";
-      CUDA_CALL(cudaMallocHost(&ret, nbytes));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaMallocHost(&ret, nbytes));
     } else {
-      CUDA_CALL(cudaSetDevice(dev.device_id));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev.device_id));
       size_t free_mem, total_mem;
-      CUDA_CALL(cudaMemGetInfo(&free_mem, &total_mem));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaMemGetInfo(&free_mem, &total_mem));
       VLOG(1) << "allocating " << nbytes << " bytes on device, with " << free_mem
               << " bytes currently free out of " << total_mem << " bytes available";
-      CUDA_CALL(cudaMalloc(&ret, nbytes));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaMalloc(&ret, nbytes));
     }
     return ret;
   }
@@ -172,11 +196,11 @@ class CUDADeviceAPI final : public DeviceAPI {
 
     if (dev.device_type == kDLCUDAHost) {
       VLOG(1) << "freeing host memory";
-      CUDA_CALL(cudaFreeHost(ptr));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaFreeHost(ptr));
     } else {
-      CUDA_CALL(cudaSetDevice(dev.device_id));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev.device_id));
       VLOG(1) << "freeing device memory";
-      CUDA_CALL(cudaFree(ptr));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaFree(ptr));
     }
   }
 
@@ -203,17 +227,17 @@ class CUDADeviceAPI final : public DeviceAPI {
     }
 
     if (dev_from.device_type == kDLCUDA && dev_to.device_type == kDLCUDA) {
-      CUDA_CALL(cudaSetDevice(dev_from.device_id));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev_from.device_id));
       if (dev_from.device_id == dev_to.device_id) {
         GPUCopy(from, to, size, cudaMemcpyDeviceToDevice, cu_stream);
       } else {
         cudaMemcpyPeerAsync(to, dev_to.device_id, from, dev_from.device_id, size, cu_stream);
       }
     } else if (dev_from.device_type == kDLCUDA && dev_to.device_type == kDLCPU) {
-      CUDA_CALL(cudaSetDevice(dev_from.device_id));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev_from.device_id));
       GPUCopy(from, to, size, cudaMemcpyDeviceToHost, cu_stream);
     } else if (dev_from.device_type == kDLCPU && dev_to.device_type == kDLCUDA) {
-      CUDA_CALL(cudaSetDevice(dev_to.device_id));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev_to.device_id));
       GPUCopy(from, to, size, cudaMemcpyHostToDevice, cu_stream);
     } else {
       TVM_FFI_THROW(InternalError) << "expect copy from/to GPU or between GPU";
@@ -222,40 +246,40 @@ class CUDADeviceAPI final : public DeviceAPI {
 
  public:
   TVMStreamHandle CreateStream(Device dev) {
-    CUDA_CALL(cudaSetDevice(dev.device_id));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev.device_id));
     cudaStream_t retval;
-    CUDA_CALL(cudaStreamCreateWithFlags(&retval, cudaStreamNonBlocking));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaStreamCreateWithFlags(&retval, cudaStreamNonBlocking));
     return static_cast<TVMStreamHandle>(retval);
   }
 
   void FreeStream(Device dev, TVMStreamHandle stream) {
-    CUDA_CALL(cudaSetDevice(dev.device_id));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev.device_id));
     cudaStream_t cu_stream = static_cast<cudaStream_t>(stream);
-    CUDA_CALL(cudaStreamDestroy(cu_stream));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaStreamDestroy(cu_stream));
   }
 
   void SyncStreamFromTo(Device dev, TVMStreamHandle event_src, TVMStreamHandle event_dst) {
-    CUDA_CALL(cudaSetDevice(dev.device_id));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev.device_id));
     cudaStream_t src_stream = static_cast<cudaStream_t>(event_src);
     cudaStream_t dst_stream = static_cast<cudaStream_t>(event_dst);
     cudaEvent_t evt;
-    CUDA_CALL(cudaEventCreate(&evt));
-    CUDA_CALL(cudaEventRecord(evt, src_stream));
-    CUDA_CALL(cudaStreamWaitEvent(dst_stream, evt, 0));
-    CUDA_CALL(cudaEventDestroy(evt));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaEventCreate(&evt));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaEventRecord(evt, src_stream));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaStreamWaitEvent(dst_stream, evt, 0));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaEventDestroy(evt));
   }
 
   void StreamSync(Device dev, TVMStreamHandle stream) final {
-    CUDA_CALL(cudaSetDevice(dev.device_id));
-    CUDA_CALL(cudaStreamSynchronize(static_cast<cudaStream_t>(stream)));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev.device_id));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaStreamSynchronize(static_cast<cudaStream_t>(stream)));
   }
 
   void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final {
-    return CUDAThreadEntry::ThreadLocal()->pool.AllocWorkspace(dev, size);
+    return ThreadLocalWorkspacePool()->AllocWorkspace(dev, size);
   }
 
   void FreeWorkspace(Device dev, void* data) final {
-    CUDAThreadEntry::ThreadLocal()->pool.FreeWorkspace(dev, data);
+    ThreadLocalWorkspacePool()->FreeWorkspace(dev, data);
   }
 
   bool SupportsDevicePointerArithmeticsOnHost() final { return true; }
@@ -268,17 +292,17 @@ class CUDADeviceAPI final : public DeviceAPI {
   }
 
  private:
+  static WorkspacePool* ThreadLocalWorkspacePool();
+
   static void GPUCopy(const void* from, void* to, size_t size, cudaMemcpyKind kind,
                       cudaStream_t stream) {
-    CUDA_CALL(cudaMemcpyAsync(to, from, size, kind, stream));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaMemcpyAsync(to, from, size, kind, stream));
   }
 };
 
-CUDAThreadEntry::CUDAThreadEntry() : pool(kDLCUDA, CUDADeviceAPI::Global()) {}
-
-CUDAThreadEntry* CUDAThreadEntry::ThreadLocal() {
-  static thread_local CUDAThreadEntry inst;
-  return &inst;
+WorkspacePool* CUDADeviceAPI::ThreadLocalWorkspacePool() {
+  static thread_local WorkspacePool pool(kDLCUDA, CUDADeviceAPI::Global());
+  return &pool;
 }
 
 TVM_FFI_STATIC_INIT_BLOCK() {
@@ -301,28 +325,28 @@ class CUDATimerNode : public TimerNode {
     // This initial cudaEventRecord is sometimes pretty slow (~100us). Does
     // cudaEventRecord do some stream synchronization?
     int device_id;
-    CUDA_CALL(cudaGetDevice(&device_id));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
     stream_ = TVMFFIEnvGetStream(kDLCUDA, device_id);
-    CUDA_CALL(cudaEventRecord(start_, static_cast<cudaStream_t>(stream_)));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaEventRecord(start_, static_cast<cudaStream_t>(stream_)));
   }
   virtual void Stop() {
     int device_id;
-    CUDA_CALL(cudaGetDevice(&device_id));
-    CUDA_CALL(cudaEventRecord(stop_, static_cast<cudaStream_t>(stream_)));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaEventRecord(stop_, static_cast<cudaStream_t>(stream_)));
   }
   virtual int64_t SyncAndGetElapsedNanos() {
-    CUDA_CALL(cudaEventSynchronize(stop_));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaEventSynchronize(stop_));
     float milliseconds = 0;
-    CUDA_CALL(cudaEventElapsedTime(&milliseconds, start_, stop_));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaEventElapsedTime(&milliseconds, start_, stop_));
     return milliseconds * 1e6;
   }
   virtual ~CUDATimerNode() {
-    CUDA_CALL(cudaEventDestroy(start_));
-    CUDA_CALL(cudaEventDestroy(stop_));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaEventDestroy(start_));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaEventDestroy(stop_));
   }
   CUDATimerNode() {
-    CUDA_CALL(cudaEventCreate(&start_));
-    CUDA_CALL(cudaEventCreate(&stop_));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaEventCreate(&start_));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaEventCreate(&stop_));
   }
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("runtime.cuda.CUDATimerNode", CUDATimerNode, TimerNode);
 
@@ -340,7 +364,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 TVM_RUNTIME_DLL ffi::String GetCudaFreeMemory() {
   size_t free_mem, total_mem;
-  CUDA_CALL(cudaMemGetInfo(&free_mem, &total_mem));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaMemGetInfo(&free_mem, &total_mem));
   std::stringstream ss;
   ss << "Current CUDA memory is " << free_mem << " bytes free out of " << total_mem
      << " bytes on device";
@@ -355,14 +379,14 @@ TVM_FFI_STATIC_INIT_BLOCK() {
         // TODO(tvm-team): remove once confirms all dep such as flashinfer
         // migrated to TVMFFIEnvGetStream
         int device_id;
-        CUDA_CALL(cudaGetDevice(&device_id));
+        TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
         return static_cast<void*>(TVMFFIEnvGetStream(kDLCUDA, device_id));
       });
 }
 
 TVM_RUNTIME_DLL int GetCudaDeviceCount() {
   int count;
-  CUDA_CALL(cudaGetDeviceCount(&count));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaGetDeviceCount(&count));
   return count;
 }
 
diff --git a/src/backend/cuda/runtime/cuda_module.cc b/src/backend/cuda/runtime/cuda_module.cc
index 8ea734da6906..5a598c87cc6a 100644
--- a/src/backend/cuda/runtime/cuda_module.cc
+++ b/src/backend/cuda/runtime/cuda_module.cc
@@ -28,6 +28,7 @@
 #include <cuda_runtime.h>
 #include <tvm/ffi/cast.h>
 #include <tvm/ffi/extra/c_env_api.h>
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/ffi/extra/module.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
@@ -41,11 +42,22 @@
 #include "../../../runtime/pack_args.h"
 #include "../../../runtime/thread_storage_scope.h"
 #include "../../../support/bytes_io.h"
-#include "cuda_common.h"
 
 namespace tvm {
 namespace runtime {
 
+#ifndef CUDA_DRIVER_CALL
+#define CUDA_DRIVER_CALL(x)                                             \
+  {                                                                     \
+    CUresult result = x;                                                \
+    if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) { \
+      const char* msg;                                                  \
+      cuGetErrorName(result, &msg);                                     \
+      TVM_FFI_THROW(CUDAError) << "" #x " failed with error: " << msg;  \
+    }                                                                   \
+  }
+#endif
+
 // Maximum number of GPU supported in CUDAModule (file-local).
 static constexpr const int kMaxNumGPUs = 32;
 
@@ -204,7 +216,7 @@ class CUDAWrappedFunc {
   // invoke the function with void arguments
   void operator()(ffi::PackedArgs args, ffi::Any* rv, void** void_args) const {
     int device_id;
-    CUDA_CALL(cudaGetDevice(&device_id));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
     ThreadWorkLoad wl = launch_param_config_.Extract(args);
 
     if (fcache_[device_id] == nullptr) {
diff --git a/src/runtime/extra/contrib/cublas/cublas_json_runtime.cc b/src/runtime/extra/contrib/cublas/cublas_json_runtime.cc
index 6520753e117b..34ba853505aa 100644
--- a/src/runtime/extra/contrib/cublas/cublas_json_runtime.cc
+++ b/src/runtime/extra/contrib/cublas/cublas_json_runtime.cc
@@ -24,6 +24,7 @@
 
 #include <tvm/ffi/cast.h>
 #include <tvm/ffi/extra/c_env_api.h>
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/tensor.h>
@@ -32,7 +33,6 @@
 #include <string>
 #include <vector>
 
-#include "../../../../backend/cuda/runtime/cuda_common.h"
 #include "../json/json_node.h"
 #include "../json/json_runtime.h"
 #include "cublas_utils.h"
@@ -89,7 +89,7 @@ class CublasJSONRuntime : public JSONRuntimeBase {
     }
 
     if (device_id == -1) {
-      CUDA_CALL(cudaGetDevice(&device_id));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
     }
     auto* entry_ptr = tvm::contrib::CuBlasLtThreadEntry::ThreadLocal(DLDevice{kDLCUDA, device_id});
     cudaStream_t stream = static_cast<cudaStream_t>(TVMFFIEnvGetStream(kDLCUDA, device_id));
diff --git a/src/runtime/extra/contrib/cublas/cublas_utils.cc b/src/runtime/extra/contrib/cublas/cublas_utils.cc
index b8c239f13a85..dcd115026cda 100644
--- a/src/runtime/extra/contrib/cublas/cublas_utils.cc
+++ b/src/runtime/extra/contrib/cublas/cublas_utils.cc
@@ -23,10 +23,9 @@
 #include "cublas_utils.h"
 
 #include <tvm/ffi/extra/c_env_api.h>
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/ffi/function.h>
 
-#include "../../../../backend/cuda/runtime/cuda_common.h"
-
 namespace tvm {
 namespace contrib {
 
@@ -51,7 +50,7 @@ CuBlasThreadEntry* CuBlasThreadEntry::ThreadLocal(DLDevice curr_device) {
 CuBlasLtThreadEntry::CuBlasLtThreadEntry() {
   CHECK_CUBLAS_ERROR(cublasLtCreate(&handle));
   CHECK_CUBLAS_ERROR(cublasLtMatmulPreferenceCreate(&matmul_pref_desc));
-  CUDA_CALL(cudaMalloc(&workspace_ptr, workspace_size));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaMalloc(&workspace_ptr, workspace_size));
 }
 
 CuBlasLtThreadEntry::~CuBlasLtThreadEntry() {
diff --git a/src/runtime/extra/contrib/cudnn/conv_backward.cc b/src/runtime/extra/contrib/cudnn/conv_backward.cc
index bfc65baaff93..ca711f4e784e 100644
--- a/src/runtime/extra/contrib/cudnn/conv_backward.cc
+++ b/src/runtime/extra/contrib/cudnn/conv_backward.cc
@@ -67,7 +67,7 @@ void BackwardDataFindAlgo(int format, int dims, int groups, const int pad[], con
                           const int dx_dim[], const std::string& data_dtype,
                           const std::string& conv_dtype, bool verbose, ffi::Any* ret) {
   int device_id;
-  CUDA_CALL(cudaGetDevice(&device_id));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
   CuDNNThreadEntry* entry_ptr = CuDNNThreadEntry::ThreadLocal(DLDevice{kDLCUDA, device_id});
   const int full_dims = dims + 2;
   std::vector<int64_t> dy_dim_int64(full_dims);
@@ -146,7 +146,7 @@ void BackwardFilterFindAlgo(int format, int dims, int groups, const int pad[], c
                             const int dw_dim[], const std::string& data_dtype,
                             const std::string& conv_dtype, bool verbose, ffi::Any* ret) {
   int device_id;
-  CUDA_CALL(cudaGetDevice(&device_id));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
   CuDNNThreadEntry* entry_ptr = CuDNNThreadEntry::ThreadLocal(DLDevice{kDLCUDA, device_id});
   const int full_dims = dims + 2;
   std::vector<int64_t> x_dim_int64(full_dims);
diff --git a/src/runtime/extra/contrib/cudnn/conv_forward.cc b/src/runtime/extra/contrib/cudnn/conv_forward.cc
index 6c6fd7eb4036..befbddd78812 100644
--- a/src/runtime/extra/contrib/cudnn/conv_forward.cc
+++ b/src/runtime/extra/contrib/cudnn/conv_forward.cc
@@ -112,7 +112,7 @@ void FindAlgo(int format, int dims, int groups, const int pad[], const int strid
               const std::string& data_dtype, const std::string& conv_dtype, bool verbose,
               ffi::Any* ret) {
   int device_id;
-  CUDA_CALL(cudaGetDevice(&device_id));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
   CuDNNThreadEntry* entry_ptr = CuDNNThreadEntry::ThreadLocal(DLDevice{kDLCUDA, device_id});
   const int full_dims = dims + 2;
   std::vector<int64_t> x_dim_int64(full_dims);
diff --git a/src/runtime/extra/contrib/cudnn/cudnn_frontend/attention.cc b/src/runtime/extra/contrib/cudnn/cudnn_frontend/attention.cc
index f31a38bd472d..54d16b5a3966 100644
--- a/src/runtime/extra/contrib/cudnn/cudnn_frontend/attention.cc
+++ b/src/runtime/extra/contrib/cudnn/cudnn_frontend/attention.cc
@@ -24,10 +24,10 @@
 
 #include "./attention.h"
 
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/ffi/function.h>
 #include <tvm/runtime/device_api.h>
 
-#include "../../../../../backend/cuda/runtime/cuda_common.h"
 #include "../cudnn_utils.h"
 
 namespace tvm {
@@ -99,7 +99,7 @@ void CuDNNSDPARunnerNode::Init(int64_t batch, int64_t seq_len, int64_t num_heads
   TVM_FFI_ICHECK(stats == nullptr);
   o->set_output(true).set_dim({batch, num_heads, seq_len, head_size_v}).set_stride(o_stride);
   int device_id;
-  CUDA_CALL(cudaGetDevice(&device_id));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
   CuDNNThreadEntry* entry_ptr = CuDNNThreadEntry::ThreadLocal(DLDevice{kDLCUDA, device_id});
   CUDNN_FRONTEND_CALL(graph_->build(entry_ptr->handle, {cudnn_frontend::HeurMode_t::A}));
 }
diff --git a/src/runtime/extra/contrib/cudnn/cudnn_json_runtime.cc b/src/runtime/extra/contrib/cudnn/cudnn_json_runtime.cc
index a7cf1ec2b318..5b3756fd79b8 100644
--- a/src/runtime/extra/contrib/cudnn/cudnn_json_runtime.cc
+++ b/src/runtime/extra/contrib/cudnn/cudnn_json_runtime.cc
@@ -103,7 +103,7 @@ class cuDNNJSONRuntime : public JSONRuntimeBase {
 
   std::function<void()> GetConv2DExec(const JSONGraphNode& node) {
     int device_id;
-    CUDA_CALL(cudaGetDevice(&device_id));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
     auto* entry_ptr = tvm::contrib::CuDNNThreadEntry::ThreadLocal(DLDevice{kDLCUDA, device_id});
     auto op_name = node.GetOpName();
 
@@ -164,7 +164,7 @@ class cuDNNJSONRuntime : public JSONRuntimeBase {
     int algo = best_algo.cast<int>();
     std::function<void()> op_exec = [=, this]() {
       int device_id;
-      CUDA_CALL(cudaGetDevice(&device_id));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
       cudaStream_t stream = static_cast<cudaStream_t>(TVMFFIEnvGetStream(kDLCUDA, device_id));
       CUDNN_CALL(cudnnSetStream(entry_ptr->handle, stream));
 
diff --git a/src/runtime/extra/contrib/cudnn/cudnn_utils.cc b/src/runtime/extra/contrib/cudnn/cudnn_utils.cc
index 5793febf01ef..5c34d4a2b0a6 100644
--- a/src/runtime/extra/contrib/cudnn/cudnn_utils.cc
+++ b/src/runtime/extra/contrib/cudnn/cudnn_utils.cc
@@ -270,7 +270,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tvm.contrib.cudnn.exists", []() -> bool {
     int device_id;
-    CUDA_CALL(cudaGetDevice(&device_id));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
     return CuDNNThreadEntry::ThreadLocal(DLDevice{kDLCUDA, device_id}, false)->exists();
   });
 }
diff --git a/src/runtime/extra/contrib/cudnn/cudnn_utils.h b/src/runtime/extra/contrib/cudnn/cudnn_utils.h
index 81849c0f0c31..cef82134ad90 100644
--- a/src/runtime/extra/contrib/cudnn/cudnn_utils.h
+++ b/src/runtime/extra/contrib/cudnn/cudnn_utils.h
@@ -26,12 +26,11 @@
 
 #include <cudnn.h>
 #include <tvm/ffi/error.h>
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/runtime/device_api.h>
 
 #include <string>
 
-#include "../../../../backend/cuda/runtime/cuda_common.h"
-
 namespace tvm {
 namespace contrib {
 
diff --git a/src/runtime/extra/contrib/cudnn/softmax.cc b/src/runtime/extra/contrib/cudnn/softmax.cc
index d494fb334946..f0562dcba073 100644
--- a/src/runtime/extra/contrib/cudnn/softmax.cc
+++ b/src/runtime/extra/contrib/cudnn/softmax.cc
@@ -41,7 +41,7 @@ void softmax_impl(cudnnSoftmaxAlgorithm_t alg, ffi::PackedArgs args, ffi::Any* r
   if (axis < 0) axis += ndim;
   TVM_FFI_ICHECK(axis >= 0 && axis < ndim);
   int device_id;
-  CUDA_CALL(cudaGetDevice(&device_id));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
   CuDNNThreadEntry* entry_ptr = CuDNNThreadEntry::ThreadLocal(DLDevice{kDLCUDA, device_id});
   entry_ptr->softmax_entry.data_type = CuDNNDataType::DLTypeToCuDNNType(x->dtype);
 
diff --git a/src/runtime/extra/contrib/curand/curand.cc b/src/runtime/extra/contrib/curand/curand.cc
index 5f8ff0758373..b2fc7538657a 100644
--- a/src/runtime/extra/contrib/curand/curand.cc
+++ b/src/runtime/extra/contrib/curand/curand.cc
@@ -17,11 +17,11 @@
  * under the License.
  */
 #include <curand.h>
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/base.h>
 
-#include "../../../../backend/cuda/runtime/cuda_common.h"
 #include "./helper_cuda_kernels.h"
 
 namespace tvm {
diff --git a/src/runtime/extra/contrib/cutlass/fp16_group_gemm_runner_sm100.cuh b/src/runtime/extra/contrib/cutlass/fp16_group_gemm_runner_sm100.cuh
index c30b34d0f41a..0a3068c5d9c2 100644
--- a/src/runtime/extra/contrib/cutlass/fp16_group_gemm_runner_sm100.cuh
+++ b/src/runtime/extra/contrib/cutlass/fp16_group_gemm_runner_sm100.cuh
@@ -17,6 +17,7 @@
  * under the License.
  */
 
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/runtime/logging.h>
 
 #include <fstream>
@@ -25,8 +26,6 @@
 #include <variant>
 #include <vector>
 
-#include "../../../../backend/cuda/runtime/cuda_common.h"
-
 // clang-format off
 #include "cutlass/cutlass.h"
 
diff --git a/src/runtime/extra/contrib/cutlass/fp16_group_gemm_runner_sm90.cuh b/src/runtime/extra/contrib/cutlass/fp16_group_gemm_runner_sm90.cuh
index dcb26b9071d5..552b46b0a031 100644
--- a/src/runtime/extra/contrib/cutlass/fp16_group_gemm_runner_sm90.cuh
+++ b/src/runtime/extra/contrib/cutlass/fp16_group_gemm_runner_sm90.cuh
@@ -17,6 +17,7 @@
  * under the License.
  */
 
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/runtime/logging.h>
 
 #include <fstream>
@@ -25,8 +26,6 @@
 #include <variant>
 #include <vector>
 
-#include "../../../../backend/cuda/runtime/cuda_common.h"
-
 // clang-format off
 #include "cutlass/cutlass.h"
 
diff --git a/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm100.cuh b/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm100.cuh
index 9c1962ad98d7..26eb35e34ee5 100644
--- a/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm100.cuh
+++ b/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm100.cuh
@@ -17,6 +17,8 @@
  * under the License.
  */
 
+#include <tvm/ffi/extra/cuda/base.h>
+
 #include <fstream>
 #include <iostream>
 #include <sstream>
@@ -24,8 +26,6 @@
 #include <variant>
 #include <vector>
 
-#include "../../../../backend/cuda/runtime/cuda_common.h"
-
 // clang-format off
 #include "cutlass/cutlass.h"
 
diff --git a/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm90.cuh b/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm90.cuh
index 1955e47759f7..9307e33d7f8e 100644
--- a/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm90.cuh
+++ b/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm90.cuh
@@ -17,6 +17,8 @@
  * under the License.
  */
 
+#include <tvm/ffi/extra/cuda/base.h>
+
 #include <fstream>
 #include <iostream>
 #include <sstream>
@@ -24,8 +26,6 @@
 #include <variant>
 #include <vector>
 
-#include "../../../../backend/cuda/runtime/cuda_common.h"
-
 // clang-format off
 #include "cutlass/cutlass.h"
 
diff --git a/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_group_gemm_runner_sm100.cuh b/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_group_gemm_runner_sm100.cuh
index a516f31d3ce9..01e9ca7b8a20 100644
--- a/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_group_gemm_runner_sm100.cuh
+++ b/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_group_gemm_runner_sm100.cuh
@@ -17,14 +17,14 @@
  * under the License.
  */
 
+#include <tvm/ffi/extra/cuda/base.h>
+
 #include <fstream>
 #include <iostream>
 #include <sstream>
 #include <variant>
 #include <vector>
 
-#include "../../../../backend/cuda/runtime/cuda_common.h"
-
 // clang-format off
 #include "cutlass/cutlass.h"
 
diff --git a/src/runtime/extra/contrib/cutlass/gemm_runner.cuh b/src/runtime/extra/contrib/cutlass/gemm_runner.cuh
index 58e1c9fbd006..174699bcec42 100644
--- a/src/runtime/extra/contrib/cutlass/gemm_runner.cuh
+++ b/src/runtime/extra/contrib/cutlass/gemm_runner.cuh
@@ -17,6 +17,7 @@
  * under the License.
  */
 
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/runtime/logging.h>
 
 #include <fstream>
@@ -25,8 +26,6 @@
 #include <variant>
 #include <vector>
 
-#include "../../../../backend/cuda/runtime/cuda_common.h"
-
 // clang-format off
 #include "cutlass/cutlass.h"
 
diff --git a/src/runtime/extra/contrib/nvshmem/dist_gemm.cu b/src/runtime/extra/contrib/nvshmem/dist_gemm.cu
index d975e140384d..26cc55a44218 100644
--- a/src/runtime/extra/contrib/nvshmem/dist_gemm.cu
+++ b/src/runtime/extra/contrib/nvshmem/dist_gemm.cu
@@ -19,12 +19,11 @@
 #include <nvshmem.h>
 #include <nvshmemx.h>
 #include <tvm/ffi/extra/c_env_api.h>
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/disco/disco_worker.h>
 
-#include "../../../../backend/cuda/runtime/cuda_common.h"
-
 namespace tvm {
 namespace runtime {
 
@@ -62,7 +61,7 @@ TVMStreamHandle stream_create() {
   DiscoWorker* worker = ThreadLocalDiscoWorker::Get()->worker;
   TVM_FFI_ICHECK(worker != nullptr) << "NVSHMEM stream creation failed: worker is not initialized";
   cudaStream_t retval;
-  CUDA_CALL(cudaStreamCreateWithFlags(&retval, cudaStreamNonBlocking));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaStreamCreateWithFlags(&retval, cudaStreamNonBlocking));
   return static_cast<TVMStreamHandle>(retval);
 }
 
@@ -100,7 +99,7 @@ void transfer_to_peers_reduce_scatter(Tensor semaphore, Tensor gemm_out, Tensor
                    stream);
     } else {
       int device_id;
-      CUDA_CALL(cudaGetDevice(&device_id));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
       TVMStreamHandle main_stream = TVMFFIEnvGetStream(kDLCUDA, device_id);
       copy_to_peer(get_pointer(staging_buffer, ffi::Shape{my_rank, 0, 0}), to_rank,
                    get_pointer(gemm_out, ffi::Shape{to_rank * LOCAL_M, 0}), LOCAL_M * N * 2,
diff --git a/src/runtime/extra/contrib/nvshmem/init.cc b/src/runtime/extra/contrib/nvshmem/init.cc
index 698f3b68024f..aeb71bb6a19a 100644
--- a/src/runtime/extra/contrib/nvshmem/init.cc
+++ b/src/runtime/extra/contrib/nvshmem/init.cc
@@ -20,14 +20,13 @@
 #include <nvshmem.h>
 #include <nvshmemx.h>
 #include <tvm/ffi/extra/c_env_api.h>
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/ffi/extra/json.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/disco/disco_worker.h>
 #include <tvm/runtime/logging.h>
 
-#include "../../../../backend/cuda/runtime/cuda_common.h"
-
 namespace tvm {
 namespace runtime {
 
@@ -66,7 +65,7 @@ void InitNVSHMEM(ffi::Shape uid_64, int num_workers, int worker_id_start) {
   nvshmemx_set_attr_uniqueid_args(worker_id, num_workers, &uid, &attr);
   nvshmemx_init_attr(NVSHMEMX_INIT_WITH_UNIQUEID, &attr);
   int mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE);
-  CUDA_CALL(cudaSetDevice(mype_node));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(mype_node));
   if (worker != nullptr) {
     if (worker->default_device.device_type == DLDeviceType::kDLCPU) {
       worker->default_device = Device{DLDeviceType::kDLCUDA, mype_node};
@@ -153,7 +152,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
       .def("runtime.nvshmem.cumodule_init", NVSHMEMXCumoduleInit)
       .def("runtime.disco.nvshmem.barrier_all_on_current_stream", []() {
         int device_id;
-        CUDA_CALL(cudaGetDevice(&device_id));
+        TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
         TVMStreamHandle stream = TVMFFIEnvGetStream(kDLCUDA, device_id);
         NVSHMEMBarrierAllOnStream(stream);
       });
diff --git a/src/runtime/extra/contrib/nvshmem/memory_allocator.cc b/src/runtime/extra/contrib/nvshmem/memory_allocator.cc
index ee9afd0eca3d..cb6e3520c8c1 100644
--- a/src/runtime/extra/contrib/nvshmem/memory_allocator.cc
+++ b/src/runtime/extra/contrib/nvshmem/memory_allocator.cc
@@ -18,13 +18,13 @@
  */
 #include <nvshmem.h>
 #include <nvshmemx.h>
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/memory/memory_manager.h>
 
 #include <thread>
 
-#include "../../../../backend/cuda/runtime/cuda_common.h"
 #include "../../../memory/pooled_allocator.h"
 #include "../../disco/utils.h"
 
diff --git a/src/runtime/extra/contrib/tensorrt/tensorrt_calibrator.h b/src/runtime/extra/contrib/tensorrt/tensorrt_calibrator.h
index 4d92afb234d1..408d50cc7e08 100755
--- a/src/runtime/extra/contrib/tensorrt/tensorrt_calibrator.h
+++ b/src/runtime/extra/contrib/tensorrt/tensorrt_calibrator.h
@@ -23,10 +23,11 @@
 #ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_CALIBRATOR_H_
 #define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_CALIBRATOR_H_
 
+#include <tvm/ffi/extra/cuda/base.h>
+
 #include <string>
 #include <vector>
 
-#include "../../../../backend/cuda/runtime/cuda_common.h"
 #include "NvInfer.h"
 
 namespace tvm {
@@ -46,7 +47,7 @@ class TensorRTCalibrator : public nvinfer1::IInt8EntropyCalibrator2 {
     }
     // Free buffers
     for (size_t i = 0; i < buffers_.size(); ++i) {
-      CUDA_CALL(cudaFree(buffers_[i]));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaFree(buffers_[i]));
     }
   }
 
@@ -55,8 +56,9 @@ class TensorRTCalibrator : public nvinfer1::IInt8EntropyCalibrator2 {
     std::vector<float*> data_host(bindings.size(), nullptr);
     for (size_t i = 0; i < bindings.size(); ++i) {
       data_host[i] = new float[batch_size_ * binding_sizes[i]];
-      CUDA_CALL(cudaMemcpy(static_cast<void*>(data_host[i]), bindings[i],
-                           batch_size_ * binding_sizes[i] * sizeof(float), cudaMemcpyDeviceToHost));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaMemcpy(static_cast<void*>(data_host[i]), bindings[i],
+                                          batch_size_ * binding_sizes[i] * sizeof(float),
+                                          cudaMemcpyDeviceToHost));
     }
     data_.push_back(data_host);
     data_sizes_.push_back(binding_sizes);
@@ -73,9 +75,10 @@ class TensorRTCalibrator : public nvinfer1::IInt8EntropyCalibrator2 {
     TVM_FFI_ICHECK_EQ(input_names_.size(), nbBindings);
     for (size_t i = 0; i < input_names_.size(); ++i) {
       TVM_FFI_ICHECK_EQ(input_names_[i], names[i]);
-      CUDA_CALL(cudaMemcpy(buffers_[i], data_[num_batches_calibrated_][i],
-                           batch_size_ * data_sizes_[num_batches_calibrated_][i] * sizeof(float),
-                           cudaMemcpyHostToDevice));
+      TVM_FFI_CHECK_CUDA_ERROR(
+          cudaMemcpy(buffers_[i], data_[num_batches_calibrated_][i],
+                     batch_size_ * data_sizes_[num_batches_calibrated_][i] * sizeof(float),
+                     cudaMemcpyHostToDevice));
       bindings[i] = buffers_[i];
     }
     num_batches_calibrated_++;
@@ -120,7 +123,7 @@ class TensorRTCalibrator : public nvinfer1::IInt8EntropyCalibrator2 {
     const int num_inputs = data_sizes_[0].size();
     buffers_.assign(num_inputs, nullptr);
     for (int i = 0; i < num_inputs; ++i) {
-      CUDA_CALL(cudaMalloc(&buffers_[i], data_sizes_[0][i] * sizeof(float)));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaMalloc(&buffers_[i], data_sizes_[0][i] * sizeof(float)));
     }
   }
 };
diff --git a/src/runtime/extra/contrib/thrust/thrust.cu b/src/runtime/extra/contrib/thrust/thrust.cu
index b3613232311a..f534e1fc765f 100644
--- a/src/runtime/extra/contrib/thrust/thrust.cu
+++ b/src/runtime/extra/contrib/thrust/thrust.cu
@@ -33,6 +33,7 @@
 #include <thrust/sort.h>
 #include <tvm/ffi/dtype.h>
 #include <tvm/ffi/extra/c_env_api.h>
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/logging.h>
@@ -41,8 +42,6 @@
 #include <functional>
 #include <memory>
 #include <vector>
-
-#include "../../../../backend/cuda/runtime/cuda_common.h"
 namespace tvm {
 namespace contrib {
 
@@ -95,7 +94,7 @@ class WorkspaceMemoryResource : public thrust::mr::memory_resource<void*> {
 
 auto get_thrust_exec_policy(WorkspaceMemoryResource* memory_resouce) {
   int device_id;
-  CUDA_CALL(cudaGetDevice(&device_id));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
   cudaStream_t stream = static_cast<cudaStream_t>(TVMFFIEnvGetStream(kDLCUDA, device_id));
   return thrust::cuda::par_nosync(memory_resouce).on(stream);
 }
diff --git a/src/runtime/extra/disco/cuda_ipc/cuda_ipc_memory.cc b/src/runtime/extra/disco/cuda_ipc/cuda_ipc_memory.cc
index 109ce6ed37a6..426557b7b7ad 100644
--- a/src/runtime/extra/disco/cuda_ipc/cuda_ipc_memory.cc
+++ b/src/runtime/extra/disco/cuda_ipc/cuda_ipc_memory.cc
@@ -18,13 +18,13 @@
  */
 
 #include <cuda_runtime.h>
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/disco/cuda_ipc_memory.h>
 #include <tvm/runtime/memory/memory_manager.h>
 
 #include "../../../../../3rdparty/tensorrt_llm/custom_allreduce_kernels.h"
-#include "../../../../backend/cuda/runtime/cuda_common.h"
 #include "../../../memory/pooled_allocator.h"
 #include "../nccl/nccl_context.h"
 
@@ -45,20 +45,22 @@ using tvm::runtime::memory::Buffer;
 std::vector<cudaIpcMemHandle_t> AllGatherIPCHandles(nccl::CCLThreadLocalContext* ctx,
                                                     cudaIpcMemHandle_t local_handle) {
   void *d_src, *d_dst;
-  CUDA_CALL(cudaMalloc(&d_src, CUDA_IPC_HANDLE_SIZE));
-  CUDA_CALL(cudaMalloc(&d_dst, CUDA_IPC_HANDLE_SIZE * ctx->worker->num_workers));
-  CUDA_CALL(cudaMemcpy(d_src, &local_handle, CUDA_IPC_HANDLE_SIZE, cudaMemcpyHostToDevice));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaMalloc(&d_src, CUDA_IPC_HANDLE_SIZE));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaMalloc(&d_dst, CUDA_IPC_HANDLE_SIZE * ctx->worker->num_workers));
+  TVM_FFI_CHECK_CUDA_ERROR(
+      cudaMemcpy(d_src, &local_handle, CUDA_IPC_HANDLE_SIZE, cudaMemcpyHostToDevice));
   NCCL_CALL(ncclAllGather(d_src, d_dst, CUDA_IPC_HANDLE_SIZE, ncclChar, ctx->global_comm,
                           /*stream=*/nullptr));
   std::vector<char> serial_handles(CUDA_IPC_HANDLE_SIZE * ctx->worker->num_workers, 0);
-  CUDA_CALL(cudaMemcpy(serial_handles.data(), d_dst,
-                       CUDA_IPC_HANDLE_SIZE * ctx->worker->num_workers, cudaMemcpyDefault));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaMemcpy(serial_handles.data(), d_dst,
+                                      CUDA_IPC_HANDLE_SIZE * ctx->worker->num_workers,
+                                      cudaMemcpyDefault));
   std::vector<cudaIpcMemHandle_t> handles(ctx->worker->num_workers);
   for (int i = 0; i < ctx->worker->num_workers; ++i) {
     memcpy(handles[i].reserved, &serial_handles[i * CUDA_IPC_HANDLE_SIZE], CUDA_IPC_HANDLE_SIZE);
   }
-  CUDA_CALL(cudaFree(d_src));
-  CUDA_CALL(cudaFree(d_dst));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaFree(d_src));
+  TVM_FFI_CHECK_CUDA_ERROR(cudaFree(d_dst));
   return handles;
 }
 
@@ -115,7 +117,7 @@ class CUDAIPCMemoryAllocator final : public memory::PooledAllocator {
 
   void DeviceFreeDataSpace(Device dev, void* ptr) final {
     TVM_FFI_ICHECK(dev.device_type == kDLCUDA);
-    CUDA_CALL(cudaSetDevice(dev.device_id));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev.device_id));
     nccl::CCLThreadLocalContext* ctx = nccl::CCLThreadLocalContext::Get();
     auto it = ipc_memory_map_.find(ptr);
     TVM_FFI_ICHECK(it != ipc_memory_map_.end());
@@ -146,20 +148,20 @@ class CUDAIPCMemoryAllocator final : public memory::PooledAllocator {
     // Alloc local buffer
     TVM_FFI_ICHECK(dev.device_type == kDLCUDA);
     void* ptr;
-    CUDA_CALL(cudaSetDevice(dev.device_id));
-    CUDA_CALL(cudaMalloc(&ptr, size));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev.device_id));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaMalloc(&ptr, size));
     // Reset allocated memory to zero when required.
     // We explicitly synchronize after memset, to make sure memset finishes
     // before using all-gather to exchange IPC handles.
     // This is important to ensure the memory reset get ordered
     // before any other peers read the memory.
     if (reset_memory_to_zero) {
-      CUDA_CALL(cudaMemset(ptr, 0, size));
-      CUDA_CALL(cudaDeviceSynchronize());
+      TVM_FFI_CHECK_CUDA_ERROR(cudaMemset(ptr, 0, size));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaDeviceSynchronize());
     }
     // Create ipc handle
     cudaIpcMemHandle_t local_handle;
-    CUDA_CALL(cudaIpcGetMemHandle(&local_handle, ptr));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaIpcGetMemHandle(&local_handle, ptr));
     // All-gather IPC handles.
     nccl::CCLThreadLocalContext* ctx = nccl::CCLThreadLocalContext::Get();
     std::vector<cudaIpcMemHandle_t> handles = AllGatherIPCHandles(ctx, local_handle);
@@ -170,8 +172,9 @@ class CUDAIPCMemoryAllocator final : public memory::PooledAllocator {
         comm_ptrs[node_id] = ptr;
       } else {
         uint8_t* foreign_buffer;
-        CUDA_CALL(cudaIpcOpenMemHandle(reinterpret_cast<void**>(&foreign_buffer), handles[node_id],
-                                       cudaIpcMemLazyEnablePeerAccess));
+        TVM_FFI_CHECK_CUDA_ERROR(cudaIpcOpenMemHandle(reinterpret_cast<void**>(&foreign_buffer),
+                                                      handles[node_id],
+                                                      cudaIpcMemLazyEnablePeerAccess));
         comm_ptrs[node_id] = foreign_buffer;
       }
     }
@@ -183,10 +186,10 @@ class CUDAIPCMemoryAllocator final : public memory::PooledAllocator {
     for (int i = 0; i < static_cast<int>(comm_ptrs.size()); ++i) {
       if (i != worker_id) {
         // Free ipc handle.
-        CUDA_CALL(cudaIpcCloseMemHandle(comm_ptrs[i]));
+        TVM_FFI_CHECK_CUDA_ERROR(cudaIpcCloseMemHandle(comm_ptrs[i]));
       } else {
         // Free local buffer.
-        CUDA_CALL(cudaFree(comm_ptrs[i]));
+        TVM_FFI_CHECK_CUDA_ERROR(cudaFree(comm_ptrs[i]));
       }
     }
   }
diff --git a/src/runtime/extra/disco/nccl/nccl_context.h b/src/runtime/extra/disco/nccl/nccl_context.h
index 3747fec43487..7a99be0897c0 100644
--- a/src/runtime/extra/disco/nccl/nccl_context.h
+++ b/src/runtime/extra/disco/nccl/nccl_context.h
@@ -35,8 +35,7 @@
 #endif
 #if TVM_NCCL_RCCL_SWITCH == 0
 #include <nccl.h>
-
-#include "../../../../backend/cuda/runtime/cuda_common.h"
+#include <tvm/ffi/extra/cuda/base.h>
 #else
 #include <rccl/rccl.h>
 
@@ -62,10 +61,16 @@ namespace nccl {
 
 using deviceStream_t = cudaStream_t;
 const constexpr DLDeviceType TVM_DISCO_DEVICE_TYPE = DLDeviceType::kDLCUDA;
-inline void SetDevice(int device_id) { CUDA_CALL(cudaSetDevice(device_id)); }
-inline void StreamSynchronize(deviceStream_t stream) { CUDA_CALL(cudaStreamSynchronize(stream)); }
-inline void StreamCreate(deviceStream_t* stream) { CUDA_CALL(cudaStreamCreate(stream)); }
-inline void StreamDestroy(deviceStream_t stream) { CUDA_CALL(cudaStreamDestroy(stream)); }
+inline void SetDevice(int device_id) { TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(device_id)); }
+inline void StreamSynchronize(deviceStream_t stream) {
+  TVM_FFI_CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
+}
+inline void StreamCreate(deviceStream_t* stream) {
+  TVM_FFI_CHECK_CUDA_ERROR(cudaStreamCreate(stream));
+}
+inline void StreamDestroy(deviceStream_t stream) {
+  TVM_FFI_CHECK_CUDA_ERROR(cudaStreamDestroy(stream));
+}
 
 #else
 
diff --git a/src/runtime/vm/cuda/cuda_graph_builtin.cc b/src/runtime/vm/cuda/cuda_graph_builtin.cc
index 58348fb695d8..b51b287d81d9 100644
--- a/src/runtime/vm/cuda/cuda_graph_builtin.cc
+++ b/src/runtime/vm/cuda/cuda_graph_builtin.cc
@@ -24,11 +24,11 @@
 
 #include <tvm/ffi/container/array.h>
 #include <tvm/ffi/extra/c_env_api.h>
+#include <tvm/ffi/extra/cuda/base.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/vm/vm.h>
 
-#include "../../../backend/cuda/runtime/cuda_common.h"
 #include "../../../support/utils.h"
 namespace tvm {
 namespace runtime {
@@ -85,7 +85,7 @@ struct CUDAGraphCapturedState {
 
   ~CUDAGraphCapturedState() {
     if (exec) {
-      CUDA_CALL(cudaGraphExecDestroy(exec));
+      TVM_FFI_CHECK_CUDA_ERROR(cudaGraphExecDestroy(exec));
     }
   }
 
@@ -100,7 +100,7 @@ struct CUDAGraphCapturedState {
 
 class ScopedCUDAStream {
  public:
-  ScopedCUDAStream() { CUDA_CALL(cudaStreamCreate(&stream_)); }
+  ScopedCUDAStream() { TVM_FFI_CHECK_CUDA_ERROR(cudaStreamCreate(&stream_)); }
   ~ScopedCUDAStream() { cudaStreamDestroy(stream_); }
   ScopedCUDAStream(const ScopedCUDAStream&) = delete;
   ScopedCUDAStream(ScopedCUDAStream&&) = delete;
@@ -116,11 +116,11 @@ class ScopedCUDAStream {
 class CUDACaptureStream {
  public:
   explicit CUDACaptureStream(cudaGraph_t* graph) : output_graph_(graph) {
-    CUDA_CALL(cudaGetDevice(&device_id_));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id_));
     TVM_FFI_CHECK_SAFE_CALL(
         TVMFFIEnvSetStream(kDLCUDA, device_id_, capture_stream_,
                            reinterpret_cast<TVMFFIStreamHandle*>(&prev_default_stream_)));
-    CUDA_CALL(cudaStreamBeginCapture(capture_stream_, cudaStreamCaptureModeGlobal));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaStreamBeginCapture(capture_stream_, cudaStreamCaptureModeGlobal));
   }
   ~CUDACaptureStream() noexcept(false) {
     cudaStreamEndCapture(capture_stream_, output_graph_);
@@ -158,8 +158,8 @@ class CUDAGraphExtensionNode : public VMExtensionNode {
       // Launch CUDA graph
       const auto& [states, exec] = it->second;
       int device_id;
-      CUDA_CALL(cudaGetDevice(&device_id));
-      CUDA_CALL(
+      TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id));
+      TVM_FFI_CHECK_CUDA_ERROR(
           cudaGraphLaunch(exec, static_cast<cudaStream_t>(TVMFFIEnvGetStream(kDLCUDA, device_id))));
       return states;
     }
@@ -190,8 +190,8 @@ class CUDAGraphExtensionNode : public VMExtensionNode {
 
     CUDAGraphCapturedState entry;
     entry.states = capture_func_rv.cast<ffi::ObjectRef>();
-    CUDA_CALL(cudaGraphInstantiate(&entry.exec, graph, NULL, NULL, 0));
-    CUDA_CALL(cudaGraphDestroy(graph));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaGraphInstantiate(&entry.exec, graph, NULL, NULL, 0));
+    TVM_FFI_CHECK_CUDA_ERROR(cudaGraphDestroy(graph));
 
     ffi::ObjectRef states = entry.states;
 

From a717665b05067e9e6e3cc888e0acc8c73a40cace Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Mon, 15 Jun 2026 11:46:49 +0000
Subject: [PATCH 2/2] [TESTS][CUDA] Relax OOM error match

---
 tests/python/relax/test_vm_builtin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/relax/test_vm_builtin.py b/tests/python/relax/test_vm_builtin.py
index f46d61810402..677bf0b4a347 100644
--- a/tests/python/relax/test_vm_builtin.py
+++ b/tests/python/relax/test_vm_builtin.py
@@ -77,7 +77,7 @@ def main():
     built = tvm.compile(Module, target=target)
     vm = relax.VirtualMachine(built, dev)
 
-    with pytest.raises(Exception, match="CUDA: out of memory"):
+    with pytest.raises(Exception, match="CUDA.*out of memory"):
         vm["main"]()