From 0f09209857c730907e308e590670de247428004f Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 14 Jun 2026 19:46:17 +0000 Subject: [PATCH 1/2] [REFACTOR][CUDA] Phase out cuda_common.h The CUDA runtime can rely on the shared tvm-ffi CUDA error helper instead of carrying a TVM-local common header. This PR removes the legacy header by moving the CUDA workspace state into cuda_device_api.cc, replacing CUDA_CALL users with TVM_FFI_CHECK_CUDA_ERROR, and inlining the remaining CUDA driver checks at their call sites. --- src/backend/cuda/runtime/cuda_common.h | 69 --------- src/backend/cuda/runtime/cuda_device_api.cc | 140 ++++++++++-------- src/backend/cuda/runtime/cuda_module.cc | 16 +- .../contrib/cublas/cublas_json_runtime.cc | 4 +- .../extra/contrib/cublas/cublas_utils.cc | 5 +- .../extra/contrib/cudnn/conv_backward.cc | 4 +- .../extra/contrib/cudnn/conv_forward.cc | 2 +- .../contrib/cudnn/cudnn_frontend/attention.cc | 4 +- .../extra/contrib/cudnn/cudnn_json_runtime.cc | 4 +- .../extra/contrib/cudnn/cudnn_utils.cc | 2 +- src/runtime/extra/contrib/cudnn/cudnn_utils.h | 3 +- src/runtime/extra/contrib/cudnn/softmax.cc | 2 +- src/runtime/extra/contrib/curand/curand.cc | 2 +- .../cutlass/fp16_group_gemm_runner_sm100.cuh | 3 +- .../cutlass/fp16_group_gemm_runner_sm90.cuh | 3 +- ...fp8_groupwise_scaled_gemm_runner_sm100.cuh | 4 +- .../fp8_groupwise_scaled_gemm_runner_sm90.cuh | 4 +- ...oupwise_scaled_group_gemm_runner_sm100.cuh | 4 +- .../extra/contrib/cutlass/gemm_runner.cuh | 3 +- .../extra/contrib/nvshmem/dist_gemm.cu | 7 +- src/runtime/extra/contrib/nvshmem/init.cc | 7 +- .../extra/contrib/nvshmem/memory_allocator.cc | 2 +- .../contrib/tensorrt/tensorrt_calibrator.h | 19 ++- src/runtime/extra/contrib/thrust/thrust.cu | 5 +- .../extra/disco/cuda_ipc/cuda_ipc_memory.cc | 39 ++--- src/runtime/extra/disco/nccl/nccl_context.h | 17 ++- src/runtime/vm/cuda/cuda_graph_builtin.cc | 18 +-- 27 files changed, 181 insertions(+), 211 deletions(-) delete mode 100644 src/backend/cuda/runtime/cuda_common.h diff --git a/src/backend/cuda/runtime/cuda_common.h b/src/backend/cuda/runtime/cuda_common.h deleted file mode 100644 index 183a2e870229..000000000000 --- a/src/backend/cuda/runtime/cuda_common.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file cuda_common.h - * \brief Common utilities for CUDA - */ -#ifndef TVM_RUNTIME_CUDA_CUDA_COMMON_H_ -#define TVM_RUNTIME_CUDA_CUDA_COMMON_H_ - -#include -#include - -#include - -#include "../../../runtime/workspace_pool.h" - -namespace tvm { -namespace runtime { - -#define CUDA_DRIVER_CALL(x) \ - { \ - CUresult result = x; \ - if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) { \ - const char* msg; \ - cuGetErrorName(result, &msg); \ - TVM_FFI_THROW(CUDAError) << "" #x " failed with error: " << msg; \ - } \ - } - -#ifndef CUDA_CALL -#define CUDA_CALL(func) \ - { \ - cudaError_t e = (func); \ - TVM_FFI_ICHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \ - << "CUDA: " << cudaGetErrorString(e); \ - } -#endif - -/*! \brief Thread local workspace */ -class CUDAThreadEntry { - public: - /*! \brief thread local pool*/ - WorkspacePool pool; - /*! \brief constructor */ - CUDAThreadEntry(); - // get the threadlocal workspace - static CUDAThreadEntry* ThreadLocal(); -}; - -} // namespace runtime -} // namespace tvm -#endif // TVM_RUNTIME_CUDA_CUDA_COMMON_H_ diff --git a/src/backend/cuda/runtime/cuda_device_api.cc b/src/backend/cuda/runtime/cuda_device_api.cc index 969f40a081f4..68ae39de56bf 100644 --- a/src/backend/cuda/runtime/cuda_device_api.cc +++ b/src/backend/cuda/runtime/cuda_device_api.cc @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -32,14 +33,26 @@ #include -#include "cuda_common.h" +#include "../../../runtime/workspace_pool.h" namespace tvm { namespace runtime { +#ifndef CUDA_DRIVER_CALL +#define CUDA_DRIVER_CALL(x) \ + { \ + CUresult result = x; \ + if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) { \ + const char* msg; \ + cuGetErrorName(result, &msg); \ + TVM_FFI_THROW(CUDAError) << "" #x " failed with error: " << msg; \ + } \ + } +#endif + class CUDADeviceAPI final : public DeviceAPI { public: - void SetDevice(Device dev) final { CUDA_CALL(cudaSetDevice(dev.device_id)); } + void SetDevice(Device dev) final { TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev.device_id)); } void GetAttr(Device dev, DeviceAttrKind kind, ffi::Any* rv) final { int value = 0; switch (kind) { @@ -50,23 +63,27 @@ class CUDADeviceAPI final : public DeviceAPI { break; } case kMaxThreadsPerBlock: { - CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrMaxThreadsPerBlock, dev.device_id)); + TVM_FFI_CHECK_CUDA_ERROR( + cudaDeviceGetAttribute(&value, cudaDevAttrMaxThreadsPerBlock, dev.device_id)); break; } case kWarpSize: { - CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrWarpSize, dev.device_id)); + TVM_FFI_CHECK_CUDA_ERROR( + cudaDeviceGetAttribute(&value, cudaDevAttrWarpSize, dev.device_id)); break; } case kMaxSharedMemoryPerBlock: { - CUDA_CALL( + TVM_FFI_CHECK_CUDA_ERROR( cudaDeviceGetAttribute(&value, cudaDevAttrMaxSharedMemoryPerBlock, dev.device_id)); break; } case kComputeVersion: { std::ostringstream os; - CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrComputeCapabilityMajor, dev.device_id)); + TVM_FFI_CHECK_CUDA_ERROR( + cudaDeviceGetAttribute(&value, cudaDevAttrComputeCapabilityMajor, dev.device_id)); os << value << "."; - CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrComputeCapabilityMinor, dev.device_id)); + TVM_FFI_CHECK_CUDA_ERROR( + cudaDeviceGetAttribute(&value, cudaDevAttrComputeCapabilityMinor, dev.device_id)); os << value; *rv = os.str(); return; @@ -79,18 +96,23 @@ class CUDADeviceAPI final : public DeviceAPI { return; } case kMaxClockRate: { - CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrClockRate, dev.device_id)); + TVM_FFI_CHECK_CUDA_ERROR( + cudaDeviceGetAttribute(&value, cudaDevAttrClockRate, dev.device_id)); break; } case kMultiProcessorCount: { - CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrMultiProcessorCount, dev.device_id)); + TVM_FFI_CHECK_CUDA_ERROR( + cudaDeviceGetAttribute(&value, cudaDevAttrMultiProcessorCount, dev.device_id)); break; } case kMaxThreadDimensions: { int dims[3]; - CUDA_CALL(cudaDeviceGetAttribute(&dims[0], cudaDevAttrMaxBlockDimX, dev.device_id)); - CUDA_CALL(cudaDeviceGetAttribute(&dims[1], cudaDevAttrMaxBlockDimY, dev.device_id)); - CUDA_CALL(cudaDeviceGetAttribute(&dims[2], cudaDevAttrMaxBlockDimZ, dev.device_id)); + TVM_FFI_CHECK_CUDA_ERROR( + cudaDeviceGetAttribute(&dims[0], cudaDevAttrMaxBlockDimX, dev.device_id)); + TVM_FFI_CHECK_CUDA_ERROR( + cudaDeviceGetAttribute(&dims[1], cudaDevAttrMaxBlockDimY, dev.device_id)); + TVM_FFI_CHECK_CUDA_ERROR( + cudaDeviceGetAttribute(&dims[2], cudaDevAttrMaxBlockDimZ, dev.device_id)); std::stringstream ss; // use json string to return multiple int values; ss << "[" << dims[0] << ", " << dims[1] << ", " << dims[2] << "]"; @@ -98,7 +120,8 @@ class CUDADeviceAPI final : public DeviceAPI { return; } case kMaxRegistersPerBlock: { - CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrMaxRegistersPerBlock, dev.device_id)); + TVM_FFI_CHECK_CUDA_ERROR( + cudaDeviceGetAttribute(&value, cudaDevAttrMaxRegistersPerBlock, dev.device_id)); break; } case kGcnArch: @@ -112,20 +135,21 @@ class CUDADeviceAPI final : public DeviceAPI { case kL2CacheSizeBytes: { // Get size of device l2 cache size in bytes. int l2_size = 0; - CUDA_CALL(cudaDeviceGetAttribute(&l2_size, cudaDevAttrL2CacheSize, dev.device_id)); + TVM_FFI_CHECK_CUDA_ERROR( + cudaDeviceGetAttribute(&l2_size, cudaDevAttrL2CacheSize, dev.device_id)); *rv = l2_size; return; } case kTotalGlobalMemory: { cudaDeviceProp prop; - CUDA_CALL(cudaGetDeviceProperties(&prop, dev.device_id)); + TVM_FFI_CHECK_CUDA_ERROR(cudaGetDeviceProperties(&prop, dev.device_id)); int64_t total_global_memory = prop.totalGlobalMem; *rv = total_global_memory; return; } case kAvailableGlobalMemory: { size_t free_mem, total_mem; - CUDA_CALL(cudaMemGetInfo(&free_mem, &total_mem)); + TVM_FFI_CHECK_CUDA_ERROR(cudaMemGetInfo(&free_mem, &total_mem)); *rv = static_cast(free_mem); return; } @@ -139,14 +163,14 @@ class CUDADeviceAPI final : public DeviceAPI { void* ret; if (dev.device_type == kDLCUDAHost) { VLOG(1) << "allocating " << nbytes << "bytes on host"; - CUDA_CALL(cudaMallocHost(&ret, nbytes)); + TVM_FFI_CHECK_CUDA_ERROR(cudaMallocHost(&ret, nbytes)); } else { - CUDA_CALL(cudaSetDevice(dev.device_id)); + TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev.device_id)); size_t free_mem, total_mem; - CUDA_CALL(cudaMemGetInfo(&free_mem, &total_mem)); + TVM_FFI_CHECK_CUDA_ERROR(cudaMemGetInfo(&free_mem, &total_mem)); VLOG(1) << "allocating " << nbytes << " bytes on device, with " << free_mem << " bytes currently free out of " << total_mem << " bytes available"; - CUDA_CALL(cudaMalloc(&ret, nbytes)); + TVM_FFI_CHECK_CUDA_ERROR(cudaMalloc(&ret, nbytes)); } return ret; } @@ -172,11 +196,11 @@ class CUDADeviceAPI final : public DeviceAPI { if (dev.device_type == kDLCUDAHost) { VLOG(1) << "freeing host memory"; - CUDA_CALL(cudaFreeHost(ptr)); + TVM_FFI_CHECK_CUDA_ERROR(cudaFreeHost(ptr)); } else { - CUDA_CALL(cudaSetDevice(dev.device_id)); + TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev.device_id)); VLOG(1) << "freeing device memory"; - CUDA_CALL(cudaFree(ptr)); + TVM_FFI_CHECK_CUDA_ERROR(cudaFree(ptr)); } } @@ -203,17 +227,17 @@ class CUDADeviceAPI final : public DeviceAPI { } if (dev_from.device_type == kDLCUDA && dev_to.device_type == kDLCUDA) { - CUDA_CALL(cudaSetDevice(dev_from.device_id)); + TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev_from.device_id)); if (dev_from.device_id == dev_to.device_id) { GPUCopy(from, to, size, cudaMemcpyDeviceToDevice, cu_stream); } else { cudaMemcpyPeerAsync(to, dev_to.device_id, from, dev_from.device_id, size, cu_stream); } } else if (dev_from.device_type == kDLCUDA && dev_to.device_type == kDLCPU) { - CUDA_CALL(cudaSetDevice(dev_from.device_id)); + TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev_from.device_id)); GPUCopy(from, to, size, cudaMemcpyDeviceToHost, cu_stream); } else if (dev_from.device_type == kDLCPU && dev_to.device_type == kDLCUDA) { - CUDA_CALL(cudaSetDevice(dev_to.device_id)); + TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev_to.device_id)); GPUCopy(from, to, size, cudaMemcpyHostToDevice, cu_stream); } else { TVM_FFI_THROW(InternalError) << "expect copy from/to GPU or between GPU"; @@ -222,40 +246,40 @@ class CUDADeviceAPI final : public DeviceAPI { public: TVMStreamHandle CreateStream(Device dev) { - CUDA_CALL(cudaSetDevice(dev.device_id)); + TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev.device_id)); cudaStream_t retval; - CUDA_CALL(cudaStreamCreateWithFlags(&retval, cudaStreamNonBlocking)); + TVM_FFI_CHECK_CUDA_ERROR(cudaStreamCreateWithFlags(&retval, cudaStreamNonBlocking)); return static_cast(retval); } void FreeStream(Device dev, TVMStreamHandle stream) { - CUDA_CALL(cudaSetDevice(dev.device_id)); + TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev.device_id)); cudaStream_t cu_stream = static_cast(stream); - CUDA_CALL(cudaStreamDestroy(cu_stream)); + TVM_FFI_CHECK_CUDA_ERROR(cudaStreamDestroy(cu_stream)); } void SyncStreamFromTo(Device dev, TVMStreamHandle event_src, TVMStreamHandle event_dst) { - CUDA_CALL(cudaSetDevice(dev.device_id)); + TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev.device_id)); cudaStream_t src_stream = static_cast(event_src); cudaStream_t dst_stream = static_cast(event_dst); cudaEvent_t evt; - CUDA_CALL(cudaEventCreate(&evt)); - CUDA_CALL(cudaEventRecord(evt, src_stream)); - CUDA_CALL(cudaStreamWaitEvent(dst_stream, evt, 0)); - CUDA_CALL(cudaEventDestroy(evt)); + TVM_FFI_CHECK_CUDA_ERROR(cudaEventCreate(&evt)); + TVM_FFI_CHECK_CUDA_ERROR(cudaEventRecord(evt, src_stream)); + TVM_FFI_CHECK_CUDA_ERROR(cudaStreamWaitEvent(dst_stream, evt, 0)); + TVM_FFI_CHECK_CUDA_ERROR(cudaEventDestroy(evt)); } void StreamSync(Device dev, TVMStreamHandle stream) final { - CUDA_CALL(cudaSetDevice(dev.device_id)); - CUDA_CALL(cudaStreamSynchronize(static_cast(stream))); + TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev.device_id)); + TVM_FFI_CHECK_CUDA_ERROR(cudaStreamSynchronize(static_cast(stream))); } void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final { - return CUDAThreadEntry::ThreadLocal()->pool.AllocWorkspace(dev, size); + return ThreadLocalWorkspacePool()->AllocWorkspace(dev, size); } void FreeWorkspace(Device dev, void* data) final { - CUDAThreadEntry::ThreadLocal()->pool.FreeWorkspace(dev, data); + ThreadLocalWorkspacePool()->FreeWorkspace(dev, data); } bool SupportsDevicePointerArithmeticsOnHost() final { return true; } @@ -268,17 +292,17 @@ class CUDADeviceAPI final : public DeviceAPI { } private: + static WorkspacePool* ThreadLocalWorkspacePool(); + static void GPUCopy(const void* from, void* to, size_t size, cudaMemcpyKind kind, cudaStream_t stream) { - CUDA_CALL(cudaMemcpyAsync(to, from, size, kind, stream)); + TVM_FFI_CHECK_CUDA_ERROR(cudaMemcpyAsync(to, from, size, kind, stream)); } }; -CUDAThreadEntry::CUDAThreadEntry() : pool(kDLCUDA, CUDADeviceAPI::Global()) {} - -CUDAThreadEntry* CUDAThreadEntry::ThreadLocal() { - static thread_local CUDAThreadEntry inst; - return &inst; +WorkspacePool* CUDADeviceAPI::ThreadLocalWorkspacePool() { + static thread_local WorkspacePool pool(kDLCUDA, CUDADeviceAPI::Global()); + return &pool; } TVM_FFI_STATIC_INIT_BLOCK() { @@ -301,28 +325,28 @@ class CUDATimerNode : public TimerNode { // This initial cudaEventRecord is sometimes pretty slow (~100us). Does // cudaEventRecord do some stream synchronization? int device_id; - CUDA_CALL(cudaGetDevice(&device_id)); + TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id)); stream_ = TVMFFIEnvGetStream(kDLCUDA, device_id); - CUDA_CALL(cudaEventRecord(start_, static_cast(stream_))); + TVM_FFI_CHECK_CUDA_ERROR(cudaEventRecord(start_, static_cast(stream_))); } virtual void Stop() { int device_id; - CUDA_CALL(cudaGetDevice(&device_id)); - CUDA_CALL(cudaEventRecord(stop_, static_cast(stream_))); + TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id)); + TVM_FFI_CHECK_CUDA_ERROR(cudaEventRecord(stop_, static_cast(stream_))); } virtual int64_t SyncAndGetElapsedNanos() { - CUDA_CALL(cudaEventSynchronize(stop_)); + TVM_FFI_CHECK_CUDA_ERROR(cudaEventSynchronize(stop_)); float milliseconds = 0; - CUDA_CALL(cudaEventElapsedTime(&milliseconds, start_, stop_)); + TVM_FFI_CHECK_CUDA_ERROR(cudaEventElapsedTime(&milliseconds, start_, stop_)); return milliseconds * 1e6; } virtual ~CUDATimerNode() { - CUDA_CALL(cudaEventDestroy(start_)); - CUDA_CALL(cudaEventDestroy(stop_)); + TVM_FFI_CHECK_CUDA_ERROR(cudaEventDestroy(start_)); + TVM_FFI_CHECK_CUDA_ERROR(cudaEventDestroy(stop_)); } CUDATimerNode() { - CUDA_CALL(cudaEventCreate(&start_)); - CUDA_CALL(cudaEventCreate(&stop_)); + TVM_FFI_CHECK_CUDA_ERROR(cudaEventCreate(&start_)); + TVM_FFI_CHECK_CUDA_ERROR(cudaEventCreate(&stop_)); } TVM_FFI_DECLARE_OBJECT_INFO_FINAL("runtime.cuda.CUDATimerNode", CUDATimerNode, TimerNode); @@ -340,7 +364,7 @@ TVM_FFI_STATIC_INIT_BLOCK() { TVM_RUNTIME_DLL ffi::String GetCudaFreeMemory() { size_t free_mem, total_mem; - CUDA_CALL(cudaMemGetInfo(&free_mem, &total_mem)); + TVM_FFI_CHECK_CUDA_ERROR(cudaMemGetInfo(&free_mem, &total_mem)); std::stringstream ss; ss << "Current CUDA memory is " << free_mem << " bytes free out of " << total_mem << " bytes on device"; @@ -355,14 +379,14 @@ TVM_FFI_STATIC_INIT_BLOCK() { // TODO(tvm-team): remove once confirms all dep such as flashinfer // migrated to TVMFFIEnvGetStream int device_id; - CUDA_CALL(cudaGetDevice(&device_id)); + TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id)); return static_cast(TVMFFIEnvGetStream(kDLCUDA, device_id)); }); } TVM_RUNTIME_DLL int GetCudaDeviceCount() { int count; - CUDA_CALL(cudaGetDeviceCount(&count)); + TVM_FFI_CHECK_CUDA_ERROR(cudaGetDeviceCount(&count)); return count; } diff --git a/src/backend/cuda/runtime/cuda_module.cc b/src/backend/cuda/runtime/cuda_module.cc index 8ea734da6906..5a598c87cc6a 100644 --- a/src/backend/cuda/runtime/cuda_module.cc +++ b/src/backend/cuda/runtime/cuda_module.cc @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -41,11 +42,22 @@ #include "../../../runtime/pack_args.h" #include "../../../runtime/thread_storage_scope.h" #include "../../../support/bytes_io.h" -#include "cuda_common.h" namespace tvm { namespace runtime { +#ifndef CUDA_DRIVER_CALL +#define CUDA_DRIVER_CALL(x) \ + { \ + CUresult result = x; \ + if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) { \ + const char* msg; \ + cuGetErrorName(result, &msg); \ + TVM_FFI_THROW(CUDAError) << "" #x " failed with error: " << msg; \ + } \ + } +#endif + // Maximum number of GPU supported in CUDAModule (file-local). static constexpr const int kMaxNumGPUs = 32; @@ -204,7 +216,7 @@ class CUDAWrappedFunc { // invoke the function with void arguments void operator()(ffi::PackedArgs args, ffi::Any* rv, void** void_args) const { int device_id; - CUDA_CALL(cudaGetDevice(&device_id)); + TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id)); ThreadWorkLoad wl = launch_param_config_.Extract(args); if (fcache_[device_id] == nullptr) { diff --git a/src/runtime/extra/contrib/cublas/cublas_json_runtime.cc b/src/runtime/extra/contrib/cublas/cublas_json_runtime.cc index 6520753e117b..34ba853505aa 100644 --- a/src/runtime/extra/contrib/cublas/cublas_json_runtime.cc +++ b/src/runtime/extra/contrib/cublas/cublas_json_runtime.cc @@ -24,6 +24,7 @@ #include #include +#include #include #include #include @@ -32,7 +33,6 @@ #include #include -#include "../../../../backend/cuda/runtime/cuda_common.h" #include "../json/json_node.h" #include "../json/json_runtime.h" #include "cublas_utils.h" @@ -89,7 +89,7 @@ class CublasJSONRuntime : public JSONRuntimeBase { } if (device_id == -1) { - CUDA_CALL(cudaGetDevice(&device_id)); + TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id)); } auto* entry_ptr = tvm::contrib::CuBlasLtThreadEntry::ThreadLocal(DLDevice{kDLCUDA, device_id}); cudaStream_t stream = static_cast(TVMFFIEnvGetStream(kDLCUDA, device_id)); diff --git a/src/runtime/extra/contrib/cublas/cublas_utils.cc b/src/runtime/extra/contrib/cublas/cublas_utils.cc index b8c239f13a85..dcd115026cda 100644 --- a/src/runtime/extra/contrib/cublas/cublas_utils.cc +++ b/src/runtime/extra/contrib/cublas/cublas_utils.cc @@ -23,10 +23,9 @@ #include "cublas_utils.h" #include +#include #include -#include "../../../../backend/cuda/runtime/cuda_common.h" - namespace tvm { namespace contrib { @@ -51,7 +50,7 @@ CuBlasThreadEntry* CuBlasThreadEntry::ThreadLocal(DLDevice curr_device) { CuBlasLtThreadEntry::CuBlasLtThreadEntry() { CHECK_CUBLAS_ERROR(cublasLtCreate(&handle)); CHECK_CUBLAS_ERROR(cublasLtMatmulPreferenceCreate(&matmul_pref_desc)); - CUDA_CALL(cudaMalloc(&workspace_ptr, workspace_size)); + TVM_FFI_CHECK_CUDA_ERROR(cudaMalloc(&workspace_ptr, workspace_size)); } CuBlasLtThreadEntry::~CuBlasLtThreadEntry() { diff --git a/src/runtime/extra/contrib/cudnn/conv_backward.cc b/src/runtime/extra/contrib/cudnn/conv_backward.cc index bfc65baaff93..ca711f4e784e 100644 --- a/src/runtime/extra/contrib/cudnn/conv_backward.cc +++ b/src/runtime/extra/contrib/cudnn/conv_backward.cc @@ -67,7 +67,7 @@ void BackwardDataFindAlgo(int format, int dims, int groups, const int pad[], con const int dx_dim[], const std::string& data_dtype, const std::string& conv_dtype, bool verbose, ffi::Any* ret) { int device_id; - CUDA_CALL(cudaGetDevice(&device_id)); + TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id)); CuDNNThreadEntry* entry_ptr = CuDNNThreadEntry::ThreadLocal(DLDevice{kDLCUDA, device_id}); const int full_dims = dims + 2; std::vector dy_dim_int64(full_dims); @@ -146,7 +146,7 @@ void BackwardFilterFindAlgo(int format, int dims, int groups, const int pad[], c const int dw_dim[], const std::string& data_dtype, const std::string& conv_dtype, bool verbose, ffi::Any* ret) { int device_id; - CUDA_CALL(cudaGetDevice(&device_id)); + TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id)); CuDNNThreadEntry* entry_ptr = CuDNNThreadEntry::ThreadLocal(DLDevice{kDLCUDA, device_id}); const int full_dims = dims + 2; std::vector x_dim_int64(full_dims); diff --git a/src/runtime/extra/contrib/cudnn/conv_forward.cc b/src/runtime/extra/contrib/cudnn/conv_forward.cc index 6c6fd7eb4036..befbddd78812 100644 --- a/src/runtime/extra/contrib/cudnn/conv_forward.cc +++ b/src/runtime/extra/contrib/cudnn/conv_forward.cc @@ -112,7 +112,7 @@ void FindAlgo(int format, int dims, int groups, const int pad[], const int strid const std::string& data_dtype, const std::string& conv_dtype, bool verbose, ffi::Any* ret) { int device_id; - CUDA_CALL(cudaGetDevice(&device_id)); + TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id)); CuDNNThreadEntry* entry_ptr = CuDNNThreadEntry::ThreadLocal(DLDevice{kDLCUDA, device_id}); const int full_dims = dims + 2; std::vector x_dim_int64(full_dims); diff --git a/src/runtime/extra/contrib/cudnn/cudnn_frontend/attention.cc b/src/runtime/extra/contrib/cudnn/cudnn_frontend/attention.cc index f31a38bd472d..54d16b5a3966 100644 --- a/src/runtime/extra/contrib/cudnn/cudnn_frontend/attention.cc +++ b/src/runtime/extra/contrib/cudnn/cudnn_frontend/attention.cc @@ -24,10 +24,10 @@ #include "./attention.h" +#include #include #include -#include "../../../../../backend/cuda/runtime/cuda_common.h" #include "../cudnn_utils.h" namespace tvm { @@ -99,7 +99,7 @@ void CuDNNSDPARunnerNode::Init(int64_t batch, int64_t seq_len, int64_t num_heads TVM_FFI_ICHECK(stats == nullptr); o->set_output(true).set_dim({batch, num_heads, seq_len, head_size_v}).set_stride(o_stride); int device_id; - CUDA_CALL(cudaGetDevice(&device_id)); + TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id)); CuDNNThreadEntry* entry_ptr = CuDNNThreadEntry::ThreadLocal(DLDevice{kDLCUDA, device_id}); CUDNN_FRONTEND_CALL(graph_->build(entry_ptr->handle, {cudnn_frontend::HeurMode_t::A})); } diff --git a/src/runtime/extra/contrib/cudnn/cudnn_json_runtime.cc b/src/runtime/extra/contrib/cudnn/cudnn_json_runtime.cc index a7cf1ec2b318..5b3756fd79b8 100644 --- a/src/runtime/extra/contrib/cudnn/cudnn_json_runtime.cc +++ b/src/runtime/extra/contrib/cudnn/cudnn_json_runtime.cc @@ -103,7 +103,7 @@ class cuDNNJSONRuntime : public JSONRuntimeBase { std::function GetConv2DExec(const JSONGraphNode& node) { int device_id; - CUDA_CALL(cudaGetDevice(&device_id)); + TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id)); auto* entry_ptr = tvm::contrib::CuDNNThreadEntry::ThreadLocal(DLDevice{kDLCUDA, device_id}); auto op_name = node.GetOpName(); @@ -164,7 +164,7 @@ class cuDNNJSONRuntime : public JSONRuntimeBase { int algo = best_algo.cast(); std::function op_exec = [=, this]() { int device_id; - CUDA_CALL(cudaGetDevice(&device_id)); + TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id)); cudaStream_t stream = static_cast(TVMFFIEnvGetStream(kDLCUDA, device_id)); CUDNN_CALL(cudnnSetStream(entry_ptr->handle, stream)); diff --git a/src/runtime/extra/contrib/cudnn/cudnn_utils.cc b/src/runtime/extra/contrib/cudnn/cudnn_utils.cc index 5793febf01ef..5c34d4a2b0a6 100644 --- a/src/runtime/extra/contrib/cudnn/cudnn_utils.cc +++ b/src/runtime/extra/contrib/cudnn/cudnn_utils.cc @@ -270,7 +270,7 @@ TVM_FFI_STATIC_INIT_BLOCK() { namespace refl = tvm::ffi::reflection; refl::GlobalDef().def("tvm.contrib.cudnn.exists", []() -> bool { int device_id; - CUDA_CALL(cudaGetDevice(&device_id)); + TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id)); return CuDNNThreadEntry::ThreadLocal(DLDevice{kDLCUDA, device_id}, false)->exists(); }); } diff --git a/src/runtime/extra/contrib/cudnn/cudnn_utils.h b/src/runtime/extra/contrib/cudnn/cudnn_utils.h index 81849c0f0c31..cef82134ad90 100644 --- a/src/runtime/extra/contrib/cudnn/cudnn_utils.h +++ b/src/runtime/extra/contrib/cudnn/cudnn_utils.h @@ -26,12 +26,11 @@ #include #include +#include #include #include -#include "../../../../backend/cuda/runtime/cuda_common.h" - namespace tvm { namespace contrib { diff --git a/src/runtime/extra/contrib/cudnn/softmax.cc b/src/runtime/extra/contrib/cudnn/softmax.cc index d494fb334946..f0562dcba073 100644 --- a/src/runtime/extra/contrib/cudnn/softmax.cc +++ b/src/runtime/extra/contrib/cudnn/softmax.cc @@ -41,7 +41,7 @@ void softmax_impl(cudnnSoftmaxAlgorithm_t alg, ffi::PackedArgs args, ffi::Any* r if (axis < 0) axis += ndim; TVM_FFI_ICHECK(axis >= 0 && axis < ndim); int device_id; - CUDA_CALL(cudaGetDevice(&device_id)); + TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id)); CuDNNThreadEntry* entry_ptr = CuDNNThreadEntry::ThreadLocal(DLDevice{kDLCUDA, device_id}); entry_ptr->softmax_entry.data_type = CuDNNDataType::DLTypeToCuDNNType(x->dtype); diff --git a/src/runtime/extra/contrib/curand/curand.cc b/src/runtime/extra/contrib/curand/curand.cc index 5f8ff0758373..b2fc7538657a 100644 --- a/src/runtime/extra/contrib/curand/curand.cc +++ b/src/runtime/extra/contrib/curand/curand.cc @@ -17,11 +17,11 @@ * under the License. */ #include +#include #include #include #include -#include "../../../../backend/cuda/runtime/cuda_common.h" #include "./helper_cuda_kernels.h" namespace tvm { diff --git a/src/runtime/extra/contrib/cutlass/fp16_group_gemm_runner_sm100.cuh b/src/runtime/extra/contrib/cutlass/fp16_group_gemm_runner_sm100.cuh index c30b34d0f41a..0a3068c5d9c2 100644 --- a/src/runtime/extra/contrib/cutlass/fp16_group_gemm_runner_sm100.cuh +++ b/src/runtime/extra/contrib/cutlass/fp16_group_gemm_runner_sm100.cuh @@ -17,6 +17,7 @@ * under the License. */ +#include #include #include @@ -25,8 +26,6 @@ #include #include -#include "../../../../backend/cuda/runtime/cuda_common.h" - // clang-format off #include "cutlass/cutlass.h" diff --git a/src/runtime/extra/contrib/cutlass/fp16_group_gemm_runner_sm90.cuh b/src/runtime/extra/contrib/cutlass/fp16_group_gemm_runner_sm90.cuh index dcb26b9071d5..552b46b0a031 100644 --- a/src/runtime/extra/contrib/cutlass/fp16_group_gemm_runner_sm90.cuh +++ b/src/runtime/extra/contrib/cutlass/fp16_group_gemm_runner_sm90.cuh @@ -17,6 +17,7 @@ * under the License. */ +#include #include #include @@ -25,8 +26,6 @@ #include #include -#include "../../../../backend/cuda/runtime/cuda_common.h" - // clang-format off #include "cutlass/cutlass.h" diff --git a/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm100.cuh b/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm100.cuh index 9c1962ad98d7..26eb35e34ee5 100644 --- a/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm100.cuh +++ b/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm100.cuh @@ -17,6 +17,8 @@ * under the License. */ +#include + #include #include #include @@ -24,8 +26,6 @@ #include #include -#include "../../../../backend/cuda/runtime/cuda_common.h" - // clang-format off #include "cutlass/cutlass.h" diff --git a/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm90.cuh b/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm90.cuh index 1955e47759f7..9307e33d7f8e 100644 --- a/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm90.cuh +++ b/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm_runner_sm90.cuh @@ -17,6 +17,8 @@ * under the License. */ +#include + #include #include #include @@ -24,8 +26,6 @@ #include #include -#include "../../../../backend/cuda/runtime/cuda_common.h" - // clang-format off #include "cutlass/cutlass.h" diff --git a/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_group_gemm_runner_sm100.cuh b/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_group_gemm_runner_sm100.cuh index a516f31d3ce9..01e9ca7b8a20 100644 --- a/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_group_gemm_runner_sm100.cuh +++ b/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_group_gemm_runner_sm100.cuh @@ -17,14 +17,14 @@ * under the License. */ +#include + #include #include #include #include #include -#include "../../../../backend/cuda/runtime/cuda_common.h" - // clang-format off #include "cutlass/cutlass.h" diff --git a/src/runtime/extra/contrib/cutlass/gemm_runner.cuh b/src/runtime/extra/contrib/cutlass/gemm_runner.cuh index 58e1c9fbd006..174699bcec42 100644 --- a/src/runtime/extra/contrib/cutlass/gemm_runner.cuh +++ b/src/runtime/extra/contrib/cutlass/gemm_runner.cuh @@ -17,6 +17,7 @@ * under the License. */ +#include #include #include @@ -25,8 +26,6 @@ #include #include -#include "../../../../backend/cuda/runtime/cuda_common.h" - // clang-format off #include "cutlass/cutlass.h" diff --git a/src/runtime/extra/contrib/nvshmem/dist_gemm.cu b/src/runtime/extra/contrib/nvshmem/dist_gemm.cu index d975e140384d..26cc55a44218 100644 --- a/src/runtime/extra/contrib/nvshmem/dist_gemm.cu +++ b/src/runtime/extra/contrib/nvshmem/dist_gemm.cu @@ -19,12 +19,11 @@ #include #include #include +#include #include #include #include -#include "../../../../backend/cuda/runtime/cuda_common.h" - namespace tvm { namespace runtime { @@ -62,7 +61,7 @@ TVMStreamHandle stream_create() { DiscoWorker* worker = ThreadLocalDiscoWorker::Get()->worker; TVM_FFI_ICHECK(worker != nullptr) << "NVSHMEM stream creation failed: worker is not initialized"; cudaStream_t retval; - CUDA_CALL(cudaStreamCreateWithFlags(&retval, cudaStreamNonBlocking)); + TVM_FFI_CHECK_CUDA_ERROR(cudaStreamCreateWithFlags(&retval, cudaStreamNonBlocking)); return static_cast(retval); } @@ -100,7 +99,7 @@ void transfer_to_peers_reduce_scatter(Tensor semaphore, Tensor gemm_out, Tensor stream); } else { int device_id; - CUDA_CALL(cudaGetDevice(&device_id)); + TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id)); TVMStreamHandle main_stream = TVMFFIEnvGetStream(kDLCUDA, device_id); copy_to_peer(get_pointer(staging_buffer, ffi::Shape{my_rank, 0, 0}), to_rank, get_pointer(gemm_out, ffi::Shape{to_rank * LOCAL_M, 0}), LOCAL_M * N * 2, diff --git a/src/runtime/extra/contrib/nvshmem/init.cc b/src/runtime/extra/contrib/nvshmem/init.cc index 698f3b68024f..aeb71bb6a19a 100644 --- a/src/runtime/extra/contrib/nvshmem/init.cc +++ b/src/runtime/extra/contrib/nvshmem/init.cc @@ -20,14 +20,13 @@ #include #include #include +#include #include #include #include #include #include -#include "../../../../backend/cuda/runtime/cuda_common.h" - namespace tvm { namespace runtime { @@ -66,7 +65,7 @@ void InitNVSHMEM(ffi::Shape uid_64, int num_workers, int worker_id_start) { nvshmemx_set_attr_uniqueid_args(worker_id, num_workers, &uid, &attr); nvshmemx_init_attr(NVSHMEMX_INIT_WITH_UNIQUEID, &attr); int mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE); - CUDA_CALL(cudaSetDevice(mype_node)); + TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(mype_node)); if (worker != nullptr) { if (worker->default_device.device_type == DLDeviceType::kDLCPU) { worker->default_device = Device{DLDeviceType::kDLCUDA, mype_node}; @@ -153,7 +152,7 @@ TVM_FFI_STATIC_INIT_BLOCK() { .def("runtime.nvshmem.cumodule_init", NVSHMEMXCumoduleInit) .def("runtime.disco.nvshmem.barrier_all_on_current_stream", []() { int device_id; - CUDA_CALL(cudaGetDevice(&device_id)); + TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id)); TVMStreamHandle stream = TVMFFIEnvGetStream(kDLCUDA, device_id); NVSHMEMBarrierAllOnStream(stream); }); diff --git a/src/runtime/extra/contrib/nvshmem/memory_allocator.cc b/src/runtime/extra/contrib/nvshmem/memory_allocator.cc index ee9afd0eca3d..cb6e3520c8c1 100644 --- a/src/runtime/extra/contrib/nvshmem/memory_allocator.cc +++ b/src/runtime/extra/contrib/nvshmem/memory_allocator.cc @@ -18,13 +18,13 @@ */ #include #include +#include #include #include #include #include -#include "../../../../backend/cuda/runtime/cuda_common.h" #include "../../../memory/pooled_allocator.h" #include "../../disco/utils.h" diff --git a/src/runtime/extra/contrib/tensorrt/tensorrt_calibrator.h b/src/runtime/extra/contrib/tensorrt/tensorrt_calibrator.h index 4d92afb234d1..408d50cc7e08 100755 --- a/src/runtime/extra/contrib/tensorrt/tensorrt_calibrator.h +++ b/src/runtime/extra/contrib/tensorrt/tensorrt_calibrator.h @@ -23,10 +23,11 @@ #ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_CALIBRATOR_H_ #define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_CALIBRATOR_H_ +#include + #include #include -#include "../../../../backend/cuda/runtime/cuda_common.h" #include "NvInfer.h" namespace tvm { @@ -46,7 +47,7 @@ class TensorRTCalibrator : public nvinfer1::IInt8EntropyCalibrator2 { } // Free buffers for (size_t i = 0; i < buffers_.size(); ++i) { - CUDA_CALL(cudaFree(buffers_[i])); + TVM_FFI_CHECK_CUDA_ERROR(cudaFree(buffers_[i])); } } @@ -55,8 +56,9 @@ class TensorRTCalibrator : public nvinfer1::IInt8EntropyCalibrator2 { std::vector data_host(bindings.size(), nullptr); for (size_t i = 0; i < bindings.size(); ++i) { data_host[i] = new float[batch_size_ * binding_sizes[i]]; - CUDA_CALL(cudaMemcpy(static_cast(data_host[i]), bindings[i], - batch_size_ * binding_sizes[i] * sizeof(float), cudaMemcpyDeviceToHost)); + TVM_FFI_CHECK_CUDA_ERROR(cudaMemcpy(static_cast(data_host[i]), bindings[i], + batch_size_ * binding_sizes[i] * sizeof(float), + cudaMemcpyDeviceToHost)); } data_.push_back(data_host); data_sizes_.push_back(binding_sizes); @@ -73,9 +75,10 @@ class TensorRTCalibrator : public nvinfer1::IInt8EntropyCalibrator2 { TVM_FFI_ICHECK_EQ(input_names_.size(), nbBindings); for (size_t i = 0; i < input_names_.size(); ++i) { TVM_FFI_ICHECK_EQ(input_names_[i], names[i]); - CUDA_CALL(cudaMemcpy(buffers_[i], data_[num_batches_calibrated_][i], - batch_size_ * data_sizes_[num_batches_calibrated_][i] * sizeof(float), - cudaMemcpyHostToDevice)); + TVM_FFI_CHECK_CUDA_ERROR( + cudaMemcpy(buffers_[i], data_[num_batches_calibrated_][i], + batch_size_ * data_sizes_[num_batches_calibrated_][i] * sizeof(float), + cudaMemcpyHostToDevice)); bindings[i] = buffers_[i]; } num_batches_calibrated_++; @@ -120,7 +123,7 @@ class TensorRTCalibrator : public nvinfer1::IInt8EntropyCalibrator2 { const int num_inputs = data_sizes_[0].size(); buffers_.assign(num_inputs, nullptr); for (int i = 0; i < num_inputs; ++i) { - CUDA_CALL(cudaMalloc(&buffers_[i], data_sizes_[0][i] * sizeof(float))); + TVM_FFI_CHECK_CUDA_ERROR(cudaMalloc(&buffers_[i], data_sizes_[0][i] * sizeof(float))); } } }; diff --git a/src/runtime/extra/contrib/thrust/thrust.cu b/src/runtime/extra/contrib/thrust/thrust.cu index b3613232311a..f534e1fc765f 100644 --- a/src/runtime/extra/contrib/thrust/thrust.cu +++ b/src/runtime/extra/contrib/thrust/thrust.cu @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -41,8 +42,6 @@ #include #include #include - -#include "../../../../backend/cuda/runtime/cuda_common.h" namespace tvm { namespace contrib { @@ -95,7 +94,7 @@ class WorkspaceMemoryResource : public thrust::mr::memory_resource { auto get_thrust_exec_policy(WorkspaceMemoryResource* memory_resouce) { int device_id; - CUDA_CALL(cudaGetDevice(&device_id)); + TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id)); cudaStream_t stream = static_cast(TVMFFIEnvGetStream(kDLCUDA, device_id)); return thrust::cuda::par_nosync(memory_resouce).on(stream); } diff --git a/src/runtime/extra/disco/cuda_ipc/cuda_ipc_memory.cc b/src/runtime/extra/disco/cuda_ipc/cuda_ipc_memory.cc index 109ce6ed37a6..426557b7b7ad 100644 --- a/src/runtime/extra/disco/cuda_ipc/cuda_ipc_memory.cc +++ b/src/runtime/extra/disco/cuda_ipc/cuda_ipc_memory.cc @@ -18,13 +18,13 @@ */ #include +#include #include #include #include #include #include "../../../../../3rdparty/tensorrt_llm/custom_allreduce_kernels.h" -#include "../../../../backend/cuda/runtime/cuda_common.h" #include "../../../memory/pooled_allocator.h" #include "../nccl/nccl_context.h" @@ -45,20 +45,22 @@ using tvm::runtime::memory::Buffer; std::vector AllGatherIPCHandles(nccl::CCLThreadLocalContext* ctx, cudaIpcMemHandle_t local_handle) { void *d_src, *d_dst; - CUDA_CALL(cudaMalloc(&d_src, CUDA_IPC_HANDLE_SIZE)); - CUDA_CALL(cudaMalloc(&d_dst, CUDA_IPC_HANDLE_SIZE * ctx->worker->num_workers)); - CUDA_CALL(cudaMemcpy(d_src, &local_handle, CUDA_IPC_HANDLE_SIZE, cudaMemcpyHostToDevice)); + TVM_FFI_CHECK_CUDA_ERROR(cudaMalloc(&d_src, CUDA_IPC_HANDLE_SIZE)); + TVM_FFI_CHECK_CUDA_ERROR(cudaMalloc(&d_dst, CUDA_IPC_HANDLE_SIZE * ctx->worker->num_workers)); + TVM_FFI_CHECK_CUDA_ERROR( + cudaMemcpy(d_src, &local_handle, CUDA_IPC_HANDLE_SIZE, cudaMemcpyHostToDevice)); NCCL_CALL(ncclAllGather(d_src, d_dst, CUDA_IPC_HANDLE_SIZE, ncclChar, ctx->global_comm, /*stream=*/nullptr)); std::vector serial_handles(CUDA_IPC_HANDLE_SIZE * ctx->worker->num_workers, 0); - CUDA_CALL(cudaMemcpy(serial_handles.data(), d_dst, - CUDA_IPC_HANDLE_SIZE * ctx->worker->num_workers, cudaMemcpyDefault)); + TVM_FFI_CHECK_CUDA_ERROR(cudaMemcpy(serial_handles.data(), d_dst, + CUDA_IPC_HANDLE_SIZE * ctx->worker->num_workers, + cudaMemcpyDefault)); std::vector handles(ctx->worker->num_workers); for (int i = 0; i < ctx->worker->num_workers; ++i) { memcpy(handles[i].reserved, &serial_handles[i * CUDA_IPC_HANDLE_SIZE], CUDA_IPC_HANDLE_SIZE); } - CUDA_CALL(cudaFree(d_src)); - CUDA_CALL(cudaFree(d_dst)); + TVM_FFI_CHECK_CUDA_ERROR(cudaFree(d_src)); + TVM_FFI_CHECK_CUDA_ERROR(cudaFree(d_dst)); return handles; } @@ -115,7 +117,7 @@ class CUDAIPCMemoryAllocator final : public memory::PooledAllocator { void DeviceFreeDataSpace(Device dev, void* ptr) final { TVM_FFI_ICHECK(dev.device_type == kDLCUDA); - CUDA_CALL(cudaSetDevice(dev.device_id)); + TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev.device_id)); nccl::CCLThreadLocalContext* ctx = nccl::CCLThreadLocalContext::Get(); auto it = ipc_memory_map_.find(ptr); TVM_FFI_ICHECK(it != ipc_memory_map_.end()); @@ -146,20 +148,20 @@ class CUDAIPCMemoryAllocator final : public memory::PooledAllocator { // Alloc local buffer TVM_FFI_ICHECK(dev.device_type == kDLCUDA); void* ptr; - CUDA_CALL(cudaSetDevice(dev.device_id)); - CUDA_CALL(cudaMalloc(&ptr, size)); + TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(dev.device_id)); + TVM_FFI_CHECK_CUDA_ERROR(cudaMalloc(&ptr, size)); // Reset allocated memory to zero when required. // We explicitly synchronize after memset, to make sure memset finishes // before using all-gather to exchange IPC handles. // This is important to ensure the memory reset get ordered // before any other peers read the memory. if (reset_memory_to_zero) { - CUDA_CALL(cudaMemset(ptr, 0, size)); - CUDA_CALL(cudaDeviceSynchronize()); + TVM_FFI_CHECK_CUDA_ERROR(cudaMemset(ptr, 0, size)); + TVM_FFI_CHECK_CUDA_ERROR(cudaDeviceSynchronize()); } // Create ipc handle cudaIpcMemHandle_t local_handle; - CUDA_CALL(cudaIpcGetMemHandle(&local_handle, ptr)); + TVM_FFI_CHECK_CUDA_ERROR(cudaIpcGetMemHandle(&local_handle, ptr)); // All-gather IPC handles. nccl::CCLThreadLocalContext* ctx = nccl::CCLThreadLocalContext::Get(); std::vector handles = AllGatherIPCHandles(ctx, local_handle); @@ -170,8 +172,9 @@ class CUDAIPCMemoryAllocator final : public memory::PooledAllocator { comm_ptrs[node_id] = ptr; } else { uint8_t* foreign_buffer; - CUDA_CALL(cudaIpcOpenMemHandle(reinterpret_cast(&foreign_buffer), handles[node_id], - cudaIpcMemLazyEnablePeerAccess)); + TVM_FFI_CHECK_CUDA_ERROR(cudaIpcOpenMemHandle(reinterpret_cast(&foreign_buffer), + handles[node_id], + cudaIpcMemLazyEnablePeerAccess)); comm_ptrs[node_id] = foreign_buffer; } } @@ -183,10 +186,10 @@ class CUDAIPCMemoryAllocator final : public memory::PooledAllocator { for (int i = 0; i < static_cast(comm_ptrs.size()); ++i) { if (i != worker_id) { // Free ipc handle. - CUDA_CALL(cudaIpcCloseMemHandle(comm_ptrs[i])); + TVM_FFI_CHECK_CUDA_ERROR(cudaIpcCloseMemHandle(comm_ptrs[i])); } else { // Free local buffer. - CUDA_CALL(cudaFree(comm_ptrs[i])); + TVM_FFI_CHECK_CUDA_ERROR(cudaFree(comm_ptrs[i])); } } } diff --git a/src/runtime/extra/disco/nccl/nccl_context.h b/src/runtime/extra/disco/nccl/nccl_context.h index 3747fec43487..7a99be0897c0 100644 --- a/src/runtime/extra/disco/nccl/nccl_context.h +++ b/src/runtime/extra/disco/nccl/nccl_context.h @@ -35,8 +35,7 @@ #endif #if TVM_NCCL_RCCL_SWITCH == 0 #include - -#include "../../../../backend/cuda/runtime/cuda_common.h" +#include #else #include @@ -62,10 +61,16 @@ namespace nccl { using deviceStream_t = cudaStream_t; const constexpr DLDeviceType TVM_DISCO_DEVICE_TYPE = DLDeviceType::kDLCUDA; -inline void SetDevice(int device_id) { CUDA_CALL(cudaSetDevice(device_id)); } -inline void StreamSynchronize(deviceStream_t stream) { CUDA_CALL(cudaStreamSynchronize(stream)); } -inline void StreamCreate(deviceStream_t* stream) { CUDA_CALL(cudaStreamCreate(stream)); } -inline void StreamDestroy(deviceStream_t stream) { CUDA_CALL(cudaStreamDestroy(stream)); } +inline void SetDevice(int device_id) { TVM_FFI_CHECK_CUDA_ERROR(cudaSetDevice(device_id)); } +inline void StreamSynchronize(deviceStream_t stream) { + TVM_FFI_CHECK_CUDA_ERROR(cudaStreamSynchronize(stream)); +} +inline void StreamCreate(deviceStream_t* stream) { + TVM_FFI_CHECK_CUDA_ERROR(cudaStreamCreate(stream)); +} +inline void StreamDestroy(deviceStream_t stream) { + TVM_FFI_CHECK_CUDA_ERROR(cudaStreamDestroy(stream)); +} #else diff --git a/src/runtime/vm/cuda/cuda_graph_builtin.cc b/src/runtime/vm/cuda/cuda_graph_builtin.cc index 58348fb695d8..b51b287d81d9 100644 --- a/src/runtime/vm/cuda/cuda_graph_builtin.cc +++ b/src/runtime/vm/cuda/cuda_graph_builtin.cc @@ -24,11 +24,11 @@ #include #include +#include #include #include #include -#include "../../../backend/cuda/runtime/cuda_common.h" #include "../../../support/utils.h" namespace tvm { namespace runtime { @@ -85,7 +85,7 @@ struct CUDAGraphCapturedState { ~CUDAGraphCapturedState() { if (exec) { - CUDA_CALL(cudaGraphExecDestroy(exec)); + TVM_FFI_CHECK_CUDA_ERROR(cudaGraphExecDestroy(exec)); } } @@ -100,7 +100,7 @@ struct CUDAGraphCapturedState { class ScopedCUDAStream { public: - ScopedCUDAStream() { CUDA_CALL(cudaStreamCreate(&stream_)); } + ScopedCUDAStream() { TVM_FFI_CHECK_CUDA_ERROR(cudaStreamCreate(&stream_)); } ~ScopedCUDAStream() { cudaStreamDestroy(stream_); } ScopedCUDAStream(const ScopedCUDAStream&) = delete; ScopedCUDAStream(ScopedCUDAStream&&) = delete; @@ -116,11 +116,11 @@ class ScopedCUDAStream { class CUDACaptureStream { public: explicit CUDACaptureStream(cudaGraph_t* graph) : output_graph_(graph) { - CUDA_CALL(cudaGetDevice(&device_id_)); + TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id_)); TVM_FFI_CHECK_SAFE_CALL( TVMFFIEnvSetStream(kDLCUDA, device_id_, capture_stream_, reinterpret_cast(&prev_default_stream_))); - CUDA_CALL(cudaStreamBeginCapture(capture_stream_, cudaStreamCaptureModeGlobal)); + TVM_FFI_CHECK_CUDA_ERROR(cudaStreamBeginCapture(capture_stream_, cudaStreamCaptureModeGlobal)); } ~CUDACaptureStream() noexcept(false) { cudaStreamEndCapture(capture_stream_, output_graph_); @@ -158,8 +158,8 @@ class CUDAGraphExtensionNode : public VMExtensionNode { // Launch CUDA graph const auto& [states, exec] = it->second; int device_id; - CUDA_CALL(cudaGetDevice(&device_id)); - CUDA_CALL( + TVM_FFI_CHECK_CUDA_ERROR(cudaGetDevice(&device_id)); + TVM_FFI_CHECK_CUDA_ERROR( cudaGraphLaunch(exec, static_cast(TVMFFIEnvGetStream(kDLCUDA, device_id)))); return states; } @@ -190,8 +190,8 @@ class CUDAGraphExtensionNode : public VMExtensionNode { CUDAGraphCapturedState entry; entry.states = capture_func_rv.cast(); - CUDA_CALL(cudaGraphInstantiate(&entry.exec, graph, NULL, NULL, 0)); - CUDA_CALL(cudaGraphDestroy(graph)); + TVM_FFI_CHECK_CUDA_ERROR(cudaGraphInstantiate(&entry.exec, graph, NULL, NULL, 0)); + TVM_FFI_CHECK_CUDA_ERROR(cudaGraphDestroy(graph)); ffi::ObjectRef states = entry.states; From a717665b05067e9e6e3cc888e0acc8c73a40cace Mon Sep 17 00:00:00 2001 From: tqchen Date: Mon, 15 Jun 2026 11:46:49 +0000 Subject: [PATCH 2/2] [TESTS][CUDA] Relax OOM error match --- tests/python/relax/test_vm_builtin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python/relax/test_vm_builtin.py b/tests/python/relax/test_vm_builtin.py index f46d61810402..677bf0b4a347 100644 --- a/tests/python/relax/test_vm_builtin.py +++ b/tests/python/relax/test_vm_builtin.py @@ -77,7 +77,7 @@ def main(): built = tvm.compile(Module, target=target) vm = relax.VirtualMachine(built, dev) - with pytest.raises(Exception, match="CUDA: out of memory"): + with pytest.raises(Exception, match="CUDA.*out of memory"): vm["main"]()