From d0fd5c8dc95adb92a51e26b35c657b05475b13d0 Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Mon, 22 Jun 2026 18:23:39 -0600 Subject: [PATCH] Proof of concept for the support of AMD GPUs --- .github/workflows/build-hip.yml | 66 +++ .gitignore | 1 + CMakeLists.txt | 64 ++- cmake/cpp-highs.cmake | 2 +- highs/CMakeLists.txt | 114 ++-- highs/pdlp/cupdlp/cuda/CMakeLists.txt | 3 +- highs/pdlp/hipdlp/gpu_backend.hpp | 220 +++++++ highs/pdlp/hipdlp/pdhg.cc | 799 +++++++++++++------------- highs/pdlp/hipdlp/pdhg.cu | 256 +++++---- highs/pdlp/hipdlp/pdhg.hpp | 78 +-- highs/pdlp/hipdlp/pdhg_kernels.hpp | 24 +- 11 files changed, 983 insertions(+), 644 deletions(-) create mode 100644 .github/workflows/build-hip.yml create mode 100644 highs/pdlp/hipdlp/gpu_backend.hpp diff --git a/.github/workflows/build-hip.yml b/.github/workflows/build-hip.yml new file mode 100644 index 00000000000..1106d30f7d8 --- /dev/null +++ b/.github/workflows/build-hip.yml @@ -0,0 +1,66 @@ +name: build-hip + +# Compile-only CI for the AMD GPU (HIP/ROCm) backend of HiPDLP. +# +# This job runs inside an official ROCm container and only *compiles and links* +# the HIP backend (-DHIPDLP_HIP=ON). It does NOT execute any GPU code: the +# GitHub-hosted runners have no AMD GPU. Its purpose is to catch HIP build +# regressions (wrong hip* API names, desynchronised #ifdef guards, signature +# mismatches) that are invisible to the CPU/CUDA builds. +# +# Running the GPU code (numerical validation) requires a self-hosted runner +# with an AMD GPU and is intentionally out of scope here. + +on: + push: + paths: + - 'highs/pdlp/hipdlp/**' + - 'CMakeLists.txt' + - 'highs/CMakeLists.txt' + - '.github/workflows/build-hip.yml' + pull_request: + paths: + - 'highs/pdlp/hipdlp/**' + - 'CMakeLists.txt' + - 'highs/CMakeLists.txt' + - '.github/workflows/build-hip.yml' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + hip-compile: + runs-on: ubuntu-latest + container: + # 24.04 + ROCm 7.x; the "-complete" image bundles hipBLAS/hipSPARSE. + image: rocm/dev-ubuntu-24.04:7.2.4-complete + + steps: + - uses: actions/checkout@v6 + + - name: Install build dependencies + shell: bash + run: | + apt-get update + apt-get install -y --no-install-recommends \ + cmake make git ca-certificates zlib1g-dev + cmake --version + hipcc --version || /opt/rocm/bin/hipcc --version + + - name: Configure CMake (HIP backend) + shell: bash + run: | + export PATH=/opt/rocm/bin:$PATH + export CMAKE_PREFIX_PATH=/opt/rocm + cmake -S . -B build_hip \ + -DHIPDLP_HIP=ON \ + -DBUILD_TESTING=OFF \ + -DBUILD_EXAMPLES=OFF \ + -DCMAKE_BUILD_TYPE=Release + + - name: Build (compile + link, no GPU execution) + shell: bash + run: | + export PATH=/opt/rocm/bin:$PATH + cmake --build build_hip --target highs --parallel diff --git a/.gitignore b/.gitignore index 20926b5870c..f6980c15fe9 100644 --- a/.gitignore +++ b/.gitignore @@ -220,6 +220,7 @@ $RECYCLE.BIN/ dist/ build/ build_release*/ +install/ eggs/ parts/ var/ diff --git a/CMakeLists.txt b/CMakeLists.txt index f28998adb01..ae69707ce47 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -150,11 +150,18 @@ message(STATUS "Use FindCUDAConf: ${CUPDLP_FIND_CUDA}") option(HIGHS_GPU_LIB "Build highs GPU as a single lib" OFF) message(STATUS "Build GPU with delayed loading: ${HIGHS_GPU_LIB}") +option(HIPDLP_HIP "Build HiPDLP with AMD HIP (ROCm)" OFF) +message(STATUS "Build HiPDLP with HIP: ${HIPDLP_HIP}") + # For now if (HIGHS_GPU_LIB) set(CUPDLP_GPU ON) endif() +if (HIPDLP_HIP) + set(HIPDLP_GPU ON) +endif() + if(CUPDLP_GPU AND CMAKE_VERSION VERSION_LESS "3.25.0") message("CUPDLP FindCUDAConf requires CMake version minumum 3.24. Please use a higher version of CMake.") endif() @@ -182,30 +189,59 @@ if (HIPO AND NOT FAST_BUILD) message(ERROR "HIPO is only available with FAST_BUILD=ON.") endif() -if (CUPDLP_GPU) +if (CUPDLP_GPU OR HIPDLP_GPU) if (WIN32) set(BUILD_SHARED_LIBS ON) endif() - set (CUPDLP_CPU OFF) - message(NOTICE "Set build cuPDLP with CUDA") + set(CUPDLP_CPU OFF) + + if (HIPDLP_HIP) + message(NOTICE "Set build HiPDLP with HIP (AMD ROCm)") + + # ROCm/HIP requires CMake 3.21+ + if(CMAKE_VERSION VERSION_LESS "3.21.0") + message(FATAL_ERROR "HIP support requires CMake >= 3.21") + endif() + + # Allow the user to point to a non-standard ROCm install + if(DEFINED ENV{ROCM_PATH}) + set(CMAKE_PREFIX_PATH "$ENV{ROCM_PATH}" ${CMAKE_PREFIX_PATH}) + elseif(EXISTS "/opt/rocm") + set(CMAKE_PREFIX_PATH "/opt/rocm" ${CMAKE_PREFIX_PATH}) + endif() + + enable_language(HIP) + find_package(hipblas REQUIRED) + find_package(hipsparse REQUIRED) + + set(GPU_LIBRARY roc::hipblas roc::hipsparse) + set(GPU_COMPILE_DEFINITIONS USE_HIP) - if (CUPDLP_FIND_CUDA) - # With FindCUDAConf.cmake - # Need to have the CUDA_HOME environment variable set. - include(FindCUDAConf) else() - # Without FindCUDAConf.cmake - enable_language(CUDA) - find_package(CUDAToolkit REQUIRED) + message(NOTICE "Set build cuPDLP with CUDA") + + if (CUPDLP_FIND_CUDA) + # With FindCUDAConf.cmake + # Need to have the CUDA_HOME environment variable set. + include(FindCUDAConf) + else() + # Without FindCUDAConf.cmake + enable_language(CUDA) + find_package(CUDAToolkit REQUIRED) + + set(CUDA_LIBRARY-NOTFOUND, OFF) + set(CUDA_LIBRARY CUDA::cudart CUDA::cublas CUDA::cusparse) + endif() - set(CUDA_LIBRARY-NOTFOUND, OFF) - set(CUDA_LIBRARY CUDA::cudart CUDA::cublas CUDA::cusparse) + set(GPU_LIBRARY ${CUDA_LIBRARY}) + set(GPU_COMPILE_DEFINITIONS "") endif() else() - set (CUPDLP_CPU ON) - set(CUDA_LIBRARY-NOTFOUND true) + set(CUPDLP_CPU ON) + set(CUDA_LIBRARY-NOTFOUND true) + set(HIPDLP_GPU OFF) endif() # Option to build static diff --git a/cmake/cpp-highs.cmake b/cmake/cpp-highs.cmake index ddbb7fa0887..0138285bc11 100644 --- a/cmake/cpp-highs.cmake +++ b/cmake/cpp-highs.cmake @@ -70,7 +70,7 @@ if (NOT HIGHS_COVERAGE) APPEND FILE "${HIGHS_BINARY_DIR}/highs-targets.cmake") endif() -if (CUPDLP_GPU AND NOT HIGHS_GPU_LIB) +if (CUPDLP_GPU AND NOT HIGHS_GPU_LIB AND NOT HIPDLP_HIP) install(TARGETS cudalin EXPORT ${lower}-targets INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} diff --git a/highs/CMakeLists.txt b/highs/CMakeLists.txt index fbdd7d34e65..06e0515e0ce 100644 --- a/highs/CMakeLists.txt +++ b/highs/CMakeLists.txt @@ -194,69 +194,89 @@ else() target_sources(highs PRIVATE ${sources} ${headers} ${win_version_file}) - # Optional Cuda - if (CUPDLP_GPU) - - target_include_directories(highs PUBLIC "$") - - if (HIGHS_GPU_LIB) - # like in cuda CMakeLists - enable_language(CXX CUDA) - - # Remove /MP flag from CUDA targets (nvcc doesn't support it) - if(MSVC) - string(REPLACE "/MP" "" CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}") - string(REPLACE "/bigobj" "" CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}") - - foreach(flag_var - CMAKE_CUDA_FLAGS_DEBUG - CMAKE_CUDA_FLAGS_RELEASE - CMAKE_CUDA_FLAGS_RELWITHDEBINFO - CMAKE_CUDA_FLAGS_MINSIZEREL) - string(REPLACE "/MP" "" ${flag_var} "${${flag_var}}") - string(REPLACE "/bigobj" "" ${flag_var} "${${flag_var}}") - endforeach() - endif() + # Optional GPU (CUDA or HIP) + if (CUPDLP_GPU OR HIPDLP_GPU) - target_sources(highs PRIVATE - pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu - pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cuh - pdlp/cupdlp/cuda/cupdlp_cudalinalg.cuh - pdlp/cupdlp/cuda/cupdlp_cudalinalg.cu - pdlp/cupdlp/cupdlp_cs.c - pdlp/hipdlp/pdhg.cu) + if (HIPDLP_HIP) + # HIP backend — pdhg.cu is compiled as HIP source. + # HIPDLP_GPU activates GPU code in HiPDLP headers. + # CUPDLP_CPU keeps cuPDLP-C in CPU mode so it doesn't pull CUDA headers. + target_compile_definitions(highs PRIVATE HIPDLP_GPU USE_HIP CUPDLP_CPU) - set_target_properties(highs PROPERTIES - CUDA_SEPARABLE_COMPILATION ON - #CUDA_ARCHITECTURES "60;70;75;80;86" ) - #Deleted 60;70 due to errors - #nvcc fatal : Unsupported gpu architecture 'compute_60' - #nvcc fatal : Unsupported gpu architecture 'compute_70' - CUDA_ARCHITECTURES "75;80;86" ) + set_source_files_properties(pdlp/hipdlp/pdhg.cu PROPERTIES LANGUAGE HIP) - # set_target_properties(cudalin PROPERTIES CUDA_ARCHITECTURES native) + target_sources(highs PRIVATE pdlp/hipdlp/pdhg.cu) - target_include_directories(highs PUBLIC "$") + set_target_properties(highs PROPERTIES + HIP_SEPARABLE_COMPILATION ON) if (WIN32) - target_link_libraries(highs PRIVATE ${CUDA_LIBRARY}) # linking happens here - else() # linux - target_link_libraries(highs PRIVATE ${CUDA_LIBRARY} m) + target_link_libraries(highs PRIVATE ${GPU_LIBRARY}) + else() + target_link_libraries(highs PRIVATE ${GPU_LIBRARY} m) endif() else() + # CUDA backend + target_include_directories(highs PUBLIC "$") + + if (HIGHS_GPU_LIB) + # like in cuda CMakeLists + enable_language(CXX CUDA) + + # Remove /MP flag from CUDA targets (nvcc doesn't support it) + if(MSVC) + string(REPLACE "/MP" "" CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}") + string(REPLACE "/bigobj" "" CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}") + + foreach(flag_var + CMAKE_CUDA_FLAGS_DEBUG + CMAKE_CUDA_FLAGS_RELEASE + CMAKE_CUDA_FLAGS_RELWITHDEBINFO + CMAKE_CUDA_FLAGS_MINSIZEREL) + string(REPLACE "/MP" "" ${flag_var} "${${flag_var}}") + string(REPLACE "/bigobj" "" ${flag_var} "${${flag_var}}") + endforeach() + endif() + + target_sources(highs PRIVATE + pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu + pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cuh + pdlp/cupdlp/cuda/cupdlp_cudalinalg.cuh + pdlp/cupdlp/cuda/cupdlp_cudalinalg.cu + pdlp/cupdlp/cupdlp_cs.c + pdlp/hipdlp/pdhg.cu) + + set_target_properties(highs PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + #CUDA_ARCHITECTURES "60;70;75;80;86" ) + #Deleted 60;70 due to errors + #nvcc fatal : Unsupported gpu architecture 'compute_60' + #nvcc fatal : Unsupported gpu architecture 'compute_70' + CUDA_ARCHITECTURES "75;80;86" ) + + target_include_directories(highs PUBLIC "$") + + if (WIN32) + target_link_libraries(highs PRIVATE ${GPU_LIBRARY}) + else() + target_link_libraries(highs PRIVATE ${GPU_LIBRARY} m) + endif() + + else() set(CUPDLP_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/highs/pdlp/cupdlp/") add_subdirectory(pdlp/cupdlp/cuda) if (WIN32) - target_link_libraries(highs PRIVATE cudalin ${CUDA_LIBRARY}) - else() - target_link_libraries(highs PRIVATE cudalin ${CUDA_LIBRARY} m) - endif() + target_link_libraries(highs PRIVATE cudalin ${GPU_LIBRARY}) + else() + target_link_libraries(highs PRIVATE cudalin ${GPU_LIBRARY} m) + endif() - set_target_properties(highs PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + set_target_properties(highs PROPERTIES CUDA_SEPARABLE_COMPILATION ON) endif() endif() + endif() target_include_directories(highs PRIVATE ${include_dirs}) diff --git a/highs/pdlp/cupdlp/cuda/CMakeLists.txt b/highs/pdlp/cupdlp/cuda/CMakeLists.txt index 2cde626eefc..955ed8b0e6d 100644 --- a/highs/pdlp/cupdlp/cuda/CMakeLists.txt +++ b/highs/pdlp/cupdlp/cuda/CMakeLists.txt @@ -17,7 +17,7 @@ add_library(cudalin SHARED # latest has # set_target_properties(cudalin PROPERTIES CUDA_SEPARABLE_COMPILATION ON) -set_target_properties(cudalin PROPERTIES +set_target_properties(cudalin PROPERTIES CUDA_SEPARABLE_COMPILATION ON #CUDA_ARCHITECTURES "60;70;75;80;86" ) #Deleted 60;70 due to errors @@ -28,6 +28,7 @@ CUDA_ARCHITECTURES "75;80;86" ) # set_target_properties(cudalin PROPERTIES CUDA_ARCHITECTURES native) + target_include_directories(cudalin PUBLIC "$") if (WIN32) diff --git a/highs/pdlp/hipdlp/gpu_backend.hpp b/highs/pdlp/hipdlp/gpu_backend.hpp new file mode 100644 index 00000000000..a83020d031b --- /dev/null +++ b/highs/pdlp/hipdlp/gpu_backend.hpp @@ -0,0 +1,220 @@ +#pragma once + +// GPU portability layer for HiPDLP. +// Provides unified gpu* types and macros that map to either the CUDA or HIP +// runtime depending on whether USE_HIP is defined at compile time. + +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) + +#ifdef USE_HIP +#include +#include +#include + +using gpuStream_t = hipStream_t; +using gpuSparseHandle_t = hipsparseHandle_t; +using gpuBlasHandle_t = hipblasHandle_t; +using gpuSpMatDescr_t = hipsparseSpMatDescr_t; +using gpuDnVecDescr_t = hipsparseDnVecDescr_t; +using gpuError_t = hipError_t; +using gpuBlasStatus_t = hipblasStatus_t; +using gpuSparseStatus_t = hipsparseStatus_t; + +#define gpuMalloc(ptr, size) hipMalloc(ptr, size) +#define gpuFree(ptr) hipFree(ptr) +#define gpuMemcpy(dst, src, size, kind) hipMemcpy(dst, src, size, kind) +#define gpuMemcpyAsync(dst, src, size, kind, stream) \ + hipMemcpyAsync(dst, src, size, kind, stream) +#define gpuMemset(ptr, val, size) hipMemset(ptr, val, size) +#define gpuMemsetAsync(ptr, val, size, stream) \ + hipMemsetAsync(ptr, val, size, stream) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice + +#define gpuStreamCreate(s) hipStreamCreate(s) +#define gpuStreamDestroy(s) hipStreamDestroy(s) +#define gpuStreamSynchronize(s) hipStreamSynchronize(s) +#define gpuDeviceSynchronize() hipDeviceSynchronize() +#define gpuGetLastError() hipGetLastError() +#define gpuGetErrorString(e) hipGetErrorString(e) +#define gpuSuccess hipSuccess + +#define gpuBlasCreate(h) hipblasCreate(h) +#define gpuBlasDestroy(h) hipblasDestroy(h) +#define gpuBlasSetStream(h, s) hipblasSetStream(h, s) +#define gpuBlasDdot(h,n,x,ix,y,iy,r) hipblasDdot(h,n,x,ix,y,iy,r) +#define gpuBlasDnrm2(h,n,x,ix,r) hipblasDnrm2(h,n,x,ix,r) +#define gpuBlasDaxpy(h,n,a,x,ix,y,iy) hipblasDaxpy(h,n,a,x,ix,y,iy) +#define gpuBlasDscal(h,n,a,x,ix) hipblasDscal(h,n,a,x,ix) +#define GPU_BLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS + +#define gpuSparseCreate(h) hipsparseCreate(h) +#define gpuSparseDestroy(h) hipsparseDestroy(h) +#define gpuSparseSetStream(h, s) hipsparseSetStream(h, s) +#define gpuSparseCreateCsr(mat, r, c, nnz, rp, ci, cv, rpt, cit, vt, dt) \ + hipsparseCreateCsr(mat, r, c, nnz, rp, ci, cv, rpt, cit, vt, dt) +#define gpuSparseDestroySpMat(m) hipsparseDestroySpMat(m) +#define gpuSparseCreateDnVec(v, s, d, t) \ + hipsparseCreateDnVec(v, s, d, t) +#define gpuSparseDestroyDnVec(v) hipsparseDestroyDnVec(v) +#define gpuSparseSpMV_bufferSize(h, op, alpha, mat, x, beta, y, dt, alg, buf) \ + hipsparseSpMV_bufferSize(h, op, alpha, mat, x, beta, y, dt, alg, buf) +#define gpuSparseSpMV(h, op, alpha, mat, x, beta, y, dt, alg, buf) \ + hipsparseSpMV(h, op, alpha, mat, x, beta, y, dt, alg, buf) +#define GPU_SPARSE_STATUS_SUCCESS HIPSPARSE_STATUS_SUCCESS +#define gpuSparseGetErrorString(s) hipsparseGetErrorString(s) + +#define GPU_OPERATION_NON_TRANSPOSE HIPSPARSE_OPERATION_NON_TRANSPOSE +#define GPU_OPERATION_TRANSPOSE HIPSPARSE_OPERATION_TRANSPOSE +#define GPU_INDEX_32I HIPSPARSE_INDEX_32I +#define GPU_INDEX_BASE_ZERO HIPSPARSE_INDEX_BASE_ZERO +#define GPU_R_64F HIP_R_64F +#define GPU_SPMV_ALG_DEFAULT HIPSPARSE_SPMV_ALG_DEFAULT +#define GPU_SPMV_CSR_ALG2 HIPSPARSE_SPMV_CSR_ALG2 + +// Device info +#define gpuGetDeviceCount(n) hipGetDeviceCount(n) +#define gpuGetDeviceProperties(p, d) hipGetDeviceProperties(p, d) +using gpuDeviceProp_t = hipDeviceProp_t; + +// Sparse vector update +#define gpuSparseSetDnVecValues(v, p) hipsparseDnVecSetValues(v, p) + +#define gpuSparseSpMV_preprocess(h, op, a, mat, x, b, y, dt, alg, buf) \ + hipsparseSpMV_preprocess(h, op, a, mat, x, b, y, dt, alg, buf) + +// Graph API +using gpuGraph_t = hipGraph_t; +using gpuGraphExec_t = hipGraphExec_t; +#define gpuStreamBeginCapture(s, m) hipStreamBeginCapture(s, m) +#define gpuStreamEndCapture(s, g) hipStreamEndCapture(s, g) +#define gpuStreamCaptureModeGlobal hipStreamCaptureModeGlobal +#define gpuGraphInstantiate(ge, g, n, e, f) hipGraphInstantiate(ge, g, n, e, f) +#define gpuGraphDestroy(g) hipGraphDestroy(g) +#define gpuGraphLaunch(ge, s) hipGraphLaunch(ge, s) +#define gpuGraphExecDestroy(ge) hipGraphExecDestroy(ge) + +#else +#include +#include +#include + +using gpuStream_t = cudaStream_t; +using gpuSparseHandle_t = cusparseHandle_t; +using gpuBlasHandle_t = cublasHandle_t; +using gpuSpMatDescr_t = cusparseSpMatDescr_t; +using gpuDnVecDescr_t = cusparseDnVecDescr_t; +using gpuError_t = cudaError_t; +using gpuBlasStatus_t = cublasStatus_t; +using gpuSparseStatus_t = cusparseStatus_t; + +#define gpuMalloc(ptr, size) cudaMalloc(ptr, size) +#define gpuFree(ptr) cudaFree(ptr) +#define gpuMemcpy(dst, src, size, kind) cudaMemcpy(dst, src, size, kind) +#define gpuMemcpyAsync(dst, src, size, kind, stream) \ + cudaMemcpyAsync(dst, src, size, kind, stream) +#define gpuMemset(ptr, val, size) cudaMemset(ptr, val, size) +#define gpuMemsetAsync(ptr, val, size, stream) \ + cudaMemsetAsync(ptr, val, size, stream) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice + +#define gpuStreamCreate(s) cudaStreamCreate(s) +#define gpuStreamDestroy(s) cudaStreamDestroy(s) +#define gpuStreamSynchronize(s) cudaStreamSynchronize(s) +#define gpuDeviceSynchronize() cudaDeviceSynchronize() +#define gpuGetLastError() cudaGetLastError() +#define gpuGetErrorString(e) cudaGetErrorString(e) +#define gpuSuccess cudaSuccess + +#define gpuBlasCreate(h) cublasCreate(h) +#define gpuBlasDestroy(h) cublasDestroy(h) +#define gpuBlasSetStream(h, s) cublasSetStream(h, s) +#define gpuBlasDdot(h,n,x,ix,y,iy,r) cublasDdot(h,n,x,ix,y,iy,r) +#define gpuBlasDnrm2(h,n,x,ix,r) cublasDnrm2(h,n,x,ix,r) +#define gpuBlasDaxpy(h,n,a,x,ix,y,iy) cublasDaxpy(h,n,a,x,ix,y,iy) +#define gpuBlasDscal(h,n,a,x,ix) cublasDscal(h,n,a,x,ix) +#define GPU_BLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS + +#define gpuSparseCreate(h) cusparseCreate(h) +#define gpuSparseDestroy(h) cusparseDestroy(h) +#define gpuSparseSetStream(h, s) cusparseSetStream(h, s) +#define gpuSparseCreateCsr(mat, r, c, nnz, rp, ci, cv, rpt, cit, vt, dt) \ + cusparseCreateCsr(mat, r, c, nnz, rp, ci, cv, rpt, cit, vt, dt) +#define gpuSparseDestroySpMat(m) cusparseDestroySpMat(m) +#define gpuSparseCreateDnVec(v, s, d, t) \ + cusparseCreateDnVec(v, s, d, t) +#define gpuSparseDestroyDnVec(v) cusparseDestroyDnVec(v) +#define gpuSparseSpMV_bufferSize(h, op, alpha, mat, x, beta, y, dt, alg, buf) \ + cusparseSpMV_bufferSize(h, op, alpha, mat, x, beta, y, dt, alg, buf) +#define gpuSparseSpMV(h, op, alpha, mat, x, beta, y, dt, alg, buf) \ + cusparseSpMV(h, op, alpha, mat, x, beta, y, dt, alg, buf) +#define GPU_SPARSE_STATUS_SUCCESS CUSPARSE_STATUS_SUCCESS +#define gpuSparseGetErrorString(s) cusparseGetErrorString(s) + +#define GPU_OPERATION_NON_TRANSPOSE CUSPARSE_OPERATION_NON_TRANSPOSE +#define GPU_OPERATION_TRANSPOSE CUSPARSE_OPERATION_TRANSPOSE +#define GPU_INDEX_32I CUSPARSE_INDEX_32I +#define GPU_INDEX_BASE_ZERO CUSPARSE_INDEX_BASE_ZERO +#define GPU_R_64F CUDA_R_64F +#define GPU_SPMV_ALG_DEFAULT CUSPARSE_SPMV_ALG_DEFAULT +#define GPU_SPMV_CSR_ALG2 CUSPARSE_SPMV_CSR_ALG2 + +// Device info +#define gpuGetDeviceCount(n) cudaGetDeviceCount(n) +#define gpuGetDeviceProperties(p, d) cudaGetDeviceProperties(p, d) +using gpuDeviceProp_t = cudaDeviceProp; + +// Sparse vector update +#define gpuSparseSetDnVecValues(v, p) cusparseDnVecSetValues(v, p) + +// cuSPARSE preprocess for SpMV (no-op equivalent in hipSPARSE) +#define gpuSparseSpMV_preprocess(h, op, a, mat, x, b, y, dt, alg, buf) \ + cusparseSpMV_preprocess(h, op, a, mat, x, b, y, dt, alg, buf) + +// Graph API +using gpuGraph_t = cudaGraph_t; +using gpuGraphExec_t = cudaGraphExec_t; +#define gpuStreamBeginCapture(s, m) cudaStreamBeginCapture(s, m) +#define gpuStreamEndCapture(s, g) cudaStreamEndCapture(s, g) +#define gpuStreamCaptureModeGlobal cudaStreamCaptureModeGlobal +#define gpuGraphInstantiate(ge, g, n, e, f) cudaGraphInstantiate(ge, g, n, e, f) +#define gpuGraphDestroy(g) cudaGraphDestroy(g) +#define gpuGraphLaunch(ge, s) cudaGraphLaunch(ge, s) +#define gpuGraphExecDestroy(ge) cudaGraphExecDestroy(ge) + +#endif // USE_HIP + +#define GPU_CHECK(call) \ + do { \ + gpuError_t err = (call); \ + if (err != gpuSuccess) { \ + fprintf(stderr, "GPU Error at %s:%d: %s\n", __FILE__, __LINE__, \ + gpuGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#define GPU_BLAS_CHECK(call) \ + do { \ + gpuBlasStatus_t status = (call); \ + if (status != GPU_BLAS_STATUS_SUCCESS) { \ + fprintf(stderr, "GPU BLAS Error at %s:%d: %d\n", __FILE__, __LINE__, \ + (int)status); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#define GPU_SPARSE_CHECK(call) \ + do { \ + gpuSparseStatus_t status = (call); \ + if (status != GPU_SPARSE_STATUS_SUCCESS) { \ + fprintf(stderr, "GPU Sparse Error at %s:%d: %s\n", __FILE__, \ + __LINE__, gpuSparseGetErrorString(status)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#endif // CUPDLP_GPU or HIPDLP_GPU diff --git a/highs/pdlp/hipdlp/pdhg.cc b/highs/pdlp/hipdlp/pdhg.cc index 3e08c8d752d..9546f39ecaf 100644 --- a/highs/pdlp/hipdlp/pdhg.cc +++ b/highs/pdlp/hipdlp/pdhg.cc @@ -15,10 +15,8 @@ #include #include -#ifdef CUPDLP_GPU -#include -#include -#include +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) +#include "gpu_backend.hpp" #endif #include "defs.hpp" @@ -497,7 +495,7 @@ void PDLPSolver::solve(std::vector& x, std::vector& y) { #endif // 1. Initialization and setup -#ifdef CUPDLP_GPU +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) setupGpu(); #endif initializeStepSizes(); @@ -521,27 +519,27 @@ void PDLPSolver::solve(std::vector& x, std::vector& y) { halpern_iteration_ = 0; } -#ifdef CUPDLP_GPU +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) if (!params_.use_halpern_restart) { - CUDA_CHECK(cudaMemset(d_x_sum_, 0, lp_.num_col_ * sizeof(double))); - CUDA_CHECK(cudaMemset(d_y_sum_, 0, lp_.num_row_ * sizeof(double))); - CUDA_CHECK(cudaMemset(d_x_avg_, 0, lp_.num_col_ * sizeof(double))); - CUDA_CHECK(cudaMemset(d_y_avg_, 0, lp_.num_row_ * sizeof(double))); + GPU_CHECK(gpuMemset(d_x_sum_, 0, lp_.num_col_ * sizeof(double))); + GPU_CHECK(gpuMemset(d_y_sum_, 0, lp_.num_row_ * sizeof(double))); + GPU_CHECK(gpuMemset(d_x_avg_, 0, lp_.num_col_ * sizeof(double))); + GPU_CHECK(gpuMemset(d_y_avg_, 0, lp_.num_row_ * sizeof(double))); } sum_weights_gpu_ = 0.0; - CUDA_CHECK(cudaMemcpy(d_x_current_, x_current_.data(), - lp_.num_col_ * sizeof(double), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(d_y_current_, y_current_.data(), - lp_.num_row_ * sizeof(double), cudaMemcpyHostToDevice)); + GPU_CHECK(gpuMemcpy(d_x_current_, x_current_.data(), + lp_.num_col_ * sizeof(double), gpuMemcpyHostToDevice)); + GPU_CHECK(gpuMemcpy(d_y_current_, y_current_.data(), + lp_.num_row_ * sizeof(double), gpuMemcpyHostToDevice)); if (params_.use_halpern_restart) { - CUDA_CHECK(cudaMemcpy(d_x_anchor_, d_x_current_, + GPU_CHECK(gpuMemcpy(d_x_anchor_, d_x_current_, lp_.num_col_ * sizeof(double), - cudaMemcpyDeviceToDevice)); - CUDA_CHECK(cudaMemcpy(d_y_anchor_, d_y_current_, + gpuMemcpyDeviceToDevice)); + GPU_CHECK(gpuMemcpy(d_y_anchor_, d_y_current_, lp_.num_row_ * sizeof(double), - cudaMemcpyDeviceToDevice)); + gpuMemcpyDeviceToDevice)); } linalgGpuAx(d_x_current_, d_ax_current_); @@ -567,9 +565,9 @@ void PDLPSolver::solve(std::vector& x, std::vector& y) { return; } -#ifdef CUPDLP_GPU +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) bool graph_created = false; - cudaGraphExec_t graphExec = nullptr; + gpuGraphExec_t graphExec = nullptr; #endif // 2. Main cuPDLPx-style Loop @@ -582,15 +580,11 @@ void PDLPSolver::solve(std::vector& x, std::vector& y) { } // -- Step 1 (Major, isolated for restart-FPE check) -- -#ifdef CUPDLP_GPU - CUDA_CHECK(cudaMemcpyAsync(d_primal_step_size_, &stepsize_.primal_step, - sizeof(double), cudaMemcpyHostToDevice, - gpu_stream_)); - CUDA_CHECK(cudaMemcpyAsync(d_dual_step_size_, &stepsize_.dual_step, - sizeof(double), cudaMemcpyHostToDevice, - gpu_stream_)); - CUDA_CHECK(cudaMemcpyAsync(d_halpern_iteration_, &halpern_iteration_, - sizeof(int), cudaMemcpyHostToDevice, +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) + GpuStepParams h_step_params{stepsize_.primal_step, stepsize_.dual_step, + halpern_iteration_}; + GPU_CHECK(gpuMemcpyAsync(d_step_params_, &h_step_params, + sizeof(GpuStepParams), gpuMemcpyHostToDevice, gpu_stream_)); performHalpernPdhgStepGpu(true, 1); #else @@ -598,7 +592,7 @@ void PDLPSolver::solve(std::vector& x, std::vector& y) { #endif if (do_restart) { -#ifdef CUPDLP_GPU +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) fpe_ = computeFixedPointErrorGpu(); #else fpe_ = computeFixedPointError(); @@ -607,24 +601,24 @@ void PDLPSolver::solve(std::vector& x, std::vector& y) { do_restart = false; } // -- Steps 2 to PDHG_CHECK_INTERVAL - 1 (Minor) -- -#ifdef CUPDLP_GPU +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) if (!graph_created) { - CUDA_CHECK( - cudaStreamBeginCapture(gpu_stream_, cudaStreamCaptureModeGlobal)); + GPU_CHECK( + gpuStreamBeginCapture(gpu_stream_, gpuStreamCaptureModeGlobal)); for (int i = 2; i <= PDHG_CHECK_INTERVAL - 1; i++) { performHalpernPdhgStepGpu(false, i); } performHalpernPdhgStepGpu(true, PDHG_CHECK_INTERVAL); - cudaGraph_t graph; - CUDA_CHECK(cudaStreamEndCapture(gpu_stream_, &graph)); - CUDA_CHECK(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); - CUDA_CHECK(cudaGraphDestroy(graph)); + gpuGraph_t graph; + GPU_CHECK(gpuStreamEndCapture(gpu_stream_, &graph)); + GPU_CHECK(gpuGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); + GPU_CHECK(gpuGraphDestroy(graph)); graph_created = true; } - CUDA_CHECK(cudaGraphLaunch(graphExec, gpu_stream_)); + GPU_CHECK(gpuGraphLaunch(graphExec, gpu_stream_)); #else for (int i = 2; i <= PDHG_CHECK_INTERVAL - 1; i++) { performHalpernPdhgStep(false, i); @@ -633,7 +627,7 @@ void PDLPSolver::solve(std::vector& x, std::vector& y) { #endif // Compute Error for Restart Check -#ifdef CUPDLP_GPU +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) fpe_ = computeFixedPointErrorGpu(); #else fpe_ = computeFixedPointError(); @@ -664,19 +658,19 @@ void PDLPSolver::solve(std::vector& x, std::vector& y) { if (params_.step_size_strategy == StepSizeStrategy::PID) { updatePrimalWeightAtRestart(results_); } -#ifdef CUPDLP_GPU - CUDA_CHECK(cudaMemcpy(d_x_anchor_, d_pdhg_primal_, +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) + GPU_CHECK(gpuMemcpy(d_x_anchor_, d_pdhg_primal_, lp_.num_col_ * sizeof(double), - cudaMemcpyDeviceToDevice)); - CUDA_CHECK(cudaMemcpy(d_y_anchor_, d_pdhg_dual_, + gpuMemcpyDeviceToDevice)); + GPU_CHECK(gpuMemcpy(d_y_anchor_, d_pdhg_dual_, lp_.num_row_ * sizeof(double), - cudaMemcpyDeviceToDevice)); - CUDA_CHECK(cudaMemcpy(d_x_current_, d_pdhg_primal_, + gpuMemcpyDeviceToDevice)); + GPU_CHECK(gpuMemcpy(d_x_current_, d_pdhg_primal_, lp_.num_col_ * sizeof(double), - cudaMemcpyDeviceToDevice)); - CUDA_CHECK(cudaMemcpy(d_y_current_, d_pdhg_dual_, + gpuMemcpyDeviceToDevice)); + GPU_CHECK(gpuMemcpy(d_y_current_, d_pdhg_dual_, lp_.num_row_ * sizeof(double), - cudaMemcpyDeviceToDevice)); + gpuMemcpyDeviceToDevice)); #else x_anchor_ = x_next_; y_anchor_ = y_next_; @@ -691,9 +685,9 @@ void PDLPSolver::solve(std::vector& x, std::vector& y) { } } -#ifdef CUPDLP_GPU +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) if (graphExec) { - CUDA_CHECK(cudaGraphExecDestroy(graphExec)); + GPU_CHECK(gpuGraphExecDestroy(graphExec)); } #endif @@ -738,23 +732,23 @@ double PDLPSolver::computeFixedPointError() { return std::sqrt(std::max(0.0, movement + interaction)); } -#ifdef CUPDLP_GPU +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) double PDLPSolver::computeFixedPointErrorGpu() { double alpha_minus_one = -1.0; // 1. delta_x = x_next_ - reflected_x_ // (Assuming d_pdhg_primal_ maps to x_next_ and d_x_next_ is used as // reflected_x_ in your minor/major steps) - CUDA_CHECK(cudaMemcpy(d_delta_x_, d_pdhg_primal_, + GPU_CHECK(gpuMemcpy(d_delta_x_, d_pdhg_primal_, a_num_cols_ * sizeof(double), - cudaMemcpyDeviceToDevice)); - CUBLAS_CHECK(cublasDaxpy(cublas_handle_, a_num_cols_, &alpha_minus_one, + gpuMemcpyDeviceToDevice)); + GPU_BLAS_CHECK(gpuBlasDaxpy(cublas_handle_, a_num_cols_, &alpha_minus_one, d_x_next_, 1, d_delta_x_, 1)); // 2. delta_y = y_next_ - reflected_y_ - CUDA_CHECK(cudaMemcpy(d_delta_y_, d_pdhg_dual_, a_num_rows_ * sizeof(double), - cudaMemcpyDeviceToDevice)); - CUBLAS_CHECK(cublasDaxpy(cublas_handle_, a_num_rows_, &alpha_minus_one, + GPU_CHECK(gpuMemcpy(d_delta_y_, d_pdhg_dual_, a_num_rows_ * sizeof(double), + gpuMemcpyDeviceToDevice)); + GPU_BLAS_CHECK(gpuBlasDaxpy(cublas_handle_, a_num_rows_, &alpha_minus_one, d_y_next_, 1, d_delta_y_, 1)); // 3. AT_delta_y = A^T * delta_y @@ -763,11 +757,11 @@ double PDLPSolver::computeFixedPointErrorGpu() { // 4. Compute norms and cross term double primal_norm = 0.0, dual_norm = 0.0, cross_term = 0.0; - CUBLAS_CHECK( - cublasDnrm2(cublas_handle_, a_num_cols_, d_delta_x_, 1, &primal_norm)); - CUBLAS_CHECK( - cublasDnrm2(cublas_handle_, a_num_rows_, d_delta_y_, 1, &dual_norm)); - CUBLAS_CHECK(cublasDdot(cublas_handle_, a_num_cols_, d_delta_x_, 1, + GPU_BLAS_CHECK( + gpuBlasDnrm2(cublas_handle_, a_num_cols_, d_delta_x_, 1, &primal_norm)); + GPU_BLAS_CHECK( + gpuBlasDnrm2(cublas_handle_, a_num_rows_, d_delta_y_, 1, &dual_norm)); + GPU_BLAS_CHECK(gpuBlasDdot(cublas_handle_, a_num_cols_, d_delta_x_, 1, d_AT_delta_y_, 1, &cross_term)); double primal_norm_sq = primal_norm * primal_norm; @@ -785,7 +779,7 @@ bool PDLPSolver::runConvergenceCheck(size_t iter, std::vector& output_x, std::vector& output_y, TerminationStatus& status) { // 1. Compute Average Iterate (GPU or CPU) -#ifdef CUPDLP_GPU +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) computeAverageIterateGpu(); #else computeAverageIterate(Ax_avg_, ATy_avg_); @@ -797,7 +791,7 @@ bool PDLPSolver::runConvergenceCheck(size_t iter, std::vector& output_x, bool average_converged = false; // 2. Check Convergence (GPU or CPU) -#ifdef CUPDLP_GPU +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) // Only use Halpern PDHG slack/iterates if we have performed at least one // iteration if (params_.use_halpern_restart && d_pdhg_primal_ != nullptr && iter > 0) { @@ -843,7 +837,7 @@ bool PDLPSolver::runConvergenceCheck(size_t iter, std::vector& output_x, bool prefer_avg = average_converged; // Copy result to output vectors -#ifdef CUPDLP_GPU +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) double* src_x = prefer_avg ? d_x_avg_ : (params_.use_halpern_restart ? d_pdhg_primal_ : d_x_current_); @@ -853,16 +847,16 @@ bool PDLPSolver::runConvergenceCheck(size_t iter, std::vector& output_x, double* src_sp = prefer_avg ? d_dSlackPosAvg_ : d_dSlackPos_; double* src_sn = prefer_avg ? d_dSlackNegAvg_ : d_dSlackNeg_; - CUDA_CHECK(cudaMemcpy(output_x.data(), src_x, lp_.num_col_ * sizeof(double), - cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(output_y.data(), src_y, lp_.num_row_ * sizeof(double), - cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(dSlackPos_.data(), src_sp, + GPU_CHECK(gpuMemcpy(output_x.data(), src_x, lp_.num_col_ * sizeof(double), + gpuMemcpyDeviceToHost)); + GPU_CHECK(gpuMemcpy(output_y.data(), src_y, lp_.num_row_ * sizeof(double), + gpuMemcpyDeviceToHost)); + GPU_CHECK(gpuMemcpy(dSlackPos_.data(), src_sp, lp_.num_col_ * sizeof(double), - cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(dSlackNeg_.data(), src_sn, + gpuMemcpyDeviceToHost)); + GPU_CHECK(gpuMemcpy(dSlackNeg_.data(), src_sn, lp_.num_col_ * sizeof(double), - cudaMemcpyDeviceToHost)); + gpuMemcpyDeviceToHost)); #else if (prefer_avg) { output_x = x_avg_; @@ -1028,7 +1022,7 @@ void PDLPSolver::accumulateAverages(size_t iter) { HighsInt inner_iter = iter - restart_scheme_.getLastRestartIter(); double dMeanStepSize = std::sqrt(stepsize_.primal_step * stepsize_.dual_step); -#ifdef CUPDLP_GPU +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) if (params_.use_halpern_restart) { // If Halpern, we average the 'current' blended iterate launchKernelUpdateAverages_wrapper(d_x_sum_, d_y_sum_, d_x_current_, @@ -1061,20 +1055,12 @@ void PDLPSolver::prepareNextIteration() { // Just ensure caches are correct if not done in PerformHalpernStep return; } else { - // Standard PDHG: x_current becomes x_next -#ifdef CUPDLP_GPU - CUDA_CHECK(cudaMemcpy(d_x_current_, d_x_next_, - lp_.num_col_ * sizeof(double), - cudaMemcpyDeviceToDevice)); - CUDA_CHECK(cudaMemcpy(d_y_current_, d_y_next_, - lp_.num_row_ * sizeof(double), - cudaMemcpyDeviceToDevice)); - CUDA_CHECK(cudaMemcpy(d_ax_current_, d_ax_next_, - lp_.num_row_ * sizeof(double), - cudaMemcpyDeviceToDevice)); - CUDA_CHECK(cudaMemcpy(d_aty_current_, d_aty_next_, - lp_.num_col_ * sizeof(double), - cudaMemcpyDeviceToDevice)); + // Standard PDHG: swap device pointers instead of copying device memory. +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) + std::swap(d_x_current_, d_x_next_); + std::swap(d_y_current_, d_y_next_); + std::swap(d_ax_current_, d_ax_next_); + std::swap(d_aty_current_, d_aty_next_); #else x_current_ = x_next_; y_current_ = y_next_; @@ -1083,7 +1069,7 @@ void PDLPSolver::prepareNextIteration() { #endif } } -#ifdef CUPDLP_GPU +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) void PDLPSolver::performHalpernPdhgStepGpu(bool is_major, int k_offset) { linalgGpuATy(d_y_current_, d_aty_current_); @@ -1126,7 +1112,7 @@ void PDLPSolver::performHalpernPdhgStepGpu(bool is_major, int k_offset) { #endif void PDLPSolver::solveReturn(const TerminationStatus term_code) { -#ifdef CUPDLP_GPU +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) cleanupGpu(); #endif results_.term_code = term_code; @@ -1669,7 +1655,7 @@ double PDLPSolver::powerMethod() { return op_norm_sq; } -#ifdef CUPDLP_GPU +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) double PDLPSolver::powerMethodGpu() { if (a_num_rows_ == 0 || a_num_cols_ == 0) return 1.0; @@ -1686,9 +1672,9 @@ double PDLPSolver::powerMethodGpu() { size_t buffer_size_at = 0; size_t buffer_size_a = 0; - CUDA_CHECK(cudaMalloc(&d_eigenvector, a_num_rows_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_next_eigenvector, a_num_rows_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_dual_product, a_num_cols_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_eigenvector, a_num_rows_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_next_eigenvector, a_num_rows_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_dual_product, a_num_cols_ * sizeof(double))); std::vector eigenvector_h(a_num_rows_); std::mt19937 engine_fixed_seed(12345); @@ -1697,84 +1683,84 @@ double PDLPSolver::powerMethodGpu() { eigenvector_h[i] = distribution(engine_fixed_seed); } - CUDA_CHECK(cudaMemcpy(d_eigenvector, eigenvector_h.data(), - a_num_rows_ * sizeof(double), cudaMemcpyHostToDevice)); + GPU_CHECK(gpuMemcpy(d_eigenvector, eigenvector_h.data(), + a_num_rows_ * sizeof(double), gpuMemcpyHostToDevice)); - cusparseDnVecDescr_t vecEigen = nullptr; - cusparseDnVecDescr_t vecNextEigen = nullptr; - cusparseDnVecDescr_t vecDual = nullptr; - CUSPARSE_CHECK( - cusparseCreateDnVec(&vecEigen, a_num_rows_, d_eigenvector, CUDA_R_64F)); - CUSPARSE_CHECK(cusparseCreateDnVec(&vecNextEigen, a_num_rows_, - d_next_eigenvector, CUDA_R_64F)); - CUSPARSE_CHECK( - cusparseCreateDnVec(&vecDual, a_num_cols_, d_dual_product, CUDA_R_64F)); + gpuDnVecDescr_t vecEigen = nullptr; + gpuDnVecDescr_t vecNextEigen = nullptr; + gpuDnVecDescr_t vecDual = nullptr; + GPU_SPARSE_CHECK( + gpuSparseCreateDnVec(&vecEigen, a_num_rows_, d_eigenvector, GPU_R_64F)); + GPU_SPARSE_CHECK(gpuSparseCreateDnVec(&vecNextEigen, a_num_rows_, + d_next_eigenvector, GPU_R_64F)); + GPU_SPARSE_CHECK( + gpuSparseCreateDnVec(&vecDual, a_num_cols_, d_dual_product, GPU_R_64F)); - CUSPARSE_CHECK(cusparseSpMV_bufferSize( - cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, mat_a_T_csr_, - vecEigen, &zero, vecDual, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, + GPU_SPARSE_CHECK(gpuSparseSpMV_bufferSize( + cusparse_handle_, GPU_OPERATION_NON_TRANSPOSE, &one, mat_a_T_csr_, + vecEigen, &zero, vecDual, GPU_R_64F, GPU_SPMV_CSR_ALG2, &buffer_size_at)); - CUSPARSE_CHECK(cusparseSpMV_bufferSize( - cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, mat_a_csr_, - vecDual, &zero, vecNextEigen, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, + GPU_SPARSE_CHECK(gpuSparseSpMV_bufferSize( + cusparse_handle_, GPU_OPERATION_NON_TRANSPOSE, &one, mat_a_csr_, + vecDual, &zero, vecNextEigen, GPU_R_64F, GPU_SPMV_CSR_ALG2, &buffer_size_a)); - CUDA_CHECK(cudaMalloc(&d_buffer_at, buffer_size_at)); - CUDA_CHECK(cudaMalloc(&d_buffer_a, buffer_size_a)); + GPU_CHECK(gpuMalloc(&d_buffer_at, buffer_size_at)); + GPU_CHECK(gpuMalloc(&d_buffer_a, buffer_size_a)); double sigma_max_sq = 1.0; for (int iter = 0; iter < max_iter; ++iter) { - CUDA_CHECK(cudaMemcpy(d_next_eigenvector, d_eigenvector, - a_num_rows_ * sizeof(double), - cudaMemcpyDeviceToDevice)); + GPU_CHECK(gpuMemcpy(d_next_eigenvector, d_eigenvector, + a_num_rows_ * sizeof(double), + gpuMemcpyDeviceToDevice)); double eigenvector_norm = 0.0; - CUBLAS_CHECK(cublasDnrm2(cublas_handle_, a_num_rows_, d_next_eigenvector, 1, - &eigenvector_norm)); + GPU_BLAS_CHECK(gpuBlasDnrm2(cublas_handle_, a_num_rows_, d_next_eigenvector, + 1, &eigenvector_norm)); if (!(eigenvector_norm > 0.0) || !std::isfinite(eigenvector_norm)) break; double inv_eigenvector_norm = 1.0 / eigenvector_norm; - CUBLAS_CHECK(cublasDscal(cublas_handle_, a_num_rows_, &inv_eigenvector_norm, - d_next_eigenvector, 1)); + GPU_BLAS_CHECK(gpuBlasDscal(cublas_handle_, a_num_rows_, + &inv_eigenvector_norm, d_next_eigenvector, 1)); - CUSPARSE_CHECK(cusparseDnVecSetValues(vecNextEigen, d_next_eigenvector)); - CUSPARSE_CHECK(cusparseDnVecSetValues(vecDual, d_dual_product)); + GPU_SPARSE_CHECK(gpuSparseSetDnVecValues(vecNextEigen, d_next_eigenvector)); + GPU_SPARSE_CHECK(gpuSparseSetDnVecValues(vecDual, d_dual_product)); - CUSPARSE_CHECK( - cusparseSpMV(cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, - mat_a_T_csr_, vecNextEigen, &zero, vecDual, CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, d_buffer_at)); + GPU_SPARSE_CHECK( + gpuSparseSpMV(cusparse_handle_, GPU_OPERATION_NON_TRANSPOSE, &one, + mat_a_T_csr_, vecNextEigen, &zero, vecDual, GPU_R_64F, + GPU_SPMV_CSR_ALG2, d_buffer_at)); - CUSPARSE_CHECK(cusparseDnVecSetValues(vecEigen, d_eigenvector)); - CUSPARSE_CHECK( - cusparseSpMV(cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, - mat_a_csr_, vecDual, &zero, vecEigen, CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, d_buffer_a)); + GPU_SPARSE_CHECK(gpuSparseSetDnVecValues(vecEigen, d_eigenvector)); + GPU_SPARSE_CHECK( + gpuSparseSpMV(cusparse_handle_, GPU_OPERATION_NON_TRANSPOSE, &one, + mat_a_csr_, vecDual, &zero, vecEigen, GPU_R_64F, + GPU_SPMV_CSR_ALG2, d_buffer_a)); - CUBLAS_CHECK(cublasDdot(cublas_handle_, a_num_rows_, d_next_eigenvector, 1, - d_eigenvector, 1, &sigma_max_sq)); + GPU_BLAS_CHECK(gpuBlasDdot(cublas_handle_, a_num_rows_, d_next_eigenvector, + 1, d_eigenvector, 1, &sigma_max_sq)); double neg_sigma_sq = -sigma_max_sq; - CUBLAS_CHECK(cublasDscal(cublas_handle_, a_num_rows_, &neg_sigma_sq, - d_next_eigenvector, 1)); - CUBLAS_CHECK(cublasDaxpy(cublas_handle_, a_num_rows_, &one, d_eigenvector, - 1, d_next_eigenvector, 1)); + GPU_BLAS_CHECK(gpuBlasDscal(cublas_handle_, a_num_rows_, &neg_sigma_sq, + d_next_eigenvector, 1)); + GPU_BLAS_CHECK(gpuBlasDaxpy(cublas_handle_, a_num_rows_, &one, + d_eigenvector, 1, d_next_eigenvector, 1)); double residual_norm = 0.0; - CUBLAS_CHECK(cublasDnrm2(cublas_handle_, a_num_rows_, d_next_eigenvector, 1, - &residual_norm)); + GPU_BLAS_CHECK(gpuBlasDnrm2(cublas_handle_, a_num_rows_, d_next_eigenvector, + 1, &residual_norm)); if (residual_norm < tolerance) break; } - CUDA_CHECK(cudaFree(d_buffer_at)); - CUDA_CHECK(cudaFree(d_buffer_a)); - CUSPARSE_CHECK(cusparseDestroyDnVec(vecEigen)); - CUSPARSE_CHECK(cusparseDestroyDnVec(vecNextEigen)); - CUSPARSE_CHECK(cusparseDestroyDnVec(vecDual)); - CUDA_CHECK(cudaFree(d_eigenvector)); - CUDA_CHECK(cudaFree(d_next_eigenvector)); - CUDA_CHECK(cudaFree(d_dual_product)); + GPU_CHECK(gpuFree(d_buffer_at)); + GPU_CHECK(gpuFree(d_buffer_a)); + GPU_SPARSE_CHECK(gpuSparseDestroyDnVec(vecEigen)); + GPU_SPARSE_CHECK(gpuSparseDestroyDnVec(vecNextEigen)); + GPU_SPARSE_CHECK(gpuSparseDestroyDnVec(vecDual)); + GPU_CHECK(gpuFree(d_eigenvector)); + GPU_CHECK(gpuFree(d_next_eigenvector)); + GPU_CHECK(gpuFree(d_dual_product)); return sigma_max_sq; } @@ -1786,22 +1772,22 @@ void PDLPSolver::setup(const HighsOptions& options, HighsTimer& timer) { highs_timer_p_ = &timer; highsLogUser(options.log_options, HighsLogType::kInfo, "Using HiPDLP first order PDLP solver on a %s\n", -#ifdef CUPDLP_GPU +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) "GPU" #else "CPU: performance may be disappointing!" #endif ); -#ifdef CUPDLP_GPU +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) HighsInt n_devices = 0; - cudaGetDeviceCount(&n_devices); + gpuGetDeviceCount(&n_devices); if (n_devices != 1) highsLogUser( options.log_options, HighsLogType::kInfo, "Number of CUDA-enabled devices is %d: device 0 will be used\n", n_devices); - cudaDeviceProp prop; - cudaGetDeviceProperties(&prop, 0); + gpuDeviceProp_t prop; + gpuGetDeviceProperties(&prop, 0); highsLogUser(options.log_options, HighsLogType::kInfo, "Cuda device: %s\n", prop.name); highsLogUser(options.log_options, HighsLogType::kInfo, @@ -1954,7 +1940,7 @@ void PDLPSolver::initializeStepSizes() { params_.step_size_strategy = StepSizeStrategy::FIXED; } -#ifdef CUPDLP_GPU +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) double op_norm_sq = powerMethodGpu(); #else double op_norm_sq = powerMethod(); @@ -1981,7 +1967,7 @@ void PDLPSolver::updatePrimalWeightAtRestart(const SolverResults& results) { double primal_dist = 0.0; double dual_dist = 0.0; -#ifdef CUPDLP_GPU +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) primal_dist = computeDiffNormCuBLAS(d_pdhg_primal_, d_x_anchor_, a_num_cols_); dual_dist = computeDiffNormCuBLAS(d_pdhg_dual_, d_y_anchor_, a_num_rows_); #else @@ -2110,20 +2096,20 @@ void PDLPSolver::updateIteratesFixed() { hipdlpTimerStop(kHipdlpClockMatrixTransposeMultiply); #endif -#ifdef CUPDLP_GPU +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) // Add this check before the memcpy if (d_x_next_ == nullptr) { std::cerr << "Error1: d_x_next_ is null!" << std::endl; return; } launchKernelUpdateX(stepsize_.primal_step); - CUDA_CHECK(cudaDeviceSynchronize()); + GPU_CHECK(gpuDeviceSynchronize()); linalgGpuAx(d_x_next_, d_ax_next_); - CUDA_CHECK(cudaDeviceSynchronize()); + GPU_CHECK(gpuDeviceSynchronize()); launchKernelUpdateY(stepsize_.dual_step); - CUDA_CHECK(cudaDeviceSynchronize()); + GPU_CHECK(gpuDeviceSynchronize()); linalgGpuATy(d_y_next_, d_aty_next_); - CUDA_CHECK(cudaDeviceSynchronize()); + GPU_CHECK(gpuDeviceSynchronize()); // Add this check before the memcpy if (d_x_next_ == nullptr) { @@ -2172,7 +2158,7 @@ void PDLPSolver::updateIteratesAdaptive() { double primal_step_update = dStepSizeUpdate / std::sqrt(stepsize_.beta); double dual_step_update = dStepSizeUpdate * std::sqrt(stepsize_.beta); -#ifdef CUPDLP_GPU +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) launchKernelUpdateX_wrapper(d_x_next_, // Output (Trial) d_x_current_, // Input (Base) d_aty_current_, // Input (Base aTy) @@ -2391,15 +2377,15 @@ void PDLPSolver::closeDebugLog() { // ============================================================================= // SECTION 5: GPU Part // ============================================================================= -#ifdef CUPDLP_GPU +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) void PDLPSolver::setupGpu() { - CUDA_CHECK(cudaStreamCreate(&gpu_stream_)); + GPU_CHECK(gpuStreamCreate(&gpu_stream_)); // 1. initialize cuSPARSE - CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_)); - CUBLAS_CHECK(cublasCreate(&cublas_handle_)); - CUSPARSE_CHECK(cusparseSetStream(cusparse_handle_, gpu_stream_)); - CUBLAS_CHECK(cublasSetStream(cublas_handle_, gpu_stream_)); + GPU_SPARSE_CHECK(gpuSparseCreate(&cusparse_handle_)); + GPU_BLAS_CHECK(gpuBlasCreate(&cublas_handle_)); + GPU_SPARSE_CHECK(gpuSparseSetStream(cusparse_handle_, gpu_stream_)); + GPU_BLAS_CHECK(gpuBlasSetStream(cublas_handle_, gpu_stream_)); // 2. Get matrix data from lp_ (CSC) a_num_rows_ = lp_.num_row_; a_num_cols_ = lp_.num_col_; @@ -2413,197 +2399,198 @@ void PDLPSolver::setupGpu() { const std::vector& h_a_val = lp_csr.value_; // 3. Allocate and copy A's CSR data to GPU - CUDA_CHECK( - cudaMalloc((void**)&d_a_row_ptr_, (a_num_rows_ + 1) * sizeof(HighsInt))); - CUDA_CHECK(cudaMalloc((void**)&d_a_col_ind_, a_nnz_ * sizeof(HighsInt))); - CUDA_CHECK(cudaMalloc((void**)&d_a_val_, a_nnz_ * sizeof(double))); - CUDA_CHECK(cudaMemcpy(d_a_row_ptr_, h_a_row_ptr.data(), + GPU_CHECK( + gpuMalloc((void**)&d_a_row_ptr_, (a_num_rows_ + 1) * sizeof(HighsInt))); + GPU_CHECK(gpuMalloc((void**)&d_a_col_ind_, a_nnz_ * sizeof(HighsInt))); + GPU_CHECK(gpuMalloc((void**)&d_a_val_, a_nnz_ * sizeof(double))); + GPU_CHECK(gpuMemcpy(d_a_row_ptr_, h_a_row_ptr.data(), (a_num_rows_ + 1) * sizeof(HighsInt), - cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(d_a_col_ind_, h_a_col_ind.data(), - a_nnz_ * sizeof(HighsInt), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(d_a_val_, h_a_val.data(), a_nnz_ * sizeof(double), - cudaMemcpyHostToDevice)); + gpuMemcpyHostToDevice)); + GPU_CHECK(gpuMemcpy(d_a_col_ind_, h_a_col_ind.data(), + a_nnz_ * sizeof(HighsInt), gpuMemcpyHostToDevice)); + GPU_CHECK(gpuMemcpy(d_a_val_, h_a_val.data(), a_nnz_ * sizeof(double), + gpuMemcpyHostToDevice)); - CUSPARSE_CHECK(cusparseCreateCsr(&mat_a_csr_, a_num_rows_, a_num_cols_, + GPU_SPARSE_CHECK(gpuSparseCreateCsr(&mat_a_csr_, a_num_rows_, a_num_cols_, a_nnz_, d_a_row_ptr_, d_a_col_ind_, d_a_val_, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F)); + GPU_INDEX_32I, GPU_INDEX_32I, + GPU_INDEX_BASE_ZERO, GPU_R_64F)); // 4. Create matrix AT in CSR format = A in CSC const std::vector& h_at_row_ptr = lp_.a_matrix_.start_; const std::vector& h_at_col_ind = lp_.a_matrix_.index_; const std::vector& h_at_val = lp_.a_matrix_.value_; - CUDA_CHECK(cudaMalloc((void**)&d_at_row_ptr_, + GPU_CHECK(gpuMalloc((void**)&d_at_row_ptr_, (a_num_cols_ + 1) * sizeof(HighsInt))); // Fixed! - CUDA_CHECK(cudaMalloc((void**)&d_at_col_ind_, a_nnz_ * sizeof(HighsInt))); - CUDA_CHECK(cudaMalloc((void**)&d_at_val_, a_nnz_ * sizeof(double))); + GPU_CHECK(gpuMalloc((void**)&d_at_col_ind_, a_nnz_ * sizeof(HighsInt))); + GPU_CHECK(gpuMalloc((void**)&d_at_val_, a_nnz_ * sizeof(double))); - CUDA_CHECK(cudaMemcpy(d_at_row_ptr_, h_at_row_ptr.data(), + GPU_CHECK(gpuMemcpy(d_at_row_ptr_, h_at_row_ptr.data(), (a_num_cols_ + 1) * sizeof(HighsInt), - cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(d_at_col_ind_, h_at_col_ind.data(), - a_nnz_ * sizeof(HighsInt), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(d_at_val_, h_at_val.data(), a_nnz_ * sizeof(double), - cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMalloc(&d_halpern_iteration_, sizeof(int))); - CUDA_CHECK(cudaMemset(d_halpern_iteration_, 0, sizeof(int))); - CUDA_CHECK(cudaMalloc(&d_primal_step_size_, sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_dual_step_size_, sizeof(double))); + gpuMemcpyHostToDevice)); + GPU_CHECK(gpuMemcpy(d_at_col_ind_, h_at_col_ind.data(), + a_nnz_ * sizeof(HighsInt), gpuMemcpyHostToDevice)); + GPU_CHECK(gpuMemcpy(d_at_val_, h_at_val.data(), a_nnz_ * sizeof(double), + gpuMemcpyHostToDevice)); + GPU_CHECK(gpuMalloc(&d_step_params_, sizeof(GpuStepParams))); + GPU_CHECK(gpuMemset(d_step_params_, 0, sizeof(GpuStepParams))); + d_primal_step_size_ = &d_step_params_->primal_step; + d_dual_step_size_ = &d_step_params_->dual_step; + d_halpern_iteration_ = &d_step_params_->halpern_iteration; // Create AT descriptor with swapped dimensions - CUSPARSE_CHECK(cusparseCreateCsr( + GPU_SPARSE_CHECK(gpuSparseCreateCsr( &mat_a_T_csr_, a_num_cols_, a_num_rows_, a_nnz_, // Dimensions swapped! - d_at_row_ptr_, d_at_col_ind_, d_at_val_, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F)); - - CUDA_CHECK(cudaMalloc(&d_col_cost_, a_num_cols_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_col_lower_, a_num_cols_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_col_upper_, a_num_cols_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_row_lower_, a_num_rows_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_is_equality_row_, a_num_rows_ * sizeof(bool))); - CUDA_CHECK(cudaMalloc(&d_x_current_, a_num_cols_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_y_current_, a_num_rows_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_x_avg_, a_num_cols_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_y_avg_, a_num_rows_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_x_next_, a_num_cols_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_y_next_, a_num_rows_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_x_at_last_restart_, a_num_cols_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_y_at_last_restart_, a_num_rows_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_x_anchor_, a_num_cols_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_y_anchor_, a_num_rows_ * sizeof(double))); - CUDA_CHECK( - cudaMalloc(&d_x_temp_diff_norm_result_, a_num_cols_ * sizeof(double))); - CUDA_CHECK( - cudaMalloc(&d_y_temp_diff_norm_result_, a_num_rows_ * sizeof(double))); + d_at_row_ptr_, d_at_col_ind_, d_at_val_, GPU_INDEX_32I, + GPU_INDEX_32I, GPU_INDEX_BASE_ZERO, GPU_R_64F)); + + GPU_CHECK(gpuMalloc(&d_col_cost_, a_num_cols_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_col_lower_, a_num_cols_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_col_upper_, a_num_cols_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_row_lower_, a_num_rows_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_is_equality_row_, a_num_rows_ * sizeof(bool))); + GPU_CHECK(gpuMalloc(&d_x_current_, a_num_cols_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_y_current_, a_num_rows_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_x_avg_, a_num_cols_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_y_avg_, a_num_rows_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_x_next_, a_num_cols_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_y_next_, a_num_rows_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_x_at_last_restart_, a_num_cols_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_y_at_last_restart_, a_num_rows_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_x_anchor_, a_num_cols_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_y_anchor_, a_num_rows_ * sizeof(double))); + GPU_CHECK( + gpuMalloc(&d_x_temp_diff_norm_result_, a_num_cols_ * sizeof(double))); + GPU_CHECK( + gpuMalloc(&d_y_temp_diff_norm_result_, a_num_rows_ * sizeof(double))); if (params_.use_halpern_restart) { - CUDA_CHECK(cudaMalloc(&d_pdhg_primal_, a_num_cols_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_pdhg_dual_, a_num_rows_ * sizeof(double))); - CUDA_CHECK(cudaMemset(d_pdhg_primal_, 0, a_num_cols_ * sizeof(double))); - CUDA_CHECK(cudaMemset(d_pdhg_dual_, 0, a_num_rows_ * sizeof(double))); - } - CUDA_CHECK(cudaMalloc(&d_delta_x_, a_num_cols_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_delta_y_, a_num_rows_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_AT_delta_y_, a_num_cols_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_ax_current_, a_num_rows_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_aty_current_, a_num_cols_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_ax_next_, a_num_rows_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_aty_next_, a_num_cols_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_ax_avg_, a_num_rows_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_aty_avg_, a_num_cols_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_x_sum_, a_num_cols_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_y_sum_, a_num_rows_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_convergence_results_, 4 * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_dSlackPos_, a_num_cols_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_dSlackNeg_, a_num_cols_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_dSlackPosAvg_, a_num_cols_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_dSlackNegAvg_, a_num_cols_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_col_scale_, a_num_cols_ * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_row_scale_, a_num_rows_ * sizeof(double))); - - CUSPARSE_CHECK( - cusparseCreateDnVec(&vec_x_desc_, a_num_cols_, d_x_current_, CUDA_R_64F)); - CUSPARSE_CHECK( - cusparseCreateDnVec(&vec_y_desc_, a_num_rows_, d_y_current_, CUDA_R_64F)); - CUSPARSE_CHECK(cusparseCreateDnVec(&vec_ax_desc_, a_num_rows_, d_ax_current_, - CUDA_R_64F)); - CUSPARSE_CHECK(cusparseCreateDnVec(&vec_aty_desc_, a_num_cols_, - d_aty_current_, CUDA_R_64F)); - - CUDA_CHECK(cudaMemcpy(d_col_cost_, lp_.col_cost_.data(), - a_num_cols_ * sizeof(double), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(d_col_lower_, lp_.col_lower_.data(), - a_num_cols_ * sizeof(double), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(d_col_upper_, lp_.col_upper_.data(), - a_num_cols_ * sizeof(double), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(d_row_lower_, lp_.row_lower_.data(), - a_num_rows_ * sizeof(double), cudaMemcpyHostToDevice)); + GPU_CHECK(gpuMalloc(&d_pdhg_primal_, a_num_cols_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_pdhg_dual_, a_num_rows_ * sizeof(double))); + GPU_CHECK(gpuMemset(d_pdhg_primal_, 0, a_num_cols_ * sizeof(double))); + GPU_CHECK(gpuMemset(d_pdhg_dual_, 0, a_num_rows_ * sizeof(double))); + } + GPU_CHECK(gpuMalloc(&d_delta_x_, a_num_cols_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_delta_y_, a_num_rows_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_AT_delta_y_, a_num_cols_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_ax_current_, a_num_rows_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_aty_current_, a_num_cols_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_ax_next_, a_num_rows_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_aty_next_, a_num_cols_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_ax_avg_, a_num_rows_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_aty_avg_, a_num_cols_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_x_sum_, a_num_cols_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_y_sum_, a_num_rows_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_convergence_results_, 4 * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_dSlackPos_, a_num_cols_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_dSlackNeg_, a_num_cols_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_dSlackPosAvg_, a_num_cols_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_dSlackNegAvg_, a_num_cols_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_col_scale_, a_num_cols_ * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_row_scale_, a_num_rows_ * sizeof(double))); + + GPU_SPARSE_CHECK( + gpuSparseCreateDnVec(&vec_x_desc_, a_num_cols_, d_x_current_, GPU_R_64F)); + GPU_SPARSE_CHECK( + gpuSparseCreateDnVec(&vec_y_desc_, a_num_rows_, d_y_current_, GPU_R_64F)); + GPU_SPARSE_CHECK(gpuSparseCreateDnVec(&vec_ax_desc_, a_num_rows_, d_ax_current_, + GPU_R_64F)); + GPU_SPARSE_CHECK(gpuSparseCreateDnVec(&vec_aty_desc_, a_num_cols_, + d_aty_current_, GPU_R_64F)); + + GPU_CHECK(gpuMemcpy(d_col_cost_, lp_.col_cost_.data(), + a_num_cols_ * sizeof(double), gpuMemcpyHostToDevice)); + GPU_CHECK(gpuMemcpy(d_col_lower_, lp_.col_lower_.data(), + a_num_cols_ * sizeof(double), gpuMemcpyHostToDevice)); + GPU_CHECK(gpuMemcpy(d_col_upper_, lp_.col_upper_.data(), + a_num_cols_ * sizeof(double), gpuMemcpyHostToDevice)); + GPU_CHECK(gpuMemcpy(d_row_lower_, lp_.row_lower_.data(), + a_num_rows_ * sizeof(double), gpuMemcpyHostToDevice)); std::vector temp_equality(a_num_rows_); for (HighsInt i = 0; i < a_num_rows_; ++i) { temp_equality[i] = is_equality_row_[i] ? 1 : 0; } // Copy to device - CUDA_CHECK(cudaMemcpy(d_is_equality_row_, temp_equality.data(), - a_num_rows_ * sizeof(uint8_t), cudaMemcpyHostToDevice)); + GPU_CHECK(gpuMemcpy(d_is_equality_row_, temp_equality.data(), + a_num_rows_ * sizeof(uint8_t), gpuMemcpyHostToDevice)); // 6. Preallocate buffer for cuSPARSE SpMV // Buffer for ax double alpha = 1.0; double beta = 0.0; - cusparseDnVecDescr_t vec_x, vec_ax; - CUSPARSE_CHECK( - cusparseCreateDnVec(&vec_x, a_num_cols_, d_x_current_, CUDA_R_64F)); - CUSPARSE_CHECK( - cusparseCreateDnVec(&vec_ax, a_num_rows_, d_ax_current_, CUDA_R_64F)); - - CUSPARSE_CHECK(cusparseSpMV_bufferSize( - cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_csr_, - vec_x, &beta, vec_ax, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, + gpuDnVecDescr_t vec_x, vec_ax; + GPU_SPARSE_CHECK( + gpuSparseCreateDnVec(&vec_x, a_num_cols_, d_x_current_, GPU_R_64F)); + GPU_SPARSE_CHECK( + gpuSparseCreateDnVec(&vec_ax, a_num_rows_, d_ax_current_, GPU_R_64F)); + + GPU_SPARSE_CHECK(gpuSparseSpMV_bufferSize( + cusparse_handle_, GPU_OPERATION_NON_TRANSPOSE, &alpha, mat_a_csr_, + vec_x, &beta, vec_ax, GPU_R_64F, GPU_SPMV_ALG_DEFAULT, &spmv_buffer_size_ax_)); - CUDA_CHECK(cudaMalloc(&d_spmv_buffer_ax_, spmv_buffer_size_ax_)); + GPU_CHECK(gpuMalloc(&d_spmv_buffer_ax_, spmv_buffer_size_ax_)); - CUSPARSE_CHECK(cusparseSpMV_preprocess( - cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_csr_, - vec_x_desc_, &beta, vec_ax_desc_, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, + GPU_SPARSE_CHECK(gpuSparseSpMV_preprocess( + cusparse_handle_, GPU_OPERATION_NON_TRANSPOSE, &alpha, mat_a_csr_, + vec_x_desc_, &beta, vec_ax_desc_, GPU_R_64F, GPU_SPMV_ALG_DEFAULT, d_spmv_buffer_ax_)); - CUSPARSE_CHECK(cusparseDestroyDnVec(vec_x)); - CUSPARSE_CHECK(cusparseDestroyDnVec(vec_ax)); + GPU_SPARSE_CHECK(gpuSparseDestroyDnVec(vec_x)); + GPU_SPARSE_CHECK(gpuSparseDestroyDnVec(vec_ax)); // Buffer for aTy - cusparseDnVecDescr_t vec_y, vec_aty; - CUSPARSE_CHECK( - cusparseCreateDnVec(&vec_y, a_num_rows_, d_y_current_, CUDA_R_64F)); - CUSPARSE_CHECK( - cusparseCreateDnVec(&vec_aty, a_num_cols_, d_aty_current_, CUDA_R_64F)); - CUSPARSE_CHECK(cusparseSpMV_bufferSize( - cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_T_csr_, - vec_y, &beta, vec_aty, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, + gpuDnVecDescr_t vec_y, vec_aty; + GPU_SPARSE_CHECK( + gpuSparseCreateDnVec(&vec_y, a_num_rows_, d_y_current_, GPU_R_64F)); + GPU_SPARSE_CHECK( + gpuSparseCreateDnVec(&vec_aty, a_num_cols_, d_aty_current_, GPU_R_64F)); + GPU_SPARSE_CHECK(gpuSparseSpMV_bufferSize( + cusparse_handle_, GPU_OPERATION_NON_TRANSPOSE, &alpha, mat_a_T_csr_, + vec_y, &beta, vec_aty, GPU_R_64F, GPU_SPMV_ALG_DEFAULT, &spmv_buffer_size_aty_)); - CUDA_CHECK(cudaMalloc(&d_spmv_buffer_aty_, spmv_buffer_size_aty_)); + GPU_CHECK(gpuMalloc(&d_spmv_buffer_aty_, spmv_buffer_size_aty_)); - CUSPARSE_CHECK(cusparseSpMV_preprocess( - cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_T_csr_, - vec_y_desc_, &beta, vec_aty_desc_, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, + GPU_SPARSE_CHECK(gpuSparseSpMV_preprocess( + cusparse_handle_, GPU_OPERATION_NON_TRANSPOSE, &alpha, mat_a_T_csr_, + vec_y_desc_, &beta, vec_aty_desc_, GPU_R_64F, GPU_SPMV_ALG_DEFAULT, d_spmv_buffer_aty_)); - CUSPARSE_CHECK(cusparseDestroyDnVec(vec_y)); - CUSPARSE_CHECK(cusparseDestroyDnVec(vec_aty)); - - CUDA_CHECK(cudaMemset(d_x_current_, 0, a_num_cols_ * sizeof(double))); - CUDA_CHECK(cudaMemset(d_y_current_, 0, a_num_rows_ * sizeof(double))); - CUDA_CHECK(cudaMemset(d_x_avg_, 0, a_num_cols_ * sizeof(double))); - CUDA_CHECK(cudaMemset(d_y_avg_, 0, a_num_rows_ * sizeof(double))); - CUDA_CHECK(cudaMemset(d_x_next_, 0, a_num_cols_ * sizeof(double))); - CUDA_CHECK(cudaMemset(d_y_next_, 0, a_num_rows_ * sizeof(double))); - CUDA_CHECK(cudaMemset(d_ax_current_, 0, a_num_rows_ * sizeof(double))); - CUDA_CHECK(cudaMemset(d_ax_next_, 0, a_num_rows_ * sizeof(double))); - CUDA_CHECK(cudaMemset(d_ax_avg_, 0, a_num_rows_ * sizeof(double))); - CUDA_CHECK(cudaMemset(d_aty_avg_, 0, a_num_cols_ * sizeof(double))); - CUDA_CHECK(cudaMemset(d_x_sum_, 0, a_num_cols_ * sizeof(double))); - CUDA_CHECK(cudaMemset(d_y_sum_, 0, a_num_rows_ * sizeof(double))); - CUDA_CHECK(cudaMemset(d_aty_current_, 0, a_num_cols_ * sizeof(double))); + GPU_SPARSE_CHECK(gpuSparseDestroyDnVec(vec_y)); + GPU_SPARSE_CHECK(gpuSparseDestroyDnVec(vec_aty)); + + GPU_CHECK(gpuMemset(d_x_current_, 0, a_num_cols_ * sizeof(double))); + GPU_CHECK(gpuMemset(d_y_current_, 0, a_num_rows_ * sizeof(double))); + GPU_CHECK(gpuMemset(d_x_avg_, 0, a_num_cols_ * sizeof(double))); + GPU_CHECK(gpuMemset(d_y_avg_, 0, a_num_rows_ * sizeof(double))); + GPU_CHECK(gpuMemset(d_x_next_, 0, a_num_cols_ * sizeof(double))); + GPU_CHECK(gpuMemset(d_y_next_, 0, a_num_rows_ * sizeof(double))); + GPU_CHECK(gpuMemset(d_ax_current_, 0, a_num_rows_ * sizeof(double))); + GPU_CHECK(gpuMemset(d_ax_next_, 0, a_num_rows_ * sizeof(double))); + GPU_CHECK(gpuMemset(d_ax_avg_, 0, a_num_rows_ * sizeof(double))); + GPU_CHECK(gpuMemset(d_aty_avg_, 0, a_num_cols_ * sizeof(double))); + GPU_CHECK(gpuMemset(d_x_sum_, 0, a_num_cols_ * sizeof(double))); + GPU_CHECK(gpuMemset(d_y_sum_, 0, a_num_rows_ * sizeof(double))); + GPU_CHECK(gpuMemset(d_aty_current_, 0, a_num_cols_ * sizeof(double))); sum_weights_gpu_ = 0.0; if (scaling_.isScaled()) { - CUDA_CHECK(cudaMemcpy(d_col_scale_, scaling_.getColScaling().data(), + GPU_CHECK(gpuMemcpy(d_col_scale_, scaling_.getColScaling().data(), a_num_cols_ * sizeof(double), - cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(d_row_scale_, scaling_.getRowScaling().data(), + gpuMemcpyHostToDevice)); + GPU_CHECK(gpuMemcpy(d_row_scale_, scaling_.getRowScaling().data(), a_num_rows_ * sizeof(double), - cudaMemcpyHostToDevice)); + gpuMemcpyHostToDevice)); } else { - cudaFree(d_col_scale_); + gpuFree(d_col_scale_); d_col_scale_ = nullptr; - cudaFree(d_row_scale_); + gpuFree(d_row_scale_); d_row_scale_ = nullptr; } size_t max_size = std::max(a_num_cols_, a_num_rows_); - CUDA_CHECK(cudaMalloc(&d_buffer_, max_size * sizeof(double))); - CUDA_CHECK(cudaMalloc(&d_buffer2_, max_size * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_buffer_, max_size * sizeof(double))); + GPU_CHECK(gpuMalloc(&d_buffer2_, max_size * sizeof(double))); highsLogDev(params_.log_options_, HighsLogType::kInfo, "GPU setup complete. Matrix A (CSR) and A^T (CSR) transferred " @@ -2611,63 +2598,61 @@ void PDLPSolver::setupGpu() { } void PDLPSolver::cleanupGpu() { - if (gpu_stream_) CUDA_CHECK(cudaStreamDestroy(gpu_stream_)); - if (cusparse_handle_) CUSPARSE_CHECK(cusparseDestroy(cusparse_handle_)); - if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_)); - if (mat_a_csr_) CUSPARSE_CHECK(cusparseDestroySpMat(mat_a_csr_)); - if (mat_a_T_csr_) CUSPARSE_CHECK(cusparseDestroySpMat(mat_a_T_csr_)); - CUDA_CHECK(cudaFree(d_a_row_ptr_)); - CUDA_CHECK(cudaFree(d_a_col_ind_)); - CUDA_CHECK(cudaFree(d_a_val_)); - CUDA_CHECK(cudaFree(d_at_row_ptr_)); - CUDA_CHECK(cudaFree(d_at_col_ind_)); - CUDA_CHECK(cudaFree(d_at_val_)); - CUDA_CHECK(cudaFree(d_col_cost_)); - CUDA_CHECK(cudaFree(d_col_lower_)); - CUDA_CHECK(cudaFree(d_col_upper_)); - CUDA_CHECK(cudaFree(d_row_lower_)); - CUDA_CHECK(cudaFree(d_is_equality_row_)); - CUDA_CHECK(cudaFree(d_x_at_last_restart_)); - CUDA_CHECK(cudaFree(d_y_at_last_restart_)); - CUDA_CHECK(cudaFree(d_halpern_iteration_)); - CUDA_CHECK(cudaFree(d_primal_step_size_)); - CUDA_CHECK(cudaFree(d_dual_step_size_)); - if (d_x_anchor_) CUDA_CHECK(cudaFree(d_x_anchor_)); - if (d_y_anchor_) CUDA_CHECK(cudaFree(d_y_anchor_)); - if (d_pdhg_primal_) CUDA_CHECK(cudaFree(d_pdhg_primal_)); - if (d_pdhg_dual_) CUDA_CHECK(cudaFree(d_pdhg_dual_)); - CUDA_CHECK(cudaFree(d_delta_x_)); - CUDA_CHECK(cudaFree(d_delta_y_)); - CUDA_CHECK(cudaFree(d_AT_delta_y_)); - CUDA_CHECK(cudaFree(d_x_temp_diff_norm_result_)); - CUDA_CHECK(cudaFree(d_y_temp_diff_norm_result_)); - CUDA_CHECK(cudaFree(d_x_current_)); - CUDA_CHECK(cudaFree(d_y_current_)); - CUDA_CHECK(cudaFree(d_x_next_)); - CUDA_CHECK(cudaFree(d_y_next_)); - CUDA_CHECK(cudaFree(d_ax_current_)); - CUDA_CHECK(cudaFree(d_aty_current_)); - CUDA_CHECK(cudaFree(d_ax_avg_)); - CUDA_CHECK(cudaFree(d_aty_avg_)); - CUDA_CHECK(cudaFree(d_ax_next_)); - CUDA_CHECK(cudaFree(d_aty_next_)); - CUDA_CHECK(cudaFree(d_x_sum_)); - CUDA_CHECK(cudaFree(d_y_sum_)); - CUDA_CHECK(cudaFree(d_spmv_buffer_ax_)); - CUDA_CHECK(cudaFree(d_spmv_buffer_aty_)); - CUDA_CHECK(cudaFree(d_convergence_results_)); - CUDA_CHECK(cudaFree(d_dSlackPos_)); - CUDA_CHECK(cudaFree(d_dSlackNeg_)); - CUDA_CHECK(cudaFree(d_dSlackPosAvg_)); - CUDA_CHECK(cudaFree(d_dSlackNegAvg_)); - if (d_col_scale_) CUDA_CHECK(cudaFree(d_col_scale_)); - if (d_row_scale_) CUDA_CHECK(cudaFree(d_row_scale_)); - CUDA_CHECK(cudaFree(d_buffer_)); - CUDA_CHECK(cudaFree(d_buffer2_)); - if (vec_x_desc_) CUSPARSE_CHECK(cusparseDestroyDnVec(vec_x_desc_)); - if (vec_y_desc_) CUSPARSE_CHECK(cusparseDestroyDnVec(vec_y_desc_)); - if (vec_ax_desc_) CUSPARSE_CHECK(cusparseDestroyDnVec(vec_ax_desc_)); - if (vec_aty_desc_) CUSPARSE_CHECK(cusparseDestroyDnVec(vec_aty_desc_)); + if (gpu_stream_) GPU_CHECK(gpuStreamDestroy(gpu_stream_)); + if (cusparse_handle_) GPU_SPARSE_CHECK(gpuSparseDestroy(cusparse_handle_)); + if (cublas_handle_) GPU_BLAS_CHECK(gpuBlasDestroy(cublas_handle_)); + if (mat_a_csr_) GPU_SPARSE_CHECK(gpuSparseDestroySpMat(mat_a_csr_)); + if (mat_a_T_csr_) GPU_SPARSE_CHECK(gpuSparseDestroySpMat(mat_a_T_csr_)); + GPU_CHECK(gpuFree(d_a_row_ptr_)); + GPU_CHECK(gpuFree(d_a_col_ind_)); + GPU_CHECK(gpuFree(d_a_val_)); + GPU_CHECK(gpuFree(d_at_row_ptr_)); + GPU_CHECK(gpuFree(d_at_col_ind_)); + GPU_CHECK(gpuFree(d_at_val_)); + GPU_CHECK(gpuFree(d_col_cost_)); + GPU_CHECK(gpuFree(d_col_lower_)); + GPU_CHECK(gpuFree(d_col_upper_)); + GPU_CHECK(gpuFree(d_row_lower_)); + GPU_CHECK(gpuFree(d_is_equality_row_)); + GPU_CHECK(gpuFree(d_x_at_last_restart_)); + GPU_CHECK(gpuFree(d_y_at_last_restart_)); + GPU_CHECK(gpuFree(d_step_params_)); + if (d_x_anchor_) GPU_CHECK(gpuFree(d_x_anchor_)); + if (d_y_anchor_) GPU_CHECK(gpuFree(d_y_anchor_)); + if (d_pdhg_primal_) GPU_CHECK(gpuFree(d_pdhg_primal_)); + if (d_pdhg_dual_) GPU_CHECK(gpuFree(d_pdhg_dual_)); + GPU_CHECK(gpuFree(d_delta_x_)); + GPU_CHECK(gpuFree(d_delta_y_)); + GPU_CHECK(gpuFree(d_AT_delta_y_)); + GPU_CHECK(gpuFree(d_x_temp_diff_norm_result_)); + GPU_CHECK(gpuFree(d_y_temp_diff_norm_result_)); + GPU_CHECK(gpuFree(d_x_current_)); + GPU_CHECK(gpuFree(d_y_current_)); + GPU_CHECK(gpuFree(d_x_next_)); + GPU_CHECK(gpuFree(d_y_next_)); + GPU_CHECK(gpuFree(d_ax_current_)); + GPU_CHECK(gpuFree(d_aty_current_)); + GPU_CHECK(gpuFree(d_ax_avg_)); + GPU_CHECK(gpuFree(d_aty_avg_)); + GPU_CHECK(gpuFree(d_ax_next_)); + GPU_CHECK(gpuFree(d_aty_next_)); + GPU_CHECK(gpuFree(d_x_sum_)); + GPU_CHECK(gpuFree(d_y_sum_)); + GPU_CHECK(gpuFree(d_spmv_buffer_ax_)); + GPU_CHECK(gpuFree(d_spmv_buffer_aty_)); + GPU_CHECK(gpuFree(d_convergence_results_)); + GPU_CHECK(gpuFree(d_dSlackPos_)); + GPU_CHECK(gpuFree(d_dSlackNeg_)); + GPU_CHECK(gpuFree(d_dSlackPosAvg_)); + GPU_CHECK(gpuFree(d_dSlackNegAvg_)); + if (d_col_scale_) GPU_CHECK(gpuFree(d_col_scale_)); + if (d_row_scale_) GPU_CHECK(gpuFree(d_row_scale_)); + GPU_CHECK(gpuFree(d_buffer_)); + GPU_CHECK(gpuFree(d_buffer2_)); + if (vec_x_desc_) GPU_SPARSE_CHECK(gpuSparseDestroyDnVec(vec_x_desc_)); + if (vec_y_desc_) GPU_SPARSE_CHECK(gpuSparseDestroyDnVec(vec_y_desc_)); + if (vec_ax_desc_) GPU_SPARSE_CHECK(gpuSparseDestroyDnVec(vec_ax_desc_)); + if (vec_aty_desc_) GPU_SPARSE_CHECK(gpuSparseDestroyDnVec(vec_aty_desc_)); } void PDLPSolver::linalgGpuAx(const double* d_x_in, double* d_ax_out) { @@ -2675,20 +2660,20 @@ void PDLPSolver::linalgGpuAx(const double* d_x_in, double* d_ax_out) { double alpha = 1.0; double beta = 0.0; - CUSPARSE_CHECK(cusparseDnVecSetValues(vec_x_desc_, (void*)d_x_in)); - CUSPARSE_CHECK(cusparseDnVecSetValues(vec_ax_desc_, (void*)d_ax_out)); + GPU_SPARSE_CHECK(gpuSparseSetDnVecValues(vec_x_desc_, (void*)d_x_in)); + GPU_SPARSE_CHECK(gpuSparseSetDnVecValues(vec_ax_desc_, (void*)d_ax_out)); if (spmv_buffer_size_ax_ == 0) { - CUSPARSE_CHECK(cusparseSpMV_bufferSize( - cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_csr_, - vec_x_desc_, &beta, vec_ax_desc_, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2, + GPU_SPARSE_CHECK(gpuSparseSpMV_bufferSize( + cusparse_handle_, GPU_OPERATION_NON_TRANSPOSE, &alpha, mat_a_csr_, + vec_x_desc_, &beta, vec_ax_desc_, GPU_R_64F, GPU_SPMV_ALG_DEFAULT, &spmv_buffer_size_ax_)); - CUDA_CHECK(cudaMalloc(&d_spmv_buffer_ax_, spmv_buffer_size_ax_)); + GPU_CHECK(gpuMalloc(&d_spmv_buffer_ax_, spmv_buffer_size_ax_)); } - CUSPARSE_CHECK( - cusparseSpMV(cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, - mat_a_csr_, vec_x_desc_, &beta, vec_ax_desc_, CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, d_spmv_buffer_ax_)); + GPU_SPARSE_CHECK( + gpuSparseSpMV(cusparse_handle_, GPU_OPERATION_NON_TRANSPOSE, &alpha, + mat_a_csr_, vec_x_desc_, &beta, vec_ax_desc_, GPU_R_64F, + GPU_SPMV_ALG_DEFAULT, d_spmv_buffer_ax_)); } void PDLPSolver::linalgGpuATy(const double* d_y_in, double* d_aty_out) { @@ -2696,46 +2681,46 @@ void PDLPSolver::linalgGpuATy(const double* d_y_in, double* d_aty_out) { double alpha = 1.0; double beta = 0.0; - CUSPARSE_CHECK(cusparseDnVecSetValues(vec_y_desc_, (void*)d_y_in)); - CUSPARSE_CHECK(cusparseDnVecSetValues(vec_aty_desc_, (void*)d_aty_out)); + GPU_SPARSE_CHECK(gpuSparseSetDnVecValues(vec_y_desc_, (void*)d_y_in)); + GPU_SPARSE_CHECK(gpuSparseSetDnVecValues(vec_aty_desc_, (void*)d_aty_out)); if (spmv_buffer_size_aty_ == 0) { - CUSPARSE_CHECK(cusparseSpMV_bufferSize( - cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, - mat_a_T_csr_, vec_y_desc_, &beta, vec_aty_desc_, CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, &spmv_buffer_size_aty_)); - CUDA_CHECK(cudaMalloc(&d_spmv_buffer_aty_, spmv_buffer_size_aty_)); - } - CUSPARSE_CHECK( - cusparseSpMV(cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, - mat_a_T_csr_, vec_y_desc_, &beta, vec_aty_desc_, CUDA_R_64F, - CUSPARSE_SPMV_CSR_ALG2, d_spmv_buffer_aty_)); + GPU_SPARSE_CHECK(gpuSparseSpMV_bufferSize( + cusparse_handle_, GPU_OPERATION_NON_TRANSPOSE, &alpha, + mat_a_T_csr_, vec_y_desc_, &beta, vec_aty_desc_, GPU_R_64F, + GPU_SPMV_ALG_DEFAULT, &spmv_buffer_size_aty_)); + GPU_CHECK(gpuMalloc(&d_spmv_buffer_aty_, spmv_buffer_size_aty_)); + } + GPU_SPARSE_CHECK( + gpuSparseSpMV(cusparse_handle_, GPU_OPERATION_NON_TRANSPOSE, &alpha, + mat_a_T_csr_, vec_y_desc_, &beta, vec_aty_desc_, GPU_R_64F, + GPU_SPMV_ALG_DEFAULT, d_spmv_buffer_aty_)); } void PDLPSolver::launchKernelUpdateX(double primal_step) { launchKernelUpdateX_wrapper(d_x_next_, d_x_current_, d_aty_current_, d_col_cost_, d_col_lower_, d_col_upper_, primal_step, a_num_cols_, gpu_stream_); - CUDA_CHECK(cudaGetLastError()); + GPU_CHECK(gpuGetLastError()); } void PDLPSolver::launchKernelUpdateY(double dual_step) { launchKernelUpdateY_wrapper(d_y_next_, d_y_current_, d_ax_current_, d_ax_next_, d_row_lower_, d_is_equality_row_, dual_step, a_num_rows_, gpu_stream_); - CUDA_CHECK(cudaGetLastError()); + GPU_CHECK(gpuGetLastError()); } void PDLPSolver::launchKernelUpdateAverages(double weight) { launchKernelUpdateAverages_wrapper(d_x_sum_, d_y_sum_, d_x_next_, d_y_next_, weight, a_num_cols_, a_num_rows_, gpu_stream_); - CUDA_CHECK(cudaGetLastError()); + GPU_CHECK(gpuGetLastError()); } void PDLPSolver::launchKernelScaleVector(double* d_out, const double* d_in, double Scale, int n) { launchKernelScaleVector_wrapper(d_out, d_in, Scale, n, gpu_stream_); - CUDA_CHECK(cudaGetLastError()); + GPU_CHECK(gpuGetLastError()); } bool PDLPSolver::checkConvergenceGpu(const HighsInt iter, const double* d_x, @@ -2754,8 +2739,8 @@ bool PDLPSolver::checkConvergenceGpu(const HighsInt iter, const double* d_x, // copy 4 doubles back to CPU double h_results[4]; - CUDA_CHECK(cudaMemcpy(h_results, d_convergence_results_, 4 * sizeof(double), - cudaMemcpyDeviceToHost)); + GPU_CHECK(gpuMemcpy(h_results, d_convergence_results_, 4 * sizeof(double), + gpuMemcpyDeviceToHost)); double primal_feas_sq = h_results[0]; double dual_feas_sq = h_results[1]; @@ -2840,8 +2825,8 @@ void PDLPSolver::computeAverageIterateGpu() { #if PDLP_DEBUG_LOG // copy x_avg to host - CUDA_CHECK(cudaMemcpy(x_avg_.data(), d_x_avg_, a_num_cols_ * sizeof(double), - cudaMemcpyDeviceToHost)); + GPU_CHECK(gpuMemcpy(x_avg_.data(), d_x_avg_, a_num_cols_ * sizeof(double), + gpuMemcpyDeviceToHost)); debug_pdlp_data_.x_average_norm = linalg::vectorNormSquared(x_avg_); #endif } @@ -2871,21 +2856,21 @@ double PDLPSolver::computeNonlinearityGpu(const double* d_x_new, const double* d_aty_new, const double* d_aty_old) { // 1. Compute delta_x = x_new - x_old - CUDA_CHECK(cudaMemcpy(d_buffer_, d_x_new, a_num_cols_ * sizeof(double), - cudaMemcpyDeviceToDevice)); + GPU_CHECK(gpuMemcpy(d_buffer_, d_x_new, a_num_cols_ * sizeof(double), + gpuMemcpyDeviceToDevice)); double alpha = -1.0; - CUBLAS_CHECK(cublasDaxpy(cublas_handle_, a_num_cols_, &alpha, d_x_old, 1, + GPU_BLAS_CHECK(gpuBlasDaxpy(cublas_handle_, a_num_cols_, &alpha, d_x_old, 1, d_buffer_, 1)); // 2. Compute delta_aty = aty_new - aty_old - CUDA_CHECK(cudaMemcpy(d_buffer2_, d_aty_new, a_num_cols_ * sizeof(double), - cudaMemcpyDeviceToDevice)); - CUBLAS_CHECK(cublasDaxpy(cublas_handle_, a_num_cols_, &alpha, d_aty_old, 1, + GPU_CHECK(gpuMemcpy(d_buffer2_, d_aty_new, a_num_cols_ * sizeof(double), + gpuMemcpyDeviceToDevice)); + GPU_BLAS_CHECK(gpuBlasDaxpy(cublas_handle_, a_num_cols_, &alpha, d_aty_old, 1, d_buffer2_, 1)); // 3. Compute Dot product: delta_x' * delta_aty double result; - CUBLAS_CHECK(cublasDdot(cublas_handle_, a_num_cols_, d_buffer_, 1, d_buffer2_, + GPU_BLAS_CHECK(gpuBlasDdot(cublas_handle_, a_num_cols_, d_buffer_, 1, d_buffer2_, 1, &result)); return result; @@ -2894,16 +2879,16 @@ double PDLPSolver::computeNonlinearityGpu(const double* d_x_new, double PDLPSolver::computeDiffNormCuBLAS(const double* d_a, const double* d_b, HighsInt n) { // 1. Copy a to buffer: buffer = a - CUDA_CHECK( - cudaMemcpy(d_buffer_, d_a, n * sizeof(double), cudaMemcpyDeviceToDevice)); + GPU_CHECK( + gpuMemcpy(d_buffer_, d_a, n * sizeof(double), gpuMemcpyDeviceToDevice)); // 2. buffer = buffer - b (using cuBLAS axpy) double alpha = -1.0; - CUBLAS_CHECK(cublasDaxpy(cublas_handle_, n, &alpha, d_b, 1, d_buffer_, 1)); + GPU_BLAS_CHECK(gpuBlasDaxpy(cublas_handle_, n, &alpha, d_b, 1, d_buffer_, 1)); // 3. result = ||buffer||_2 (using cuBLAS Norm2) double norm; - CUBLAS_CHECK(cublasDnrm2(cublas_handle_, n, d_buffer_, 1, &norm)); + GPU_BLAS_CHECK(gpuBlasDnrm2(cublas_handle_, n, d_buffer_, 1, &norm)); return norm; } diff --git a/highs/pdlp/hipdlp/pdhg.cu b/highs/pdlp/hipdlp/pdhg.cu index 6209faa3794..7362542ff7c 100644 --- a/highs/pdlp/hipdlp/pdhg.cu +++ b/highs/pdlp/hipdlp/pdhg.cu @@ -1,8 +1,8 @@ -#include -#include -#include +#include "HConfig.h" +#include "gpu_backend.hpp" +#include "pdhg_kernels.hpp" +#include #include -#include #ifdef HIGHSINT64 typedef int64_t HighsInt; @@ -14,8 +14,8 @@ typedef unsigned int HighsUInt; #define HIGHSINT_FORMAT "d" #endif -// Define Infinity for GPU -#define GPU_INF 1e20 +// Define Infinity for GPU +#define GPU_INF 1e20 // Buffer Indices for the reduction array #define IDX_PRIMAL_FEAS 0 @@ -23,17 +23,21 @@ typedef unsigned int HighsUInt; #define IDX_PRIMAL_OBJ 2 #define IDX_DUAL_OBJ 3 -// Add to pdhg.cu -#define FULL_WARP_REDUCE(val) { \ - val += __shfl_down_sync(0xFFFFFFFF, val, 16); \ - val += __shfl_down_sync(0xFFFFFFFF, val, 8); \ - val += __shfl_down_sync(0xFFFFFFFF, val, 4); \ - val += __shfl_down_sync(0xFFFFFFFF, val, 2); \ - val += __shfl_down_sync(0xFFFFFFFF, val, 1); \ +// warpSize is a built-in device variable that returns the actual hardware +// warp/wavefront width (32 on CUDA and RDNA, 64 on GCN/CDNA). +__device__ __forceinline__ double gpuReduce(double val) { + for (int offset = warpSize / 2; offset > 0; offset >>= 1) { +#ifdef USE_HIP + val += __shfl_down(val, offset); +#else + val += __shfl_down_sync(0xffffffff, val, offset); +#endif + } + return val; } // Utility for robust 1D kernel launches -#define CUDA_GRID_STRIDE_LOOP(i, n) \ +#define GPU_GRID_STRIDE_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ i += blockDim.x * gridDim.x) @@ -58,26 +62,27 @@ __device__ __forceinline__ double fma_rn(double a, double b, double c) { // === KERNEL 1: Update X (Primal Step) === __global__ void kernelUpdateX( - double* d_x_new, const double* d_x_old, const double* d_aty, - const double* d_cost, const double* d_lower, const double* d_upper, - double primal_step, int n_cols) + double* __restrict__ d_x_new, const double* __restrict__ d_x_old, + const double* __restrict__ d_aty, const double* __restrict__ d_cost, + const double* __restrict__ d_lower, const double* __restrict__ d_upper, + double primal_step, int n_cols) { - CUDA_GRID_STRIDE_LOOP(i, n_cols) { + GPU_GRID_STRIDE_LOOP(i, n_cols) { double x_updated = fma_rn(primal_step, d_aty[i] - d_cost[i], d_x_old[i]); - - // 3. Project to bounds [l, u] + + // Project to bounds [l, u] d_x_new[i] = fmin(fmax(x_updated, d_lower[i]), d_upper[i]); } } // === KERNEL 2: Update Y (Dual Step) === __global__ void kernelUpdateY( - double* d_y_new, const double* d_y_old, - const double* d_ax_old, const double* d_ax_new, - const double* d_rhs, const bool* d_is_equality, + double* __restrict__ d_y_new, const double* __restrict__ d_y_old, + const double* __restrict__ d_ax_old, const double* __restrict__ d_ax_new, + const double* __restrict__ d_rhs, const bool* __restrict__ d_is_equality, double dual_step, int n_rows) { - CUDA_GRID_STRIDE_LOOP(j, n_rows) { + GPU_GRID_STRIDE_LOOP(j, n_rows) { double residual = fma_rn(-2.0, d_ax_new[j], d_rhs[j] + d_ax_old[j]); double dual_update = fma_rn(dual_step, residual, d_y_old[j]); d_y_new[j] = d_is_equality[j] ? dual_update : fmax(0.0, dual_update); @@ -88,46 +93,46 @@ __global__ void kernelUpdateY( // x_sum = x_sum + weight * x_next. // y_sum = y_sum + weight * y_next __global__ void kernelUpdateAverages( - double* d_x_sum, double* d_y_sum, - const double* d_x_next, const double* d_y_next, + double* __restrict__ d_x_sum, double* __restrict__ d_y_sum, + const double* __restrict__ d_x_next, const double* __restrict__ d_y_next, double weight, int n_cols, int n_rows) { // Update x_sum - CUDA_GRID_STRIDE_LOOP(i, n_cols) { + GPU_GRID_STRIDE_LOOP(i, n_cols) { d_x_sum[i] = fma_rn(weight, d_x_next[i], d_x_sum[i]); } - + // Update y_sum - CUDA_GRID_STRIDE_LOOP(j, n_rows) { + GPU_GRID_STRIDE_LOOP(j, n_rows) { d_y_sum[j] = fma_rn(weight, d_y_next[j], d_y_sum[j]); } } __global__ void kernelScaleVector( - double* d_out, const double* d_in, + double* __restrict__ d_out, const double* __restrict__ d_in, double scale, int n) { - CUDA_GRID_STRIDE_LOOP(i, n) { + GPU_GRID_STRIDE_LOOP(i, n) { d_out[i] = d_in[i] * scale; } } // === KERNEL 4: Primal Convergence Check (Row-wise) === __global__ void kernelCheckPrimal( - double* d_results, - const double* d_ax, const double* d_y, - const double* d_row_lower, const double* d_row_scale, - const bool* d_is_equality, int n_rows){ + double* __restrict__ d_results, + const double* __restrict__ d_ax, const double* __restrict__ d_y, + const double* __restrict__ d_row_lower, const double* d_row_scale, + const bool* __restrict__ d_is_equality, int n_rows){ double local_feas_sq = 0.0; double local_dual_obj = 0.0; - CUDA_GRID_STRIDE_LOOP(i,n_rows){ + GPU_GRID_STRIDE_LOOP(i,n_rows){ double val_y = d_y[i]; double val_b = d_row_lower[i]; double val_ax = d_ax[i]; - if (abs(val_b) < GPU_INF){ - local_dual_obj += val_b * val_y; + if (abs(val_b) < GPU_INF){ + local_dual_obj = fma_rn(val_b, val_y, local_dual_obj); } double residual = val_ax - val_b; @@ -140,46 +145,58 @@ __global__ void kernelCheckPrimal( residual *= d_row_scale[i]; } - local_feas_sq += residual * residual; + local_feas_sq = fma_rn(residual, residual, local_feas_sq); } - // 1. wrap reduce - FULL_WARP_REDUCE(local_feas_sq); - FULL_WARP_REDUCE(local_dual_obj); + // Warp-level reduction + local_feas_sq = gpuReduce(local_feas_sq); + local_dual_obj = gpuReduce(local_dual_obj); + + // Block-level reduction: one atomicAdd per block to global memory instead of + // one per warp, cutting contention by ~blockDim.x/warpSize. + __shared__ double s_feas[2]; + if (threadIdx.x == 0) { s_feas[0] = 0.0; s_feas[1] = 0.0; } + __syncthreads(); - // 2. Atomic Add only from the first thread of each warp - if ((threadIdx.x & 31) == 0){ - atomicAdd(&d_results[IDX_PRIMAL_FEAS], local_feas_sq); - atomicAdd(&d_results[IDX_DUAL_OBJ], local_dual_obj); + if ((threadIdx.x & (warpSize - 1)) == 0) { + atomicAdd(&s_feas[0], local_feas_sq); + atomicAdd(&s_feas[1], local_dual_obj); + } + __syncthreads(); + + if (threadIdx.x == 0) { + atomicAdd(&d_results[IDX_PRIMAL_FEAS], s_feas[0]); + atomicAdd(&d_results[IDX_DUAL_OBJ], s_feas[1]); } } // === KERNEL 5: Dual Convergence Check (Column-wise) === +// d_slack_pos doubles as input (raw Halpern slack) and output (s_pos). __global__ void kernelCheckDual( - double* d_results, // [1]: D.Feas, [2]: P.Obj, [3]: D.Obj - double* d_slack_pos, // Input (raw halpern slack) / Output (s_pos) - double* d_slack_neg, // Output - const double* d_aty, - const double* d_x, - const double* d_cost, // c - const double* d_col_lower, // l - const double* d_col_upper, // u - const double* d_col_scale, // Can be nullptr + double* __restrict__ d_results, + double* __restrict__ d_slack_pos, // Input (raw halpern slack) / Output (s_pos) + double* __restrict__ d_slack_neg, // Output + const double* __restrict__ d_aty, + const double* __restrict__ d_x, + const double* __restrict__ d_cost, // c + const double* __restrict__ d_col_lower, // l + const double* __restrict__ d_col_upper, // u + const double* d_col_scale, // Can be nullptr int n_cols, - bool use_halpern_slack) + bool use_halpern_slack) { double local_dual_feas_sq = 0.0; double local_primal_obj = 0.0; double local_dual_obj_part = 0.0; - CUDA_GRID_STRIDE_LOOP(i, n_cols){ + GPU_GRID_STRIDE_LOOP(i, n_cols){ double val_x = d_x[i]; double val_c = d_cost[i]; double val_aty = d_aty[i]; double val_l = d_col_lower[i]; double val_u = d_col_upper[i]; - local_primal_obj += val_c * val_x; + local_primal_obj = fma_rn(val_c, val_x, local_primal_obj); double dual_residual = val_c - val_aty; double s_pos = 0.0; @@ -187,7 +204,7 @@ __global__ void kernelCheckDual( if (use_halpern_slack) { // Read the raw Halpern slack previously saved in d_slack_pos by primal major step - double raw_slack = d_slack_pos[i]; + double raw_slack = d_slack_pos[i]; s_pos = fmax(0.0, raw_slack); s_neg = fmax(0.0, -raw_slack); } else { @@ -206,19 +223,34 @@ __global__ void kernelCheckDual( eff_dual_residual *= d_col_scale[i]; } - local_dual_feas_sq += eff_dual_residual * eff_dual_residual; + local_dual_feas_sq = fma_rn(eff_dual_residual, eff_dual_residual, local_dual_feas_sq); + + if (val_l > -GPU_INF) local_dual_obj_part = fma_rn(val_l, s_pos, local_dual_obj_part); + if (val_u < GPU_INF) local_dual_obj_part = fma_rn(-val_u, s_neg, local_dual_obj_part); + } + + // Warp-level reduction + local_dual_feas_sq = gpuReduce(local_dual_feas_sq); + local_primal_obj = gpuReduce(local_primal_obj); + local_dual_obj_part = gpuReduce(local_dual_obj_part); - double obj_term = 0.0; - if (val_l > -GPU_INF) obj_term += val_l * s_pos; - if (val_u < GPU_INF) obj_term -= val_u * s_neg; + // Block-level reduction via shared memory before global atomicAdd. + __shared__ double s_dual[3]; + if (threadIdx.x == 0) { s_dual[0] = 0.0; s_dual[1] = 0.0; s_dual[2] = 0.0; } + __syncthreads(); - local_dual_obj_part += obj_term; + if ((threadIdx.x & (warpSize - 1)) == 0) { + atomicAdd(&s_dual[0], local_dual_feas_sq); + atomicAdd(&s_dual[1], local_primal_obj); + atomicAdd(&s_dual[2], local_dual_obj_part); } + __syncthreads(); - // Atomic accumulation - atomicAdd(&d_results[IDX_DUAL_FEAS], local_dual_feas_sq); - atomicAdd(&d_results[IDX_PRIMAL_OBJ], local_primal_obj); - atomicAdd(&d_results[IDX_DUAL_OBJ], local_dual_obj_part); + if (threadIdx.x == 0) { + atomicAdd(&d_results[IDX_DUAL_FEAS], s_dual[0]); + atomicAdd(&d_results[IDX_PRIMAL_OBJ], s_dual[1]); + atomicAdd(&d_results[IDX_DUAL_OBJ], s_dual[2]); + } } // ============================================================================ @@ -236,7 +268,7 @@ __global__ void kernelHalpernPrimalMinor( const double* __restrict__ step_size_ptr, int n) { const double step_size = *step_size_ptr; - CUDA_GRID_STRIDE_LOOP(i, n) { + GPU_GRID_STRIDE_LOOP(i, n) { double temp = fma_rn(-step_size, objective[i] - dual_product[i], current_primal[i]); double temp_proj = fmax(var_lb[i], fmin(temp, var_ub[i])); @@ -258,12 +290,13 @@ __global__ void kernelHalpernPrimalMajor( double* __restrict__ dual_slack) { const double step_size = *step_size_ptr; - CUDA_GRID_STRIDE_LOOP(i, n) { + const double inv_step_size = 1.0 / step_size; + GPU_GRID_STRIDE_LOOP(i, n) { double temp = fma_rn(-step_size, objective[i] - dual_product[i], current_primal[i]); double temp_proj = fmax(var_lb[i], fmin(temp, var_ub[i])); pdhg_primal[i] = temp_proj; - dual_slack[i] = (temp_proj - temp) / step_size; + dual_slack[i] = (temp_proj - temp) * inv_step_size; reflected_primal[i] = fma_rn(2.0, temp_proj, -current_primal[i]); } } @@ -277,7 +310,7 @@ __global__ void kernelHalpernDualMinor( const double* __restrict__ step_size_ptr, int n) { const double step_size = *step_size_ptr; - CUDA_GRID_STRIDE_LOOP(i, n) { + GPU_GRID_STRIDE_LOOP(i, n) { double dual_update = fma_rn(step_size, rhs[i] - primal_product[i], current_dual[i]); double pdhg_dual_i = is_equality[i] ? dual_update @@ -296,7 +329,7 @@ __global__ void kernelHalpernDualMajor( const double* __restrict__ step_size_ptr, int n) { const double step_size = *step_size_ptr; - CUDA_GRID_STRIDE_LOOP(i, n) { + GPU_GRID_STRIDE_LOOP(i, n) { double dual_update = fma_rn(step_size, rhs[i] - primal_product[i], current_dual[i]); double pdhg_dual_i = is_equality[i] ? dual_update @@ -316,12 +349,15 @@ __global__ void kernelHalpernBlend( double reflection_coeff, // ρ, typically 1.0 HighsInt n) { - const int current_k = (*halpern_iteration) + k_offset; - const double weight = static_cast(current_k) / (current_k + 1.0); - CUDA_GRID_STRIDE_LOOP(i, n) { + const int current_k = (*halpern_iteration) + k_offset; + const double weight = static_cast(current_k) / (current_k + 1.0); + const double one_minus_weight = 1.0 - weight; + const double one_minus_rho = 1.0 - reflection_coeff; + GPU_GRID_STRIDE_LOOP(i, n) { double blended = fma_rn(reflection_coeff, reflected[i], - (1.0 - reflection_coeff) * current[i]); - current[i] = fma_rn(weight, blended, (1.0 - weight) * initial[i]); + fma_rn(one_minus_rho, current[i], 0.0)); + current[i] = fma_rn(weight, blended, + fma_rn(one_minus_weight, initial[i], 0.0)); } } @@ -330,63 +366,63 @@ extern "C" { void launchKernelUpdateX_wrapper( double* d_x_new, const double* d_x_old, const double* d_aty, const double* d_cost, const double* d_lower, const double* d_upper, - double primal_step, int n_cols, cudaStream_t stream) + double primal_step, int n_cols, gpuStream_t stream) { const int block_size = 256; dim3 config = GetLaunchConfig(n_cols, block_size); - + kernelUpdateX<<>>( d_x_new, d_x_old, d_aty, d_cost, d_lower, d_upper, primal_step, n_cols); - - cudaGetLastError(); + + gpuGetLastError(); } void launchKernelUpdateY_wrapper( double* d_y_new, const double* d_y_old, - const double* d_ax_old, const double* d_ax_new, + const double* d_ax_old, const double* d_ax_new, const double* d_rhs, const bool* d_is_equality, - double dual_step, int n_rows, cudaStream_t stream) + double dual_step, int n_rows, gpuStream_t stream) { const int block_size = 256; dim3 config = GetLaunchConfig(n_rows, block_size); - + kernelUpdateY<<>>( d_y_new, d_y_old, d_ax_old, d_ax_new, d_rhs, d_is_equality, dual_step, n_rows); - - cudaGetLastError(); + + gpuGetLastError(); } void launchKernelUpdateAverages_wrapper( double* d_x_sum, double* d_y_sum, const double* d_x_next, const double* d_y_next, - double weight, int n_cols, int n_rows, cudaStream_t stream) + double weight, int n_cols, int n_rows, gpuStream_t stream) { const int block_size = 256; dim3 config_x = GetLaunchConfig(n_cols, block_size); - dim3 config_y = GetLaunchConfig(n_rows, block_size); + dim3 config_y = GetLaunchConfig(n_rows, block_size); kernelUpdateAverages<< config_y.x ? config_x.x : config_y.x, block_size, 0, stream>>>( d_x_sum, d_y_sum, d_x_next, d_y_next, weight, n_cols, n_rows); - cudaGetLastError(); + gpuGetLastError(); } void launchKernelScaleVector_wrapper( - double* d_out, const double* d_in, - double scale, int n, cudaStream_t stream) + double* d_out, const double* d_in, + double scale, int n, gpuStream_t stream) { const int block_size = 256; dim3 config = GetLaunchConfig(n, block_size); - + kernelScaleVector<<>>( d_out, d_in, scale, n); - - cudaGetLastError(); + + gpuGetLastError(); } void launchCheckConvergenceKernels_wrapper( @@ -400,10 +436,10 @@ void launchCheckConvergenceKernels_wrapper( const double* d_col_scale, const double* d_row_scale, int n_cols, int n_rows, bool use_halpern_slack, - cudaStream_t stream) + gpuStream_t stream) { // 1. Zero out results - cudaMemsetAsync(d_results, 0, 4 * sizeof(double), stream); + gpuMemsetAsync(d_results, 0, 4 * sizeof(double), stream); int block_size = 256; @@ -416,26 +452,26 @@ void launchCheckConvergenceKernels_wrapper( // 3. Launch Dual Kernel dim3 grid_cols = GetLaunchConfig(n_cols, block_size); kernelCheckDual<<>>( - d_results, d_slack_pos, d_slack_neg, d_aty, d_x, + d_results, d_slack_pos, d_slack_neg, d_aty, d_x, d_col_cost, d_col_lower, d_col_upper, d_col_scale, n_cols, use_halpern_slack ); - cudaGetLastError(); + gpuGetLastError(); } void launchKernelHalpernPrimalMinor_wrapper( const double* d_current_primal, double* d_reflected_primal, const double* d_dual_product, const double* d_objective, const double* d_var_lb, const double* d_var_ub, - const double* d_step_size, int n, cudaStream_t stream) + const double* d_step_size, int n, gpuStream_t stream) { const int block_size = 256; dim3 config = GetLaunchConfig(n, block_size); kernelHalpernPrimalMinor<<>>( d_current_primal, d_reflected_primal, d_dual_product, d_objective, d_var_lb, d_var_ub, d_step_size, n); - cudaGetLastError(); + gpuGetLastError(); } void launchKernelHalpernPrimalMajor_wrapper( @@ -443,7 +479,7 @@ void launchKernelHalpernPrimalMajor_wrapper( double* d_reflected_primal, const double* d_dual_product, const double* d_objective, const double* d_var_lb, const double* d_var_ub, const double* d_step_size, int n, - double* d_dual_slack, cudaStream_t stream) + double* d_dual_slack, gpuStream_t stream) { const int block_size = 256; dim3 config = GetLaunchConfig(n, block_size); @@ -451,47 +487,47 @@ void launchKernelHalpernPrimalMajor_wrapper( d_current_primal, d_pdhg_primal, d_reflected_primal, d_dual_product, d_objective, d_var_lb, d_var_ub, d_step_size, n, d_dual_slack); - cudaGetLastError(); + gpuGetLastError(); } void launchKernelHalpernDualMinor_wrapper( const double* d_current_dual, double* d_reflected_dual, const double* d_primal_product, const double* d_rhs, const bool* d_is_equality, const double* d_step_size, int n, - cudaStream_t stream) + gpuStream_t stream) { const int block_size = 256; dim3 config = GetLaunchConfig(n, block_size); kernelHalpernDualMinor<<>>( d_current_dual, d_reflected_dual, d_primal_product, d_rhs, d_is_equality, d_step_size, n); - cudaGetLastError(); + gpuGetLastError(); } void launchKernelHalpernDualMajor_wrapper( const double* d_current_dual, double* d_pdhg_dual, double* d_reflected_dual, const double* d_primal_product, const double* d_rhs, const bool* d_is_equality, - const double* d_step_size, int n, cudaStream_t stream) + const double* d_step_size, int n, gpuStream_t stream) { const int block_size = 256; dim3 config = GetLaunchConfig(n, block_size); kernelHalpernDualMajor<<>>( d_current_dual, d_pdhg_dual, d_reflected_dual, d_primal_product, d_rhs, d_is_equality, d_step_size, n); - cudaGetLastError(); + gpuGetLastError(); } void launchKernelHalpernBlend_wrapper( double* d_current, const double* d_reflected, const double* d_initial, const int* d_halpern_iteration, int k_offset, - double reflection_coeff, int n, cudaStream_t stream) + double reflection_coeff, int n, gpuStream_t stream) { const int block_size = 256; dim3 config = GetLaunchConfig(n, block_size); kernelHalpernBlend<<>>( d_current, d_reflected, d_initial, d_halpern_iteration, k_offset, reflection_coeff, n); - cudaGetLastError(); + gpuGetLastError(); } -} // extern "C" \ No newline at end of file +} // extern "C" diff --git a/highs/pdlp/hipdlp/pdhg.hpp b/highs/pdlp/hipdlp/pdhg.hpp index aa3fb4a38f6..4a2e5b1ef7d 100644 --- a/highs/pdlp/hipdlp/pdhg.hpp +++ b/highs/pdlp/hipdlp/pdhg.hpp @@ -11,10 +11,9 @@ #ifndef PDHG_HPP #define PDHG_HPP -#ifdef CUPDLP_GPU -#include -#include -#include +#include "HConfig.h" +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) +#include "gpu_backend.hpp" #endif #define PDLP_PROFILE (0) @@ -36,42 +35,8 @@ const bool kUseCupdlpx = true; const bool kTempSetting = true; -// --- GPU Macros (Defined at file scope for visibility) --- -#ifdef CUPDLP_GPU -#include -#include -#include - -#define CUDA_CHECK(call) \ - do { \ - cudaError_t err = (call); \ - if (err != cudaSuccess) { \ - fprintf(stderr, "CUDA Error at %s:%d: %s\n", __FILE__, __LINE__, \ - cudaGetErrorString(err)); \ - exit(EXIT_FAILURE); \ - } \ - } while (0) - -#define CUSPARSE_CHECK(call) \ - do { \ - cusparseStatus_t status = (call); \ - if (status != CUSPARSE_STATUS_SUCCESS) { \ - fprintf(stderr, "cuSPARSE Error at %s:%d: %s\n", __FILE__, __LINE__, \ - cusparseGetErrorString(status)); \ - exit(EXIT_FAILURE); \ - } \ - } while (0) - -#define CUBLAS_CHECK(call) \ - do { \ - cublasStatus_t status = call; \ - if (status != CUBLAS_STATUS_SUCCESS) { \ - fprintf(stderr, "cuBLAS Error at %s:%d: %d\n", __FILE__, __LINE__, \ - status); \ - exit(EXIT_FAILURE); \ - } \ - } while (0) -#endif +// GPU error-checking macros are defined in gpu_backend.hpp as +// GPU_CHECK / GPU_BLAS_CHECK / GPU_SPARSE_CHECK. // Forward declarations struct StepSizeConfig; @@ -126,7 +91,7 @@ class PDLPSolver { // --- Convergence & Math Helpers --- double powerMethod(); -#ifdef CUPDLP_GPU +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) double powerMethodGpu(); #endif void initializeStepSizes(); @@ -263,28 +228,37 @@ class PDLPSolver { void hipdlpTimerStop(const HighsInt hipdlp_clock); #endif -#ifdef CUPDLP_GPU +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) HighsInt a_num_rows_ = 0; HighsInt a_num_cols_ = 0; HighsInt a_nnz_ = 0; double sum_weights_gpu_ = 0.0; // --- GPU Specifics --- - cudaStream_t gpu_stream_ = nullptr; - cusparseHandle_t cusparse_handle_ = nullptr; - cublasHandle_t cublas_handle_ = nullptr; + gpuStream_t gpu_stream_ = nullptr; + gpuSparseHandle_t cusparse_handle_ = nullptr; + gpuBlasHandle_t cublas_handle_ = nullptr; // Matrix descriptors - cusparseSpMatDescr_t mat_a_csr_ = nullptr; - cusparseSpMatDescr_t mat_a_T_csr_ = nullptr; + gpuSpMatDescr_t mat_a_csr_ = nullptr; + gpuSpMatDescr_t mat_a_T_csr_ = nullptr; // Device Pointers HighsInt *d_a_row_ptr_ = nullptr, *d_a_col_ind_ = nullptr; double* d_a_val_ = nullptr; HighsInt *d_at_row_ptr_ = nullptr, *d_at_col_ind_ = nullptr; double* d_at_val_ = nullptr; - int* d_halpern_iteration_ = nullptr; - double* d_primal_step_size_ = nullptr; - double* d_dual_step_size_ = nullptr; + // Packed into a single device allocation to avoid 3 separate H→D transfers + // per iteration. Layout: [primal_step, dual_step, halpern_iteration (as int)]. + struct GpuStepParams { + double primal_step; + double dual_step; + int halpern_iteration; + }; + GpuStepParams* d_step_params_ = nullptr; + // Convenience device pointers aliasing into d_step_params_ + int* d_halpern_iteration_ = nullptr; + double* d_primal_step_size_ = nullptr; + double* d_dual_step_size_ = nullptr; // GPU Vectors (Device memory) double *d_x_current_ = nullptr, *d_y_current_ = nullptr; @@ -323,8 +297,8 @@ class PDLPSolver { size_t spmv_buffer_size_aty_ = 0; // Vector Descriptors - cusparseDnVecDescr_t vec_x_desc_ = nullptr, vec_y_desc_ = nullptr; - cusparseDnVecDescr_t vec_ax_desc_ = nullptr, vec_aty_desc_ = nullptr; + gpuDnVecDescr_t vec_x_desc_ = nullptr, vec_y_desc_ = nullptr; + gpuDnVecDescr_t vec_ax_desc_ = nullptr, vec_aty_desc_ = nullptr; // GPU Methods void setupGpu(); diff --git a/highs/pdlp/hipdlp/pdhg_kernels.hpp b/highs/pdlp/hipdlp/pdhg_kernels.hpp index 57a6f411318..eec1214d710 100644 --- a/highs/pdlp/hipdlp/pdhg_kernels.hpp +++ b/highs/pdlp/hipdlp/pdhg_kernels.hpp @@ -1,7 +1,7 @@ #pragma once -#ifdef CUPDLP_GPU -#include +#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU) +#include "gpu_backend.hpp" #ifdef __cplusplus extern "C" { @@ -11,22 +11,22 @@ void launchKernelUpdateX_wrapper(double* d_x_new, const double* d_x_old, const double* d_aty, const double* d_cost, const double* d_lower, const double* d_upper, double primal_step, int n_cols, - cudaStream_t stream); + gpuStream_t stream); void launchKernelUpdateY_wrapper(double* d_y_new, const double* d_y_old, const double* d_ax_old, const double* d_ax_new, const double* d_row_lower, const bool* d_is_equality, double dual_step, - int n_rows, cudaStream_t stream); + int n_rows, gpuStream_t stream); void launchKernelUpdateAverages_wrapper(double* d_x_sum, double* d_y_sum, const double* d_x_current, const double* d_y_current, double weight, int n_cols, int n_rows, - cudaStream_t stream); + gpuStream_t stream); void launchKernelScaleVector_wrapper(double* d_out, const double* d_in, - double scale, int n, cudaStream_t stream); + double scale, int n, gpuStream_t stream); void launchCheckConvergenceKernels_wrapper( double* d_results, double* d_slack_pos, double* d_slack_neg, @@ -35,20 +35,20 @@ void launchCheckConvergenceKernels_wrapper( const double* d_col_lower, const double* d_col_upper, const bool* d_is_equality, const double* d_col_scale, const double* d_row_scale, int n_cols, int n_rows, bool use_halpern_slack, - cudaStream_t stream); + gpuStream_t stream); void launchKernelHalpernPrimalMinor_wrapper( const double* d_current_primal, double* d_reflected_primal, const double* d_dual_product, const double* d_objective, const double* d_var_lb, const double* d_var_ub, const double* d_step_size, - int n, cudaStream_t stream); + int n, gpuStream_t stream); void launchKernelHalpernPrimalMajor_wrapper( const double* d_current_primal, double* d_pdhg_primal, double* d_reflected_primal, const double* d_dual_product, const double* d_objective, const double* d_var_lb, const double* d_var_ub, const double* d_step_size, int n, double* d_dual_slack, - cudaStream_t stream); + gpuStream_t stream); void launchKernelHalpernDualMinor_wrapper(const double* d_current_dual, double* d_reflected_dual, @@ -56,20 +56,20 @@ void launchKernelHalpernDualMinor_wrapper(const double* d_current_dual, const double* d_rhs, const bool* d_is_equality, const double* d_step_size, int n, - cudaStream_t stream); + gpuStream_t stream); void launchKernelHalpernDualMajor_wrapper( const double* d_current_dual, double* d_pdhg_dual, double* d_reflected_dual, const double* d_primal_product, const double* d_rhs, const bool* d_is_equality, const double* d_step_size, int n, - cudaStream_t stream); + gpuStream_t stream); void launchKernelHalpernBlend_wrapper(double* d_current, const double* d_reflected, const double* d_initial, const int* d_halpern_iteration, int k_offset, double reflection_coeff, - int n, cudaStream_t stream); + int n, gpuStream_t stream); #ifdef __cplusplus }