From d0fd5c8dc95adb92a51e26b35c657b05475b13d0 Mon Sep 17 00:00:00 2001
From: Alexis Montoison <Alexis.Montoison@amd.com>
Date: Mon, 22 Jun 2026 18:23:39 -0600
Subject: [PATCH] Proof of concept for the support of AMD GPUs

---
 .github/workflows/build-hip.yml       |  66 +++
 .gitignore                            |   1 +
 CMakeLists.txt                        |  64 ++-
 cmake/cpp-highs.cmake                 |   2 +-
 highs/CMakeLists.txt                  | 114 ++--
 highs/pdlp/cupdlp/cuda/CMakeLists.txt |   3 +-
 highs/pdlp/hipdlp/gpu_backend.hpp     | 220 +++++++
 highs/pdlp/hipdlp/pdhg.cc             | 799 +++++++++++++-------------
 highs/pdlp/hipdlp/pdhg.cu             | 256 +++++----
 highs/pdlp/hipdlp/pdhg.hpp            |  78 +--
 highs/pdlp/hipdlp/pdhg_kernels.hpp    |  24 +-
 11 files changed, 983 insertions(+), 644 deletions(-)
 create mode 100644 .github/workflows/build-hip.yml
 create mode 100644 highs/pdlp/hipdlp/gpu_backend.hpp

diff --git a/.github/workflows/build-hip.yml b/.github/workflows/build-hip.yml
new file mode 100644
index 00000000000..1106d30f7d8
--- /dev/null
+++ b/.github/workflows/build-hip.yml
@@ -0,0 +1,66 @@
+name: build-hip
+
+# Compile-only CI for the AMD GPU (HIP/ROCm) backend of HiPDLP.
+#
+# This job runs inside an official ROCm container and only *compiles and links*
+# the HIP backend (-DHIPDLP_HIP=ON). It does NOT execute any GPU code: the
+# GitHub-hosted runners have no AMD GPU. Its purpose is to catch HIP build
+# regressions (wrong hip* API names, desynchronised #ifdef guards, signature
+# mismatches) that are invisible to the CPU/CUDA builds.
+#
+# Running the GPU code (numerical validation) requires a self-hosted runner
+# with an AMD GPU and is intentionally out of scope here.
+
+on:
+  push:
+    paths:
+      - 'highs/pdlp/hipdlp/**'
+      - 'CMakeLists.txt'
+      - 'highs/CMakeLists.txt'
+      - '.github/workflows/build-hip.yml'
+  pull_request:
+    paths:
+      - 'highs/pdlp/hipdlp/**'
+      - 'CMakeLists.txt'
+      - 'highs/CMakeLists.txt'
+      - '.github/workflows/build-hip.yml'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  hip-compile:
+    runs-on: ubuntu-latest
+    container:
+      # 24.04 + ROCm 7.x; the "-complete" image bundles hipBLAS/hipSPARSE.
+      image: rocm/dev-ubuntu-24.04:7.2.4-complete
+
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Install build dependencies
+        shell: bash
+        run: |
+          apt-get update
+          apt-get install -y --no-install-recommends \
+            cmake make git ca-certificates zlib1g-dev
+          cmake --version
+          hipcc --version || /opt/rocm/bin/hipcc --version
+
+      - name: Configure CMake (HIP backend)
+        shell: bash
+        run: |
+          export PATH=/opt/rocm/bin:$PATH
+          export CMAKE_PREFIX_PATH=/opt/rocm
+          cmake -S . -B build_hip \
+            -DHIPDLP_HIP=ON \
+            -DBUILD_TESTING=OFF \
+            -DBUILD_EXAMPLES=OFF \
+            -DCMAKE_BUILD_TYPE=Release
+
+      - name: Build (compile + link, no GPU execution)
+        shell: bash
+        run: |
+          export PATH=/opt/rocm/bin:$PATH
+          cmake --build build_hip --target highs --parallel
diff --git a/.gitignore b/.gitignore
index 20926b5870c..f6980c15fe9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -220,6 +220,7 @@ $RECYCLE.BIN/
 dist/
 build/
 build_release*/
+install/
 eggs/
 parts/
 var/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f28998adb01..ae69707ce47 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -150,11 +150,18 @@ message(STATUS "Use FindCUDAConf: ${CUPDLP_FIND_CUDA}")
 option(HIGHS_GPU_LIB "Build highs GPU as a single lib" OFF)
 message(STATUS "Build GPU with delayed loading: ${HIGHS_GPU_LIB}")
 
+option(HIPDLP_HIP "Build HiPDLP with AMD HIP (ROCm)" OFF)
+message(STATUS "Build HiPDLP with HIP: ${HIPDLP_HIP}")
+
 # For now
 if (HIGHS_GPU_LIB)
   set(CUPDLP_GPU ON)
 endif()
 
+if (HIPDLP_HIP)
+  set(HIPDLP_GPU ON)
+endif()
+
 if(CUPDLP_GPU AND CMAKE_VERSION VERSION_LESS "3.25.0")
     message("CUPDLP FindCUDAConf requires CMake version minumum 3.24. Please use a higher version of CMake.")
 endif()
@@ -182,30 +189,59 @@ if (HIPO AND NOT FAST_BUILD)
   message(ERROR "HIPO is only available with FAST_BUILD=ON.")
 endif()
 
-if (CUPDLP_GPU)
+if (CUPDLP_GPU OR HIPDLP_GPU)
   if (WIN32)
     set(BUILD_SHARED_LIBS ON)
   endif()
 
-  set (CUPDLP_CPU OFF)
-  message(NOTICE "Set build cuPDLP with CUDA")
+  set(CUPDLP_CPU OFF)
+
+  if (HIPDLP_HIP)
+    message(NOTICE "Set build HiPDLP with HIP (AMD ROCm)")
+
+    # ROCm/HIP requires CMake 3.21+
+    if(CMAKE_VERSION VERSION_LESS "3.21.0")
+      message(FATAL_ERROR "HIP support requires CMake >= 3.21")
+    endif()
+
+    # Allow the user to point to a non-standard ROCm install
+    if(DEFINED ENV{ROCM_PATH})
+      set(CMAKE_PREFIX_PATH "$ENV{ROCM_PATH}" ${CMAKE_PREFIX_PATH})
+    elseif(EXISTS "/opt/rocm")
+      set(CMAKE_PREFIX_PATH "/opt/rocm" ${CMAKE_PREFIX_PATH})
+    endif()
+
+    enable_language(HIP)
+    find_package(hipblas REQUIRED)
+    find_package(hipsparse REQUIRED)
+
+    set(GPU_LIBRARY roc::hipblas roc::hipsparse)
+    set(GPU_COMPILE_DEFINITIONS USE_HIP)
 
-  if (CUPDLP_FIND_CUDA)
-    # With FindCUDAConf.cmake
-    # Need to have the CUDA_HOME environment variable set.
-    include(FindCUDAConf)
   else()
-    # Without FindCUDAConf.cmake
-    enable_language(CUDA)
-    find_package(CUDAToolkit REQUIRED)
+    message(NOTICE "Set build cuPDLP with CUDA")
+
+    if (CUPDLP_FIND_CUDA)
+      # With FindCUDAConf.cmake
+      # Need to have the CUDA_HOME environment variable set.
+      include(FindCUDAConf)
+    else()
+      # Without FindCUDAConf.cmake
+      enable_language(CUDA)
+      find_package(CUDAToolkit REQUIRED)
+
+      set(CUDA_LIBRARY-NOTFOUND, OFF)
+      set(CUDA_LIBRARY CUDA::cudart CUDA::cublas CUDA::cusparse)
+    endif()
 
-    set(CUDA_LIBRARY-NOTFOUND, OFF)
-    set(CUDA_LIBRARY CUDA::cudart CUDA::cublas CUDA::cusparse)
+    set(GPU_LIBRARY ${CUDA_LIBRARY})
+    set(GPU_COMPILE_DEFINITIONS "")
   endif()
 
 else()
-  set (CUPDLP_CPU ON)
-    set(CUDA_LIBRARY-NOTFOUND true)
+  set(CUPDLP_CPU ON)
+  set(CUDA_LIBRARY-NOTFOUND true)
+  set(HIPDLP_GPU OFF)
 endif()
 
 # Option to build static
diff --git a/cmake/cpp-highs.cmake b/cmake/cpp-highs.cmake
index ddbb7fa0887..0138285bc11 100644
--- a/cmake/cpp-highs.cmake
+++ b/cmake/cpp-highs.cmake
@@ -70,7 +70,7 @@ if (NOT HIGHS_COVERAGE)
     APPEND FILE "${HIGHS_BINARY_DIR}/highs-targets.cmake")
 endif()
 
-if (CUPDLP_GPU AND NOT HIGHS_GPU_LIB)
+if (CUPDLP_GPU AND NOT HIGHS_GPU_LIB AND NOT HIPDLP_HIP)
   install(TARGETS cudalin
       EXPORT ${lower}-targets
       INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
diff --git a/highs/CMakeLists.txt b/highs/CMakeLists.txt
index fbdd7d34e65..06e0515e0ce 100644
--- a/highs/CMakeLists.txt
+++ b/highs/CMakeLists.txt
@@ -194,69 +194,89 @@ else()
 
   target_sources(highs PRIVATE ${sources} ${headers} ${win_version_file})
 
-  # Optional Cuda
-  if (CUPDLP_GPU)
-
-    target_include_directories(highs PUBLIC "$<BUILD_INTERFACE:${CMAKE_CUDA_PATH}/include>")
-
-    if (HIGHS_GPU_LIB)
-      # like in cuda CMakeLists
-      enable_language(CXX CUDA)
-
-      # Remove /MP flag from CUDA targets (nvcc doesn't support it)
-      if(MSVC)
-        string(REPLACE "/MP" "" CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}")
-        string(REPLACE "/bigobj" "" CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}")
-
-        foreach(flag_var
-          CMAKE_CUDA_FLAGS_DEBUG
-          CMAKE_CUDA_FLAGS_RELEASE
-          CMAKE_CUDA_FLAGS_RELWITHDEBINFO
-          CMAKE_CUDA_FLAGS_MINSIZEREL)
-          string(REPLACE "/MP" "" ${flag_var} "${${flag_var}}")
-          string(REPLACE "/bigobj" "" ${flag_var} "${${flag_var}}")
-        endforeach()
-      endif()
+  # Optional GPU (CUDA or HIP)
+  if (CUPDLP_GPU OR HIPDLP_GPU)
 
-      target_sources(highs PRIVATE
-        pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu
-        pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cuh
-        pdlp/cupdlp/cuda/cupdlp_cudalinalg.cuh
-        pdlp/cupdlp/cuda/cupdlp_cudalinalg.cu
-        pdlp/cupdlp/cupdlp_cs.c
-        pdlp/hipdlp/pdhg.cu)
+    if (HIPDLP_HIP)
+      # HIP backend — pdhg.cu is compiled as HIP source.
+      # HIPDLP_GPU activates GPU code in HiPDLP headers.
+      # CUPDLP_CPU keeps cuPDLP-C in CPU mode so it doesn't pull CUDA headers.
+      target_compile_definitions(highs PRIVATE HIPDLP_GPU USE_HIP CUPDLP_CPU)
 
-      set_target_properties(highs PROPERTIES
-        CUDA_SEPARABLE_COMPILATION ON
-        #CUDA_ARCHITECTURES "60;70;75;80;86" )
-        #Deleted 60;70 due to errors
-        #nvcc fatal   : Unsupported gpu architecture 'compute_60'
-        #nvcc fatal   : Unsupported gpu architecture 'compute_70'
-        CUDA_ARCHITECTURES "75;80;86" )
+      set_source_files_properties(pdlp/hipdlp/pdhg.cu PROPERTIES LANGUAGE HIP)
 
-      # set_target_properties(cudalin PROPERTIES CUDA_ARCHITECTURES native)
+      target_sources(highs PRIVATE pdlp/hipdlp/pdhg.cu)
 
-      target_include_directories(highs PUBLIC "$<BUILD_INTERFACE:${CMAKE_CUDA_PATH}/include>")
+      set_target_properties(highs PROPERTIES
+        HIP_SEPARABLE_COMPILATION ON)
 
       if (WIN32)
-        target_link_libraries(highs PRIVATE ${CUDA_LIBRARY})  # linking happens here
-      else() # linux
-        target_link_libraries(highs PRIVATE ${CUDA_LIBRARY} m)
+        target_link_libraries(highs PRIVATE ${GPU_LIBRARY})
+      else()
+        target_link_libraries(highs PRIVATE ${GPU_LIBRARY} m)
       endif()
 
     else()
+      # CUDA backend
+      target_include_directories(highs PUBLIC "$<BUILD_INTERFACE:${CMAKE_CUDA_PATH}/include>")
+
+      if (HIGHS_GPU_LIB)
+        # like in cuda CMakeLists
+        enable_language(CXX CUDA)
+
+        # Remove /MP flag from CUDA targets (nvcc doesn't support it)
+        if(MSVC)
+          string(REPLACE "/MP" "" CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}")
+          string(REPLACE "/bigobj" "" CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}")
+
+          foreach(flag_var
+            CMAKE_CUDA_FLAGS_DEBUG
+            CMAKE_CUDA_FLAGS_RELEASE
+            CMAKE_CUDA_FLAGS_RELWITHDEBINFO
+            CMAKE_CUDA_FLAGS_MINSIZEREL)
+            string(REPLACE "/MP" "" ${flag_var} "${${flag_var}}")
+            string(REPLACE "/bigobj" "" ${flag_var} "${${flag_var}}")
+          endforeach()
+        endif()
+
+        target_sources(highs PRIVATE
+          pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cu
+          pdlp/cupdlp/cuda/cupdlp_cuda_kernels.cuh
+          pdlp/cupdlp/cuda/cupdlp_cudalinalg.cuh
+          pdlp/cupdlp/cuda/cupdlp_cudalinalg.cu
+          pdlp/cupdlp/cupdlp_cs.c
+          pdlp/hipdlp/pdhg.cu)
+
+        set_target_properties(highs PROPERTIES
+          CUDA_SEPARABLE_COMPILATION ON
+          #CUDA_ARCHITECTURES "60;70;75;80;86" )
+          #Deleted 60;70 due to errors
+          #nvcc fatal   : Unsupported gpu architecture 'compute_60'
+          #nvcc fatal   : Unsupported gpu architecture 'compute_70'
+          CUDA_ARCHITECTURES "75;80;86" )
+
+        target_include_directories(highs PUBLIC "$<BUILD_INTERFACE:${CMAKE_CUDA_PATH}/include>")
+
+        if (WIN32)
+          target_link_libraries(highs PRIVATE ${GPU_LIBRARY})
+        else()
+          target_link_libraries(highs PRIVATE ${GPU_LIBRARY} m)
+        endif()
+
+      else()
         set(CUPDLP_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/highs/pdlp/cupdlp/")
         add_subdirectory(pdlp/cupdlp/cuda)
 
         if (WIN32)
-            target_link_libraries(highs PRIVATE cudalin ${CUDA_LIBRARY})
-          else()
-            target_link_libraries(highs PRIVATE cudalin ${CUDA_LIBRARY} m)
-          endif()
+          target_link_libraries(highs PRIVATE cudalin ${GPU_LIBRARY})
+        else()
+          target_link_libraries(highs PRIVATE cudalin ${GPU_LIBRARY} m)
+        endif()
 
-          set_target_properties(highs PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+        set_target_properties(highs PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
       endif()
     endif()
+  endif()
 
   target_include_directories(highs PRIVATE ${include_dirs})
 
diff --git a/highs/pdlp/cupdlp/cuda/CMakeLists.txt b/highs/pdlp/cupdlp/cuda/CMakeLists.txt
index 2cde626eefc..955ed8b0e6d 100644
--- a/highs/pdlp/cupdlp/cuda/CMakeLists.txt
+++ b/highs/pdlp/cupdlp/cuda/CMakeLists.txt
@@ -17,7 +17,7 @@ add_library(cudalin SHARED
 # latest has
 # set_target_properties(cudalin PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 
-set_target_properties(cudalin PROPERTIES 
+set_target_properties(cudalin PROPERTIES
 CUDA_SEPARABLE_COMPILATION ON
 #CUDA_ARCHITECTURES "60;70;75;80;86" )
 #Deleted 60;70 due to errors
@@ -28,6 +28,7 @@ CUDA_ARCHITECTURES "75;80;86" )
 
 # set_target_properties(cudalin PROPERTIES CUDA_ARCHITECTURES native)
 
+
 target_include_directories(cudalin PUBLIC "$<BUILD_INTERFACE:${CMAKE_CUDA_PATH}/include>")
 
 if (WIN32)
diff --git a/highs/pdlp/hipdlp/gpu_backend.hpp b/highs/pdlp/hipdlp/gpu_backend.hpp
new file mode 100644
index 00000000000..a83020d031b
--- /dev/null
+++ b/highs/pdlp/hipdlp/gpu_backend.hpp
@@ -0,0 +1,220 @@
+#pragma once
+
+// GPU portability layer for HiPDLP.
+// Provides unified gpu* types and macros that map to either the CUDA or HIP
+// runtime depending on whether USE_HIP is defined at compile time.
+
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
+
+#ifdef USE_HIP
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+#include <hipsparse/hipsparse.h>
+
+using gpuStream_t        = hipStream_t;
+using gpuSparseHandle_t  = hipsparseHandle_t;
+using gpuBlasHandle_t    = hipblasHandle_t;
+using gpuSpMatDescr_t    = hipsparseSpMatDescr_t;
+using gpuDnVecDescr_t    = hipsparseDnVecDescr_t;
+using gpuError_t         = hipError_t;
+using gpuBlasStatus_t    = hipblasStatus_t;
+using gpuSparseStatus_t  = hipsparseStatus_t;
+
+#define gpuMalloc(ptr, size)            hipMalloc(ptr, size)
+#define gpuFree(ptr)                    hipFree(ptr)
+#define gpuMemcpy(dst, src, size, kind) hipMemcpy(dst, src, size, kind)
+#define gpuMemcpyAsync(dst, src, size, kind, stream) \
+    hipMemcpyAsync(dst, src, size, kind, stream)
+#define gpuMemset(ptr, val, size)       hipMemset(ptr, val, size)
+#define gpuMemsetAsync(ptr, val, size, stream) \
+    hipMemsetAsync(ptr, val, size, stream)
+#define gpuMemcpyHostToDevice   hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost   hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+
+#define gpuStreamCreate(s)          hipStreamCreate(s)
+#define gpuStreamDestroy(s)         hipStreamDestroy(s)
+#define gpuStreamSynchronize(s)     hipStreamSynchronize(s)
+#define gpuDeviceSynchronize()      hipDeviceSynchronize()
+#define gpuGetLastError()           hipGetLastError()
+#define gpuGetErrorString(e)        hipGetErrorString(e)
+#define gpuSuccess                  hipSuccess
+
+#define gpuBlasCreate(h)            hipblasCreate(h)
+#define gpuBlasDestroy(h)           hipblasDestroy(h)
+#define gpuBlasSetStream(h, s)      hipblasSetStream(h, s)
+#define gpuBlasDdot(h,n,x,ix,y,iy,r)  hipblasDdot(h,n,x,ix,y,iy,r)
+#define gpuBlasDnrm2(h,n,x,ix,r)      hipblasDnrm2(h,n,x,ix,r)
+#define gpuBlasDaxpy(h,n,a,x,ix,y,iy) hipblasDaxpy(h,n,a,x,ix,y,iy)
+#define gpuBlasDscal(h,n,a,x,ix)       hipblasDscal(h,n,a,x,ix)
+#define GPU_BLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+
+#define gpuSparseCreate(h)          hipsparseCreate(h)
+#define gpuSparseDestroy(h)         hipsparseDestroy(h)
+#define gpuSparseSetStream(h, s)    hipsparseSetStream(h, s)
+#define gpuSparseCreateCsr(mat, r, c, nnz, rp, ci, cv, rpt, cit, vt, dt) \
+    hipsparseCreateCsr(mat, r, c, nnz, rp, ci, cv, rpt, cit, vt, dt)
+#define gpuSparseDestroySpMat(m)    hipsparseDestroySpMat(m)
+#define gpuSparseCreateDnVec(v, s, d, t) \
+    hipsparseCreateDnVec(v, s, d, t)
+#define gpuSparseDestroyDnVec(v)    hipsparseDestroyDnVec(v)
+#define gpuSparseSpMV_bufferSize(h, op, alpha, mat, x, beta, y, dt, alg, buf) \
+    hipsparseSpMV_bufferSize(h, op, alpha, mat, x, beta, y, dt, alg, buf)
+#define gpuSparseSpMV(h, op, alpha, mat, x, beta, y, dt, alg, buf) \
+    hipsparseSpMV(h, op, alpha, mat, x, beta, y, dt, alg, buf)
+#define GPU_SPARSE_STATUS_SUCCESS    HIPSPARSE_STATUS_SUCCESS
+#define gpuSparseGetErrorString(s)   hipsparseGetErrorString(s)
+
+#define GPU_OPERATION_NON_TRANSPOSE  HIPSPARSE_OPERATION_NON_TRANSPOSE
+#define GPU_OPERATION_TRANSPOSE      HIPSPARSE_OPERATION_TRANSPOSE
+#define GPU_INDEX_32I                HIPSPARSE_INDEX_32I
+#define GPU_INDEX_BASE_ZERO          HIPSPARSE_INDEX_BASE_ZERO
+#define GPU_R_64F                    HIP_R_64F
+#define GPU_SPMV_ALG_DEFAULT         HIPSPARSE_SPMV_ALG_DEFAULT
+#define GPU_SPMV_CSR_ALG2            HIPSPARSE_SPMV_CSR_ALG2
+
+// Device info
+#define gpuGetDeviceCount(n)         hipGetDeviceCount(n)
+#define gpuGetDeviceProperties(p, d) hipGetDeviceProperties(p, d)
+using gpuDeviceProp_t = hipDeviceProp_t;
+
+// Sparse vector update
+#define gpuSparseSetDnVecValues(v, p) hipsparseDnVecSetValues(v, p)
+
+#define gpuSparseSpMV_preprocess(h, op, a, mat, x, b, y, dt, alg, buf) \
+    hipsparseSpMV_preprocess(h, op, a, mat, x, b, y, dt, alg, buf)
+
+// Graph API
+using gpuGraph_t     = hipGraph_t;
+using gpuGraphExec_t = hipGraphExec_t;
+#define gpuStreamBeginCapture(s, m)          hipStreamBeginCapture(s, m)
+#define gpuStreamEndCapture(s, g)            hipStreamEndCapture(s, g)
+#define gpuStreamCaptureModeGlobal           hipStreamCaptureModeGlobal
+#define gpuGraphInstantiate(ge, g, n, e, f)  hipGraphInstantiate(ge, g, n, e, f)
+#define gpuGraphDestroy(g)                   hipGraphDestroy(g)
+#define gpuGraphLaunch(ge, s)                hipGraphLaunch(ge, s)
+#define gpuGraphExecDestroy(ge)              hipGraphExecDestroy(ge)
+
+#else
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cusparse.h>
+
+using gpuStream_t        = cudaStream_t;
+using gpuSparseHandle_t  = cusparseHandle_t;
+using gpuBlasHandle_t    = cublasHandle_t;
+using gpuSpMatDescr_t    = cusparseSpMatDescr_t;
+using gpuDnVecDescr_t    = cusparseDnVecDescr_t;
+using gpuError_t         = cudaError_t;
+using gpuBlasStatus_t    = cublasStatus_t;
+using gpuSparseStatus_t  = cusparseStatus_t;
+
+#define gpuMalloc(ptr, size)            cudaMalloc(ptr, size)
+#define gpuFree(ptr)                    cudaFree(ptr)
+#define gpuMemcpy(dst, src, size, kind) cudaMemcpy(dst, src, size, kind)
+#define gpuMemcpyAsync(dst, src, size, kind, stream) \
+    cudaMemcpyAsync(dst, src, size, kind, stream)
+#define gpuMemset(ptr, val, size)       cudaMemset(ptr, val, size)
+#define gpuMemsetAsync(ptr, val, size, stream) \
+    cudaMemsetAsync(ptr, val, size, stream)
+#define gpuMemcpyHostToDevice   cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost   cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
+
+#define gpuStreamCreate(s)          cudaStreamCreate(s)
+#define gpuStreamDestroy(s)         cudaStreamDestroy(s)
+#define gpuStreamSynchronize(s)     cudaStreamSynchronize(s)
+#define gpuDeviceSynchronize()      cudaDeviceSynchronize()
+#define gpuGetLastError()           cudaGetLastError()
+#define gpuGetErrorString(e)        cudaGetErrorString(e)
+#define gpuSuccess                  cudaSuccess
+
+#define gpuBlasCreate(h)            cublasCreate(h)
+#define gpuBlasDestroy(h)           cublasDestroy(h)
+#define gpuBlasSetStream(h, s)      cublasSetStream(h, s)
+#define gpuBlasDdot(h,n,x,ix,y,iy,r)  cublasDdot(h,n,x,ix,y,iy,r)
+#define gpuBlasDnrm2(h,n,x,ix,r)      cublasDnrm2(h,n,x,ix,r)
+#define gpuBlasDaxpy(h,n,a,x,ix,y,iy) cublasDaxpy(h,n,a,x,ix,y,iy)
+#define gpuBlasDscal(h,n,a,x,ix)       cublasDscal(h,n,a,x,ix)
+#define GPU_BLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+
+#define gpuSparseCreate(h)          cusparseCreate(h)
+#define gpuSparseDestroy(h)         cusparseDestroy(h)
+#define gpuSparseSetStream(h, s)    cusparseSetStream(h, s)
+#define gpuSparseCreateCsr(mat, r, c, nnz, rp, ci, cv, rpt, cit, vt, dt) \
+    cusparseCreateCsr(mat, r, c, nnz, rp, ci, cv, rpt, cit, vt, dt)
+#define gpuSparseDestroySpMat(m)    cusparseDestroySpMat(m)
+#define gpuSparseCreateDnVec(v, s, d, t) \
+    cusparseCreateDnVec(v, s, d, t)
+#define gpuSparseDestroyDnVec(v)    cusparseDestroyDnVec(v)
+#define gpuSparseSpMV_bufferSize(h, op, alpha, mat, x, beta, y, dt, alg, buf) \
+    cusparseSpMV_bufferSize(h, op, alpha, mat, x, beta, y, dt, alg, buf)
+#define gpuSparseSpMV(h, op, alpha, mat, x, beta, y, dt, alg, buf) \
+    cusparseSpMV(h, op, alpha, mat, x, beta, y, dt, alg, buf)
+#define GPU_SPARSE_STATUS_SUCCESS    CUSPARSE_STATUS_SUCCESS
+#define gpuSparseGetErrorString(s)   cusparseGetErrorString(s)
+
+#define GPU_OPERATION_NON_TRANSPOSE  CUSPARSE_OPERATION_NON_TRANSPOSE
+#define GPU_OPERATION_TRANSPOSE      CUSPARSE_OPERATION_TRANSPOSE
+#define GPU_INDEX_32I                CUSPARSE_INDEX_32I
+#define GPU_INDEX_BASE_ZERO          CUSPARSE_INDEX_BASE_ZERO
+#define GPU_R_64F                    CUDA_R_64F
+#define GPU_SPMV_ALG_DEFAULT         CUSPARSE_SPMV_ALG_DEFAULT
+#define GPU_SPMV_CSR_ALG2            CUSPARSE_SPMV_CSR_ALG2
+
+// Device info
+#define gpuGetDeviceCount(n)         cudaGetDeviceCount(n)
+#define gpuGetDeviceProperties(p, d) cudaGetDeviceProperties(p, d)
+using gpuDeviceProp_t = cudaDeviceProp;
+
+// Sparse vector update
+#define gpuSparseSetDnVecValues(v, p) cusparseDnVecSetValues(v, p)
+
+// cuSPARSE preprocess for SpMV (no-op equivalent in hipSPARSE)
+#define gpuSparseSpMV_preprocess(h, op, a, mat, x, b, y, dt, alg, buf) \
+    cusparseSpMV_preprocess(h, op, a, mat, x, b, y, dt, alg, buf)
+
+// Graph API
+using gpuGraph_t     = cudaGraph_t;
+using gpuGraphExec_t = cudaGraphExec_t;
+#define gpuStreamBeginCapture(s, m)          cudaStreamBeginCapture(s, m)
+#define gpuStreamEndCapture(s, g)            cudaStreamEndCapture(s, g)
+#define gpuStreamCaptureModeGlobal           cudaStreamCaptureModeGlobal
+#define gpuGraphInstantiate(ge, g, n, e, f)  cudaGraphInstantiate(ge, g, n, e, f)
+#define gpuGraphDestroy(g)                   cudaGraphDestroy(g)
+#define gpuGraphLaunch(ge, s)                cudaGraphLaunch(ge, s)
+#define gpuGraphExecDestroy(ge)              cudaGraphExecDestroy(ge)
+
+#endif  // USE_HIP
+
+#define GPU_CHECK(call)                                                      \
+  do {                                                                       \
+    gpuError_t err = (call);                                                 \
+    if (err != gpuSuccess) {                                                 \
+      fprintf(stderr, "GPU Error at %s:%d: %s\n", __FILE__, __LINE__,       \
+              gpuGetErrorString(err));                                        \
+      exit(EXIT_FAILURE);                                                    \
+    }                                                                        \
+  } while (0)
+
+#define GPU_BLAS_CHECK(call)                                                 \
+  do {                                                                       \
+    gpuBlasStatus_t status = (call);                                         \
+    if (status != GPU_BLAS_STATUS_SUCCESS) {                                 \
+      fprintf(stderr, "GPU BLAS Error at %s:%d: %d\n", __FILE__, __LINE__,  \
+              (int)status);                                                  \
+      exit(EXIT_FAILURE);                                                    \
+    }                                                                        \
+  } while (0)
+
+#define GPU_SPARSE_CHECK(call)                                               \
+  do {                                                                       \
+    gpuSparseStatus_t status = (call);                                       \
+    if (status != GPU_SPARSE_STATUS_SUCCESS) {                               \
+      fprintf(stderr, "GPU Sparse Error at %s:%d: %s\n", __FILE__,          \
+              __LINE__, gpuSparseGetErrorString(status));                    \
+      exit(EXIT_FAILURE);                                                    \
+    }                                                                        \
+  } while (0)
+
+#endif  // CUPDLP_GPU or HIPDLP_GPU
diff --git a/highs/pdlp/hipdlp/pdhg.cc b/highs/pdlp/hipdlp/pdhg.cc
index 3e08c8d752d..9546f39ecaf 100644
--- a/highs/pdlp/hipdlp/pdhg.cc
+++ b/highs/pdlp/hipdlp/pdhg.cc
@@ -15,10 +15,8 @@
 #include <random>
 #include <tuple>
 
-#ifdef CUPDLP_GPU
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
-#include <cusparse.h>
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
+#include "gpu_backend.hpp"
 #endif
 
 #include "defs.hpp"
@@ -497,7 +495,7 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
 #endif
 
   // 1. Initialization and setup
-#ifdef CUPDLP_GPU
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
   setupGpu();
 #endif
   initializeStepSizes();
@@ -521,27 +519,27 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
     halpern_iteration_ = 0;
   }
 
-#ifdef CUPDLP_GPU
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
   if (!params_.use_halpern_restart) {
-    CUDA_CHECK(cudaMemset(d_x_sum_, 0, lp_.num_col_ * sizeof(double)));
-    CUDA_CHECK(cudaMemset(d_y_sum_, 0, lp_.num_row_ * sizeof(double)));
-    CUDA_CHECK(cudaMemset(d_x_avg_, 0, lp_.num_col_ * sizeof(double)));
-    CUDA_CHECK(cudaMemset(d_y_avg_, 0, lp_.num_row_ * sizeof(double)));
+    GPU_CHECK(gpuMemset(d_x_sum_, 0, lp_.num_col_ * sizeof(double)));
+    GPU_CHECK(gpuMemset(d_y_sum_, 0, lp_.num_row_ * sizeof(double)));
+    GPU_CHECK(gpuMemset(d_x_avg_, 0, lp_.num_col_ * sizeof(double)));
+    GPU_CHECK(gpuMemset(d_y_avg_, 0, lp_.num_row_ * sizeof(double)));
   }
   sum_weights_gpu_ = 0.0;
 
-  CUDA_CHECK(cudaMemcpy(d_x_current_, x_current_.data(),
-                        lp_.num_col_ * sizeof(double), cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy(d_y_current_, y_current_.data(),
-                        lp_.num_row_ * sizeof(double), cudaMemcpyHostToDevice));
+  GPU_CHECK(gpuMemcpy(d_x_current_, x_current_.data(),
+                        lp_.num_col_ * sizeof(double), gpuMemcpyHostToDevice));
+  GPU_CHECK(gpuMemcpy(d_y_current_, y_current_.data(),
+                        lp_.num_row_ * sizeof(double), gpuMemcpyHostToDevice));
 
   if (params_.use_halpern_restart) {
-    CUDA_CHECK(cudaMemcpy(d_x_anchor_, d_x_current_,
+    GPU_CHECK(gpuMemcpy(d_x_anchor_, d_x_current_,
                           lp_.num_col_ * sizeof(double),
-                          cudaMemcpyDeviceToDevice));
-    CUDA_CHECK(cudaMemcpy(d_y_anchor_, d_y_current_,
+                          gpuMemcpyDeviceToDevice));
+    GPU_CHECK(gpuMemcpy(d_y_anchor_, d_y_current_,
                           lp_.num_row_ * sizeof(double),
-                          cudaMemcpyDeviceToDevice));
+                          gpuMemcpyDeviceToDevice));
   }
 
   linalgGpuAx(d_x_current_, d_ax_current_);
@@ -567,9 +565,9 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
     return;
   }
 
-#ifdef CUPDLP_GPU
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
   bool graph_created = false;
-  cudaGraphExec_t graphExec = nullptr;
+  gpuGraphExec_t graphExec = nullptr;
 #endif
 
   // 2. Main cuPDLPx-style Loop
@@ -582,15 +580,11 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
     }
 
     // -- Step 1 (Major, isolated for restart-FPE check) --
-#ifdef CUPDLP_GPU
-    CUDA_CHECK(cudaMemcpyAsync(d_primal_step_size_, &stepsize_.primal_step,
-                               sizeof(double), cudaMemcpyHostToDevice,
-                               gpu_stream_));
-    CUDA_CHECK(cudaMemcpyAsync(d_dual_step_size_, &stepsize_.dual_step,
-                               sizeof(double), cudaMemcpyHostToDevice,
-                               gpu_stream_));
-    CUDA_CHECK(cudaMemcpyAsync(d_halpern_iteration_, &halpern_iteration_,
-                               sizeof(int), cudaMemcpyHostToDevice,
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
+    GpuStepParams h_step_params{stepsize_.primal_step, stepsize_.dual_step,
+                                halpern_iteration_};
+    GPU_CHECK(gpuMemcpyAsync(d_step_params_, &h_step_params,
+                               sizeof(GpuStepParams), gpuMemcpyHostToDevice,
                                gpu_stream_));
     performHalpernPdhgStepGpu(true, 1);
 #else
@@ -598,7 +592,7 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
 #endif
 
     if (do_restart) {
-#ifdef CUPDLP_GPU
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
       fpe_ = computeFixedPointErrorGpu();
 #else
       fpe_ = computeFixedPointError();
@@ -607,24 +601,24 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
       do_restart = false;
     }
     // -- Steps 2 to PDHG_CHECK_INTERVAL - 1 (Minor) --
-#ifdef CUPDLP_GPU
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
     if (!graph_created) {
-      CUDA_CHECK(
-          cudaStreamBeginCapture(gpu_stream_, cudaStreamCaptureModeGlobal));
+      GPU_CHECK(
+          gpuStreamBeginCapture(gpu_stream_, gpuStreamCaptureModeGlobal));
 
       for (int i = 2; i <= PDHG_CHECK_INTERVAL - 1; i++) {
         performHalpernPdhgStepGpu(false, i);
       }
       performHalpernPdhgStepGpu(true, PDHG_CHECK_INTERVAL);
 
-      cudaGraph_t graph;
-      CUDA_CHECK(cudaStreamEndCapture(gpu_stream_, &graph));
-      CUDA_CHECK(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
-      CUDA_CHECK(cudaGraphDestroy(graph));
+      gpuGraph_t graph;
+      GPU_CHECK(gpuStreamEndCapture(gpu_stream_, &graph));
+      GPU_CHECK(gpuGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
+      GPU_CHECK(gpuGraphDestroy(graph));
       graph_created = true;
     }
 
-    CUDA_CHECK(cudaGraphLaunch(graphExec, gpu_stream_));
+    GPU_CHECK(gpuGraphLaunch(graphExec, gpu_stream_));
 #else
     for (int i = 2; i <= PDHG_CHECK_INTERVAL - 1; i++) {
       performHalpernPdhgStep(false, i);
@@ -633,7 +627,7 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
 #endif
 
     // Compute Error for Restart Check
-#ifdef CUPDLP_GPU
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
     fpe_ = computeFixedPointErrorGpu();
 #else
     fpe_ = computeFixedPointError();
@@ -664,19 +658,19 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
       if (params_.step_size_strategy == StepSizeStrategy::PID) {
         updatePrimalWeightAtRestart(results_);
       }
-#ifdef CUPDLP_GPU
-      CUDA_CHECK(cudaMemcpy(d_x_anchor_, d_pdhg_primal_,
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
+      GPU_CHECK(gpuMemcpy(d_x_anchor_, d_pdhg_primal_,
                             lp_.num_col_ * sizeof(double),
-                            cudaMemcpyDeviceToDevice));
-      CUDA_CHECK(cudaMemcpy(d_y_anchor_, d_pdhg_dual_,
+                            gpuMemcpyDeviceToDevice));
+      GPU_CHECK(gpuMemcpy(d_y_anchor_, d_pdhg_dual_,
                             lp_.num_row_ * sizeof(double),
-                            cudaMemcpyDeviceToDevice));
-      CUDA_CHECK(cudaMemcpy(d_x_current_, d_pdhg_primal_,
+                            gpuMemcpyDeviceToDevice));
+      GPU_CHECK(gpuMemcpy(d_x_current_, d_pdhg_primal_,
                             lp_.num_col_ * sizeof(double),
-                            cudaMemcpyDeviceToDevice));
-      CUDA_CHECK(cudaMemcpy(d_y_current_, d_pdhg_dual_,
+                            gpuMemcpyDeviceToDevice));
+      GPU_CHECK(gpuMemcpy(d_y_current_, d_pdhg_dual_,
                             lp_.num_row_ * sizeof(double),
-                            cudaMemcpyDeviceToDevice));
+                            gpuMemcpyDeviceToDevice));
 #else
       x_anchor_ = x_next_;
       y_anchor_ = y_next_;
@@ -691,9 +685,9 @@ void PDLPSolver::solve(std::vector<double>& x, std::vector<double>& y) {
     }
   }
 
-#ifdef CUPDLP_GPU
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
   if (graphExec) {
-    CUDA_CHECK(cudaGraphExecDestroy(graphExec));
+    GPU_CHECK(gpuGraphExecDestroy(graphExec));
   }
 #endif
 
@@ -738,23 +732,23 @@ double PDLPSolver::computeFixedPointError() {
   return std::sqrt(std::max(0.0, movement + interaction));
 }
 
-#ifdef CUPDLP_GPU
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
 double PDLPSolver::computeFixedPointErrorGpu() {
   double alpha_minus_one = -1.0;
 
   // 1. delta_x = x_next_ - reflected_x_
   // (Assuming d_pdhg_primal_ maps to x_next_ and d_x_next_ is used as
   // reflected_x_ in your minor/major steps)
-  CUDA_CHECK(cudaMemcpy(d_delta_x_, d_pdhg_primal_,
+  GPU_CHECK(gpuMemcpy(d_delta_x_, d_pdhg_primal_,
                         a_num_cols_ * sizeof(double),
-                        cudaMemcpyDeviceToDevice));
-  CUBLAS_CHECK(cublasDaxpy(cublas_handle_, a_num_cols_, &alpha_minus_one,
+                        gpuMemcpyDeviceToDevice));
+  GPU_BLAS_CHECK(gpuBlasDaxpy(cublas_handle_, a_num_cols_, &alpha_minus_one,
                            d_x_next_, 1, d_delta_x_, 1));
 
   // 2. delta_y = y_next_ - reflected_y_
-  CUDA_CHECK(cudaMemcpy(d_delta_y_, d_pdhg_dual_, a_num_rows_ * sizeof(double),
-                        cudaMemcpyDeviceToDevice));
-  CUBLAS_CHECK(cublasDaxpy(cublas_handle_, a_num_rows_, &alpha_minus_one,
+  GPU_CHECK(gpuMemcpy(d_delta_y_, d_pdhg_dual_, a_num_rows_ * sizeof(double),
+                        gpuMemcpyDeviceToDevice));
+  GPU_BLAS_CHECK(gpuBlasDaxpy(cublas_handle_, a_num_rows_, &alpha_minus_one,
                            d_y_next_, 1, d_delta_y_, 1));
 
   // 3. AT_delta_y = A^T * delta_y
@@ -763,11 +757,11 @@ double PDLPSolver::computeFixedPointErrorGpu() {
   // 4. Compute norms and cross term
   double primal_norm = 0.0, dual_norm = 0.0, cross_term = 0.0;
 
-  CUBLAS_CHECK(
-      cublasDnrm2(cublas_handle_, a_num_cols_, d_delta_x_, 1, &primal_norm));
-  CUBLAS_CHECK(
-      cublasDnrm2(cublas_handle_, a_num_rows_, d_delta_y_, 1, &dual_norm));
-  CUBLAS_CHECK(cublasDdot(cublas_handle_, a_num_cols_, d_delta_x_, 1,
+  GPU_BLAS_CHECK(
+      gpuBlasDnrm2(cublas_handle_, a_num_cols_, d_delta_x_, 1, &primal_norm));
+  GPU_BLAS_CHECK(
+      gpuBlasDnrm2(cublas_handle_, a_num_rows_, d_delta_y_, 1, &dual_norm));
+  GPU_BLAS_CHECK(gpuBlasDdot(cublas_handle_, a_num_cols_, d_delta_x_, 1,
                           d_AT_delta_y_, 1, &cross_term));
 
   double primal_norm_sq = primal_norm * primal_norm;
@@ -785,7 +779,7 @@ bool PDLPSolver::runConvergenceCheck(size_t iter, std::vector<double>& output_x,
                                      std::vector<double>& output_y,
                                      TerminationStatus& status) {
   // 1. Compute Average Iterate (GPU or CPU)
-#ifdef CUPDLP_GPU
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
   computeAverageIterateGpu();
 #else
   computeAverageIterate(Ax_avg_, ATy_avg_);
@@ -797,7 +791,7 @@ bool PDLPSolver::runConvergenceCheck(size_t iter, std::vector<double>& output_x,
   bool average_converged = false;
 
   // 2. Check Convergence (GPU or CPU)
-#ifdef CUPDLP_GPU
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
   // Only use Halpern PDHG slack/iterates if we have performed at least one
   // iteration
   if (params_.use_halpern_restart && d_pdhg_primal_ != nullptr && iter > 0) {
@@ -843,7 +837,7 @@ bool PDLPSolver::runConvergenceCheck(size_t iter, std::vector<double>& output_x,
     bool prefer_avg = average_converged;
 
     // Copy result to output vectors
-#ifdef CUPDLP_GPU
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
     double* src_x = prefer_avg ? d_x_avg_
                                : (params_.use_halpern_restart ? d_pdhg_primal_
                                                               : d_x_current_);
@@ -853,16 +847,16 @@ bool PDLPSolver::runConvergenceCheck(size_t iter, std::vector<double>& output_x,
     double* src_sp = prefer_avg ? d_dSlackPosAvg_ : d_dSlackPos_;
     double* src_sn = prefer_avg ? d_dSlackNegAvg_ : d_dSlackNeg_;
 
-    CUDA_CHECK(cudaMemcpy(output_x.data(), src_x, lp_.num_col_ * sizeof(double),
-                          cudaMemcpyDeviceToHost));
-    CUDA_CHECK(cudaMemcpy(output_y.data(), src_y, lp_.num_row_ * sizeof(double),
-                          cudaMemcpyDeviceToHost));
-    CUDA_CHECK(cudaMemcpy(dSlackPos_.data(), src_sp,
+    GPU_CHECK(gpuMemcpy(output_x.data(), src_x, lp_.num_col_ * sizeof(double),
+                          gpuMemcpyDeviceToHost));
+    GPU_CHECK(gpuMemcpy(output_y.data(), src_y, lp_.num_row_ * sizeof(double),
+                          gpuMemcpyDeviceToHost));
+    GPU_CHECK(gpuMemcpy(dSlackPos_.data(), src_sp,
                           lp_.num_col_ * sizeof(double),
-                          cudaMemcpyDeviceToHost));
-    CUDA_CHECK(cudaMemcpy(dSlackNeg_.data(), src_sn,
+                          gpuMemcpyDeviceToHost));
+    GPU_CHECK(gpuMemcpy(dSlackNeg_.data(), src_sn,
                           lp_.num_col_ * sizeof(double),
-                          cudaMemcpyDeviceToHost));
+                          gpuMemcpyDeviceToHost));
 #else
     if (prefer_avg) {
       output_x = x_avg_;
@@ -1028,7 +1022,7 @@ void PDLPSolver::accumulateAverages(size_t iter) {
   HighsInt inner_iter = iter - restart_scheme_.getLastRestartIter();
   double dMeanStepSize = std::sqrt(stepsize_.primal_step * stepsize_.dual_step);
 
-#ifdef CUPDLP_GPU
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
   if (params_.use_halpern_restart) {
     // If Halpern, we average the 'current' blended iterate
     launchKernelUpdateAverages_wrapper(d_x_sum_, d_y_sum_, d_x_current_,
@@ -1061,20 +1055,12 @@ void PDLPSolver::prepareNextIteration() {
     // Just ensure caches are correct if not done in PerformHalpernStep
     return;
   } else {
-    // Standard PDHG: x_current becomes x_next
-#ifdef CUPDLP_GPU
-    CUDA_CHECK(cudaMemcpy(d_x_current_, d_x_next_,
-                          lp_.num_col_ * sizeof(double),
-                          cudaMemcpyDeviceToDevice));
-    CUDA_CHECK(cudaMemcpy(d_y_current_, d_y_next_,
-                          lp_.num_row_ * sizeof(double),
-                          cudaMemcpyDeviceToDevice));
-    CUDA_CHECK(cudaMemcpy(d_ax_current_, d_ax_next_,
-                          lp_.num_row_ * sizeof(double),
-                          cudaMemcpyDeviceToDevice));
-    CUDA_CHECK(cudaMemcpy(d_aty_current_, d_aty_next_,
-                          lp_.num_col_ * sizeof(double),
-                          cudaMemcpyDeviceToDevice));
+    // Standard PDHG: swap device pointers instead of copying device memory.
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
+    std::swap(d_x_current_,   d_x_next_);
+    std::swap(d_y_current_,   d_y_next_);
+    std::swap(d_ax_current_,  d_ax_next_);
+    std::swap(d_aty_current_, d_aty_next_);
 #else
     x_current_ = x_next_;
     y_current_ = y_next_;
@@ -1083,7 +1069,7 @@ void PDLPSolver::prepareNextIteration() {
 #endif
   }
 }
-#ifdef CUPDLP_GPU
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
 void PDLPSolver::performHalpernPdhgStepGpu(bool is_major, int k_offset) {
   linalgGpuATy(d_y_current_, d_aty_current_);
 
@@ -1126,7 +1112,7 @@ void PDLPSolver::performHalpernPdhgStepGpu(bool is_major, int k_offset) {
 #endif
 
 void PDLPSolver::solveReturn(const TerminationStatus term_code) {
-#ifdef CUPDLP_GPU
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
   cleanupGpu();
 #endif
   results_.term_code = term_code;
@@ -1669,7 +1655,7 @@ double PDLPSolver::powerMethod() {
   return op_norm_sq;
 }
 
-#ifdef CUPDLP_GPU
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
 double PDLPSolver::powerMethodGpu() {
   if (a_num_rows_ == 0 || a_num_cols_ == 0) return 1.0;
 
@@ -1686,9 +1672,9 @@ double PDLPSolver::powerMethodGpu() {
   size_t buffer_size_at = 0;
   size_t buffer_size_a = 0;
 
-  CUDA_CHECK(cudaMalloc(&d_eigenvector, a_num_rows_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_next_eigenvector, a_num_rows_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_dual_product, a_num_cols_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_eigenvector, a_num_rows_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_next_eigenvector, a_num_rows_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_dual_product, a_num_cols_ * sizeof(double)));
 
   std::vector<double> eigenvector_h(a_num_rows_);
   std::mt19937 engine_fixed_seed(12345);
@@ -1697,84 +1683,84 @@ double PDLPSolver::powerMethodGpu() {
     eigenvector_h[i] = distribution(engine_fixed_seed);
   }
 
-  CUDA_CHECK(cudaMemcpy(d_eigenvector, eigenvector_h.data(),
-                        a_num_rows_ * sizeof(double), cudaMemcpyHostToDevice));
+  GPU_CHECK(gpuMemcpy(d_eigenvector, eigenvector_h.data(),
+                      a_num_rows_ * sizeof(double), gpuMemcpyHostToDevice));
 
-  cusparseDnVecDescr_t vecEigen = nullptr;
-  cusparseDnVecDescr_t vecNextEigen = nullptr;
-  cusparseDnVecDescr_t vecDual = nullptr;
-  CUSPARSE_CHECK(
-      cusparseCreateDnVec(&vecEigen, a_num_rows_, d_eigenvector, CUDA_R_64F));
-  CUSPARSE_CHECK(cusparseCreateDnVec(&vecNextEigen, a_num_rows_,
-                                     d_next_eigenvector, CUDA_R_64F));
-  CUSPARSE_CHECK(
-      cusparseCreateDnVec(&vecDual, a_num_cols_, d_dual_product, CUDA_R_64F));
+  gpuDnVecDescr_t vecEigen = nullptr;
+  gpuDnVecDescr_t vecNextEigen = nullptr;
+  gpuDnVecDescr_t vecDual = nullptr;
+  GPU_SPARSE_CHECK(
+      gpuSparseCreateDnVec(&vecEigen, a_num_rows_, d_eigenvector, GPU_R_64F));
+  GPU_SPARSE_CHECK(gpuSparseCreateDnVec(&vecNextEigen, a_num_rows_,
+                                        d_next_eigenvector, GPU_R_64F));
+  GPU_SPARSE_CHECK(
+      gpuSparseCreateDnVec(&vecDual, a_num_cols_, d_dual_product, GPU_R_64F));
 
-  CUSPARSE_CHECK(cusparseSpMV_bufferSize(
-      cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, mat_a_T_csr_,
-      vecEigen, &zero, vecDual, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2,
+  GPU_SPARSE_CHECK(gpuSparseSpMV_bufferSize(
+      cusparse_handle_, GPU_OPERATION_NON_TRANSPOSE, &one, mat_a_T_csr_,
+      vecEigen, &zero, vecDual, GPU_R_64F, GPU_SPMV_CSR_ALG2,
       &buffer_size_at));
-  CUSPARSE_CHECK(cusparseSpMV_bufferSize(
-      cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, mat_a_csr_,
-      vecDual, &zero, vecNextEigen, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2,
+  GPU_SPARSE_CHECK(gpuSparseSpMV_bufferSize(
+      cusparse_handle_, GPU_OPERATION_NON_TRANSPOSE, &one, mat_a_csr_,
+      vecDual, &zero, vecNextEigen, GPU_R_64F, GPU_SPMV_CSR_ALG2,
       &buffer_size_a));
 
-  CUDA_CHECK(cudaMalloc(&d_buffer_at, buffer_size_at));
-  CUDA_CHECK(cudaMalloc(&d_buffer_a, buffer_size_a));
+  GPU_CHECK(gpuMalloc(&d_buffer_at, buffer_size_at));
+  GPU_CHECK(gpuMalloc(&d_buffer_a, buffer_size_a));
 
   double sigma_max_sq = 1.0;
 
   for (int iter = 0; iter < max_iter; ++iter) {
-    CUDA_CHECK(cudaMemcpy(d_next_eigenvector, d_eigenvector,
-                          a_num_rows_ * sizeof(double),
-                          cudaMemcpyDeviceToDevice));
+    GPU_CHECK(gpuMemcpy(d_next_eigenvector, d_eigenvector,
+                        a_num_rows_ * sizeof(double),
+                        gpuMemcpyDeviceToDevice));
 
     double eigenvector_norm = 0.0;
-    CUBLAS_CHECK(cublasDnrm2(cublas_handle_, a_num_rows_, d_next_eigenvector, 1,
-                             &eigenvector_norm));
+    GPU_BLAS_CHECK(gpuBlasDnrm2(cublas_handle_, a_num_rows_, d_next_eigenvector,
+                                1, &eigenvector_norm));
     if (!(eigenvector_norm > 0.0) || !std::isfinite(eigenvector_norm)) break;
 
     double inv_eigenvector_norm = 1.0 / eigenvector_norm;
-    CUBLAS_CHECK(cublasDscal(cublas_handle_, a_num_rows_, &inv_eigenvector_norm,
-                             d_next_eigenvector, 1));
+    GPU_BLAS_CHECK(gpuBlasDscal(cublas_handle_, a_num_rows_,
+                                &inv_eigenvector_norm, d_next_eigenvector, 1));
 
-    CUSPARSE_CHECK(cusparseDnVecSetValues(vecNextEigen, d_next_eigenvector));
-    CUSPARSE_CHECK(cusparseDnVecSetValues(vecDual, d_dual_product));
+    GPU_SPARSE_CHECK(gpuSparseSetDnVecValues(vecNextEigen, d_next_eigenvector));
+    GPU_SPARSE_CHECK(gpuSparseSetDnVecValues(vecDual, d_dual_product));
 
-    CUSPARSE_CHECK(
-        cusparseSpMV(cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, &one,
-                     mat_a_T_csr_, vecNextEigen, &zero, vecDual, CUDA_R_64F,
-                     CUSPARSE_SPMV_CSR_ALG2, d_buffer_at));
+    GPU_SPARSE_CHECK(
+        gpuSparseSpMV(cusparse_handle_, GPU_OPERATION_NON_TRANSPOSE, &one,
+                      mat_a_T_csr_, vecNextEigen, &zero, vecDual, GPU_R_64F,
+                      GPU_SPMV_CSR_ALG2, d_buffer_at));
 
-    CUSPARSE_CHECK(cusparseDnVecSetValues(vecEigen, d_eigenvector));
-    CUSPARSE_CHECK(
-        cusparseSpMV(cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, &one,
-                     mat_a_csr_, vecDual, &zero, vecEigen, CUDA_R_64F,
-                     CUSPARSE_SPMV_CSR_ALG2, d_buffer_a));
+    GPU_SPARSE_CHECK(gpuSparseSetDnVecValues(vecEigen, d_eigenvector));
+    GPU_SPARSE_CHECK(
+        gpuSparseSpMV(cusparse_handle_, GPU_OPERATION_NON_TRANSPOSE, &one,
+                      mat_a_csr_, vecDual, &zero, vecEigen, GPU_R_64F,
+                      GPU_SPMV_CSR_ALG2, d_buffer_a));
 
-    CUBLAS_CHECK(cublasDdot(cublas_handle_, a_num_rows_, d_next_eigenvector, 1,
-                            d_eigenvector, 1, &sigma_max_sq));
+    GPU_BLAS_CHECK(gpuBlasDdot(cublas_handle_, a_num_rows_, d_next_eigenvector,
+                               1, d_eigenvector, 1, &sigma_max_sq));
 
     double neg_sigma_sq = -sigma_max_sq;
-    CUBLAS_CHECK(cublasDscal(cublas_handle_, a_num_rows_, &neg_sigma_sq,
-                             d_next_eigenvector, 1));
-    CUBLAS_CHECK(cublasDaxpy(cublas_handle_, a_num_rows_, &one, d_eigenvector,
-                             1, d_next_eigenvector, 1));
+    GPU_BLAS_CHECK(gpuBlasDscal(cublas_handle_, a_num_rows_, &neg_sigma_sq,
+                                d_next_eigenvector, 1));
+    GPU_BLAS_CHECK(gpuBlasDaxpy(cublas_handle_, a_num_rows_, &one,
+                                d_eigenvector, 1, d_next_eigenvector, 1));
 
     double residual_norm = 0.0;
-    CUBLAS_CHECK(cublasDnrm2(cublas_handle_, a_num_rows_, d_next_eigenvector, 1,
-                             &residual_norm));
+    GPU_BLAS_CHECK(gpuBlasDnrm2(cublas_handle_, a_num_rows_, d_next_eigenvector,
+                                1, &residual_norm));
     if (residual_norm < tolerance) break;
   }
 
-  CUDA_CHECK(cudaFree(d_buffer_at));
-  CUDA_CHECK(cudaFree(d_buffer_a));
-  CUSPARSE_CHECK(cusparseDestroyDnVec(vecEigen));
-  CUSPARSE_CHECK(cusparseDestroyDnVec(vecNextEigen));
-  CUSPARSE_CHECK(cusparseDestroyDnVec(vecDual));
-  CUDA_CHECK(cudaFree(d_eigenvector));
-  CUDA_CHECK(cudaFree(d_next_eigenvector));
-  CUDA_CHECK(cudaFree(d_dual_product));
+  GPU_CHECK(gpuFree(d_buffer_at));
+  GPU_CHECK(gpuFree(d_buffer_a));
+  GPU_SPARSE_CHECK(gpuSparseDestroyDnVec(vecEigen));
+  GPU_SPARSE_CHECK(gpuSparseDestroyDnVec(vecNextEigen));
+  GPU_SPARSE_CHECK(gpuSparseDestroyDnVec(vecDual));
+  GPU_CHECK(gpuFree(d_eigenvector));
+  GPU_CHECK(gpuFree(d_next_eigenvector));
+  GPU_CHECK(gpuFree(d_dual_product));
 
   return sigma_max_sq;
 }
@@ -1786,22 +1772,22 @@ void PDLPSolver::setup(const HighsOptions& options, HighsTimer& timer) {
   highs_timer_p_ = &timer;
   highsLogUser(options.log_options, HighsLogType::kInfo,
                "Using HiPDLP first order PDLP solver on a %s\n",
-#ifdef CUPDLP_GPU
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
                "GPU"
 #else
                "CPU: performance may be disappointing!"
 #endif
   );
-#ifdef CUPDLP_GPU
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
   HighsInt n_devices = 0;
-  cudaGetDeviceCount(&n_devices);
+  gpuGetDeviceCount(&n_devices);
   if (n_devices != 1)
     highsLogUser(
         options.log_options, HighsLogType::kInfo,
         "Number of CUDA-enabled devices is %d: device 0 will be used\n",
         n_devices);
-  cudaDeviceProp prop;
-  cudaGetDeviceProperties(&prop, 0);
+  gpuDeviceProp_t prop;
+  gpuGetDeviceProperties(&prop, 0);
   highsLogUser(options.log_options, HighsLogType::kInfo, "Cuda device: %s\n",
                prop.name);
   highsLogUser(options.log_options, HighsLogType::kInfo,
@@ -1954,7 +1940,7 @@ void PDLPSolver::initializeStepSizes() {
     params_.step_size_strategy = StepSizeStrategy::FIXED;
   }
 
-#ifdef CUPDLP_GPU
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
   double op_norm_sq = powerMethodGpu();
 #else
   double op_norm_sq = powerMethod();
@@ -1981,7 +1967,7 @@ void PDLPSolver::updatePrimalWeightAtRestart(const SolverResults& results) {
   double primal_dist = 0.0;
   double dual_dist = 0.0;
 
-#ifdef CUPDLP_GPU
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
   primal_dist = computeDiffNormCuBLAS(d_pdhg_primal_, d_x_anchor_, a_num_cols_);
   dual_dist = computeDiffNormCuBLAS(d_pdhg_dual_, d_y_anchor_, a_num_rows_);
 #else
@@ -2110,20 +2096,20 @@ void PDLPSolver::updateIteratesFixed() {
   hipdlpTimerStop(kHipdlpClockMatrixTransposeMultiply);
 #endif
 
-#ifdef CUPDLP_GPU
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
   // Add this check before the memcpy
   if (d_x_next_ == nullptr) {
     std::cerr << "Error1: d_x_next_ is null!" << std::endl;
     return;
   }
   launchKernelUpdateX(stepsize_.primal_step);
-  CUDA_CHECK(cudaDeviceSynchronize());
+  GPU_CHECK(gpuDeviceSynchronize());
   linalgGpuAx(d_x_next_, d_ax_next_);
-  CUDA_CHECK(cudaDeviceSynchronize());
+  GPU_CHECK(gpuDeviceSynchronize());
   launchKernelUpdateY(stepsize_.dual_step);
-  CUDA_CHECK(cudaDeviceSynchronize());
+  GPU_CHECK(gpuDeviceSynchronize());
   linalgGpuATy(d_y_next_, d_aty_next_);
-  CUDA_CHECK(cudaDeviceSynchronize());
+  GPU_CHECK(gpuDeviceSynchronize());
 
   // Add this check before the memcpy
   if (d_x_next_ == nullptr) {
@@ -2172,7 +2158,7 @@ void PDLPSolver::updateIteratesAdaptive() {
     double primal_step_update = dStepSizeUpdate / std::sqrt(stepsize_.beta);
     double dual_step_update = dStepSizeUpdate * std::sqrt(stepsize_.beta);
 
-#ifdef CUPDLP_GPU
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
     launchKernelUpdateX_wrapper(d_x_next_,       // Output (Trial)
                                 d_x_current_,    // Input (Base)
                                 d_aty_current_,  // Input (Base aTy)
@@ -2391,15 +2377,15 @@ void PDLPSolver::closeDebugLog() {
 // =============================================================================
 //  SECTION 5: GPU Part
 // =============================================================================
-#ifdef CUPDLP_GPU
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
 void PDLPSolver::setupGpu() {
-  CUDA_CHECK(cudaStreamCreate(&gpu_stream_));
+  GPU_CHECK(gpuStreamCreate(&gpu_stream_));
 
   // 1. initialize cuSPARSE
-  CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_));
-  CUBLAS_CHECK(cublasCreate(&cublas_handle_));
-  CUSPARSE_CHECK(cusparseSetStream(cusparse_handle_, gpu_stream_));
-  CUBLAS_CHECK(cublasSetStream(cublas_handle_, gpu_stream_));
+  GPU_SPARSE_CHECK(gpuSparseCreate(&cusparse_handle_));
+  GPU_BLAS_CHECK(gpuBlasCreate(&cublas_handle_));
+  GPU_SPARSE_CHECK(gpuSparseSetStream(cusparse_handle_, gpu_stream_));
+  GPU_BLAS_CHECK(gpuBlasSetStream(cublas_handle_, gpu_stream_));
   // 2. Get matrix data from lp_ (CSC)
   a_num_rows_ = lp_.num_row_;
   a_num_cols_ = lp_.num_col_;
@@ -2413,197 +2399,198 @@ void PDLPSolver::setupGpu() {
   const std::vector<double>& h_a_val = lp_csr.value_;
 
   // 3. Allocate and copy A's CSR data to GPU
-  CUDA_CHECK(
-      cudaMalloc((void**)&d_a_row_ptr_, (a_num_rows_ + 1) * sizeof(HighsInt)));
-  CUDA_CHECK(cudaMalloc((void**)&d_a_col_ind_, a_nnz_ * sizeof(HighsInt)));
-  CUDA_CHECK(cudaMalloc((void**)&d_a_val_, a_nnz_ * sizeof(double)));
-  CUDA_CHECK(cudaMemcpy(d_a_row_ptr_, h_a_row_ptr.data(),
+  GPU_CHECK(
+      gpuMalloc((void**)&d_a_row_ptr_, (a_num_rows_ + 1) * sizeof(HighsInt)));
+  GPU_CHECK(gpuMalloc((void**)&d_a_col_ind_, a_nnz_ * sizeof(HighsInt)));
+  GPU_CHECK(gpuMalloc((void**)&d_a_val_, a_nnz_ * sizeof(double)));
+  GPU_CHECK(gpuMemcpy(d_a_row_ptr_, h_a_row_ptr.data(),
                         (a_num_rows_ + 1) * sizeof(HighsInt),
-                        cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy(d_a_col_ind_, h_a_col_ind.data(),
-                        a_nnz_ * sizeof(HighsInt), cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy(d_a_val_, h_a_val.data(), a_nnz_ * sizeof(double),
-                        cudaMemcpyHostToDevice));
+                        gpuMemcpyHostToDevice));
+  GPU_CHECK(gpuMemcpy(d_a_col_ind_, h_a_col_ind.data(),
+                        a_nnz_ * sizeof(HighsInt), gpuMemcpyHostToDevice));
+  GPU_CHECK(gpuMemcpy(d_a_val_, h_a_val.data(), a_nnz_ * sizeof(double),
+                        gpuMemcpyHostToDevice));
 
-  CUSPARSE_CHECK(cusparseCreateCsr(&mat_a_csr_, a_num_rows_, a_num_cols_,
+  GPU_SPARSE_CHECK(gpuSparseCreateCsr(&mat_a_csr_, a_num_rows_, a_num_cols_,
                                    a_nnz_, d_a_row_ptr_, d_a_col_ind_, d_a_val_,
-                                   CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-                                   CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F));
+                                   GPU_INDEX_32I, GPU_INDEX_32I,
+                                   GPU_INDEX_BASE_ZERO, GPU_R_64F));
 
   // 4. Create matrix AT in CSR format = A in CSC
   const std::vector<HighsInt>& h_at_row_ptr = lp_.a_matrix_.start_;
   const std::vector<HighsInt>& h_at_col_ind = lp_.a_matrix_.index_;
   const std::vector<double>& h_at_val = lp_.a_matrix_.value_;
 
-  CUDA_CHECK(cudaMalloc((void**)&d_at_row_ptr_,
+  GPU_CHECK(gpuMalloc((void**)&d_at_row_ptr_,
                         (a_num_cols_ + 1) * sizeof(HighsInt)));  // Fixed!
-  CUDA_CHECK(cudaMalloc((void**)&d_at_col_ind_, a_nnz_ * sizeof(HighsInt)));
-  CUDA_CHECK(cudaMalloc((void**)&d_at_val_, a_nnz_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc((void**)&d_at_col_ind_, a_nnz_ * sizeof(HighsInt)));
+  GPU_CHECK(gpuMalloc((void**)&d_at_val_, a_nnz_ * sizeof(double)));
 
-  CUDA_CHECK(cudaMemcpy(d_at_row_ptr_, h_at_row_ptr.data(),
+  GPU_CHECK(gpuMemcpy(d_at_row_ptr_, h_at_row_ptr.data(),
                         (a_num_cols_ + 1) * sizeof(HighsInt),
-                        cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy(d_at_col_ind_, h_at_col_ind.data(),
-                        a_nnz_ * sizeof(HighsInt), cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy(d_at_val_, h_at_val.data(), a_nnz_ * sizeof(double),
-                        cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMalloc(&d_halpern_iteration_, sizeof(int)));
-  CUDA_CHECK(cudaMemset(d_halpern_iteration_, 0, sizeof(int)));
-  CUDA_CHECK(cudaMalloc(&d_primal_step_size_, sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_dual_step_size_, sizeof(double)));
+                        gpuMemcpyHostToDevice));
+  GPU_CHECK(gpuMemcpy(d_at_col_ind_, h_at_col_ind.data(),
+                        a_nnz_ * sizeof(HighsInt), gpuMemcpyHostToDevice));
+  GPU_CHECK(gpuMemcpy(d_at_val_, h_at_val.data(), a_nnz_ * sizeof(double),
+                        gpuMemcpyHostToDevice));
+  GPU_CHECK(gpuMalloc(&d_step_params_, sizeof(GpuStepParams)));
+  GPU_CHECK(gpuMemset(d_step_params_, 0, sizeof(GpuStepParams)));
+  d_primal_step_size_  = &d_step_params_->primal_step;
+  d_dual_step_size_    = &d_step_params_->dual_step;
+  d_halpern_iteration_ = &d_step_params_->halpern_iteration;
 
   // Create AT descriptor with swapped dimensions
-  CUSPARSE_CHECK(cusparseCreateCsr(
+  GPU_SPARSE_CHECK(gpuSparseCreateCsr(
       &mat_a_T_csr_, a_num_cols_, a_num_rows_, a_nnz_,  // Dimensions swapped!
-      d_at_row_ptr_, d_at_col_ind_, d_at_val_, CUSPARSE_INDEX_32I,
-      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F));
-
-  CUDA_CHECK(cudaMalloc(&d_col_cost_, a_num_cols_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_col_lower_, a_num_cols_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_col_upper_, a_num_cols_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_row_lower_, a_num_rows_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_is_equality_row_, a_num_rows_ * sizeof(bool)));
-  CUDA_CHECK(cudaMalloc(&d_x_current_, a_num_cols_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_y_current_, a_num_rows_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_x_avg_, a_num_cols_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_y_avg_, a_num_rows_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_x_next_, a_num_cols_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_y_next_, a_num_rows_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_x_at_last_restart_, a_num_cols_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_y_at_last_restart_, a_num_rows_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_x_anchor_, a_num_cols_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_y_anchor_, a_num_rows_ * sizeof(double)));
-  CUDA_CHECK(
-      cudaMalloc(&d_x_temp_diff_norm_result_, a_num_cols_ * sizeof(double)));
-  CUDA_CHECK(
-      cudaMalloc(&d_y_temp_diff_norm_result_, a_num_rows_ * sizeof(double)));
+      d_at_row_ptr_, d_at_col_ind_, d_at_val_, GPU_INDEX_32I,
+      GPU_INDEX_32I, GPU_INDEX_BASE_ZERO, GPU_R_64F));
+
+  GPU_CHECK(gpuMalloc(&d_col_cost_, a_num_cols_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_col_lower_, a_num_cols_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_col_upper_, a_num_cols_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_row_lower_, a_num_rows_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_is_equality_row_, a_num_rows_ * sizeof(bool)));
+  GPU_CHECK(gpuMalloc(&d_x_current_, a_num_cols_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_y_current_, a_num_rows_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_x_avg_, a_num_cols_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_y_avg_, a_num_rows_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_x_next_, a_num_cols_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_y_next_, a_num_rows_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_x_at_last_restart_, a_num_cols_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_y_at_last_restart_, a_num_rows_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_x_anchor_, a_num_cols_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_y_anchor_, a_num_rows_ * sizeof(double)));
+  GPU_CHECK(
+      gpuMalloc(&d_x_temp_diff_norm_result_, a_num_cols_ * sizeof(double)));
+  GPU_CHECK(
+      gpuMalloc(&d_y_temp_diff_norm_result_, a_num_rows_ * sizeof(double)));
   if (params_.use_halpern_restart) {
-    CUDA_CHECK(cudaMalloc(&d_pdhg_primal_, a_num_cols_ * sizeof(double)));
-    CUDA_CHECK(cudaMalloc(&d_pdhg_dual_, a_num_rows_ * sizeof(double)));
-    CUDA_CHECK(cudaMemset(d_pdhg_primal_, 0, a_num_cols_ * sizeof(double)));
-    CUDA_CHECK(cudaMemset(d_pdhg_dual_, 0, a_num_rows_ * sizeof(double)));
-  }
-  CUDA_CHECK(cudaMalloc(&d_delta_x_, a_num_cols_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_delta_y_, a_num_rows_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_AT_delta_y_, a_num_cols_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_ax_current_, a_num_rows_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_aty_current_, a_num_cols_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_ax_next_, a_num_rows_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_aty_next_, a_num_cols_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_ax_avg_, a_num_rows_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_aty_avg_, a_num_cols_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_x_sum_, a_num_cols_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_y_sum_, a_num_rows_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_convergence_results_, 4 * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_dSlackPos_, a_num_cols_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_dSlackNeg_, a_num_cols_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_dSlackPosAvg_, a_num_cols_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_dSlackNegAvg_, a_num_cols_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_col_scale_, a_num_cols_ * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_row_scale_, a_num_rows_ * sizeof(double)));
-
-  CUSPARSE_CHECK(
-      cusparseCreateDnVec(&vec_x_desc_, a_num_cols_, d_x_current_, CUDA_R_64F));
-  CUSPARSE_CHECK(
-      cusparseCreateDnVec(&vec_y_desc_, a_num_rows_, d_y_current_, CUDA_R_64F));
-  CUSPARSE_CHECK(cusparseCreateDnVec(&vec_ax_desc_, a_num_rows_, d_ax_current_,
-                                     CUDA_R_64F));
-  CUSPARSE_CHECK(cusparseCreateDnVec(&vec_aty_desc_, a_num_cols_,
-                                     d_aty_current_, CUDA_R_64F));
-
-  CUDA_CHECK(cudaMemcpy(d_col_cost_, lp_.col_cost_.data(),
-                        a_num_cols_ * sizeof(double), cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy(d_col_lower_, lp_.col_lower_.data(),
-                        a_num_cols_ * sizeof(double), cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy(d_col_upper_, lp_.col_upper_.data(),
-                        a_num_cols_ * sizeof(double), cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMemcpy(d_row_lower_, lp_.row_lower_.data(),
-                        a_num_rows_ * sizeof(double), cudaMemcpyHostToDevice));
+    GPU_CHECK(gpuMalloc(&d_pdhg_primal_, a_num_cols_ * sizeof(double)));
+    GPU_CHECK(gpuMalloc(&d_pdhg_dual_, a_num_rows_ * sizeof(double)));
+    GPU_CHECK(gpuMemset(d_pdhg_primal_, 0, a_num_cols_ * sizeof(double)));
+    GPU_CHECK(gpuMemset(d_pdhg_dual_, 0, a_num_rows_ * sizeof(double)));
+  }
+  GPU_CHECK(gpuMalloc(&d_delta_x_, a_num_cols_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_delta_y_, a_num_rows_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_AT_delta_y_, a_num_cols_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_ax_current_, a_num_rows_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_aty_current_, a_num_cols_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_ax_next_, a_num_rows_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_aty_next_, a_num_cols_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_ax_avg_, a_num_rows_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_aty_avg_, a_num_cols_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_x_sum_, a_num_cols_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_y_sum_, a_num_rows_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_convergence_results_, 4 * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_dSlackPos_, a_num_cols_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_dSlackNeg_, a_num_cols_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_dSlackPosAvg_, a_num_cols_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_dSlackNegAvg_, a_num_cols_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_col_scale_, a_num_cols_ * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_row_scale_, a_num_rows_ * sizeof(double)));
+
+  GPU_SPARSE_CHECK(
+      gpuSparseCreateDnVec(&vec_x_desc_, a_num_cols_, d_x_current_, GPU_R_64F));
+  GPU_SPARSE_CHECK(
+      gpuSparseCreateDnVec(&vec_y_desc_, a_num_rows_, d_y_current_, GPU_R_64F));
+  GPU_SPARSE_CHECK(gpuSparseCreateDnVec(&vec_ax_desc_, a_num_rows_, d_ax_current_,
+                                     GPU_R_64F));
+  GPU_SPARSE_CHECK(gpuSparseCreateDnVec(&vec_aty_desc_, a_num_cols_,
+                                     d_aty_current_, GPU_R_64F));
+
+  GPU_CHECK(gpuMemcpy(d_col_cost_, lp_.col_cost_.data(),
+                        a_num_cols_ * sizeof(double), gpuMemcpyHostToDevice));
+  GPU_CHECK(gpuMemcpy(d_col_lower_, lp_.col_lower_.data(),
+                        a_num_cols_ * sizeof(double), gpuMemcpyHostToDevice));
+  GPU_CHECK(gpuMemcpy(d_col_upper_, lp_.col_upper_.data(),
+                        a_num_cols_ * sizeof(double), gpuMemcpyHostToDevice));
+  GPU_CHECK(gpuMemcpy(d_row_lower_, lp_.row_lower_.data(),
+                        a_num_rows_ * sizeof(double), gpuMemcpyHostToDevice));
   std::vector<uint8_t> temp_equality(a_num_rows_);
   for (HighsInt i = 0; i < a_num_rows_; ++i) {
     temp_equality[i] = is_equality_row_[i] ? 1 : 0;
   }
 
   // Copy to device
-  CUDA_CHECK(cudaMemcpy(d_is_equality_row_, temp_equality.data(),
-                        a_num_rows_ * sizeof(uint8_t), cudaMemcpyHostToDevice));
+  GPU_CHECK(gpuMemcpy(d_is_equality_row_, temp_equality.data(),
+                        a_num_rows_ * sizeof(uint8_t), gpuMemcpyHostToDevice));
 
   // 6. Preallocate buffer for cuSPARSE SpMV
   // Buffer for ax
   double alpha = 1.0;
   double beta = 0.0;
-  cusparseDnVecDescr_t vec_x, vec_ax;
-  CUSPARSE_CHECK(
-      cusparseCreateDnVec(&vec_x, a_num_cols_, d_x_current_, CUDA_R_64F));
-  CUSPARSE_CHECK(
-      cusparseCreateDnVec(&vec_ax, a_num_rows_, d_ax_current_, CUDA_R_64F));
-
-  CUSPARSE_CHECK(cusparseSpMV_bufferSize(
-      cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_csr_,
-      vec_x, &beta, vec_ax, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2,
+  gpuDnVecDescr_t vec_x, vec_ax;
+  GPU_SPARSE_CHECK(
+      gpuSparseCreateDnVec(&vec_x, a_num_cols_, d_x_current_, GPU_R_64F));
+  GPU_SPARSE_CHECK(
+      gpuSparseCreateDnVec(&vec_ax, a_num_rows_, d_ax_current_, GPU_R_64F));
+
+  GPU_SPARSE_CHECK(gpuSparseSpMV_bufferSize(
+      cusparse_handle_, GPU_OPERATION_NON_TRANSPOSE, &alpha, mat_a_csr_,
+      vec_x, &beta, vec_ax, GPU_R_64F, GPU_SPMV_ALG_DEFAULT,
       &spmv_buffer_size_ax_));
-  CUDA_CHECK(cudaMalloc(&d_spmv_buffer_ax_, spmv_buffer_size_ax_));
+  GPU_CHECK(gpuMalloc(&d_spmv_buffer_ax_, spmv_buffer_size_ax_));
 
-  CUSPARSE_CHECK(cusparseSpMV_preprocess(
-      cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_csr_,
-      vec_x_desc_, &beta, vec_ax_desc_, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2,
+  GPU_SPARSE_CHECK(gpuSparseSpMV_preprocess(
+      cusparse_handle_, GPU_OPERATION_NON_TRANSPOSE, &alpha, mat_a_csr_,
+      vec_x_desc_, &beta, vec_ax_desc_, GPU_R_64F, GPU_SPMV_ALG_DEFAULT,
       d_spmv_buffer_ax_));
 
-  CUSPARSE_CHECK(cusparseDestroyDnVec(vec_x));
-  CUSPARSE_CHECK(cusparseDestroyDnVec(vec_ax));
+  GPU_SPARSE_CHECK(gpuSparseDestroyDnVec(vec_x));
+  GPU_SPARSE_CHECK(gpuSparseDestroyDnVec(vec_ax));
 
   // Buffer for aTy
-  cusparseDnVecDescr_t vec_y, vec_aty;
-  CUSPARSE_CHECK(
-      cusparseCreateDnVec(&vec_y, a_num_rows_, d_y_current_, CUDA_R_64F));
-  CUSPARSE_CHECK(
-      cusparseCreateDnVec(&vec_aty, a_num_cols_, d_aty_current_, CUDA_R_64F));
-  CUSPARSE_CHECK(cusparseSpMV_bufferSize(
-      cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_T_csr_,
-      vec_y, &beta, vec_aty, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2,
+  gpuDnVecDescr_t vec_y, vec_aty;
+  GPU_SPARSE_CHECK(
+      gpuSparseCreateDnVec(&vec_y, a_num_rows_, d_y_current_, GPU_R_64F));
+  GPU_SPARSE_CHECK(
+      gpuSparseCreateDnVec(&vec_aty, a_num_cols_, d_aty_current_, GPU_R_64F));
+  GPU_SPARSE_CHECK(gpuSparseSpMV_bufferSize(
+      cusparse_handle_, GPU_OPERATION_NON_TRANSPOSE, &alpha, mat_a_T_csr_,
+      vec_y, &beta, vec_aty, GPU_R_64F, GPU_SPMV_ALG_DEFAULT,
       &spmv_buffer_size_aty_));
-  CUDA_CHECK(cudaMalloc(&d_spmv_buffer_aty_, spmv_buffer_size_aty_));
+  GPU_CHECK(gpuMalloc(&d_spmv_buffer_aty_, spmv_buffer_size_aty_));
 
-  CUSPARSE_CHECK(cusparseSpMV_preprocess(
-      cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_T_csr_,
-      vec_y_desc_, &beta, vec_aty_desc_, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2,
+  GPU_SPARSE_CHECK(gpuSparseSpMV_preprocess(
+      cusparse_handle_, GPU_OPERATION_NON_TRANSPOSE, &alpha, mat_a_T_csr_,
+      vec_y_desc_, &beta, vec_aty_desc_, GPU_R_64F, GPU_SPMV_ALG_DEFAULT,
       d_spmv_buffer_aty_));
 
-  CUSPARSE_CHECK(cusparseDestroyDnVec(vec_y));
-  CUSPARSE_CHECK(cusparseDestroyDnVec(vec_aty));
-
-  CUDA_CHECK(cudaMemset(d_x_current_, 0, a_num_cols_ * sizeof(double)));
-  CUDA_CHECK(cudaMemset(d_y_current_, 0, a_num_rows_ * sizeof(double)));
-  CUDA_CHECK(cudaMemset(d_x_avg_, 0, a_num_cols_ * sizeof(double)));
-  CUDA_CHECK(cudaMemset(d_y_avg_, 0, a_num_rows_ * sizeof(double)));
-  CUDA_CHECK(cudaMemset(d_x_next_, 0, a_num_cols_ * sizeof(double)));
-  CUDA_CHECK(cudaMemset(d_y_next_, 0, a_num_rows_ * sizeof(double)));
-  CUDA_CHECK(cudaMemset(d_ax_current_, 0, a_num_rows_ * sizeof(double)));
-  CUDA_CHECK(cudaMemset(d_ax_next_, 0, a_num_rows_ * sizeof(double)));
-  CUDA_CHECK(cudaMemset(d_ax_avg_, 0, a_num_rows_ * sizeof(double)));
-  CUDA_CHECK(cudaMemset(d_aty_avg_, 0, a_num_cols_ * sizeof(double)));
-  CUDA_CHECK(cudaMemset(d_x_sum_, 0, a_num_cols_ * sizeof(double)));
-  CUDA_CHECK(cudaMemset(d_y_sum_, 0, a_num_rows_ * sizeof(double)));
-  CUDA_CHECK(cudaMemset(d_aty_current_, 0, a_num_cols_ * sizeof(double)));
+  GPU_SPARSE_CHECK(gpuSparseDestroyDnVec(vec_y));
+  GPU_SPARSE_CHECK(gpuSparseDestroyDnVec(vec_aty));
+
+  GPU_CHECK(gpuMemset(d_x_current_, 0, a_num_cols_ * sizeof(double)));
+  GPU_CHECK(gpuMemset(d_y_current_, 0, a_num_rows_ * sizeof(double)));
+  GPU_CHECK(gpuMemset(d_x_avg_, 0, a_num_cols_ * sizeof(double)));
+  GPU_CHECK(gpuMemset(d_y_avg_, 0, a_num_rows_ * sizeof(double)));
+  GPU_CHECK(gpuMemset(d_x_next_, 0, a_num_cols_ * sizeof(double)));
+  GPU_CHECK(gpuMemset(d_y_next_, 0, a_num_rows_ * sizeof(double)));
+  GPU_CHECK(gpuMemset(d_ax_current_, 0, a_num_rows_ * sizeof(double)));
+  GPU_CHECK(gpuMemset(d_ax_next_, 0, a_num_rows_ * sizeof(double)));
+  GPU_CHECK(gpuMemset(d_ax_avg_, 0, a_num_rows_ * sizeof(double)));
+  GPU_CHECK(gpuMemset(d_aty_avg_, 0, a_num_cols_ * sizeof(double)));
+  GPU_CHECK(gpuMemset(d_x_sum_, 0, a_num_cols_ * sizeof(double)));
+  GPU_CHECK(gpuMemset(d_y_sum_, 0, a_num_rows_ * sizeof(double)));
+  GPU_CHECK(gpuMemset(d_aty_current_, 0, a_num_cols_ * sizeof(double)));
   sum_weights_gpu_ = 0.0;
 
   if (scaling_.isScaled()) {
-    CUDA_CHECK(cudaMemcpy(d_col_scale_, scaling_.getColScaling().data(),
+    GPU_CHECK(gpuMemcpy(d_col_scale_, scaling_.getColScaling().data(),
                           a_num_cols_ * sizeof(double),
-                          cudaMemcpyHostToDevice));
-    CUDA_CHECK(cudaMemcpy(d_row_scale_, scaling_.getRowScaling().data(),
+                          gpuMemcpyHostToDevice));
+    GPU_CHECK(gpuMemcpy(d_row_scale_, scaling_.getRowScaling().data(),
                           a_num_rows_ * sizeof(double),
-                          cudaMemcpyHostToDevice));
+                          gpuMemcpyHostToDevice));
   } else {
-    cudaFree(d_col_scale_);
+    gpuFree(d_col_scale_);
     d_col_scale_ = nullptr;
-    cudaFree(d_row_scale_);
+    gpuFree(d_row_scale_);
     d_row_scale_ = nullptr;
   }
 
   size_t max_size = std::max(a_num_cols_, a_num_rows_);
-  CUDA_CHECK(cudaMalloc(&d_buffer_, max_size * sizeof(double)));
-  CUDA_CHECK(cudaMalloc(&d_buffer2_, max_size * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_buffer_, max_size * sizeof(double)));
+  GPU_CHECK(gpuMalloc(&d_buffer2_, max_size * sizeof(double)));
 
   highsLogDev(params_.log_options_, HighsLogType::kInfo,
               "GPU setup complete. Matrix A (CSR) and A^T (CSR) transferred "
@@ -2611,63 +2598,61 @@ void PDLPSolver::setupGpu() {
 }
 
 void PDLPSolver::cleanupGpu() {
-  if (gpu_stream_) CUDA_CHECK(cudaStreamDestroy(gpu_stream_));
-  if (cusparse_handle_) CUSPARSE_CHECK(cusparseDestroy(cusparse_handle_));
-  if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_));
-  if (mat_a_csr_) CUSPARSE_CHECK(cusparseDestroySpMat(mat_a_csr_));
-  if (mat_a_T_csr_) CUSPARSE_CHECK(cusparseDestroySpMat(mat_a_T_csr_));
-  CUDA_CHECK(cudaFree(d_a_row_ptr_));
-  CUDA_CHECK(cudaFree(d_a_col_ind_));
-  CUDA_CHECK(cudaFree(d_a_val_));
-  CUDA_CHECK(cudaFree(d_at_row_ptr_));
-  CUDA_CHECK(cudaFree(d_at_col_ind_));
-  CUDA_CHECK(cudaFree(d_at_val_));
-  CUDA_CHECK(cudaFree(d_col_cost_));
-  CUDA_CHECK(cudaFree(d_col_lower_));
-  CUDA_CHECK(cudaFree(d_col_upper_));
-  CUDA_CHECK(cudaFree(d_row_lower_));
-  CUDA_CHECK(cudaFree(d_is_equality_row_));
-  CUDA_CHECK(cudaFree(d_x_at_last_restart_));
-  CUDA_CHECK(cudaFree(d_y_at_last_restart_));
-  CUDA_CHECK(cudaFree(d_halpern_iteration_));
-  CUDA_CHECK(cudaFree(d_primal_step_size_));
-  CUDA_CHECK(cudaFree(d_dual_step_size_));
-  if (d_x_anchor_) CUDA_CHECK(cudaFree(d_x_anchor_));
-  if (d_y_anchor_) CUDA_CHECK(cudaFree(d_y_anchor_));
-  if (d_pdhg_primal_) CUDA_CHECK(cudaFree(d_pdhg_primal_));
-  if (d_pdhg_dual_) CUDA_CHECK(cudaFree(d_pdhg_dual_));
-  CUDA_CHECK(cudaFree(d_delta_x_));
-  CUDA_CHECK(cudaFree(d_delta_y_));
-  CUDA_CHECK(cudaFree(d_AT_delta_y_));
-  CUDA_CHECK(cudaFree(d_x_temp_diff_norm_result_));
-  CUDA_CHECK(cudaFree(d_y_temp_diff_norm_result_));
-  CUDA_CHECK(cudaFree(d_x_current_));
-  CUDA_CHECK(cudaFree(d_y_current_));
-  CUDA_CHECK(cudaFree(d_x_next_));
-  CUDA_CHECK(cudaFree(d_y_next_));
-  CUDA_CHECK(cudaFree(d_ax_current_));
-  CUDA_CHECK(cudaFree(d_aty_current_));
-  CUDA_CHECK(cudaFree(d_ax_avg_));
-  CUDA_CHECK(cudaFree(d_aty_avg_));
-  CUDA_CHECK(cudaFree(d_ax_next_));
-  CUDA_CHECK(cudaFree(d_aty_next_));
-  CUDA_CHECK(cudaFree(d_x_sum_));
-  CUDA_CHECK(cudaFree(d_y_sum_));
-  CUDA_CHECK(cudaFree(d_spmv_buffer_ax_));
-  CUDA_CHECK(cudaFree(d_spmv_buffer_aty_));
-  CUDA_CHECK(cudaFree(d_convergence_results_));
-  CUDA_CHECK(cudaFree(d_dSlackPos_));
-  CUDA_CHECK(cudaFree(d_dSlackNeg_));
-  CUDA_CHECK(cudaFree(d_dSlackPosAvg_));
-  CUDA_CHECK(cudaFree(d_dSlackNegAvg_));
-  if (d_col_scale_) CUDA_CHECK(cudaFree(d_col_scale_));
-  if (d_row_scale_) CUDA_CHECK(cudaFree(d_row_scale_));
-  CUDA_CHECK(cudaFree(d_buffer_));
-  CUDA_CHECK(cudaFree(d_buffer2_));
-  if (vec_x_desc_) CUSPARSE_CHECK(cusparseDestroyDnVec(vec_x_desc_));
-  if (vec_y_desc_) CUSPARSE_CHECK(cusparseDestroyDnVec(vec_y_desc_));
-  if (vec_ax_desc_) CUSPARSE_CHECK(cusparseDestroyDnVec(vec_ax_desc_));
-  if (vec_aty_desc_) CUSPARSE_CHECK(cusparseDestroyDnVec(vec_aty_desc_));
+  if (gpu_stream_) GPU_CHECK(gpuStreamDestroy(gpu_stream_));
+  if (cusparse_handle_) GPU_SPARSE_CHECK(gpuSparseDestroy(cusparse_handle_));
+  if (cublas_handle_) GPU_BLAS_CHECK(gpuBlasDestroy(cublas_handle_));
+  if (mat_a_csr_) GPU_SPARSE_CHECK(gpuSparseDestroySpMat(mat_a_csr_));
+  if (mat_a_T_csr_) GPU_SPARSE_CHECK(gpuSparseDestroySpMat(mat_a_T_csr_));
+  GPU_CHECK(gpuFree(d_a_row_ptr_));
+  GPU_CHECK(gpuFree(d_a_col_ind_));
+  GPU_CHECK(gpuFree(d_a_val_));
+  GPU_CHECK(gpuFree(d_at_row_ptr_));
+  GPU_CHECK(gpuFree(d_at_col_ind_));
+  GPU_CHECK(gpuFree(d_at_val_));
+  GPU_CHECK(gpuFree(d_col_cost_));
+  GPU_CHECK(gpuFree(d_col_lower_));
+  GPU_CHECK(gpuFree(d_col_upper_));
+  GPU_CHECK(gpuFree(d_row_lower_));
+  GPU_CHECK(gpuFree(d_is_equality_row_));
+  GPU_CHECK(gpuFree(d_x_at_last_restart_));
+  GPU_CHECK(gpuFree(d_y_at_last_restart_));
+  GPU_CHECK(gpuFree(d_step_params_));
+  if (d_x_anchor_) GPU_CHECK(gpuFree(d_x_anchor_));
+  if (d_y_anchor_) GPU_CHECK(gpuFree(d_y_anchor_));
+  if (d_pdhg_primal_) GPU_CHECK(gpuFree(d_pdhg_primal_));
+  if (d_pdhg_dual_) GPU_CHECK(gpuFree(d_pdhg_dual_));
+  GPU_CHECK(gpuFree(d_delta_x_));
+  GPU_CHECK(gpuFree(d_delta_y_));
+  GPU_CHECK(gpuFree(d_AT_delta_y_));
+  GPU_CHECK(gpuFree(d_x_temp_diff_norm_result_));
+  GPU_CHECK(gpuFree(d_y_temp_diff_norm_result_));
+  GPU_CHECK(gpuFree(d_x_current_));
+  GPU_CHECK(gpuFree(d_y_current_));
+  GPU_CHECK(gpuFree(d_x_next_));
+  GPU_CHECK(gpuFree(d_y_next_));
+  GPU_CHECK(gpuFree(d_ax_current_));
+  GPU_CHECK(gpuFree(d_aty_current_));
+  GPU_CHECK(gpuFree(d_ax_avg_));
+  GPU_CHECK(gpuFree(d_aty_avg_));
+  GPU_CHECK(gpuFree(d_ax_next_));
+  GPU_CHECK(gpuFree(d_aty_next_));
+  GPU_CHECK(gpuFree(d_x_sum_));
+  GPU_CHECK(gpuFree(d_y_sum_));
+  GPU_CHECK(gpuFree(d_spmv_buffer_ax_));
+  GPU_CHECK(gpuFree(d_spmv_buffer_aty_));
+  GPU_CHECK(gpuFree(d_convergence_results_));
+  GPU_CHECK(gpuFree(d_dSlackPos_));
+  GPU_CHECK(gpuFree(d_dSlackNeg_));
+  GPU_CHECK(gpuFree(d_dSlackPosAvg_));
+  GPU_CHECK(gpuFree(d_dSlackNegAvg_));
+  if (d_col_scale_) GPU_CHECK(gpuFree(d_col_scale_));
+  if (d_row_scale_) GPU_CHECK(gpuFree(d_row_scale_));
+  GPU_CHECK(gpuFree(d_buffer_));
+  GPU_CHECK(gpuFree(d_buffer2_));
+  if (vec_x_desc_) GPU_SPARSE_CHECK(gpuSparseDestroyDnVec(vec_x_desc_));
+  if (vec_y_desc_) GPU_SPARSE_CHECK(gpuSparseDestroyDnVec(vec_y_desc_));
+  if (vec_ax_desc_) GPU_SPARSE_CHECK(gpuSparseDestroyDnVec(vec_ax_desc_));
+  if (vec_aty_desc_) GPU_SPARSE_CHECK(gpuSparseDestroyDnVec(vec_aty_desc_));
 }
 
 void PDLPSolver::linalgGpuAx(const double* d_x_in, double* d_ax_out) {
@@ -2675,20 +2660,20 @@ void PDLPSolver::linalgGpuAx(const double* d_x_in, double* d_ax_out) {
   double alpha = 1.0;
   double beta = 0.0;
 
-  CUSPARSE_CHECK(cusparseDnVecSetValues(vec_x_desc_, (void*)d_x_in));
-  CUSPARSE_CHECK(cusparseDnVecSetValues(vec_ax_desc_, (void*)d_ax_out));
+  GPU_SPARSE_CHECK(gpuSparseSetDnVecValues(vec_x_desc_, (void*)d_x_in));
+  GPU_SPARSE_CHECK(gpuSparseSetDnVecValues(vec_ax_desc_, (void*)d_ax_out));
   if (spmv_buffer_size_ax_ == 0) {
-    CUSPARSE_CHECK(cusparseSpMV_bufferSize(
-        cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, mat_a_csr_,
-        vec_x_desc_, &beta, vec_ax_desc_, CUDA_R_64F, CUSPARSE_SPMV_CSR_ALG2,
+    GPU_SPARSE_CHECK(gpuSparseSpMV_bufferSize(
+        cusparse_handle_, GPU_OPERATION_NON_TRANSPOSE, &alpha, mat_a_csr_,
+        vec_x_desc_, &beta, vec_ax_desc_, GPU_R_64F, GPU_SPMV_ALG_DEFAULT,
         &spmv_buffer_size_ax_));
-    CUDA_CHECK(cudaMalloc(&d_spmv_buffer_ax_, spmv_buffer_size_ax_));
+    GPU_CHECK(gpuMalloc(&d_spmv_buffer_ax_, spmv_buffer_size_ax_));
   }
 
-  CUSPARSE_CHECK(
-      cusparseSpMV(cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
-                   mat_a_csr_, vec_x_desc_, &beta, vec_ax_desc_, CUDA_R_64F,
-                   CUSPARSE_SPMV_CSR_ALG2, d_spmv_buffer_ax_));
+  GPU_SPARSE_CHECK(
+      gpuSparseSpMV(cusparse_handle_, GPU_OPERATION_NON_TRANSPOSE, &alpha,
+                   mat_a_csr_, vec_x_desc_, &beta, vec_ax_desc_, GPU_R_64F,
+                   GPU_SPMV_ALG_DEFAULT, d_spmv_buffer_ax_));
 }
 
 void PDLPSolver::linalgGpuATy(const double* d_y_in, double* d_aty_out) {
@@ -2696,46 +2681,46 @@ void PDLPSolver::linalgGpuATy(const double* d_y_in, double* d_aty_out) {
   double alpha = 1.0;
   double beta = 0.0;
 
-  CUSPARSE_CHECK(cusparseDnVecSetValues(vec_y_desc_, (void*)d_y_in));
-  CUSPARSE_CHECK(cusparseDnVecSetValues(vec_aty_desc_, (void*)d_aty_out));
+  GPU_SPARSE_CHECK(gpuSparseSetDnVecValues(vec_y_desc_, (void*)d_y_in));
+  GPU_SPARSE_CHECK(gpuSparseSetDnVecValues(vec_aty_desc_, (void*)d_aty_out));
   if (spmv_buffer_size_aty_ == 0) {
-    CUSPARSE_CHECK(cusparseSpMV_bufferSize(
-        cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
-        mat_a_T_csr_, vec_y_desc_, &beta, vec_aty_desc_, CUDA_R_64F,
-        CUSPARSE_SPMV_CSR_ALG2, &spmv_buffer_size_aty_));
-    CUDA_CHECK(cudaMalloc(&d_spmv_buffer_aty_, spmv_buffer_size_aty_));
-  }
-  CUSPARSE_CHECK(
-      cusparseSpMV(cusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
-                   mat_a_T_csr_, vec_y_desc_, &beta, vec_aty_desc_, CUDA_R_64F,
-                   CUSPARSE_SPMV_CSR_ALG2, d_spmv_buffer_aty_));
+    GPU_SPARSE_CHECK(gpuSparseSpMV_bufferSize(
+        cusparse_handle_, GPU_OPERATION_NON_TRANSPOSE, &alpha,
+        mat_a_T_csr_, vec_y_desc_, &beta, vec_aty_desc_, GPU_R_64F,
+        GPU_SPMV_ALG_DEFAULT, &spmv_buffer_size_aty_));
+    GPU_CHECK(gpuMalloc(&d_spmv_buffer_aty_, spmv_buffer_size_aty_));
+  }
+  GPU_SPARSE_CHECK(
+      gpuSparseSpMV(cusparse_handle_, GPU_OPERATION_NON_TRANSPOSE, &alpha,
+                   mat_a_T_csr_, vec_y_desc_, &beta, vec_aty_desc_, GPU_R_64F,
+                   GPU_SPMV_ALG_DEFAULT, d_spmv_buffer_aty_));
 }
 
 void PDLPSolver::launchKernelUpdateX(double primal_step) {
   launchKernelUpdateX_wrapper(d_x_next_, d_x_current_, d_aty_current_,
                               d_col_cost_, d_col_lower_, d_col_upper_,
                               primal_step, a_num_cols_, gpu_stream_);
-  CUDA_CHECK(cudaGetLastError());
+  GPU_CHECK(gpuGetLastError());
 }
 
 void PDLPSolver::launchKernelUpdateY(double dual_step) {
   launchKernelUpdateY_wrapper(d_y_next_, d_y_current_, d_ax_current_,
                               d_ax_next_, d_row_lower_, d_is_equality_row_,
                               dual_step, a_num_rows_, gpu_stream_);
-  CUDA_CHECK(cudaGetLastError());
+  GPU_CHECK(gpuGetLastError());
 }
 
 void PDLPSolver::launchKernelUpdateAverages(double weight) {
   launchKernelUpdateAverages_wrapper(d_x_sum_, d_y_sum_, d_x_next_, d_y_next_,
                                      weight, a_num_cols_, a_num_rows_,
                                      gpu_stream_);
-  CUDA_CHECK(cudaGetLastError());
+  GPU_CHECK(gpuGetLastError());
 }
 
 void PDLPSolver::launchKernelScaleVector(double* d_out, const double* d_in,
                                          double Scale, int n) {
   launchKernelScaleVector_wrapper(d_out, d_in, Scale, n, gpu_stream_);
-  CUDA_CHECK(cudaGetLastError());
+  GPU_CHECK(gpuGetLastError());
 }
 
 bool PDLPSolver::checkConvergenceGpu(const HighsInt iter, const double* d_x,
@@ -2754,8 +2739,8 @@ bool PDLPSolver::checkConvergenceGpu(const HighsInt iter, const double* d_x,
   // copy 4 doubles back to CPU
 
   double h_results[4];
-  CUDA_CHECK(cudaMemcpy(h_results, d_convergence_results_, 4 * sizeof(double),
-                        cudaMemcpyDeviceToHost));
+  GPU_CHECK(gpuMemcpy(h_results, d_convergence_results_, 4 * sizeof(double),
+                        gpuMemcpyDeviceToHost));
 
   double primal_feas_sq = h_results[0];
   double dual_feas_sq = h_results[1];
@@ -2840,8 +2825,8 @@ void PDLPSolver::computeAverageIterateGpu() {
 
 #if PDLP_DEBUG_LOG
   // copy x_avg to host
-  CUDA_CHECK(cudaMemcpy(x_avg_.data(), d_x_avg_, a_num_cols_ * sizeof(double),
-                        cudaMemcpyDeviceToHost));
+  GPU_CHECK(gpuMemcpy(x_avg_.data(), d_x_avg_, a_num_cols_ * sizeof(double),
+                        gpuMemcpyDeviceToHost));
   debug_pdlp_data_.x_average_norm = linalg::vectorNormSquared(x_avg_);
 #endif
 }
@@ -2871,21 +2856,21 @@ double PDLPSolver::computeNonlinearityGpu(const double* d_x_new,
                                           const double* d_aty_new,
                                           const double* d_aty_old) {
   // 1. Compute delta_x = x_new - x_old
-  CUDA_CHECK(cudaMemcpy(d_buffer_, d_x_new, a_num_cols_ * sizeof(double),
-                        cudaMemcpyDeviceToDevice));
+  GPU_CHECK(gpuMemcpy(d_buffer_, d_x_new, a_num_cols_ * sizeof(double),
+                        gpuMemcpyDeviceToDevice));
   double alpha = -1.0;
-  CUBLAS_CHECK(cublasDaxpy(cublas_handle_, a_num_cols_, &alpha, d_x_old, 1,
+  GPU_BLAS_CHECK(gpuBlasDaxpy(cublas_handle_, a_num_cols_, &alpha, d_x_old, 1,
                            d_buffer_, 1));
 
   // 2. Compute delta_aty = aty_new - aty_old
-  CUDA_CHECK(cudaMemcpy(d_buffer2_, d_aty_new, a_num_cols_ * sizeof(double),
-                        cudaMemcpyDeviceToDevice));
-  CUBLAS_CHECK(cublasDaxpy(cublas_handle_, a_num_cols_, &alpha, d_aty_old, 1,
+  GPU_CHECK(gpuMemcpy(d_buffer2_, d_aty_new, a_num_cols_ * sizeof(double),
+                        gpuMemcpyDeviceToDevice));
+  GPU_BLAS_CHECK(gpuBlasDaxpy(cublas_handle_, a_num_cols_, &alpha, d_aty_old, 1,
                            d_buffer2_, 1));
 
   // 3. Compute Dot product: delta_x' * delta_aty
   double result;
-  CUBLAS_CHECK(cublasDdot(cublas_handle_, a_num_cols_, d_buffer_, 1, d_buffer2_,
+  GPU_BLAS_CHECK(gpuBlasDdot(cublas_handle_, a_num_cols_, d_buffer_, 1, d_buffer2_,
                           1, &result));
 
   return result;
@@ -2894,16 +2879,16 @@ double PDLPSolver::computeNonlinearityGpu(const double* d_x_new,
 double PDLPSolver::computeDiffNormCuBLAS(const double* d_a, const double* d_b,
                                          HighsInt n) {
   // 1. Copy a to buffer: buffer = a
-  CUDA_CHECK(
-      cudaMemcpy(d_buffer_, d_a, n * sizeof(double), cudaMemcpyDeviceToDevice));
+  GPU_CHECK(
+      gpuMemcpy(d_buffer_, d_a, n * sizeof(double), gpuMemcpyDeviceToDevice));
 
   // 2. buffer = buffer - b  (using cuBLAS axpy)
   double alpha = -1.0;
-  CUBLAS_CHECK(cublasDaxpy(cublas_handle_, n, &alpha, d_b, 1, d_buffer_, 1));
+  GPU_BLAS_CHECK(gpuBlasDaxpy(cublas_handle_, n, &alpha, d_b, 1, d_buffer_, 1));
 
   // 3. result = ||buffer||_2  (using cuBLAS Norm2)
   double norm;
-  CUBLAS_CHECK(cublasDnrm2(cublas_handle_, n, d_buffer_, 1, &norm));
+  GPU_BLAS_CHECK(gpuBlasDnrm2(cublas_handle_, n, d_buffer_, 1, &norm));
 
   return norm;
 }
diff --git a/highs/pdlp/hipdlp/pdhg.cu b/highs/pdlp/hipdlp/pdhg.cu
index 6209faa3794..7362542ff7c 100644
--- a/highs/pdlp/hipdlp/pdhg.cu
+++ b/highs/pdlp/hipdlp/pdhg.cu
@@ -1,8 +1,8 @@
-#include <cuda_runtime.h>
-#include <device_launch_parameters.h>
-#include <cmath> 
+#include "HConfig.h"
+#include "gpu_backend.hpp"
+#include "pdhg_kernels.hpp"
+#include <cmath>
 #include <cstdio>
-#include <cublas_v2.h>
 
 #ifdef HIGHSINT64
 typedef int64_t HighsInt;
@@ -14,8 +14,8 @@ typedef unsigned int HighsUInt;
 #define HIGHSINT_FORMAT "d"
 #endif
 
-// Define Infinity for GPU 
-#define GPU_INF 1e20 
+// Define Infinity for GPU
+#define GPU_INF 1e20
 
 // Buffer Indices for the reduction array
 #define IDX_PRIMAL_FEAS 0
@@ -23,17 +23,21 @@ typedef unsigned int HighsUInt;
 #define IDX_PRIMAL_OBJ  2
 #define IDX_DUAL_OBJ    3
 
-// Add to pdhg.cu
-#define FULL_WARP_REDUCE(val) { \
-  val += __shfl_down_sync(0xFFFFFFFF, val, 16); \
-  val += __shfl_down_sync(0xFFFFFFFF, val, 8); \
-  val += __shfl_down_sync(0xFFFFFFFF, val, 4); \
-  val += __shfl_down_sync(0xFFFFFFFF, val, 2); \
-  val += __shfl_down_sync(0xFFFFFFFF, val, 1); \
+// warpSize is a built-in device variable that returns the actual hardware
+// warp/wavefront width (32 on CUDA and RDNA, 64 on GCN/CDNA).
+__device__ __forceinline__ double gpuReduce(double val) {
+  for (int offset = warpSize / 2; offset > 0; offset >>= 1) {
+#ifdef USE_HIP
+    val += __shfl_down(val, offset);
+#else
+    val += __shfl_down_sync(0xffffffff, val, offset);
+#endif
+  }
+  return val;
 }
 
 // Utility for robust 1D kernel launches
-#define CUDA_GRID_STRIDE_LOOP(i, n)                                  \
+#define GPU_GRID_STRIDE_LOOP(i, n)                                  \
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
        i += blockDim.x * gridDim.x)
 
@@ -58,26 +62,27 @@ __device__ __forceinline__ double fma_rn(double a, double b, double c) {
 
 // === KERNEL 1: Update X (Primal Step) ===
 __global__ void kernelUpdateX(
-    double* d_x_new, const double* d_x_old, const double* d_aty, 
-    const double* d_cost, const double* d_lower, const double* d_upper,
-    double primal_step, int n_cols) 
+    double* __restrict__ d_x_new, const double* __restrict__ d_x_old,
+    const double* __restrict__ d_aty, const double* __restrict__ d_cost,
+    const double* __restrict__ d_lower, const double* __restrict__ d_upper,
+    double primal_step, int n_cols)
 {
-  CUDA_GRID_STRIDE_LOOP(i, n_cols) {
+  GPU_GRID_STRIDE_LOOP(i, n_cols) {
     double x_updated = fma_rn(primal_step, d_aty[i] - d_cost[i], d_x_old[i]);
-    
-    // 3. Project to bounds [l, u]
+
+    // Project to bounds [l, u]
     d_x_new[i] = fmin(fmax(x_updated, d_lower[i]), d_upper[i]);
   }
 }
 
 // === KERNEL 2: Update Y (Dual Step) ===
 __global__ void kernelUpdateY(
-    double* d_y_new, const double* d_y_old,
-    const double* d_ax_old, const double* d_ax_new,  
-    const double* d_rhs, const bool* d_is_equality,
+    double* __restrict__ d_y_new, const double* __restrict__ d_y_old,
+    const double* __restrict__ d_ax_old, const double* __restrict__ d_ax_new,
+    const double* __restrict__ d_rhs, const bool* __restrict__ d_is_equality,
     double dual_step, int n_rows)
 {
-  CUDA_GRID_STRIDE_LOOP(j, n_rows) {
+  GPU_GRID_STRIDE_LOOP(j, n_rows) {
     double residual = fma_rn(-2.0, d_ax_new[j], d_rhs[j] + d_ax_old[j]);
     double dual_update = fma_rn(dual_step, residual, d_y_old[j]);
     d_y_new[j] = d_is_equality[j] ? dual_update : fmax(0.0, dual_update);
@@ -88,46 +93,46 @@ __global__ void kernelUpdateY(
 // x_sum = x_sum + weight * x_next.
 // y_sum = y_sum + weight * y_next
 __global__ void kernelUpdateAverages(
-    double* d_x_sum, double* d_y_sum,
-    const double* d_x_next, const double* d_y_next,
+    double* __restrict__ d_x_sum, double* __restrict__ d_y_sum,
+    const double* __restrict__ d_x_next, const double* __restrict__ d_y_next,
     double weight, int n_cols, int n_rows)
 {
     // Update x_sum
-    CUDA_GRID_STRIDE_LOOP(i, n_cols) {
+    GPU_GRID_STRIDE_LOOP(i, n_cols) {
         d_x_sum[i] = fma_rn(weight, d_x_next[i], d_x_sum[i]);
     }
-    
+
     // Update y_sum
-    CUDA_GRID_STRIDE_LOOP(j, n_rows) {
+    GPU_GRID_STRIDE_LOOP(j, n_rows) {
         d_y_sum[j] = fma_rn(weight, d_y_next[j], d_y_sum[j]);
     }
 }
 
 __global__ void kernelScaleVector(
-    double* d_out, const double* d_in, 
+    double* __restrict__ d_out, const double* __restrict__ d_in,
     double scale, int n)
 {
-  CUDA_GRID_STRIDE_LOOP(i, n) {
+  GPU_GRID_STRIDE_LOOP(i, n) {
     d_out[i] = d_in[i] * scale;
   }
 }
 
 // === KERNEL 4: Primal Convergence Check (Row-wise) ===
 __global__ void kernelCheckPrimal(
-  double* d_results,
-  const double* d_ax, const double* d_y,
-  const double* d_row_lower, const double* d_row_scale,
-  const bool* d_is_equality, int n_rows){
+  double* __restrict__ d_results,
+  const double* __restrict__ d_ax, const double* __restrict__ d_y,
+  const double* __restrict__ d_row_lower, const double* d_row_scale,
+  const bool* __restrict__ d_is_equality, int n_rows){
   double local_feas_sq = 0.0;
   double local_dual_obj = 0.0;
 
-  CUDA_GRID_STRIDE_LOOP(i,n_rows){
+  GPU_GRID_STRIDE_LOOP(i,n_rows){
     double val_y = d_y[i];
     double val_b = d_row_lower[i];
     double val_ax = d_ax[i];
 
-    if (abs(val_b) < GPU_INF){ 
-      local_dual_obj += val_b * val_y;
+    if (abs(val_b) < GPU_INF){
+      local_dual_obj = fma_rn(val_b, val_y, local_dual_obj);
     }
 
     double residual = val_ax - val_b;
@@ -140,46 +145,58 @@ __global__ void kernelCheckPrimal(
       residual *= d_row_scale[i];
     }
 
-    local_feas_sq += residual * residual;
+    local_feas_sq = fma_rn(residual, residual, local_feas_sq);
   }
 
-  // 1. wrap reduce
-  FULL_WARP_REDUCE(local_feas_sq);
-  FULL_WARP_REDUCE(local_dual_obj);
+  // Warp-level reduction
+  local_feas_sq  = gpuReduce(local_feas_sq);
+  local_dual_obj = gpuReduce(local_dual_obj);
+
+  // Block-level reduction: one atomicAdd per block to global memory instead of
+  // one per warp, cutting contention by ~blockDim.x/warpSize.
+  __shared__ double s_feas[2];
+  if (threadIdx.x == 0) { s_feas[0] = 0.0; s_feas[1] = 0.0; }
+  __syncthreads();
 
-  // 2. Atomic Add only from the first thread of each warp
-  if ((threadIdx.x & 31) == 0){
-    atomicAdd(&d_results[IDX_PRIMAL_FEAS], local_feas_sq);
-    atomicAdd(&d_results[IDX_DUAL_OBJ], local_dual_obj);
+  if ((threadIdx.x & (warpSize - 1)) == 0) {
+    atomicAdd(&s_feas[0], local_feas_sq);
+    atomicAdd(&s_feas[1], local_dual_obj);
+  }
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    atomicAdd(&d_results[IDX_PRIMAL_FEAS], s_feas[0]);
+    atomicAdd(&d_results[IDX_DUAL_OBJ],    s_feas[1]);
   }
 }
 
 // === KERNEL 5: Dual Convergence Check (Column-wise) ===
+// d_slack_pos doubles as input (raw Halpern slack) and output (s_pos).
 __global__ void kernelCheckDual(
-    double* d_results,          // [1]: D.Feas, [2]: P.Obj, [3]: D.Obj
-    double* d_slack_pos,        // Input (raw halpern slack) / Output (s_pos)
-    double* d_slack_neg,        // Output
-    const double* d_aty,
-    const double* d_x,
-    const double* d_cost,       // c
-    const double* d_col_lower,  // l
-    const double* d_col_upper,  // u
-    const double* d_col_scale,  // Can be nullptr
+    double* __restrict__ d_results,
+    double* __restrict__ d_slack_pos,   // Input (raw halpern slack) / Output (s_pos)
+    double* __restrict__ d_slack_neg,   // Output
+    const double* __restrict__ d_aty,
+    const double* __restrict__ d_x,
+    const double* __restrict__ d_cost,      // c
+    const double* __restrict__ d_col_lower, // l
+    const double* __restrict__ d_col_upper, // u
+    const double* d_col_scale,              // Can be nullptr
     int n_cols,
-    bool use_halpern_slack)     
+    bool use_halpern_slack)
 {
   double local_dual_feas_sq = 0.0;
   double local_primal_obj = 0.0;
   double local_dual_obj_part = 0.0;
 
-  CUDA_GRID_STRIDE_LOOP(i, n_cols){
+  GPU_GRID_STRIDE_LOOP(i, n_cols){
     double val_x = d_x[i];
     double val_c = d_cost[i];
     double val_aty = d_aty[i];
     double val_l = d_col_lower[i];
     double val_u = d_col_upper[i];
 
-    local_primal_obj += val_c * val_x;
+    local_primal_obj = fma_rn(val_c, val_x, local_primal_obj);
     double dual_residual = val_c - val_aty;
 
     double s_pos = 0.0;
@@ -187,7 +204,7 @@ __global__ void kernelCheckDual(
 
     if (use_halpern_slack) {
       // Read the raw Halpern slack previously saved in d_slack_pos by primal major step
-      double raw_slack = d_slack_pos[i]; 
+      double raw_slack = d_slack_pos[i];
       s_pos = fmax(0.0, raw_slack);
       s_neg = fmax(0.0, -raw_slack);
     } else {
@@ -206,19 +223,34 @@ __global__ void kernelCheckDual(
       eff_dual_residual *= d_col_scale[i];
     }
 
-    local_dual_feas_sq += eff_dual_residual * eff_dual_residual;
+    local_dual_feas_sq = fma_rn(eff_dual_residual, eff_dual_residual, local_dual_feas_sq);
+
+    if (val_l > -GPU_INF) local_dual_obj_part = fma_rn(val_l,  s_pos, local_dual_obj_part);
+    if (val_u <  GPU_INF) local_dual_obj_part = fma_rn(-val_u, s_neg, local_dual_obj_part);
+  }
+
+  // Warp-level reduction
+  local_dual_feas_sq  = gpuReduce(local_dual_feas_sq);
+  local_primal_obj    = gpuReduce(local_primal_obj);
+  local_dual_obj_part = gpuReduce(local_dual_obj_part);
 
-    double obj_term = 0.0;
-    if (val_l > -GPU_INF) obj_term += val_l * s_pos;
-    if (val_u < GPU_INF)  obj_term -= val_u * s_neg;
+  // Block-level reduction via shared memory before global atomicAdd.
+  __shared__ double s_dual[3];
+  if (threadIdx.x == 0) { s_dual[0] = 0.0; s_dual[1] = 0.0; s_dual[2] = 0.0; }
+  __syncthreads();
 
-    local_dual_obj_part += obj_term;
+  if ((threadIdx.x & (warpSize - 1)) == 0) {
+    atomicAdd(&s_dual[0], local_dual_feas_sq);
+    atomicAdd(&s_dual[1], local_primal_obj);
+    atomicAdd(&s_dual[2], local_dual_obj_part);
   }
+  __syncthreads();
 
-  // Atomic accumulation
-  atomicAdd(&d_results[IDX_DUAL_FEAS], local_dual_feas_sq);
-  atomicAdd(&d_results[IDX_PRIMAL_OBJ], local_primal_obj);
-  atomicAdd(&d_results[IDX_DUAL_OBJ], local_dual_obj_part);
+  if (threadIdx.x == 0) {
+    atomicAdd(&d_results[IDX_DUAL_FEAS],  s_dual[0]);
+    atomicAdd(&d_results[IDX_PRIMAL_OBJ], s_dual[1]);
+    atomicAdd(&d_results[IDX_DUAL_OBJ],   s_dual[2]);
+  }
 }
 
 // ============================================================================
@@ -236,7 +268,7 @@ __global__ void kernelHalpernPrimalMinor(
         const double* __restrict__ step_size_ptr, int n)
 {
         const double step_size = *step_size_ptr;
-    CUDA_GRID_STRIDE_LOOP(i, n) {
+    GPU_GRID_STRIDE_LOOP(i, n) {
         double temp = fma_rn(-step_size, objective[i] - dual_product[i],
                              current_primal[i]);
         double temp_proj = fmax(var_lb[i], fmin(temp, var_ub[i]));
@@ -258,12 +290,13 @@ __global__ void kernelHalpernPrimalMajor(
     double* __restrict__ dual_slack)
 {
         const double step_size = *step_size_ptr;
-    CUDA_GRID_STRIDE_LOOP(i, n) {
+        const double inv_step_size = 1.0 / step_size;
+    GPU_GRID_STRIDE_LOOP(i, n) {
         double temp = fma_rn(-step_size, objective[i] - dual_product[i],
                              current_primal[i]);
         double temp_proj = fmax(var_lb[i], fmin(temp, var_ub[i]));
         pdhg_primal[i] = temp_proj;
-        dual_slack[i] = (temp_proj - temp) / step_size;
+        dual_slack[i] = (temp_proj - temp) * inv_step_size;
         reflected_primal[i] = fma_rn(2.0, temp_proj, -current_primal[i]);
     }
 }
@@ -277,7 +310,7 @@ __global__ void kernelHalpernDualMinor(
         const double* __restrict__ step_size_ptr, int n)
 {
         const double step_size = *step_size_ptr;
-    CUDA_GRID_STRIDE_LOOP(i, n) {
+    GPU_GRID_STRIDE_LOOP(i, n) {
         double dual_update = fma_rn(step_size, rhs[i] - primal_product[i],
                                     current_dual[i]);
         double pdhg_dual_i = is_equality[i] ? dual_update
@@ -296,7 +329,7 @@ __global__ void kernelHalpernDualMajor(
         const double* __restrict__ step_size_ptr, int n)
 {
         const double step_size = *step_size_ptr;
-    CUDA_GRID_STRIDE_LOOP(i, n) {
+    GPU_GRID_STRIDE_LOOP(i, n) {
         double dual_update = fma_rn(step_size, rhs[i] - primal_product[i],
                                     current_dual[i]);
         double pdhg_dual_i = is_equality[i] ? dual_update
@@ -316,12 +349,15 @@ __global__ void kernelHalpernBlend(
     double reflection_coeff,   // ρ, typically 1.0
     HighsInt n)
 {
-    const int current_k = (*halpern_iteration) + k_offset;
-    const double weight = static_cast<double>(current_k) / (current_k + 1.0);
-    CUDA_GRID_STRIDE_LOOP(i, n) {
+    const int current_k           = (*halpern_iteration) + k_offset;
+    const double weight           = static_cast<double>(current_k) / (current_k + 1.0);
+    const double one_minus_weight = 1.0 - weight;
+    const double one_minus_rho    = 1.0 - reflection_coeff;
+    GPU_GRID_STRIDE_LOOP(i, n) {
         double blended = fma_rn(reflection_coeff, reflected[i],
-                                (1.0 - reflection_coeff) * current[i]);
-        current[i] = fma_rn(weight, blended, (1.0 - weight) * initial[i]);
+                                fma_rn(one_minus_rho, current[i], 0.0));
+        current[i]     = fma_rn(weight, blended,
+                                fma_rn(one_minus_weight, initial[i], 0.0));
     }
 }
 
@@ -330,63 +366,63 @@ extern "C" {
 void launchKernelUpdateX_wrapper(
     double* d_x_new, const double* d_x_old, const double* d_aty,
     const double* d_cost, const double* d_lower, const double* d_upper,
-    double primal_step, int n_cols, cudaStream_t stream) 
+    double primal_step, int n_cols, gpuStream_t stream)
 {
     const int block_size = 256;
     dim3 config = GetLaunchConfig(n_cols, block_size);
-    
+
     kernelUpdateX<<<config.x, block_size, 0, stream>>>(
         d_x_new, d_x_old, d_aty,
         d_cost, d_lower, d_upper,
         primal_step, n_cols);
-    
-    cudaGetLastError(); 
+
+    gpuGetLastError();
 }
 
 void launchKernelUpdateY_wrapper(
     double* d_y_new, const double* d_y_old,
-    const double* d_ax_old, const double* d_ax_new, 
+    const double* d_ax_old, const double* d_ax_new,
     const double* d_rhs, const bool* d_is_equality,
-    double dual_step, int n_rows, cudaStream_t stream) 
+    double dual_step, int n_rows, gpuStream_t stream)
 {
     const int block_size = 256;
     dim3 config = GetLaunchConfig(n_rows, block_size);
-    
+
     kernelUpdateY<<<config.x, block_size, 0, stream>>>(
         d_y_new, d_y_old,
         d_ax_old, d_ax_new,
         d_rhs, d_is_equality,
         dual_step, n_rows);
-    
-    cudaGetLastError();
+
+    gpuGetLastError();
 }
 
 void launchKernelUpdateAverages_wrapper(
     double* d_x_sum, double* d_y_sum,
     const double* d_x_next, const double* d_y_next,
-    double weight, int n_cols, int n_rows, cudaStream_t stream) 
+    double weight, int n_cols, int n_rows, gpuStream_t stream)
 {
     const int block_size = 256;
     dim3 config_x = GetLaunchConfig(n_cols, block_size);
-    dim3 config_y = GetLaunchConfig(n_rows, block_size);  
+    dim3 config_y = GetLaunchConfig(n_rows, block_size);
     kernelUpdateAverages<<<config_x.x > config_y.x ? config_x.x : config_y.x, block_size, 0, stream>>>(
         d_x_sum, d_y_sum,
         d_x_next, d_y_next,
         weight, n_cols, n_rows);
-    cudaGetLastError();
+    gpuGetLastError();
 }
 
 void launchKernelScaleVector_wrapper(
-    double* d_out, const double* d_in, 
-    double scale, int n, cudaStream_t stream)
+    double* d_out, const double* d_in,
+    double scale, int n, gpuStream_t stream)
 {
     const int block_size = 256;
     dim3 config = GetLaunchConfig(n, block_size);
-    
+
     kernelScaleVector<<<config.x, block_size, 0, stream>>>(
         d_out, d_in, scale, n);
-    
-    cudaGetLastError();
+
+    gpuGetLastError();
 }
 
 void launchCheckConvergenceKernels_wrapper(
@@ -400,10 +436,10 @@ void launchCheckConvergenceKernels_wrapper(
     const double* d_col_scale, const double* d_row_scale,
     int n_cols, int n_rows,
     bool use_halpern_slack,
-    cudaStream_t stream)
+    gpuStream_t stream)
 {
     // 1. Zero out results
-    cudaMemsetAsync(d_results, 0, 4 * sizeof(double), stream);
+    gpuMemsetAsync(d_results, 0, 4 * sizeof(double), stream);
 
     int block_size = 256;
 
@@ -416,26 +452,26 @@ void launchCheckConvergenceKernels_wrapper(
     // 3. Launch Dual Kernel
     dim3 grid_cols = GetLaunchConfig(n_cols, block_size);
     kernelCheckDual<<<grid_cols, block_size, 0, stream>>>(
-        d_results, d_slack_pos, d_slack_neg, d_aty, d_x, 
+        d_results, d_slack_pos, d_slack_neg, d_aty, d_x,
         d_col_cost, d_col_lower, d_col_upper, d_col_scale, n_cols,
         use_halpern_slack
     );
 
-    cudaGetLastError();
+    gpuGetLastError();
 }
 
 void launchKernelHalpernPrimalMinor_wrapper(
     const double* d_current_primal, double* d_reflected_primal,
     const double* d_dual_product, const double* d_objective,
     const double* d_var_lb, const double* d_var_ub,
-    const double* d_step_size, int n, cudaStream_t stream)
+    const double* d_step_size, int n, gpuStream_t stream)
 {
     const int block_size = 256;
     dim3 config = GetLaunchConfig(n, block_size);
     kernelHalpernPrimalMinor<<<config.x, block_size, 0, stream>>>(
         d_current_primal, d_reflected_primal, d_dual_product,
         d_objective, d_var_lb, d_var_ub, d_step_size, n);
-    cudaGetLastError();
+    gpuGetLastError();
 }
 
 void launchKernelHalpernPrimalMajor_wrapper(
@@ -443,7 +479,7 @@ void launchKernelHalpernPrimalMajor_wrapper(
     double* d_reflected_primal, const double* d_dual_product,
     const double* d_objective, const double* d_var_lb,
     const double* d_var_ub, const double* d_step_size, int n,
-    double* d_dual_slack, cudaStream_t stream)
+    double* d_dual_slack, gpuStream_t stream)
 {
     const int block_size = 256;
     dim3 config = GetLaunchConfig(n, block_size);
@@ -451,47 +487,47 @@ void launchKernelHalpernPrimalMajor_wrapper(
         d_current_primal, d_pdhg_primal, d_reflected_primal,
         d_dual_product, d_objective, d_var_lb, d_var_ub,
         d_step_size, n, d_dual_slack);
-    cudaGetLastError();
+    gpuGetLastError();
 }
 
 void launchKernelHalpernDualMinor_wrapper(
     const double* d_current_dual, double* d_reflected_dual,
     const double* d_primal_product, const double* d_rhs,
     const bool* d_is_equality, const double* d_step_size, int n,
-    cudaStream_t stream)
+    gpuStream_t stream)
 {
     const int block_size = 256;
     dim3 config = GetLaunchConfig(n, block_size);
     kernelHalpernDualMinor<<<config.x, block_size, 0, stream>>>(
         d_current_dual, d_reflected_dual, d_primal_product,
         d_rhs, d_is_equality, d_step_size, n);
-    cudaGetLastError();
+    gpuGetLastError();
 }
 
 void launchKernelHalpernDualMajor_wrapper(
     const double* d_current_dual, double* d_pdhg_dual,
     double* d_reflected_dual, const double* d_primal_product,
     const double* d_rhs, const bool* d_is_equality,
-    const double* d_step_size, int n, cudaStream_t stream)
+    const double* d_step_size, int n, gpuStream_t stream)
 {
     const int block_size = 256;
     dim3 config = GetLaunchConfig(n, block_size);
     kernelHalpernDualMajor<<<config.x, block_size, 0, stream>>>(
         d_current_dual, d_pdhg_dual, d_reflected_dual,
         d_primal_product, d_rhs, d_is_equality, d_step_size, n);
-    cudaGetLastError();
+    gpuGetLastError();
 }
 
 void launchKernelHalpernBlend_wrapper(
     double* d_current, const double* d_reflected,
     const double* d_initial, const int* d_halpern_iteration, int k_offset,
-    double reflection_coeff, int n, cudaStream_t stream)
+    double reflection_coeff, int n, gpuStream_t stream)
 {
     const int block_size = 256;
     dim3 config = GetLaunchConfig(n, block_size);
     kernelHalpernBlend<<<config.x, block_size, 0, stream>>>(
         d_current, d_reflected, d_initial,
         d_halpern_iteration, k_offset, reflection_coeff, n);
-    cudaGetLastError();
+    gpuGetLastError();
 }
-} // extern "C"
\ No newline at end of file
+} // extern "C"
diff --git a/highs/pdlp/hipdlp/pdhg.hpp b/highs/pdlp/hipdlp/pdhg.hpp
index aa3fb4a38f6..4a2e5b1ef7d 100644
--- a/highs/pdlp/hipdlp/pdhg.hpp
+++ b/highs/pdlp/hipdlp/pdhg.hpp
@@ -11,10 +11,9 @@
 #ifndef PDHG_HPP
 #define PDHG_HPP
 
-#ifdef CUPDLP_GPU
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
-#include <cusparse.h>
+#include "HConfig.h"
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
+#include "gpu_backend.hpp"
 #endif
 
 #define PDLP_PROFILE (0)
@@ -36,42 +35,8 @@
 const bool kUseCupdlpx = true;
 const bool kTempSetting = true;
 
-// --- GPU Macros (Defined at file scope for visibility) ---
-#ifdef CUPDLP_GPU
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
-#include <cusparse.h>
-
-#define CUDA_CHECK(call)                                               \
-  do {                                                                 \
-    cudaError_t err = (call);                                          \
-    if (err != cudaSuccess) {                                          \
-      fprintf(stderr, "CUDA Error at %s:%d: %s\n", __FILE__, __LINE__, \
-              cudaGetErrorString(err));                                \
-      exit(EXIT_FAILURE);                                              \
-    }                                                                  \
-  } while (0)
-
-#define CUSPARSE_CHECK(call)                                               \
-  do {                                                                     \
-    cusparseStatus_t status = (call);                                      \
-    if (status != CUSPARSE_STATUS_SUCCESS) {                               \
-      fprintf(stderr, "cuSPARSE Error at %s:%d: %s\n", __FILE__, __LINE__, \
-              cusparseGetErrorString(status));                             \
-      exit(EXIT_FAILURE);                                                  \
-    }                                                                      \
-  } while (0)
-
-#define CUBLAS_CHECK(call)                                               \
-  do {                                                                   \
-    cublasStatus_t status = call;                                        \
-    if (status != CUBLAS_STATUS_SUCCESS) {                               \
-      fprintf(stderr, "cuBLAS Error at %s:%d: %d\n", __FILE__, __LINE__, \
-              status);                                                   \
-      exit(EXIT_FAILURE);                                                \
-    }                                                                    \
-  } while (0)
-#endif
+// GPU error-checking macros are defined in gpu_backend.hpp as
+// GPU_CHECK / GPU_BLAS_CHECK / GPU_SPARSE_CHECK.
 
 // Forward declarations
 struct StepSizeConfig;
@@ -126,7 +91,7 @@ class PDLPSolver {
 
   // --- Convergence & Math Helpers ---
   double powerMethod();
-#ifdef CUPDLP_GPU
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
   double powerMethodGpu();
 #endif
   void initializeStepSizes();
@@ -263,28 +228,37 @@ class PDLPSolver {
   void hipdlpTimerStop(const HighsInt hipdlp_clock);
 #endif
 
-#ifdef CUPDLP_GPU
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
   HighsInt a_num_rows_ = 0;
   HighsInt a_num_cols_ = 0;
   HighsInt a_nnz_ = 0;
   double sum_weights_gpu_ = 0.0;
   // --- GPU Specifics ---
-  cudaStream_t gpu_stream_ = nullptr;
-  cusparseHandle_t cusparse_handle_ = nullptr;
-  cublasHandle_t cublas_handle_ = nullptr;
+  gpuStream_t gpu_stream_ = nullptr;
+  gpuSparseHandle_t cusparse_handle_ = nullptr;
+  gpuBlasHandle_t cublas_handle_ = nullptr;
 
   // Matrix descriptors
-  cusparseSpMatDescr_t mat_a_csr_ = nullptr;
-  cusparseSpMatDescr_t mat_a_T_csr_ = nullptr;
+  gpuSpMatDescr_t mat_a_csr_ = nullptr;
+  gpuSpMatDescr_t mat_a_T_csr_ = nullptr;
 
   // Device Pointers
   HighsInt *d_a_row_ptr_ = nullptr, *d_a_col_ind_ = nullptr;
   double* d_a_val_ = nullptr;
   HighsInt *d_at_row_ptr_ = nullptr, *d_at_col_ind_ = nullptr;
   double* d_at_val_ = nullptr;
-  int* d_halpern_iteration_ = nullptr;
-  double* d_primal_step_size_ = nullptr;
-  double* d_dual_step_size_ = nullptr;
+  // Packed into a single device allocation to avoid 3 separate H→D transfers
+  // per iteration. Layout: [primal_step, dual_step, halpern_iteration (as int)].
+  struct GpuStepParams {
+    double primal_step;
+    double dual_step;
+    int    halpern_iteration;
+  };
+  GpuStepParams* d_step_params_ = nullptr;
+  // Convenience device pointers aliasing into d_step_params_
+  int*    d_halpern_iteration_ = nullptr;
+  double* d_primal_step_size_  = nullptr;
+  double* d_dual_step_size_    = nullptr;
 
   // GPU Vectors (Device memory)
   double *d_x_current_ = nullptr, *d_y_current_ = nullptr;
@@ -323,8 +297,8 @@ class PDLPSolver {
   size_t spmv_buffer_size_aty_ = 0;
 
   // Vector Descriptors
-  cusparseDnVecDescr_t vec_x_desc_ = nullptr, vec_y_desc_ = nullptr;
-  cusparseDnVecDescr_t vec_ax_desc_ = nullptr, vec_aty_desc_ = nullptr;
+  gpuDnVecDescr_t vec_x_desc_ = nullptr, vec_y_desc_ = nullptr;
+  gpuDnVecDescr_t vec_ax_desc_ = nullptr, vec_aty_desc_ = nullptr;
 
   // GPU Methods
   void setupGpu();
diff --git a/highs/pdlp/hipdlp/pdhg_kernels.hpp b/highs/pdlp/hipdlp/pdhg_kernels.hpp
index 57a6f411318..eec1214d710 100644
--- a/highs/pdlp/hipdlp/pdhg_kernels.hpp
+++ b/highs/pdlp/hipdlp/pdhg_kernels.hpp
@@ -1,7 +1,7 @@
 #pragma once
 
-#ifdef CUPDLP_GPU
-#include <cuda_runtime.h>
+#if defined(CUPDLP_GPU) || defined(HIPDLP_GPU)
+#include "gpu_backend.hpp"
 
 #ifdef __cplusplus
 extern "C" {
@@ -11,22 +11,22 @@ void launchKernelUpdateX_wrapper(double* d_x_new, const double* d_x_old,
                                  const double* d_aty, const double* d_cost,
                                  const double* d_lower, const double* d_upper,
                                  double primal_step, int n_cols,
-                                 cudaStream_t stream);
+                                 gpuStream_t stream);
 
 void launchKernelUpdateY_wrapper(double* d_y_new, const double* d_y_old,
                                  const double* d_ax_old, const double* d_ax_new,
                                  const double* d_row_lower,
                                  const bool* d_is_equality, double dual_step,
-                                 int n_rows, cudaStream_t stream);
+                                 int n_rows, gpuStream_t stream);
 
 void launchKernelUpdateAverages_wrapper(double* d_x_sum, double* d_y_sum,
                                         const double* d_x_current,
                                         const double* d_y_current,
                                         double weight, int n_cols, int n_rows,
-                                        cudaStream_t stream);
+                                        gpuStream_t stream);
 
 void launchKernelScaleVector_wrapper(double* d_out, const double* d_in,
-                                     double scale, int n, cudaStream_t stream);
+                                     double scale, int n, gpuStream_t stream);
 
 void launchCheckConvergenceKernels_wrapper(
     double* d_results, double* d_slack_pos, double* d_slack_neg,
@@ -35,20 +35,20 @@ void launchCheckConvergenceKernels_wrapper(
     const double* d_col_lower, const double* d_col_upper,
     const bool* d_is_equality, const double* d_col_scale,
     const double* d_row_scale, int n_cols, int n_rows, bool use_halpern_slack,
-    cudaStream_t stream);
+    gpuStream_t stream);
 
 void launchKernelHalpernPrimalMinor_wrapper(
     const double* d_current_primal, double* d_reflected_primal,
     const double* d_dual_product, const double* d_objective,
     const double* d_var_lb, const double* d_var_ub, const double* d_step_size,
-    int n, cudaStream_t stream);
+    int n, gpuStream_t stream);
 
 void launchKernelHalpernPrimalMajor_wrapper(
     const double* d_current_primal, double* d_pdhg_primal,
     double* d_reflected_primal, const double* d_dual_product,
     const double* d_objective, const double* d_var_lb, const double* d_var_ub,
     const double* d_step_size, int n, double* d_dual_slack,
-    cudaStream_t stream);
+    gpuStream_t stream);
 
 void launchKernelHalpernDualMinor_wrapper(const double* d_current_dual,
                                           double* d_reflected_dual,
@@ -56,20 +56,20 @@ void launchKernelHalpernDualMinor_wrapper(const double* d_current_dual,
                                           const double* d_rhs,
                                           const bool* d_is_equality,
                                           const double* d_step_size, int n,
-                                          cudaStream_t stream);
+                                          gpuStream_t stream);
 
 void launchKernelHalpernDualMajor_wrapper(
     const double* d_current_dual, double* d_pdhg_dual, double* d_reflected_dual,
     const double* d_primal_product, const double* d_rhs,
     const bool* d_is_equality, const double* d_step_size, int n,
-    cudaStream_t stream);
+    gpuStream_t stream);
 
 void launchKernelHalpernBlend_wrapper(double* d_current,
                                       const double* d_reflected,
                                       const double* d_initial,
                                       const int* d_halpern_iteration,
                                       int k_offset, double reflection_coeff,
-                                      int n, cudaStream_t stream);
+                                      int n, gpuStream_t stream);
 
 #ifdef __cplusplus
 }