From 01f346932591a781eb80252f3ed71938a6e01209 Mon Sep 17 00:00:00 2001
From: yuwenchen95 <yuwchen@nvidia.com>
Date: Wed, 24 Jun 2026 07:31:37 -0700
Subject: [PATCH 1/2] Add support for sparsity exploitation for SOCP of large
 dimensionality

Signed-off-by: yuwenchen95 <yuwchen@nvidia.com>
---
 .../cuopt/linear_programming/constants.h      |   1 +
 .../pdlp/solver_settings.hpp                  |   1 +
 cpp/src/barrier/barrier.cu                    | 491 +++++++++++---
 cpp/src/barrier/iterative_refinement.hpp      |   1 -
 cpp/src/barrier/second_order_cone_kernels.cuh | 601 ++++++++++++++++--
 .../barrier/second_order_cone_reduction.cuh   | 116 ++--
 .../dual_simplex/simplex_solver_settings.hpp  |   2 +
 cpp/src/math_optimization/solver_settings.cu  |   1 +
 cpp/src/pdlp/solve.cu                         |   1 +
 cpp/tests/socp/CMakeLists.txt                 |   1 +
 cpp/tests/socp/second_order_cone_kernels.cu   | 355 ++++++++++-
 cpp/tests/socp/solve_barrier_socp.cu          | 210 ++++++
 cpp/tests/socp/sparse_augmented_kkt_test.cu   | 353 ++++++++++
 13 files changed, 1920 insertions(+), 214 deletions(-)
 create mode 100644 cpp/tests/socp/sparse_augmented_kkt_test.cu
diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h
index 024fae7583..041cc38688 100644
--- a/cpp/include/cuopt/linear_programming/constants.h
+++ b/cpp/include/cuopt/linear_programming/constants.h
@@ -48,6 +48,7 @@
 #define CUOPT_ORDERING                             "ordering"
 #define CUOPT_BARRIER_DUAL_INITIAL_POINT           "barrier_dual_initial_point"
 #define CUOPT_BARRIER_ITERATIVE_REFINEMENT         "barrier_iterative_refinement"
+#define CUOPT_BARRIER_SOC_THRESHOLD                "barrier_soc_threshold"
 #define CUOPT_BARRIER_STEP_SCALE                   "barrier_step_scale"
 #define CUOPT_ELIMINATE_DENSE_COLUMNS              "eliminate_dense_columns"
 #define CUOPT_CUDSS_DETERMINISTIC                  "cudss_deterministic"
diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
index b30286f9ce..dd1979de6c 100644
--- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
+++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
@@ -283,6 +283,7 @@ class pdlp_solver_settings_t {
   bool eliminate_dense_columns{true};
   pdlp_precision_t pdlp_precision{pdlp_precision_t::DefaultPrecision};
   bool barrier_iterative_refinement{true};
+  i_t barrier_soc_threshold{5};
   f_t barrier_step_scale{0.9};
   bool save_best_primal_so_far{false};
   /**
diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu
index 1cbeed5cf3..845624b6ef 100644
--- a/cpp/src/barrier/barrier.cu
+++ b/cpp/src/barrier/barrier.cu
@@ -40,6 +40,8 @@
 #include <optional>
 #include <span>
 
+#include <cstdio>
+
 #include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/core/nvtx.hpp>
 #include <raft/linalg/dot.cuh>
@@ -262,6 +264,16 @@ class iteration_data_t {
       d_augmented_diagonal_indices_(0, lp.handle_ptr->get_stream()),
       d_cone_csr_indices_(0, lp.handle_ptr->get_stream()),
       d_cone_Q_values_(0, lp.handle_ptr->get_stream()),
+      d_dense_cone_block_offsets_(0, lp.handle_ptr->get_stream()),
+      d_dense_cone_ids_(0, lp.handle_ptr->get_stream()),
+      d_sparse_hessian_diag_(0, lp.handle_ptr->get_stream()),
+      d_sparse_hessian_Q_(0, lp.handle_ptr->get_stream()),
+      d_sparse_exp_v_col_(0, lp.handle_ptr->get_stream()),
+      d_sparse_exp_u_col_(0, lp.handle_ptr->get_stream()),
+      d_sparse_exp_v_row_(0, lp.handle_ptr->get_stream()),
+      d_sparse_exp_u_row_(0, lp.handle_ptr->get_stream()),
+      d_sparse_expansion_D_(0, lp.handle_ptr->get_stream()),
+      d_sparse_Hs_diag_(0, lp.handle_ptr->get_stream()),
       use_augmented(false),
       has_factorization(false),
       n_direct_free_linear(0),
@@ -406,7 +418,8 @@ class iteration_data_t {
         std::span<const i_t>(lp.second_order_cone_dims.data(), lp.second_order_cone_dims.size()),
         raft::device_span<f_t>{},
         raft::device_span<f_t>{},
-        stream_view_);
+        stream_view_,
+        settings.barrier_soc_threshold);
       cuopt_assert(cone_count() > 0, "second-order cone topology must contain at least one cone");
       cuopt_assert(cone_entry_count() == total_cone_dim, "second-order cone entry count mismatch");
     }
@@ -506,7 +519,7 @@ class iteration_data_t {
 
       if (use_augmented) {
         settings.log.printf("Linear system               : augmented\n");
-        const i_t augmented_size = lp.num_cols + lp.num_rows;
+        const i_t augmented_size = augmented_system_size(lp.num_cols, lp.num_rows);
         d_augmented_rhs_.resize(augmented_size, stream_view_);
         d_augmented_soln_.resize(augmented_size, stream_view_);
       } else {
@@ -599,8 +612,9 @@ class iteration_data_t {
     if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; }
     {
       raft::common::nvtx::range scope("Barrier: LP Data: Cholesky init");
-      i_t factorization_size = use_augmented ? lp.num_rows + lp.num_cols : lp.num_rows;
-      chol                   = std::make_unique<sparse_cholesky_cudss_t<i_t, f_t>>(
+      i_t factorization_size =
+        use_augmented ? augmented_system_size(lp.num_cols, lp.num_rows) : lp.num_rows;
+      chol = std::make_unique<sparse_cholesky_cudss_t<i_t, f_t>>(
         handle_ptr, settings, factorization_size);
       chol->set_positive_definite(false);
     }
@@ -653,16 +667,23 @@ class iteration_data_t {
 
   i_t cone_end() const { return cone_start() + cone_entry_count(); }
 
-  i_t linear_xz_size(std::size_t full_xz_size) const
+  i_t augmented_expansion_count() const
   {
-    return has_cones() ? cone_start() : static_cast<i_t>(full_xz_size);
+    return has_cones() && cones().has_sparse_cones() ? cones().expansion_var_count() : i_t(0);
   }
 
+  i_t augmented_system_size(i_t n, i_t m) const { return n + m + augmented_expansion_count(); }
+
   bool is_cone_variable(i_t variable) const
   {
     return has_cones() && variable >= cone_start() && variable < cone_end();
   }
 
+  i_t linear_xz_size(std::size_t full_xz_size) const
+  {
+    return has_cones() ? cone_start() : static_cast<i_t>(full_xz_size);
+  }
+
   f_t complementarity_degree(std::size_t num_primal_variables, i_t num_upper_bounds) const
   {
     const bool has_soc = has_cones();
@@ -678,52 +699,89 @@ class iteration_data_t {
 
   void form_augmented(bool first_call = false)
   {
-    i_t n                  = A.n;
-    i_t m                  = A.m;
-    i_t nnzA               = A.col_start[n];
-    i_t nnzQ               = Q.n > 0 ? Q.col_start[n] : 0;
-    i_t factorization_size = n + m;
+    i_t n    = A.n;
+    i_t m    = A.m;
+    i_t nnzA = A.col_start[n];
+    i_t nnzQ = Q.n > 0 ? Q.col_start[n] : 0;
+
+    const bool has_soc     = has_cones();
+    const i_t m_c          = cone_entry_count();
+    const i_t p            = augmented_expansion_count();
+    i_t factorization_size = augmented_system_size(n, m);
 
-    const bool has_soc  = has_cones();
-    const i_t m_c       = cone_entry_count();
-    i_t total_block_nnz = 0;
+    i_t dense_soc_kkt_nnz = 0;
 
     std::vector<std::size_t> cone_offsets_host;
-    std::vector<std::size_t> cone_block_offsets_host;
+    std::vector<std::size_t> dense_cone_block_offsets_host;
+    std::vector<i_t> dense_cone_ids_host;
+    std::vector<i_t> cone_sparse_idx;
+    std::vector<i_t> sparse_cone_ids_host;
+    std::vector<std::size_t> sparse_entry_offsets;
     std::vector<i_t> local_to_cone;
 
     if (first_call) {
+      const i_t n_sparse_soc = has_soc ? cones().n_sparse_cones : i_t(0);
       if (has_soc) {
         raft::common::nvtx::range scope("Barrier: augmented: SOC offsets");
-        // Initialize the cone block offsets
         const i_t n_cones = cone_count();
+        // TODO: Once form_augmented() is moved to GPU, we can avoid D2H round-trip.
         cone_offsets_host.resize(n_cones + 1);
-        cone_block_offsets_host.resize(n_cones + 1);
         raft::copy(
           cone_offsets_host.data(), cones().cone_offsets.data(), n_cones + 1, stream_view_);
         handle_ptr->sync_stream();
+
+        cone_sparse_idx.assign(n_cones, i_t(-1));
+        if (n_sparse_soc > 0) {
+          // TODO: Once form_augmented() is moved to GPU, we can avoid D2H round-trip for
+          // sparse_cone_ids / sparse_cone_dims.
+          sparse_cone_ids_host = cuopt::host_copy(cones().sparse_cone_ids, stream_view_);
+          for (i_t s = 0; s < n_sparse_soc; ++s) {
+            cone_sparse_idx[sparse_cone_ids_host[s]] = s;
+          }
+          sparse_entry_offsets.resize(n_sparse_soc + 1, 0);
+          const auto sparse_dims_host = cuopt::host_copy(cones().sparse_cone_dims, stream_view_);
+          for (i_t s = 0; s < n_sparse_soc; ++s) {
+            sparse_entry_offsets[s + 1] = sparse_entry_offsets[s] + sparse_dims_host[s];
+          }
+        }
+
+        dense_cone_ids_host.reserve(n_cones);
+        dense_cone_block_offsets_host = {0};
+        dense_cone_block_offsets_host.reserve(n_cones + 1);
         for (i_t k = 0; k < n_cones; ++k) {
-          const auto q_k                 = cone_offsets_host[k + 1] - cone_offsets_host[k];
-          cone_block_offsets_host[k + 1] = cone_block_offsets_host[k] + q_k * q_k;
+          if (cone_sparse_idx[k] < 0) {
+            dense_cone_ids_host.push_back(k);
+            const auto q_k = cone_offsets_host[k + 1] - cone_offsets_host[k];
+            dense_cone_block_offsets_host.push_back(dense_cone_block_offsets_host.back() +
+                                                    q_k * q_k);
+          }
         }
-        total_block_nnz = static_cast<i_t>(cone_block_offsets_host[n_cones]);
+        dense_soc_kkt_nnz = static_cast<i_t>(dense_cone_block_offsets_host.back());
 
-        // Precompute: for each local cone entry, which cone does it belong to?
         local_to_cone.resize(m_c);
         for (i_t k = 0; k < n_cones; ++k) {
           const i_t lo = cone_offsets_host[k];
           const i_t hi = cone_offsets_host[k + 1];
-          for (i_t p = lo; p < hi; p++) {
-            local_to_cone[p] = k;
+          for (i_t idx = lo; idx < hi; idx++) {
+            local_to_cone[idx] = k;
           }
         }
       }
 
-      i_t new_nnz = 2 * nnzA + n + m + nnzQ + total_block_nnz;  // conservative estimate of nnz
-      csr_matrix_t<i_t, f_t> augmented_CSR(n + m, n + m, new_nnz);
-      std::vector<i_t> augmented_diagonal_indices(n + m, -1);
-      std::vector<i_t> cone_csr_indices_host(total_block_nnz, -1);
-      std::vector<f_t> cone_Q_values_host(total_block_nnz, f_t(0));
+      const size_t n_sparse_entries = p > 0 ? cones().n_sparse_cone_entries : size_t{0};
+      const i_t sparse_soc_kkt_nnz  = static_cast<i_t>(5 * n_sparse_entries + p);
+      i_t new_nnz = 2 * nnzA + n + m + p + nnzQ + dense_soc_kkt_nnz + sparse_soc_kkt_nnz;
+      csr_matrix_t<i_t, f_t> augmented_CSR(factorization_size, factorization_size, new_nnz);
+      std::vector<i_t> augmented_diagonal_indices(factorization_size, -1);
+      std::vector<i_t> cone_csr_indices_host(dense_soc_kkt_nnz, -1);
+      std::vector<f_t> cone_Q_values_host(dense_soc_kkt_nnz, f_t(0));
+      std::vector<i_t> sparse_hessian_diag_host(n_sparse_entries, -1);
+      std::vector<f_t> sparse_hessian_Q_host(n_sparse_entries, f_t(0));
+      std::vector<i_t> sparse_exp_v_col_host(n_sparse_entries, -1);
+      std::vector<i_t> sparse_exp_u_col_host(n_sparse_entries, -1);
+      std::vector<i_t> sparse_exp_v_row_host(n_sparse_entries, -1);
+      std::vector<i_t> sparse_exp_u_row_host(n_sparse_entries, -1);
+      std::vector<i_t> sparse_expansion_D_host(p, -1);
       i_t q            = 0;
       i_t off_diag_Qnz = 0;
 
@@ -735,50 +793,101 @@ class iteration_data_t {
           const bool is_cone_row = is_cone_variable(i);
 
           if (is_cone_row) {
-            // Determine which cone this variable belongs to and its local row
             i_t local_idx = i - cone_start();
             i_t k         = local_to_cone[local_idx];
             i_t local_r =
               static_cast<i_t>(static_cast<std::size_t>(local_idx) - cone_offsets_host[k]);
             i_t q_k            = static_cast<i_t>(cone_offsets_host[k + 1] - cone_offsets_host[k]);
             i_t cone_col_start = cone_start() + static_cast<i_t>(cone_offsets_host[k]);
-            i_t block_base     = static_cast<i_t>(cone_block_offsets_host[k]) + local_r * q_k;
+            i_t cone_col_end   = cone_col_start + q_k;
 
-            // Merge-join: Q entries (sorted) with dense cone block columns (contiguous)
             i_t qp    = (nnzQ > 0) ? Q.col_start[i] : 0;
             i_t q_end = (nnzQ > 0) ? Q.col_start[i + 1] : 0;
 
-            // Q entries before cone block
-            for (; qp < q_end && Q.i[qp] < cone_col_start; qp++) {
-              augmented_CSR.j[q]   = Q.i[qp];
-              augmented_CSR.x[q++] = -Q.x[qp];
-              off_diag_Qnz++;
-            }
-
-            // Dense cone block, absorbing any Q entries that fall inside
-            for (i_t c = 0; c < q_k; c++) {
-              i_t col         = cone_col_start + c;
-              f_t q_contrib   = f_t(0);
-              f_t initial_val = (c == local_r) ? f_t(-dual_perturb)
-                                               : f_t(0);  // diagonal entry of the cone block column
+            // Row of sparse SOC.
+            if (cone_sparse_idx[k] >= 0) {
+              const i_t sparse_idx  = cone_sparse_idx[k];
+              const size_t flat_idx = sparse_entry_offsets[sparse_idx] + local_r;
+              const i_t exp_v_col   = n + m + 2 * sparse_idx;
+              const i_t exp_u_col   = n + m + 2 * sparse_idx + 1;
+
+              // Handle off-diagonal entries of sparse SOC before diagonal entry.
+              for (; qp < q_end && Q.i[qp] < cone_col_start; qp++) {
+                augmented_CSR.j[q]   = Q.i[qp];
+                augmented_CSR.x[q++] = -Q.x[qp];
+                off_diag_Qnz++;
+              }
 
-              if (qp < q_end && Q.i[qp] == col) {
+              // Handle diagonal entry of sparse SOC.
+              f_t q_contrib = f_t(0);
+              if (qp < q_end && Q.i[qp] == i) {
                 q_contrib = Q.x[qp];
                 qp++;
               }
+              sparse_hessian_diag_host[flat_idx] = q;
+              sparse_hessian_Q_host[flat_idx]    = q_contrib;
+              augmented_diagonal_indices[i]      = q;
+              augmented_CSR.j[q]                 = i;
+              augmented_CSR.x[q++]               = -dual_perturb - q_contrib;
+
+              // Handle expansion columns of sparse SOC.
+              sparse_exp_v_col_host[flat_idx] = q;
+              augmented_CSR.j[q]              = exp_v_col;
+              augmented_CSR.x[q++]            = f_t(0);
+
+              sparse_exp_u_col_host[flat_idx] = q;
+              augmented_CSR.j[q]              = exp_u_col;
+              augmented_CSR.x[q++]            = f_t(0);
+
+              // Handle off-diagonal entries of dense SOC before diagonal entry.
+              for (; qp < q_end && Q.i[qp] < cone_col_end; qp++) {
+                if (Q.i[qp] == i) { continue; }
+                augmented_CSR.j[q]   = Q.i[qp];
+                augmented_CSR.x[q++] = -Q.x[qp];
+                off_diag_Qnz++;
+              }
+              for (; qp < q_end; qp++) {
+                augmented_CSR.j[q]   = Q.i[qp];
+                augmented_CSR.x[q++] = -Q.x[qp];
+                off_diag_Qnz++;
+              }
+            } else {
+              // Row of dense SOC.
+              i_t dense_idx = 0;
+              for (i_t t = 0; t < k; ++t) {
+                if (cone_sparse_idx[t] < 0) { ++dense_idx; }
+              }
+              i_t block_base =
+                static_cast<i_t>(dense_cone_block_offsets_host[dense_idx]) + local_r * q_k;
 
-              cone_csr_indices_host[block_base + c] = q;
-              cone_Q_values_host[block_base + c]    = q_contrib;
-              if (col == i) { augmented_diagonal_indices[i] = q; }
-              augmented_CSR.j[q]   = col;
-              augmented_CSR.x[q++] = initial_val - q_contrib;
-            }
+              for (; qp < q_end && Q.i[qp] < cone_col_start; qp++) {
+                augmented_CSR.j[q]   = Q.i[qp];
+                augmented_CSR.x[q++] = -Q.x[qp];
+                off_diag_Qnz++;
+              }
+
+              for (i_t c = 0; c < q_k; c++) {
+                i_t col         = cone_col_start + c;
+                f_t q_contrib   = f_t(0);
+                f_t initial_val = (c == local_r) ? f_t(-dual_perturb) : f_t(0);
+
+                if (qp < q_end && Q.i[qp] == col) {
+                  q_contrib = Q.x[qp];
+                  qp++;
+                }
+
+                cone_csr_indices_host[block_base + c] = q;
+                cone_Q_values_host[block_base + c]    = q_contrib;
+                if (col == i) { augmented_diagonal_indices[i] = q; }
+                augmented_CSR.j[q]   = col;
+                augmented_CSR.x[q++] = initial_val - q_contrib;
+              }
 
-            // Q entries after cone block
-            for (; qp < q_end; qp++) {
-              augmented_CSR.j[q]   = Q.i[qp];
-              augmented_CSR.x[q++] = -Q.x[qp];
-              off_diag_Qnz++;
+              for (; qp < q_end; qp++) {
+                augmented_CSR.j[q]   = Q.i[qp];
+                augmented_CSR.x[q++] = -Q.x[qp];
+                off_diag_Qnz++;
+              }
             }
           } else if (nnzQ == 0) {
             augmented_diagonal_indices[i] = q;
@@ -816,27 +925,77 @@ class iteration_data_t {
         }
 
         for (i_t k = n; k < n + m; ++k) {
-          // A block, we can use AT in csc directly
           augmented_CSR.row_start[k] = q;
           const i_t l                = k - n;
           const i_t col_beg          = AT.col_start[l];
           const i_t col_end          = AT.col_start[l + 1];
-          for (i_t p = col_beg; p < col_end; ++p) {
-            augmented_CSR.j[q]   = AT.i[p];
-            augmented_CSR.x[q++] = AT.x[p];
+          for (i_t idx = col_beg; idx < col_end; ++idx) {
+            augmented_CSR.j[q]   = AT.i[idx];
+            augmented_CSR.x[q++] = AT.x[idx];
           }
           augmented_diagonal_indices[k] = q;
           augmented_CSR.j[q]            = k;
           augmented_CSR.x[q++]          = primal_perturb;
         }
-        augmented_CSR.row_start[n + m] = q;
-        augmented_CSR.nz_max           = q;
+
+        // Handle expansion rows of sparse SOC.
+        for (i_t s = 0; s < n_sparse_soc; ++s) {
+          const i_t k   = sparse_cone_ids_host[s];
+          const i_t q_k = static_cast<i_t>(cone_offsets_host[k + 1] - cone_offsets_host[k]);
+          const i_t cone_col_start = cone_start() + static_cast<i_t>(cone_offsets_host[k]);
+          const size_t flat_base   = sparse_entry_offsets[s];
+          const i_t prow_v         = n + m + 2 * s;
+          const i_t prow_u         = n + m + 2 * s + 1;
+
+          augmented_CSR.row_start[prow_v] = q;
+          for (i_t j = 0; j < q_k; ++j) {
+            sparse_exp_v_row_host[flat_base + j] = q;
+            augmented_CSR.j[q]                   = cone_col_start + j;
+            augmented_CSR.x[q++]                 = f_t(0);
+          }
+          sparse_expansion_D_host[2 * s] = q;
+          augmented_CSR.j[q]             = prow_v;
+          augmented_CSR.x[q++]           = f_t(0);
+
+          augmented_CSR.row_start[prow_u] = q;
+          for (i_t j = 0; j < q_k; ++j) {
+            sparse_exp_u_row_host[flat_base + j] = q;
+            augmented_CSR.j[q]                   = cone_col_start + j;
+            augmented_CSR.x[q++]                 = f_t(0);
+          }
+          sparse_expansion_D_host[2 * s + 1] = q;
+          augmented_CSR.j[q]                 = prow_u;
+          augmented_CSR.x[q++]               = f_t(0);
+        }
+
+        augmented_CSR.row_start[factorization_size] = q;
+        augmented_CSR.nz_max                        = q;
         augmented_CSR.j.resize(q);
         augmented_CSR.x.resize(q);
-        i_t expected_nnz = 2 * nnzA + (n - m_c) + total_block_nnz + m + off_diag_Qnz;
+        i_t expected_nnz =
+          2 * nnzA + (n - m_c) + dense_soc_kkt_nnz + m + off_diag_Qnz + sparse_soc_kkt_nnz;
         settings_.log.debug("augmented nz %d predicted %d\n", q, expected_nnz);
-        cuopt_assert(q == expected_nnz, "augmented nnz != predicted");
+        cuopt_assert(q == expected_nnz, "augmented nz != predicted");
         cuopt_assert(A.col_start[n] == AT.col_start[m], "A nz != AT nz");
+
+        if (n_sparse_entries > 0) {
+          for (size_t e = 0; e < n_sparse_entries; ++e) {
+            cuopt_assert(sparse_hessian_diag_host[e] >= 0,
+                         "sparse Hessian diagonal CSR index unset");
+            cuopt_assert(sparse_exp_v_col_host[e] >= 0,
+                         "sparse expansion v-column CSR index unset");
+            cuopt_assert(sparse_exp_u_col_host[e] >= 0,
+                         "sparse expansion u-column CSR index unset");
+            cuopt_assert(sparse_exp_v_row_host[e] >= 0, "sparse expansion v-row CSR index unset");
+            cuopt_assert(sparse_exp_u_row_host[e] >= 0, "sparse expansion u-row CSR index unset");
+          }
+          for (i_t s = 0; s < n_sparse_soc; ++s) {
+            cuopt_assert(sparse_expansion_D_host[2 * s] >= 0,
+                         "sparse expansion v diagonal CSR index unset");
+            cuopt_assert(sparse_expansion_D_host[2 * s + 1] >= 0,
+                         "sparse expansion u diagonal CSR index unset");
+          }
+        }
       }
 
       {
@@ -850,16 +1009,72 @@ class iteration_data_t {
                    handle_ptr->get_stream());
 
         if (has_soc) {
-          d_cone_csr_indices_.resize(total_block_nnz, handle_ptr->get_stream());
+          d_cone_csr_indices_.resize(dense_soc_kkt_nnz, handle_ptr->get_stream());
           raft::copy(d_cone_csr_indices_.data(),
                      cone_csr_indices_host.data(),
-                     total_block_nnz,
+                     dense_soc_kkt_nnz,
                      handle_ptr->get_stream());
-          d_cone_Q_values_.resize(total_block_nnz, handle_ptr->get_stream());
+          d_cone_Q_values_.resize(dense_soc_kkt_nnz, handle_ptr->get_stream());
           raft::copy(d_cone_Q_values_.data(),
                      cone_Q_values_host.data(),
-                     total_block_nnz,
+                     dense_soc_kkt_nnz,
+                     handle_ptr->get_stream());
+
+          d_dense_cone_block_offsets_.resize(dense_cone_block_offsets_host.size(),
+                                             handle_ptr->get_stream());
+          raft::copy(d_dense_cone_block_offsets_.data(),
+                     dense_cone_block_offsets_host.data(),
+                     dense_cone_block_offsets_host.size(),
                      handle_ptr->get_stream());
+          d_dense_cone_ids_.resize(dense_cone_ids_host.size(), handle_ptr->get_stream());
+          if (!dense_cone_ids_host.empty()) {
+            raft::copy(d_dense_cone_ids_.data(),
+                       dense_cone_ids_host.data(),
+                       dense_cone_ids_host.size(),
+                       handle_ptr->get_stream());
+          }
+
+          const size_t n_sparse_entries = cones().n_sparse_cone_entries;
+          d_sparse_hessian_diag_.resize(n_sparse_entries, handle_ptr->get_stream());
+          d_sparse_hessian_Q_.resize(n_sparse_entries, handle_ptr->get_stream());
+          d_sparse_exp_v_col_.resize(n_sparse_entries, handle_ptr->get_stream());
+          d_sparse_exp_u_col_.resize(n_sparse_entries, handle_ptr->get_stream());
+          d_sparse_exp_v_row_.resize(n_sparse_entries, handle_ptr->get_stream());
+          d_sparse_exp_u_row_.resize(n_sparse_entries, handle_ptr->get_stream());
+          d_sparse_expansion_D_.resize(p, handle_ptr->get_stream());
+          d_sparse_Hs_diag_.resize(n_sparse_entries, handle_ptr->get_stream());
+          if (n_sparse_entries > 0) {
+            raft::copy(d_sparse_hessian_diag_.data(),
+                       sparse_hessian_diag_host.data(),
+                       n_sparse_entries,
+                       handle_ptr->get_stream());
+            raft::copy(d_sparse_hessian_Q_.data(),
+                       sparse_hessian_Q_host.data(),
+                       n_sparse_entries,
+                       handle_ptr->get_stream());
+            raft::copy(d_sparse_exp_v_col_.data(),
+                       sparse_exp_v_col_host.data(),
+                       n_sparse_entries,
+                       handle_ptr->get_stream());
+            raft::copy(d_sparse_exp_u_col_.data(),
+                       sparse_exp_u_col_host.data(),
+                       n_sparse_entries,
+                       handle_ptr->get_stream());
+            raft::copy(d_sparse_exp_v_row_.data(),
+                       sparse_exp_v_row_host.data(),
+                       n_sparse_entries,
+                       handle_ptr->get_stream());
+            raft::copy(d_sparse_exp_u_row_.data(),
+                       sparse_exp_u_row_host.data(),
+                       n_sparse_entries,
+                       handle_ptr->get_stream());
+          }
+          if (p > 0) {
+            raft::copy(d_sparse_expansion_D_.data(),
+                       sparse_expansion_D_host.data(),
+                       sparse_expansion_D_host.size(),
+                       handle_ptr->get_stream());
+          }
         }
 
         handle_ptr->sync_stream();
@@ -868,11 +1083,11 @@ class iteration_data_t {
       csc_matrix_t<i_t, f_t> augmented_transpose(1, 1, 1);
       augmented.transpose(augmented_transpose);
       settings_.log.printf("Aug nnz %d Aug^T nnz %d\n",
-                           augmented.col_start[m + n],
-                           augmented_transpose.col_start[m + n]);
+                           augmented.col_start[factorization_size],
+                           augmented_transpose.col_start[factorization_size]);
       augmented.check_matrix();
       augmented_transpose.check_matrix();
-      csc_matrix_t<i_t, f_t> error(m + n, m + n, 1);
+      csc_matrix_t<i_t, f_t> error(factorization_size, factorization_size, 1);
       add(augmented, augmented_transpose, 1.0, -1.0, error);
       settings_.log.printf("|| Aug - Aug^T ||_1 %e\n", error.norm1());
       cuopt_assert(error.norm1() <= 1e-2, "|| Aug - Aug^T ||_1 > 1e-2");
@@ -909,14 +1124,41 @@ class iteration_data_t {
       RAFT_CHECK_CUDA(handle_ptr->get_stream());
 
       if (has_soc) {
-        scatter_hessian_into_augmented(cones(),
-                                       device_augmented.x,
-                                       d_cone_csr_indices_,
-                                       d_cone_Q_values_,
-                                       handle_ptr->get_stream(),
-                                       dual_perturb);
-        RAFT_CHECK_CUDA(handle_ptr->get_stream());
+        if (cones().has_sparse_cones()) {
+          launch_get_Hs_sparse(cones(), d_sparse_Hs_diag_, handle_ptr->get_stream());
+          scatter_sparse_hessian_diag_into_augmented(device_augmented.x,
+                                                     d_sparse_hessian_diag_,
+                                                     d_sparse_Hs_diag_,
+                                                     d_sparse_hessian_Q_,
+                                                     handle_ptr->get_stream(),
+                                                     dual_perturb);
+          update_sparse_expansion_in_augmented(device_augmented.x,
+                                               d_sparse_exp_v_col_,
+                                               d_sparse_exp_u_col_,
+                                               d_sparse_exp_v_row_,
+                                               d_sparse_exp_u_row_,
+                                               d_sparse_expansion_D_,
+                                               cones().sparse_v,
+                                               cones().sparse_u,
+                                               cones().eta,
+                                               cones().sparse_cone_ids,
+                                               handle_ptr->get_stream(),
+                                               dual_perturb);
+          RAFT_CHECK_CUDA(handle_ptr->get_stream());
+        }
+        if (cones().n_dense_cones() > 0) {
+          scatter_dense_hessian_into_augmented(cones(),
+                                               device_augmented.x,
+                                               d_cone_csr_indices_,
+                                               d_cone_Q_values_,
+                                               d_dense_cone_block_offsets_,
+                                               d_dense_cone_ids_,
+                                               handle_ptr->get_stream(),
+                                               dual_perturb);
+          RAFT_CHECK_CUDA(handle_ptr->get_stream());
+        }
       }
+      handle_ptr->sync_stream();
     }
   }
 
@@ -1400,6 +1642,15 @@ class iteration_data_t {
     }
   }
 
+  void restore_saved_iterate()
+  {
+    x = x_save;
+    y = y_save;
+    z = z_save;
+    v = v_save;
+    w = w_save;
+  }
+
   void to_solution(const lp_problem_t<i_t, f_t>& lp,
                    i_t iterations,
                    f_t objective,
@@ -1809,18 +2060,28 @@ class iteration_data_t {
     const i_t m        = A.m;
     const i_t n        = A.n;
     const bool has_soc = has_cones();
+    const i_t p        = augmented_expansion_count();
+    const i_t sys_size = augmented_system_size(n, m);
+    cuopt_assert(static_cast<i_t>(x.size()) >= sys_size, "augmented_multiply: x too small");
+    cuopt_assert(static_cast<i_t>(y.size()) >= sys_size, "augmented_multiply: y too small");
 
     rmm::device_uvector<f_t> d_x1(n, handle_ptr->get_stream());
     rmm::device_uvector<f_t> d_x2(m, handle_ptr->get_stream());
     rmm::device_uvector<f_t> d_y1(n, handle_ptr->get_stream());
     rmm::device_uvector<f_t> d_y2(m, handle_ptr->get_stream());
+    rmm::device_uvector<f_t> d_y_exp(p, handle_ptr->get_stream());
+    rmm::device_uvector<f_t> d_y_exp_orig(p, handle_ptr->get_stream());
 
     raft::copy(d_x1.data(), x.data(), n, handle_ptr->get_stream());
     raft::copy(d_x2.data(), x.data() + n, m, handle_ptr->get_stream());
     raft::copy(d_y1.data(), y.data(), n, handle_ptr->get_stream());
     raft::copy(d_y2.data(), y.data() + n, m, handle_ptr->get_stream());
+    if (p > 0) {
+      raft::copy(d_y_exp_orig.data(), y.data() + n + m, p, handle_ptr->get_stream());
+      thrust::fill_n(rmm::exec_policy(stream_view_), d_y_exp.begin(), p, f_t(0));
+    }
 
-    // y1 <- alpha ( -D * x_1 + A^T x_2) + beta * y1
+    // y1 <- alpha ( -(Q + D + H) * x_1 + A^T x_2) + beta * y1
 
     rmm::device_uvector<f_t> d_r1(n, handle_ptr->get_stream());
     thrust::fill_n(rmm::exec_policy(stream_view_), d_r1.begin(), n, f_t(0));
@@ -1835,14 +2096,31 @@ class iteration_data_t {
                                               stream_view_);
     RAFT_CHECK_CUDA(stream_view_);
 
-    // r1 <- D * x_1 + H x_1  (cone Hessian block H = S^T S; accumulate_cone_hessian_matvec)
+    // r1 <- D * x_1 + H x_1 on cone rows
+    // (dense cones: explicit dense H block; sparse cones: rank-2 expansion, which adds
+    //  Hs_diag .* x_cone to r1 here and writes the expansion rows into d_y_exp)
     if (has_soc) {
       const i_t m_c = cone_entry_count();
-      accumulate_cone_hessian_matvec(raft::device_span<const f_t>(d_x1.data() + cone_start(), m_c),
-                                     cones(),
-                                     raft::device_span<f_t>(d_r1.data() + cone_start(), m_c),
-                                     stream_view_);
-      RAFT_CHECK_CUDA(stream_view_);
+      if (cones().has_sparse_cones()) {
+        launch_sparse_augmented_matvec(
+          raft::device_span<const f_t>(x.data(), x.size()),
+          raft::device_span<f_t>(d_r1.data(), d_r1.size()),
+          raft::device_span<f_t>(d_y_exp.data(), d_y_exp.size()),
+          cones(),
+          raft::device_span<const f_t>(d_sparse_Hs_diag_.data(), d_sparse_Hs_diag_.size()),
+          cone_start(),
+          n,
+          m,
+          handle_ptr->get_stream());
+        RAFT_CHECK_CUDA(stream_view_);
+      }
+      if (cones().n_dense_cones() > 0) {
+        launch_dense_hessian_matvec(raft::device_span<const f_t>(d_x1.data() + cone_start(), m_c),
+                                    cones(),
+                                    raft::device_span<f_t>(d_r1.data() + cone_start(), m_c),
+                                    stream_view_);
+        RAFT_CHECK_CUDA(stream_view_);
+      }
     }
 
     // r1 <- Q x1 + D x1 + H x1  (cone: same H as above)
@@ -1861,8 +2139,13 @@ class iteration_data_t {
     // matrix_vector_multiply(A, alpha, x1, beta, y2);
     cusparse_view_.spmv(alpha, d_x1, beta, d_y2);
 
+    if (p > 0) {
+      axpy(alpha, d_y_exp.data(), beta, d_y_exp_orig.data(), d_y_exp.data(), p, stream_view_);
+    }
+
     raft::copy(y.data(), d_y1.data(), n, stream_view_);
     raft::copy(y.data() + n, d_y2.data(), m, stream_view_);
+    if (p > 0) { raft::copy(y.data() + n + m, d_y_exp.data(), p, stream_view_); }
     handle_ptr->sync_stream();
   }
 
@@ -1944,6 +2227,16 @@ class iteration_data_t {
   rmm::device_uvector<i_t> d_augmented_diagonal_indices_;
   rmm::device_uvector<i_t> d_cone_csr_indices_;
   rmm::device_uvector<f_t> d_cone_Q_values_;
+  rmm::device_uvector<size_t> d_dense_cone_block_offsets_;
+  rmm::device_uvector<i_t> d_dense_cone_ids_;
+  rmm::device_uvector<i_t> d_sparse_hessian_diag_;
+  rmm::device_uvector<f_t> d_sparse_hessian_Q_;
+  rmm::device_uvector<i_t> d_sparse_exp_v_col_;
+  rmm::device_uvector<i_t> d_sparse_exp_u_col_;
+  rmm::device_uvector<i_t> d_sparse_exp_v_row_;
+  rmm::device_uvector<i_t> d_sparse_exp_u_row_;
+  rmm::device_uvector<i_t> d_sparse_expansion_D_;
+  rmm::device_uvector<f_t> d_sparse_Hs_diag_;
   bool indefinite_Q;
   cusparse_view_t<i_t, f_t> cusparse_Q_view_;
 
@@ -2064,7 +2357,7 @@ void cholesky_debug_check(const iteration_data_t<i_t, f_t>& data,
   // return;
   srand(42);
 
-  i_t vec_size = use_augmented ? lp.num_cols + lp.num_rows : lp.num_rows;
+  i_t vec_size = use_augmented ? data.augmented_system_size(lp.num_cols, lp.num_rows) : lp.num_rows;
   // 1. Create a random test vector
   dense_vector_t<i_t, f_t> test_vec(vec_size);
   for (size_t i = 0; i < test_vec.size(); i++) {
@@ -2224,14 +2517,16 @@ int barrier_solver_t<i_t, f_t>::initial_point(iteration_data_t<i_t, f_t>& data)
   data.inv_diag.pairwise_product(Fu, DinvFu);
   dense_vector_t<i_t, f_t> q(lp.num_rows);
   if (use_augmented) {
-    dense_vector_t<i_t, f_t> rhs(lp.num_cols + lp.num_rows);
+    const i_t aug_size = data.augmented_system_size(lp.num_cols, lp.num_rows);
+    dense_vector_t<i_t, f_t> rhs(aug_size);
+    rhs.set_scalar(0.0);
     for (i_t k = 0; k < lp.num_cols; k++) {
       rhs[k] = -Fu[k];
     }
     for (i_t k = 0; k < lp.num_rows; k++) {
       rhs[lp.num_cols + k] = rhs_x[k];
     }
-    dense_vector_t<i_t, f_t> soln(lp.num_cols + lp.num_rows);
+    dense_vector_t<i_t, f_t> soln(aug_size);
     i_t solve_status = data.chol->solve(rhs, soln);
     struct op_t {
       op_t(iteration_data_t<i_t, f_t>& data) : data_(data) {}
@@ -2374,12 +2669,13 @@ int barrier_solver_t<i_t, f_t>::initial_point(iteration_data_t<i_t, f_t>& data)
       }
     }
   } else if (use_augmented) {
-    dense_vector_t<i_t, f_t> dual_rhs(lp.num_cols + lp.num_rows);
+    const i_t aug_size = data.augmented_system_size(lp.num_cols, lp.num_rows);
+    dense_vector_t<i_t, f_t> dual_rhs(aug_size);
     dual_rhs.set_scalar(0.0);
     for (i_t k = 0; k < lp.num_cols; k++) {
       dual_rhs[k] = data.c[k];
     }
-    dense_vector_t<i_t, f_t> py(lp.num_cols + lp.num_rows);
+    dense_vector_t<i_t, f_t> py(aug_size);
     data.chol->solve(dual_rhs, py);
     for (i_t k = 0; k < lp.num_cols; k++) {
       data.z[k] = py[k];
@@ -2646,6 +2942,7 @@ i_t barrier_solver_t<i_t, f_t>::gpu_compute_search_direction(iteration_data_t<i_
     cones.x     = raft::device_span<f_t>(data.d_x_.data() + cone_var_start, m_c);
     cones.z     = raft::device_span<f_t>(data.d_z_.data() + cone_var_start, m_c);
     launch_nt_scaling(cones, stream_view_);
+    if (cones.has_sparse_cones()) { launch_update_scaling_sparse(cones, stream_view_); }
   }
 
   max_residual = 0.0;
@@ -2852,6 +3149,14 @@ i_t barrier_solver_t<i_t, f_t>::gpu_compute_search_direction(iteration_data_t<i_
     raft::copy(data.d_augmented_rhs_.data(), data.d_r1_.data(), lp.num_cols, stream_view_);
     raft::copy(
       data.d_augmented_rhs_.data() + lp.num_cols, data.d_h_.data(), lp.num_rows, stream_view_);
+    const i_t expansion_count = data.augmented_expansion_count();
+    if (expansion_count > 0) {
+      const i_t expansion_start = lp.num_cols + lp.num_rows;
+      thrust::fill_n(rmm::exec_policy(stream_view_),
+                     data.d_augmented_rhs_.begin() + expansion_start,
+                     expansion_count,
+                     f_t(0));
+    }
     data.chol->solve(data.d_augmented_rhs_, data.d_augmented_soln_);
     struct op_t {
       op_t(iteration_data_t<i_t, f_t>& data) : data_(data) {}
@@ -3956,11 +4261,7 @@ lp_status_t barrier_solver_t<i_t, f_t>::check_for_suboptimal_solution(
       data.relative_dual_residual_save < settings.barrier_relaxed_optimality_tol &&
       data.relative_complementarity_residual_save < settings.barrier_relaxed_complementarity_tol) {
     settings.log.printf("Restoring previous solution\n");
-    raft::copy(data.x.data(), data.d_x_.data(), data.d_x_.size(), stream_view_);
-    raft::copy(data.y.data(), data.d_y_.data(), data.d_y_.size(), stream_view_);
-    raft::copy(data.z.data(), data.d_z_.data(), data.d_z_.size(), stream_view_);
-    raft::copy(data.v.data(), data.d_v_.data(), data.d_v_.size(), stream_view_);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
+    data.restore_saved_iterate();
     data.to_solution(lp,
                      iter,
                      primal_objective_save,
diff --git a/cpp/src/barrier/iterative_refinement.hpp b/cpp/src/barrier/iterative_refinement.hpp
index 807d055787..bb87984185 100644
--- a/cpp/src/barrier/iterative_refinement.hpp
+++ b/cpp/src/barrier/iterative_refinement.hpp
@@ -24,7 +24,6 @@
 
 #include <algorithm>
 #include <cmath>
-#include <cstdio>
 #include <limits>
 #include <vector>
 
diff --git a/cpp/src/barrier/second_order_cone_kernels.cuh b/cpp/src/barrier/second_order_cone_kernels.cuh
index 5a674fef70..1c3e527299 100644
--- a/cpp/src/barrier/second_order_cone_kernels.cuh
+++ b/cpp/src/barrier/second_order_cone_kernels.cuh
@@ -202,12 +202,26 @@ struct cone_data_t {
   rmm::device_uvector<f_t> w;       // [n_cone_entries], unit-J-norm NT direction
   rmm::device_uvector<f_t> lambda;  // [n_cone_entries], NT point lambda = W^{-T} z
 
+  // Sparse SOC rank-2 scaling (cones with dimension > soc_threshold).
+  i_t soc_threshold;
+  i_t n_sparse_cones;
+  size_t n_sparse_cone_entries;               // sum of sparse cone dimensions
+  rmm::device_uvector<i_t> sparse_cone_ids;   // [n_sparse_cones], indices into cone arrays
+  rmm::device_uvector<i_t> sparse_cone_dims;  // [n_sparse_cones], dimension of each sparse cone
+  rmm::device_uvector<f_t> d;                 // [n_sparse_cones], corner of rank-2 diagonal block
+  rmm::device_uvector<f_t> sparse_v;  // [n_sparse_cone_entries], rank-2 vector v per sparse cone
+  rmm::device_uvector<f_t> sparse_u;  // [n_sparse_cone_entries], rank-2 vector u per sparse cone
+  rmm::device_uvector<i_t>
+    sparse_entry_offsets;  // [n_sparse_cones + 1], packed prefix offsets of sparse cone entries
+  rmm::device_uvector<i_t> cone_is_sparse;  // [n_cones], 1 if cone uses sparse KKT expansion
+
   cone_scratch_t<i_t, f_t> scratch;
 
   cone_data_t(std::span<const i_t> cone_dimensions_host,
               raft::device_span<f_t> x_in,
               raft::device_span<f_t> z_in,
-              rmm::cuda_stream_view stream)
+              rmm::cuda_stream_view stream,
+              i_t soc_threshold_in = 5)
     : n_cones(cone_dimensions_host.size()),
       n_cone_entries(
         std::reduce(cone_dimensions_host.begin(), cone_dimensions_host.end(), size_t{0})),
@@ -220,8 +234,58 @@ struct cone_data_t {
       eta(n_cones, stream),
       w(n_cone_entries, stream),
       lambda(n_cone_entries, stream),
-      scratch(n_cones, n_cone_entries, stream)
+      sparse_cone_ids(0, stream),
+      sparse_cone_dims(0, stream),
+      d(0, stream),
+      sparse_v(0, stream),
+      sparse_u(0, stream),
+      sparse_entry_offsets(0, stream),
+      cone_is_sparse(n_cones, stream),
+      scratch(n_cones, n_cone_entries, stream),
+      soc_threshold(soc_threshold_in),
+      n_sparse_cones(0),
+      n_sparse_cone_entries(0)
   {
+    thrust::fill(rmm::exec_policy(stream), cone_is_sparse.begin(), cone_is_sparse.end(), 0);
+
+    std::vector<i_t> sparse_cone_ids_host;
+    std::vector<i_t> sparse_cone_dims_host;
+    sparse_cone_ids_host.reserve(n_cones);
+    sparse_cone_dims_host.reserve(n_cones);
+    for (i_t cone = 0; cone < n_cones; ++cone) {
+      if (cone_dimensions_host[cone] > soc_threshold) {
+        sparse_cone_ids_host.push_back(cone);
+        sparse_cone_dims_host.push_back(cone_dimensions_host[cone]);
+      }
+    }
+    n_sparse_cones = static_cast<i_t>(sparse_cone_ids_host.size());
+    n_sparse_cone_entries =
+      std::reduce(sparse_cone_dims_host.begin(), sparse_cone_dims_host.end(), size_t{0});
+
+    if (n_sparse_cones > 0) {
+      sparse_cone_ids.resize(n_sparse_cones, stream);
+      sparse_cone_dims.resize(n_sparse_cones, stream);
+      d.resize(n_sparse_cones, stream);
+      sparse_v.resize(n_sparse_cone_entries, stream);
+      sparse_u.resize(n_sparse_cone_entries, stream);
+      sparse_entry_offsets.resize(n_sparse_cones + 1, stream);
+      raft::copy(sparse_cone_ids.data(), sparse_cone_ids_host.data(), n_sparse_cones, stream);
+      raft::copy(sparse_cone_dims.data(), sparse_cone_dims_host.data(), n_sparse_cones, stream);
+      std::vector<i_t> cone_is_sparse_host(n_cones, 0);
+      for (i_t cone : sparse_cone_ids_host) {
+        cone_is_sparse_host[cone] = 1;
+      }
+      raft::copy(cone_is_sparse.data(), cone_is_sparse_host.data(), n_cones, stream);
+
+      // Packed prefix offsets of sparse cone entries.
+      std::vector<i_t> sparse_entry_offsets_host(n_sparse_cones + 1, 0);
+      for (i_t s = 0; s < n_sparse_cones; ++s) {
+        sparse_entry_offsets_host[s + 1] = sparse_entry_offsets_host[s] + sparse_cone_dims_host[s];
+      }
+      raft::copy(
+        sparse_entry_offsets.data(), sparse_entry_offsets_host.data(), n_sparse_cones + 1, stream);
+    }
+
     raft::copy(cone_dimensions.data(), cone_dimensions_host.data(), n_cones, stream);
     cone_offsets.set_element_to_zero_async(0, stream);
     auto policy = rmm::exec_policy(stream);
@@ -242,6 +306,16 @@ struct cone_data_t {
                         element_cone_ids.begin());
     segmented_sum.template prepare_workspace<f_t>(stream);
   }
+
+  // True when at least one cone is large enough (dim > soc_threshold) to use the
+  // rank-2 sparse KKT expansion instead of a dense q x q block.
+  bool has_sparse_cones() const { return n_sparse_cones > 0; }
+
+  // Number of cones kept dense (dim <= soc_threshold).
+  i_t n_dense_cones() const { return n_cones - n_sparse_cones; }
+
+  // Extra augmented-system columns/rows: two expansion variables (v, u) per sparse cone.
+  i_t expansion_var_count() const { return 2 * n_sparse_cones; }
 };
 
 template <std::integral i_t, std::floating_point f_t>
@@ -482,6 +556,94 @@ void launch_nt_scaling(cone_data_t<i_t, f_t>& cones, rmm::cuda_stream_view strea
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
+// One block per sparse cone. Recompute the rank-2 factors (corner d and the
+// vectors v, u, both scaled by eta^2) from the current NT direction w so that
+// the implicit block reproduces the dense H = eta^2 (2 w w^T - J).
+template <std::integral i_t, std::floating_point f_t>
+__global__ void update_scaling_sparse_kernel(raft::device_span<const f_t> w,
+                                             raft::device_span<const f_t> eta,
+                                             raft::device_span<f_t> d,
+                                             raft::device_span<f_t> sparse_v,
+                                             raft::device_span<f_t> sparse_u,
+                                             raft::device_span<const size_t> cone_offsets,
+                                             raft::device_span<const i_t> sparse_cone_dims,
+                                             raft::device_span<const i_t> sparse_cone_ids,
+                                             raft::device_span<const i_t> sparse_entry_offsets,
+                                             i_t n_sparse_cones)
+{
+  const i_t sparse_idx = blockIdx.x;
+  if (sparse_idx >= n_sparse_cones) { return; }
+
+  __shared__ f_t s_mem[4];
+
+  const i_t cone_idx    = sparse_cone_ids[sparse_idx];
+  const size_t cone_off = cone_offsets[cone_idx];
+  const i_t cone_dim    = sparse_cone_dims[sparse_idx];
+  const i_t block_start = sparse_entry_offsets[sparse_idx];
+
+  if (threadIdx.x == 0) {
+    const f_t alpha    = f_t(2) * w[cone_off];
+    const f_t wsq      = f_t(2) * w[cone_off] * w[cone_off] - f_t(1);
+    const f_t wsq_safe = f_t(0.5) * (wsq + sqrt(wsq * wsq + f_t(1)));
+    const f_t wsqinv   = f_t(1) / wsq_safe;
+    const f_t di       = f_t(0.5) * wsqinv;
+    d[sparse_idx]      = di;
+    const f_t radicand = wsq_safe - di;
+    const f_t u0       = sqrt(max(radicand, f_t(0)));
+    const f_t u1       = (u0 > f_t(0)) ? alpha / u0 : f_t(0);
+    const f_t v0       = f_t(0);
+    const f_t denom    = f_t(2) * wsq_safe - wsqinv;
+    const f_t v1_arg   = (abs(denom) > f_t(1e-12)) ? f_t(2) * (f_t(2) + wsqinv) / denom : f_t(2);
+    const f_t v1       = sqrt(max(v1_arg, f_t(0)));
+    const f_t eta_sq   = eta[cone_idx] * eta[cone_idx];
+    s_mem[0]           = eta_sq * u0;
+    s_mem[1]           = eta_sq * u1;
+    s_mem[2]           = eta_sq * v0;
+    s_mem[3]           = eta_sq * v1;
+  }
+  __syncthreads();
+
+  const f_t scaled_u0 = s_mem[0];
+  const f_t scaled_u1 = s_mem[1];
+  const f_t scaled_v0 = s_mem[2];
+  const f_t scaled_v1 = s_mem[3];
+
+  for (i_t j = threadIdx.x; j < cone_dim; j += blockDim.x) {
+    if (j == 0) {
+      sparse_u[block_start + j] = scaled_u0;
+      sparse_v[block_start + j] = scaled_v0;
+    } else {
+      sparse_u[block_start + j] = scaled_u1 * w[cone_off + j];
+      sparse_v[block_start + j] = scaled_v1 * w[cone_off + j];
+    }
+  }
+}
+
+/**
+ * Refresh the rank-2 NT factors (d, v, u) of every sparse cone from the current
+ * scaling, so the implicit sparse block matches the dense Hessian for this
+ * iteration. Call after `launch_nt_scaling` has updated w and eta.
+ */
+template <std::integral i_t, std::floating_point f_t>
+void launch_update_scaling_sparse(cone_data_t<i_t, f_t>& cones, rmm::cuda_stream_view stream)
+{
+  if (!cones.has_sparse_cones()) { return; }
+
+  const i_t n_sparse = cones.n_sparse_cones;
+  update_scaling_sparse_kernel<i_t, f_t>
+    <<<n_sparse, soc_block_size, 0, stream.value()>>>(cuopt::make_span(cones.w),
+                                                      cuopt::make_span(cones.eta),
+                                                      cuopt::make_span(cones.d),
+                                                      cuopt::make_span(cones.sparse_v),
+                                                      cuopt::make_span(cones.sparse_u),
+                                                      cuopt::make_span(cones.cone_offsets),
+                                                      cuopt::make_span(cones.sparse_cone_dims),
+                                                      cuopt::make_span(cones.sparse_cone_ids),
+                                                      cuopt::make_span(cones.sparse_entry_offsets),
+                                                      n_sparse);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
 template <std::integral i_t, std::floating_point f_t>
 __global__ void __launch_bounds__(soc_block_size)
   apply_w_inv_write_kernel(raft::device_span<const f_t> v,
@@ -546,21 +708,24 @@ __global__ void __launch_bounds__(soc_block_size)
 
 template <std::integral i_t, std::floating_point f_t>
 __global__ void __launch_bounds__(soc_block_size)
-  apply_hessian_write_kernel(raft::device_span<const f_t> v,
-                             raft::device_span<f_t> out,
-                             raft::device_span<const f_t> w,
-                             raft::device_span<const f_t> eta,
-                             raft::device_span<const f_t> wv_dot,
-                             raft::device_span<const size_t> cone_offsets,
-                             raft::device_span<const i_t> element_cone_ids,
-                             raft::device_span<const f_t> bias,
-                             f_t output_scale,
-                             f_t bias_scale)
+  apply_hessian_kernel(raft::device_span<const f_t> v,
+                       raft::device_span<f_t> out,
+                       raft::device_span<const f_t> w,
+                       raft::device_span<const f_t> eta,
+                       raft::device_span<const f_t> wv_dot,
+                       raft::device_span<const size_t> cone_offsets,
+                       raft::device_span<const i_t> element_cone_ids,
+                       raft::device_span<const i_t> cone_is_sparse,
+                       bool dense_cones_only,
+                       raft::device_span<const f_t> bias,
+                       f_t output_scale,
+                       f_t bias_scale)
 {
   const size_t idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
   if (idx >= out.size()) { return; }
 
-  const i_t cone         = element_cone_ids[idx];
+  const i_t cone = element_cone_ids[idx];
+  if (dense_cones_only && !cone_is_sparse.empty() && cone_is_sparse[cone] != 0) { return; }
   const size_t cone_off  = cone_offsets[cone];
   const size_t local_idx = idx - cone_off;
 
@@ -767,7 +932,8 @@ void apply_hessian(raft::device_span<const f_t> v,
                    rmm::cuda_stream_view stream,
                    f_t output_scale                  = 1,
                    raft::device_span<const f_t> bias = {},
-                   f_t bias_scale                    = 0)
+                   f_t bias_scale                    = 0,
+                   bool dense_cones_only             = false)
 {
   auto w                = cuopt::make_span(cones.w);
   auto eta              = cuopt::make_span(cones.eta);
@@ -781,8 +947,19 @@ void apply_hessian(raft::device_span<const f_t> v,
   cones.segmented_sum(wv_terms, wv_dot, stream);
 
   const size_t grid_dim = raft::ceildiv<size_t>(out.size(), soc_block_size);
-  apply_hessian_write_kernel<i_t, f_t><<<grid_dim, soc_block_size, 0, stream.value()>>>(
-    v, out, w, eta, wv_dot, cone_offsets, element_cone_ids, bias, output_scale, bias_scale);
+  apply_hessian_kernel<i_t, f_t>
+    <<<grid_dim, soc_block_size, 0, stream.value()>>>(v,
+                                                      out,
+                                                      w,
+                                                      eta,
+                                                      wv_dot,
+                                                      cone_offsets,
+                                                      element_cone_ids,
+                                                      cuopt::make_span(cones.cone_is_sparse),
+                                                      dense_cones_only,
+                                                      bias,
+                                                      output_scale,
+                                                      bias_scale);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
@@ -804,100 +981,392 @@ void recover_cone_dz_from_target(raft::device_span<const f_t> dx,
 }
 
 /**
- * Accumulate the SOC cone-block matvec into an existing output vector.
+ * Accumulate the dense SOC cone-block matvec into an existing output vector:
+ *   out += H x, where H = S^2, applied to dense cones only.
+ */
+template <std::integral i_t, std::floating_point f_t>
+void launch_dense_hessian_matvec(raft::device_span<const f_t> x,
+                                 cone_data_t<i_t, f_t>& cones,
+                                 raft::device_span<f_t> out,
+                                 rmm::cuda_stream_view stream)
+{
+  auto out_input = raft::device_span<const f_t>(out.data(), out.size());
+  apply_hessian<i_t, f_t>(x, out, cones, stream, 1, out_input, 1, true);
+}
+
+// One block per sparse cone. Write the diagonal of the implicit cone Hessian:
+// eta^2 on every entry, with the head entry scaled by the rank-2 corner d.
+template <std::integral i_t, std::floating_point f_t>
+__global__ void get_Hs_sparse_kernel(raft::device_span<f_t> Hs_diag,
+                                     raft::device_span<const f_t> eta,
+                                     raft::device_span<const f_t> d,
+                                     raft::device_span<const i_t> sparse_cone_ids,
+                                     raft::device_span<const i_t> sparse_entry_offsets,
+                                     raft::device_span<const i_t> sparse_cone_dims,
+                                     i_t n_sparse_cones)
+{
+  const i_t sparse_idx = blockIdx.x;
+  if (sparse_idx >= n_sparse_cones) { return; }
+
+  const i_t block_start = sparse_entry_offsets[sparse_idx];
+  const i_t block_dim   = sparse_cone_dims[sparse_idx];
+  const i_t cone_idx    = sparse_cone_ids[sparse_idx];
+  const f_t eta_sq      = eta[cone_idx] * eta[cone_idx];
+
+  for (i_t j = threadIdx.x; j < block_dim; j += blockDim.x) {
+    Hs_diag[block_start + j] = eta_sq;
+  }
+  __syncthreads();
+
+  if (threadIdx.x == 0) { Hs_diag[block_start] *= d[sparse_idx]; }
+}
+
+/**
+ * Compute the packed diagonal `Hs_diag` of the implicit sparse cone Hessians,
+ * one contiguous block of `sparse_cone_dims[s]` entries per sparse cone. This is
+ * the diagonal that gets scattered onto the cone rows of the augmented system.
+ */
+template <std::integral i_t, std::floating_point f_t>
+void launch_get_Hs_sparse(cone_data_t<i_t, f_t>& cones,
+                          rmm::device_uvector<f_t>& Hs_diag,
+                          rmm::cuda_stream_view stream)
+{
+  if (!cones.has_sparse_cones()) { return; }
+
+  const i_t n_sparse = cones.n_sparse_cones;
+  cuopt_assert(Hs_diag.size() == cones.n_sparse_cone_entries, "Hs_diag size mismatch");
+
+  get_Hs_sparse_kernel<i_t, f_t>
+    <<<n_sparse, soc_block_size, 0, stream.value()>>>(cuopt::make_span(Hs_diag),
+                                                      cuopt::make_span(cones.eta),
+                                                      cuopt::make_span(cones.d),
+                                                      cuopt::make_span(cones.sparse_cone_ids),
+                                                      cuopt::make_span(cones.sparse_entry_offsets),
+                                                      cuopt::make_span(cones.sparse_cone_dims),
+                                                      n_sparse);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+// Scatter one sparse-cone diagonal entry into the augmented value buffer:
+// augmented_x[csr_indices[e]] = -Hs_diag[e] - q_values[e] - dual_perturb,
+// matching the sign convention of the dense scatter (negated KKT block plus
+// any quadratic-objective contribution and diagonal regularization).
+template <std::integral i_t, std::floating_point f_t>
+__global__ void __launch_bounds__(soc_block_size)
+  scatter_sparse_hessian_diag_kernel(raft::device_span<f_t> augmented_x,
+                                     raft::device_span<const i_t> csr_indices,
+                                     raft::device_span<const f_t> Hs_diag,
+                                     raft::device_span<const f_t> q_values,
+                                     f_t dual_perturb)
+{
+  const size_t e = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+  if (e >= csr_indices.size()) { return; }
+
+  augmented_x[csr_indices[e]] = -Hs_diag[e] - q_values[e] - dual_perturb;
+}
+
+/**
+ * Write the sparse cone Hessian diagonals into the augmented system value buffer.
+ *
+ * `csr_indices[e]` gives the value-buffer position of the diagonal for packed
+ * cone entry `e`; each is set to `-Hs_diag[e] - q_values[e] - dual_perturb`.
+ */
+template <std::integral i_t, std::floating_point f_t>
+void scatter_sparse_hessian_diag_into_augmented(rmm::device_uvector<f_t>& augmented_x,
+                                                const rmm::device_uvector<i_t>& csr_indices,
+                                                const rmm::device_uvector<f_t>& Hs_diag,
+                                                const rmm::device_uvector<f_t>& q_values,
+                                                rmm::cuda_stream_view stream,
+                                                f_t dual_perturb)
+{
+  const size_t count = csr_indices.size();
+  if (count == 0) { return; }
+  cuopt_assert(count == Hs_diag.size(), "sparse Hessian index/value size mismatch");
+  cuopt_assert(count == q_values.size(), "sparse Hessian Q-value size mismatch");
+
+  const size_t grid = raft::ceildiv<size_t>(count, soc_block_size);
+  scatter_sparse_hessian_diag_kernel<i_t, f_t>
+    <<<grid, soc_block_size, 0, stream.value()>>>(cuopt::make_span(augmented_x),
+                                                  cuopt::make_span(csr_indices),
+                                                  cuopt::make_span(Hs_diag),
+                                                  cuopt::make_span(q_values),
+                                                  dual_perturb);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+/**
+ * Refresh the expansion (rank-2 coupling) entries of the augmented system.
  *
- * Used by matrix-free products with the primal-reduced KKT block:
- *   out += H x, where H = S^2.
+ * For each sparse cone this scatters the rank-2 vectors v, u into their four
+ * coupling positions -- the v/u columns and the symmetric v/u rows linking the
+ * cone variables to the two expansion variables -- and writes the expansion
+ * diagonal +/-(eta^2 + dual_perturb). The Hessian diagonal itself is written
+ * separately by `scatter_sparse_hessian_diag_into_augmented`.
  */
 template <std::integral i_t, std::floating_point f_t>
-void accumulate_cone_hessian_matvec(raft::device_span<const f_t> x,
+void update_sparse_expansion_in_augmented(rmm::device_uvector<f_t>& augmented_x,
+                                          const rmm::device_uvector<i_t>& sparse_exp_v_col,
+                                          const rmm::device_uvector<i_t>& sparse_exp_u_col,
+                                          const rmm::device_uvector<i_t>& sparse_exp_v_row,
+                                          const rmm::device_uvector<i_t>& sparse_exp_u_row,
+                                          const rmm::device_uvector<i_t>& sparse_expansion_D,
+                                          const rmm::device_uvector<f_t>& sparse_v,
+                                          const rmm::device_uvector<f_t>& sparse_u,
+                                          const rmm::device_uvector<f_t>& eta,
+                                          const rmm::device_uvector<i_t>& sparse_cone_ids,
+                                          rmm::cuda_stream_view stream,
+                                          f_t dual_perturb)
+{
+  const size_t n_coupling = sparse_exp_v_col.size();
+  if (n_coupling == 0) { return; }
+
+  cuopt_assert(sparse_exp_u_col.size() == n_coupling, "sparse_exp_u_col size mismatch");
+  cuopt_assert(sparse_exp_v_row.size() == n_coupling, "sparse_exp_v_row size mismatch");
+  cuopt_assert(sparse_exp_u_row.size() == n_coupling, "sparse_exp_u_row size mismatch");
+  cuopt_assert(sparse_v.size() == n_coupling, "sparse_v size mismatch");
+  cuopt_assert(sparse_u.size() == n_coupling, "sparse_u size mismatch");
+
+  thrust::scatter(rmm::exec_policy(stream),
+                  sparse_v.begin(),
+                  sparse_v.end(),
+                  sparse_exp_v_col.begin(),
+                  augmented_x.begin());
+  thrust::scatter(rmm::exec_policy(stream),
+                  sparse_u.begin(),
+                  sparse_u.end(),
+                  sparse_exp_u_col.begin(),
+                  augmented_x.begin());
+  thrust::scatter(rmm::exec_policy(stream),
+                  sparse_v.begin(),
+                  sparse_v.end(),
+                  sparse_exp_v_row.begin(),
+                  augmented_x.begin());
+  thrust::scatter(rmm::exec_policy(stream),
+                  sparse_u.begin(),
+                  sparse_u.end(),
+                  sparse_exp_u_row.begin(),
+                  augmented_x.begin());
+
+  const i_t n_expansion = static_cast<i_t>(sparse_expansion_D.size());
+  if (n_expansion > 0) {
+    auto eta_span             = cuopt::make_span(eta);
+    auto sparse_cone_ids_span = cuopt::make_span(sparse_cone_ids);
+    // Expansion-row diagonal of sparse cone i/2: -(eta^2 + dual_perturb) for the
+    // v-row (even i), +(eta^2 + dual_perturb) for the u-row (odd i).
+    auto diag_values = thrust::make_transform_iterator(
+      thrust::counting_iterator<i_t>(0),
+      [eta_span, sparse_cone_ids_span, dual_perturb] HD(i_t i) -> f_t {
+        const i_t sparse_idx = i / 2;
+        const f_t eta_val    = eta_span[sparse_cone_ids_span[sparse_idx]];
+        const f_t sign       = (i % 2 == 0) ? f_t(-1) : f_t(1);
+        return sign * (eta_val * eta_val + dual_perturb);
+      });
+    thrust::scatter(rmm::exec_policy(stream),
+                    diag_values,
+                    diag_values + n_expansion,
+                    sparse_expansion_D.begin(),
+                    augmented_x.begin());
+  }
+}
+
+/**
+ * Accumulate the sparse-SOC expanded KKT block into a matrix-free product.
+ *
+ * For each sparse cone this adds to the primal cone rows (before augmented_multiply negates r1):
+ *   r1 += Hs_diag .* x_cone - v * x_exp_v - u * x_exp_u
+ * so y1 = -r1 matches the explicit CSR coupling (+v, +u on primal rows).
+ * and to the expansion rows:
+ *   y_exp[2s]   += -eta^2 * x_exp_v + v^T x_cone
+ *   y_exp[2s+1] += +eta^2 * x_exp_u + u^T x_cone
+ *
+ * `v` and `u` are the rank-2 vectors in cones.sparse_v / cones.sparse_u.
+ */
+template <std::integral i_t, std::floating_point f_t>
+__global__ void sparse_augmented_matvec_kernel(raft::device_span<const f_t> x,
+                                               raft::device_span<f_t> r1,
+                                               raft::device_span<f_t> y_exp,
+                                               raft::device_span<const f_t> Hs_diag,
+                                               raft::device_span<const f_t> sparse_v,
+                                               raft::device_span<const f_t> sparse_u,
+                                               raft::device_span<const f_t> eta,
+                                               raft::device_span<const i_t> sparse_cone_ids,
+                                               raft::device_span<const i_t> sparse_cone_dims,
+                                               raft::device_span<const i_t> sparse_entry_offsets,
+                                               raft::device_span<const size_t> cone_offsets,
+                                               i_t cone_var_start,
+                                               i_t n_primal,
+                                               i_t m_constraints,
+                                               i_t n_sparse_cones)
+{
+  const i_t sparse_idx = blockIdx.x;
+  if (sparse_idx >= n_sparse_cones) { return; }
+
+  const i_t cone    = sparse_cone_ids[sparse_idx];
+  const i_t q       = sparse_cone_dims[sparse_idx];
+  const i_t flat    = sparse_entry_offsets[sparse_idx];
+  const size_t off  = cone_offsets[cone];
+  const i_t base    = cone_var_start + static_cast<i_t>(off);
+  const i_t exp_v   = n_primal + m_constraints + 2 * sparse_idx;
+  const i_t exp_u   = exp_v + 1;
+  const f_t x_exp_v = x[exp_v];
+  const f_t x_exp_u = x[exp_u];
+  const f_t eta_sq  = eta[cone] * eta[cone];
+
+  f_t partial_dot_v = f_t(0);
+  f_t partial_dot_u = f_t(0);
+  for (i_t j = threadIdx.x; j < q; j += blockDim.x) {
+    const f_t xj = x[base + j];
+    const f_t vj = sparse_v[flat + j];
+    const f_t uj = sparse_u[flat + j];
+    partial_dot_v += vj * xj;
+    partial_dot_u += uj * xj;
+    r1[base + j] += Hs_diag[flat + j] * xj - vj * x_exp_v - uj * x_exp_u;
+  }
+
+  __shared__ f_t s_dot_v[soc_block_size];
+  __shared__ f_t s_dot_u[soc_block_size];
+  s_dot_v[threadIdx.x] = partial_dot_v;
+  s_dot_u[threadIdx.x] = partial_dot_u;
+  __syncthreads();
+
+  for (i_t stride = blockDim.x / 2; stride > 0; stride >>= 1) {
+    if (threadIdx.x < stride) {
+      s_dot_v[threadIdx.x] += s_dot_v[threadIdx.x + stride];
+      s_dot_u[threadIdx.x] += s_dot_u[threadIdx.x + stride];
+    }
+    __syncthreads();
+  }
+
+  if (threadIdx.x == 0) {
+    y_exp[2 * sparse_idx] += -eta_sq * x_exp_v + s_dot_v[0];
+    y_exp[2 * sparse_idx + 1] += eta_sq * x_exp_u + s_dot_u[0];
+  }
+}
+
+/**
+ * Matrix-free product of the implicit sparse cone blocks (see
+ * `sparse_augmented_matvec_kernel`). Accumulates the cone-row contribution into
+ * `r1` and the expansion-row contribution into `y_exp`, one block per sparse cone.
+ */
+template <std::integral i_t, std::floating_point f_t>
+void launch_sparse_augmented_matvec(raft::device_span<const f_t> x,
+                                    raft::device_span<f_t> r1,
+                                    raft::device_span<f_t> y_exp,
                                     cone_data_t<i_t, f_t>& cones,
-                                    raft::device_span<f_t> out,
+                                    raft::device_span<const f_t> Hs_diag,
+                                    i_t cone_var_start,
+                                    i_t n_primal,
+                                    i_t m_constraints,
                                     rmm::cuda_stream_view stream)
 {
-  auto out_input = raft::device_span<const f_t>(out.data(), out.size());
-  apply_hessian<i_t, f_t>(x, out, cones, stream, 1, out_input, 1);
+  if (!cones.has_sparse_cones()) { return; }
+
+  const i_t n_sparse = cones.n_sparse_cones;
+  cuopt_assert(Hs_diag.size() == cones.n_sparse_cone_entries, "Hs_diag size mismatch");
+  cuopt_assert(y_exp.size() == static_cast<size_t>(cones.expansion_var_count()),
+               "expansion output size mismatch");
+
+  sparse_augmented_matvec_kernel<i_t, f_t>
+    <<<n_sparse, soc_block_size, 0, stream.value()>>>(x,
+                                                      r1,
+                                                      y_exp,
+                                                      Hs_diag,
+                                                      cuopt::make_span(cones.sparse_v),
+                                                      cuopt::make_span(cones.sparse_u),
+                                                      cuopt::make_span(cones.eta),
+                                                      cuopt::make_span(cones.sparse_cone_ids),
+                                                      cuopt::make_span(cones.sparse_cone_dims),
+                                                      cuopt::make_span(cones.sparse_entry_offsets),
+                                                      cuopt::make_span(cones.cone_offsets),
+                                                      cone_var_start,
+                                                      n_primal,
+                                                      m_constraints,
+                                                      n_sparse);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
+// Scatter one nonzero of a dense cone's q x q NT Hessian block into the
+// augmented value buffer. Only cones with dim <= soc_threshold take this path.
 template <std::integral i_t, std::floating_point f_t>
 __global__ void __launch_bounds__(soc_block_size)
-  scatter_hessian_into_augmented_kernel(raft::device_span<f_t> augmented_x,
-                                        raft::device_span<const i_t> csr_indices,
-                                        raft::device_span<const f_t> q_values,
-                                        raft::device_span<const f_t> w,
-                                        raft::device_span<const f_t> eta,
-                                        raft::device_span<const size_t> cone_offsets,
-                                        raft::device_span<const size_t> block_offsets,
-                                        i_t n_cones,
-                                        f_t dual_perturb_value)
+  scatter_dense_hessian_into_augmented_kernel(raft::device_span<f_t> augmented_x,
+                                              raft::device_span<const i_t> csr_indices,
+                                              raft::device_span<const f_t> q_values,
+                                              raft::device_span<const f_t> w,
+                                              raft::device_span<const f_t> eta,
+                                              raft::device_span<const size_t> cone_offsets,
+                                              raft::device_span<const size_t> dense_block_offsets,
+                                              raft::device_span<const i_t> dense_cone_ids,
+                                              i_t n_dense_cones,
+                                              f_t dual_perturb_value)
 {
   const size_t e = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
   if (e >= csr_indices.size()) { return; }
 
   i_t lo = 0;
-  i_t hi = n_cones;
+  i_t hi = n_dense_cones;
   while (lo < hi) {
     const i_t mid = lo + (hi - lo) / 2;
-    if (block_offsets[mid + 1] <= e) {
+    if (dense_block_offsets[mid + 1] <= e) {
       lo = mid + 1;
     } else {
       hi = mid;
     }
   }
 
-  const i_t cone       = lo;
+  const i_t dense_idx  = lo;
+  const i_t cone       = dense_cone_ids[dense_idx];
   const size_t off     = cone_offsets[cone];
   const size_t q       = cone_offsets[cone + 1] - off;
-  const size_t blk_off = block_offsets[cone];
+  const size_t blk_off = dense_block_offsets[dense_idx];
   const size_t local   = e - blk_off;
   const size_t r       = local / q;
   const size_t c       = local % q;
 
-  const f_t eta_sq          = eta[cone] * eta[cone];
-  const f_t w0              = w[off];
-  const f_t u_r             = (r == 0) ? w0 : w[off + r];
-  const f_t u_c             = (c == 0) ? w0 : w[off + c];
-  f_t val                   = f_t{2} * u_r * eta_sq * u_c;
-  const f_t diag_correction = (r == 0) ? -eta_sq : eta_sq;
-  if (r == c) { val += diag_correction; }
+  const f_t eta_sq = eta[cone] * eta[cone];
+  const f_t w0     = w[off];
+  const f_t u_r    = (r == 0) ? w0 : w[off + r];
+  const f_t u_c    = (c == 0) ? w0 : w[off + c];
+  const f_t val    = f_t{2} * u_r * eta_sq * u_c;
 
-  augmented_x[csr_indices[e]] = -val - q_values[e];
+  f_t entry = -val - q_values[e];
+  if (r == c) {
+    const f_t diag_correction = (r == 0) ? -eta_sq : eta_sq;
+    entry -= diag_correction + dual_perturb_value;
+  }
+  augmented_x[csr_indices[e]] = entry;
 }
 
+/**
+ * Write the full q x q NT Hessian blocks of the dense cones (dim <= soc_threshold)
+ * into the augmented system value buffer at the precomputed CSR positions.
+ */
 template <std::integral i_t, std::floating_point f_t>
-void scatter_hessian_into_augmented(const cone_data_t<i_t, f_t>& cones,
-                                    rmm::device_uvector<f_t>& augmented_x,
-                                    const rmm::device_uvector<i_t>& csr_indices,
-                                    const rmm::device_uvector<f_t>& q_values,
-                                    rmm::cuda_stream_view stream,
-                                    f_t dual_perturb_value)
+void scatter_dense_hessian_into_augmented(const cone_data_t<i_t, f_t>& cones,
+                                          rmm::device_uvector<f_t>& augmented_x,
+                                          const rmm::device_uvector<i_t>& csr_indices,
+                                          const rmm::device_uvector<f_t>& q_values,
+                                          const rmm::device_uvector<size_t>& dense_block_offsets,
+                                          const rmm::device_uvector<i_t>& dense_cone_ids,
+                                          rmm::cuda_stream_view stream,
+                                          f_t dual_perturb_value)
 {
   const size_t count = csr_indices.size();
   if (count == 0) { return; }
-  cuopt_assert(count == q_values.size(), "cone CSR index and Q-value arrays must match");
-
-  // TODO: This offset calculation should be done in the barrier layer,
-  // because it is already done in the barrier layer for the augmented system, see
-  // cone_block_offsets_host.
-  rmm::device_uvector<size_t> block_offsets(cones.n_cones + 1, stream);
-  block_offsets.set_element_to_zero_async(0, stream);
-
-  auto block_sizes = thrust::make_transform_iterator(
-    cones.cone_dimensions.begin(), [] HD(i_t q) -> size_t { return static_cast<size_t>(q) * q; });
-  thrust::inclusive_scan(
-    rmm::exec_policy(stream), block_sizes, block_sizes + cones.n_cones, block_offsets.begin() + 1);
+  cuopt_assert(count == q_values.size(), "dense cone CSR index and Q-value arrays must match");
 
-  // TODO: use dual_perturb_value for regularization
+  const i_t n_dense = cones.n_dense_cones();
   const size_t grid = raft::ceildiv<size_t>(count, soc_block_size);
-  scatter_hessian_into_augmented_kernel<i_t, f_t>
+  scatter_dense_hessian_into_augmented_kernel<i_t, f_t>
     <<<grid, soc_block_size, 0, stream.value()>>>(cuopt::make_span(augmented_x),
                                                   cuopt::make_span(csr_indices),
                                                   cuopt::make_span(q_values),
                                                   cuopt::make_span(cones.w),
                                                   cuopt::make_span(cones.eta),
                                                   cuopt::make_span(cones.cone_offsets),
-                                                  cuopt::make_span(block_offsets),
-                                                  cones.n_cones,
+                                                  cuopt::make_span(dense_block_offsets),
+                                                  cuopt::make_span(dense_cone_ids),
+                                                  n_dense,
                                                   dual_perturb_value);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
diff --git a/cpp/src/barrier/second_order_cone_reduction.cuh b/cpp/src/barrier/second_order_cone_reduction.cuh
index ada9fcfb6a..6ab380102a 100644
--- a/cpp/src/barrier/second_order_cone_reduction.cuh
+++ b/cpp/src/barrier/second_order_cone_reduction.cuh
@@ -10,8 +10,8 @@
 #include <utilities/copy_helpers.hpp>
 #include <utilities/macros.cuh>
 
+#include <cub/block/block_reduce.cuh>
 #include <cub/device/device_reduce.cuh>
-#include <cub/device/device_segmented_reduce.cuh>
 #include <cub/warp/warp_reduce.cuh>
 
 #include <rmm/device_buffer.hpp>
@@ -24,7 +24,6 @@
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
-#include <thrust/iterator/permutation_iterator.h>
 
 #include <algorithm>
 #include <concepts>
@@ -46,14 +45,22 @@ __global__ void __launch_bounds__(warps_per_cta* raft::WarpSize)
                               OutputIt output,
                               value_t init);
 
+template <std::integral i_t, typename value_t, int block_dim, typename InputIt, typename OutputIt>
+__global__ void __launch_bounds__(block_dim)
+  block_per_cone_reduce_kernel(InputIt input,
+                               raft::device_span<const i_t> medium_cone_ids,
+                               raft::device_span<const std::size_t> cone_offsets,
+                               OutputIt output,
+                               value_t init);
+
 /**
  * Segmented-sum dispatcher for packed second-order cone vectors.
  *
  * Cone dimensions are fixed for a solve, so the constructor partitions cone
  * ids once by reduction strategy. Each call then reuses those partitions:
- * small cones use one warp per cone, medium cones use CUB DeviceSegmentedReduce,
+ * small cones use one warp per cone, medium cones use one block per cone,
  * and large cones use CUB DeviceReduce one cone at a time. The object owns the
- * CUB workspace for those medium/large paths. Call `prepare_workspace` once
+ * CUB workspace for the large-cone path. Call `prepare_workspace` once
  * before using a CUB-backed path.
  */
 template <std::integral i_t, i_t warp_cone_dim = 64, i_t large_cone_cutoff = 32768>
@@ -69,7 +76,7 @@ struct segmented_sum_t {
   std::vector<i_t> large_cone_ids;
   std::vector<i_t> large_cone_dimensions;
 
-  // Maximum CUB temporary storage needed by prepared medium/large reductions.
+  // Maximum CUB temporary storage needed by prepared large reductions.
   std::size_t cub_workspace_bytes = 0;
   rmm::device_buffer cub_workspace;
 
@@ -80,24 +87,6 @@ struct segmented_sum_t {
     auto input  = thrust::make_constant_iterator(value_t{});
     auto output = thrust::make_discard_iterator();
 
-    if (!medium_cone_ids.is_empty()) {
-      const auto medium_begin_offsets =
-        thrust::make_permutation_iterator(cone_offsets.data(), medium_cone_ids.begin());
-      const auto medium_end_offsets =
-        thrust::make_permutation_iterator(cone_offsets.data() + 1, medium_cone_ids.begin());
-
-      std::size_t temp_storage_bytes = 0;
-      RAFT_CUDA_TRY(cub::DeviceSegmentedReduce::Sum(nullptr,
-                                                    temp_storage_bytes,
-                                                    input,
-                                                    output,
-                                                    medium_cone_ids.size(),
-                                                    medium_begin_offsets,
-                                                    medium_end_offsets,
-                                                    stream.value()));
-      cub_workspace_bytes = std::max(cub_workspace_bytes, temp_storage_bytes);
-    }
-
     for (std::size_t i = 0; i < large_cone_ids.size(); ++i) {
       std::size_t temp_storage_bytes = 0;
       RAFT_CUDA_TRY(cub::DeviceReduce::Sum(nullptr,
@@ -109,7 +98,7 @@ struct segmented_sum_t {
       cub_workspace_bytes = std::max(cub_workspace_bytes, temp_storage_bytes);
     }
 
-    if (cub_workspace.size() < cub_workspace_bytes) {
+    if (cub_workspace_bytes > 0 && cub_workspace.size() < cub_workspace_bytes) {
       cub_workspace.resize(cub_workspace_bytes, stream);
     }
   }
@@ -138,31 +127,17 @@ struct segmented_sum_t {
     }
 
     if (!medium_cone_ids.is_empty()) {
-      cuopt_assert(cub_workspace_bytes > 0 && cub_workspace.size() >= cub_workspace_bytes,
-                   "segmented_sum_t::prepare_workspace must be called before reducing medium or "
-                   "large cones");
-
-      const auto medium_output = thrust::make_permutation_iterator(output, medium_cone_ids.begin());
-      const auto medium_begin_offsets =
-        thrust::make_permutation_iterator(cone_offsets.data(), medium_cone_ids.begin());
-      const auto medium_end_offsets =
-        thrust::make_permutation_iterator(cone_offsets.data() + 1, medium_cone_ids.begin());
-
-      std::size_t temp_storage_bytes = cub_workspace_bytes;
-      RAFT_CUDA_TRY(cub::DeviceSegmentedReduce::Sum(cub_workspace.data(),
-                                                    temp_storage_bytes,
-                                                    input,
-                                                    medium_output,
-                                                    medium_cone_ids.size(),
-                                                    medium_begin_offsets,
-                                                    medium_end_offsets,
-                                                    stream.value()));
+      constexpr int medium_block_dim = 256;
+      const auto n_medium            = medium_cone_ids.size();
+      block_per_cone_reduce_kernel<i_t, value_t, medium_block_dim>
+        <<<n_medium, medium_block_dim, 0, stream.value()>>>(
+          input, cuopt::make_span(medium_cone_ids), cone_offsets, output, init);
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
     }
 
     if (!large_cone_ids.empty()) {
       cuopt_assert(cub_workspace_bytes > 0 && cub_workspace.size() >= cub_workspace_bytes,
-                   "segmented_sum_t::prepare_workspace must be called before reducing medium or "
-                   "large cones");
+                   "segmented_sum_t::prepare_workspace must be called before reducing large cones");
 
       for (std::size_t i = 0; i < large_cone_ids.size(); ++i) {
         std::size_t temp_storage_bytes = cub_workspace_bytes;
@@ -258,4 +233,55 @@ __global__ void __launch_bounds__(warps_per_cta* raft::WarpSize)
   if (lane_id == 0) { output[cone] = sum; }
 }
 
+template <std::integral i_t, typename value_t, int block_dim, typename InputIt, typename OutputIt>
+__global__ void __launch_bounds__(block_dim)
+  block_per_cone_reduce_kernel(InputIt input,
+                               raft::device_span<const i_t> medium_cone_ids,
+                               raft::device_span<const std::size_t> cone_offsets,
+                               OutputIt output,
+                               value_t init)
+{
+  static_assert(block_dim > 0);
+  static_assert(block_dim <= 1024);
+
+  constexpr int items_per_thread = 4;
+
+  using block_reduce_t = cub::BlockReduce<value_t, block_dim>;
+  __shared__ typename block_reduce_t::TempStorage temp_storage;
+
+  const auto slot = blockIdx.x;
+  if (slot >= medium_cone_ids.size()) { return; }
+
+  const auto cone = medium_cone_ids[slot];
+  const auto off  = cone_offsets[cone];
+  const auto dim  = cone_offsets[cone + 1] - off;
+
+  value_t acc[items_per_thread];
+#pragma unroll
+  for (int k = 0; k < items_per_thread; ++k) {
+    acc[k] = value_t{0};
+  }
+
+  // Thread t owns elements t, t+block_dim, ..., t+(items_per_thread-1)*block_dim
+  // within each tile of `block_dim * items_per_thread` consecutive entries, so
+  // every warp load stays coalesced.
+  const std::size_t tile = static_cast<std::size_t>(block_dim) * items_per_thread;
+  for (std::size_t tile_start = 0; tile_start < dim; tile_start += tile) {
+#pragma unroll
+    for (int k = 0; k < items_per_thread; ++k) {
+      const std::size_t idx = tile_start + threadIdx.x + static_cast<std::size_t>(k) * block_dim;
+      if (idx < dim) { acc[k] = acc[k] + input[off + idx]; }
+    }
+  }
+
+  value_t sum = init;
+#pragma unroll
+  for (int k = 0; k < items_per_thread; ++k) {
+    sum = sum + acc[k];
+  }
+
+  sum = block_reduce_t(temp_storage).Sum(sum);
+  if (threadIdx.x == 0) { output[cone] = sum; }
+}
+
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp
index 286ac7364f..46a551b9f1 100644
--- a/cpp/src/dual_simplex/simplex_solver_settings.hpp
+++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp
@@ -69,6 +69,7 @@ struct simplex_solver_settings_t {
       eliminate_dense_columns(true),
       barrier_iterative_refinement(true),
       barrier_step_scale(0.9),
+      barrier_soc_threshold(5),
       num_gpus(1),
       folding(-1),
       augmented(0),
@@ -157,6 +158,7 @@ struct simplex_solver_settings_t {
   bool eliminate_dense_columns;       // true to eliminate dense columns from A*D*A^T
   bool barrier_iterative_refinement;  // true to use iterative refinement for barrier method
   f_t barrier_step_scale;             // step scale for barrier method
+  i_t barrier_soc_threshold;          // SOC dimension above which rank-2 sparse scaling is used
   int num_gpus;   // Number of GPUs to use (maximum of 2 gpus are supported at the moment)
   i_t folding;    // -1 automatic, 0 don't fold, 1 fold
   i_t augmented;  // -1 automatic, 0 to solve with ADAT, 1 to solve with augmented system
diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu
index 4a000c6e05..235fc57f30 100644
--- a/cpp/src/math_optimization/solver_settings.cu
+++ b/cpp/src/math_optimization/solver_settings.cu
@@ -132,6 +132,7 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
     {CUOPT_DUALIZE, &pdlp_settings.dualize, -1, 1, -1},
     {CUOPT_ORDERING, &pdlp_settings.ordering, -1, 1, -1},
     {CUOPT_BARRIER_DUAL_INITIAL_POINT, &pdlp_settings.barrier_dual_initial_point, -1, 1, -1},
+    {CUOPT_BARRIER_SOC_THRESHOLD, &pdlp_settings.barrier_soc_threshold, 0, 1000000, 5, "SOC dimension above which rank-2 sparse KKT expansion is used"},
     {CUOPT_MIP_CUT_PASSES, &mip_settings.max_cut_passes, -1, std::numeric_limits<i_t>::max(), 10},
     {CUOPT_MIP_MIXED_INTEGER_ROUNDING_CUTS, &mip_settings.mir_cuts, -1, 1, -1},
     {CUOPT_MIP_MIXED_INTEGER_GOMORY_CUTS, &mip_settings.mixed_integer_gomory_cuts, -1, 1, -1},
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index 5d29e97fa0..3fc1fc7dda 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -511,6 +511,7 @@ run_barrier(dual_simplex::user_problem_t<i_t, f_t>& user_problem,
   barrier_settings.crossover                       = settings.crossover;
   barrier_settings.eliminate_dense_columns         = settings.eliminate_dense_columns;
   barrier_settings.barrier_iterative_refinement    = settings.barrier_iterative_refinement;
+  barrier_settings.barrier_soc_threshold           = settings.barrier_soc_threshold;
   barrier_settings.barrier_step_scale              = settings.barrier_step_scale;
   barrier_settings.cudss_deterministic             = settings.cudss_deterministic;
   barrier_settings.barrier_relaxed_feasibility_tol = settings.tolerances.relative_primal_tolerance;
diff --git a/cpp/tests/socp/CMakeLists.txt b/cpp/tests/socp/CMakeLists.txt
index f380984225..7ea1a732d5 100644
--- a/cpp/tests/socp/CMakeLists.txt
+++ b/cpp/tests/socp/CMakeLists.txt
@@ -5,6 +5,7 @@
 
 ConfigureTest(SOCP_TEST
     ${CMAKE_CURRENT_SOURCE_DIR}/second_order_cone_kernels.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/sparse_augmented_kkt_test.cu
     ${CMAKE_CURRENT_SOURCE_DIR}/solve_barrier_socp.cu
     ${CMAKE_CURRENT_SOURCE_DIR}/general_quadratic_test.cu
     LABELS numopt)
diff --git a/cpp/tests/socp/second_order_cone_kernels.cu b/cpp/tests/socp/second_order_cone_kernels.cu
index 49126ee335..43c162b4aa 100644
--- a/cpp/tests/socp/second_order_cone_kernels.cu
+++ b/cpp/tests/socp/second_order_cone_kernels.cu
@@ -202,6 +202,171 @@ TEST(second_order_cone_kernels, nt_scaling_matches_host_reference)
   }
 }
 
+TEST(second_order_cone_kernels, nt_scaling_many_small_one_medium_cone)
+{
+  auto stream = rmm::cuda_stream_default;
+
+  // chainsing-like topology: many small cones plus one dim-1000 medium cone (warp_cone_dim=64).
+  std::vector<int> cone_dimensions;
+  cone_dimensions.reserve(21);
+  for (int i = 0; i < 20; ++i) {
+    cone_dimensions.push_back(3);
+  }
+  cone_dimensions.push_back(1000);
+
+  std::size_t n_cone_entries = 0;
+  for (const auto dim : cone_dimensions) {
+    n_cone_entries += static_cast<std::size_t>(dim);
+  }
+
+  std::vector<double> x_host(n_cone_entries);
+  std::vector<double> z_host(n_cone_entries);
+  std::size_t offset = 0;
+  for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) {
+    const auto dim = cone_dimensions[cone];
+    x_host[offset] = 100.0 + static_cast<double>(cone);
+    z_host[offset] = 80.0 + static_cast<double>(cone);
+    for (int local_idx = 1; local_idx < dim; ++local_idx) {
+      x_host[offset + local_idx] = 0.001 * static_cast<double>((local_idx % 5) + 1);
+      z_host[offset + local_idx] = 0.0015 * static_cast<double>((local_idx % 7) + 1);
+    }
+    offset += static_cast<std::size_t>(dim);
+  }
+
+  auto x = cuopt::device_copy(x_host, stream);
+  auto z = cuopt::device_copy(z_host, stream);
+  cone_data_t<int, double> cones(cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream);
+
+  EXPECT_EQ(cuopt::host_copy(cones.segmented_sum.medium_cone_ids, stream), (std::vector<int>{20}));
+  EXPECT_EQ(cones.n_sparse_cones, 1);
+
+  launch_nt_scaling(cones, stream);
+  launch_update_scaling_sparse(cones, stream);
+
+  auto eta_host = cuopt::host_copy(cones.eta, stream);
+  auto w_host   = cuopt::host_copy(cones.w, stream);
+
+  std::vector<double> expected_eta(cone_dimensions.size());
+  std::vector<double> expected_w(n_cone_entries);
+
+  offset = 0;
+  for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) {
+    const auto dim = cone_dimensions[cone];
+
+    double x_tail_sq = 0.0;
+    double z_tail_sq = 0.0;
+    for (int local_idx = 1; local_idx < dim; ++local_idx) {
+      const auto idx = offset + local_idx;
+      x_tail_sq += x_host[idx] * x_host[idx];
+      z_tail_sq += z_host[idx] * z_host[idx];
+    }
+
+    const auto x_tail_norm = std::sqrt(x_tail_sq);
+    const auto z_tail_norm = std::sqrt(z_tail_sq);
+    const auto x_det       = (x_host[offset] - x_tail_norm) * (x_host[offset] + x_tail_norm);
+    const auto z_det       = (z_host[offset] - z_tail_norm) * (z_host[offset] + z_tail_norm);
+    ASSERT_GT(x_det, 0.0) << "cone " << cone;
+    ASSERT_GT(z_det, 0.0) << "cone " << cone;
+
+    const auto x_scale = std::sqrt(x_det);
+    const auto z_scale = std::sqrt(z_det);
+
+    expected_eta[cone] = std::sqrt(z_scale / x_scale);
+
+    double normalized_xz_dot = 0.0;
+    for (int local_idx = 0; local_idx < dim; ++local_idx) {
+      const auto idx = offset + local_idx;
+      normalized_xz_dot += x_host[idx] * z_host[idx] / (x_scale * z_scale);
+    }
+    const auto w_det = 2.0 + 2.0 * normalized_xz_dot;
+    ASSERT_GT(w_det, 0.0) << "cone " << cone;
+    const auto w_scale = std::sqrt(w_det);
+
+    expected_w[offset] = 0.0;
+    for (int local_idx = 1; local_idx < dim; ++local_idx) {
+      const auto idx  = offset + local_idx;
+      expected_w[idx] = (z_host[idx] / z_scale - x_host[idx] / x_scale) / w_scale;
+    }
+
+    double normalized_tail_sq = 0.0;
+    for (int local_idx = 1; local_idx < dim; ++local_idx) {
+      const auto idx = offset + local_idx;
+      normalized_tail_sq += expected_w[idx] * expected_w[idx];
+    }
+    expected_w[offset] = std::sqrt(1.0 + normalized_tail_sq);
+
+    offset += static_cast<std::size_t>(dim);
+  }
+
+  for (std::size_t i = 0; i < expected_eta.size(); ++i) {
+    EXPECT_NEAR(eta_host[i], expected_eta[i], 1e-10) << "cone " << i;
+  }
+
+  for (std::size_t i = 0; i < expected_w.size(); ++i) {
+    EXPECT_NEAR(w_host[i], expected_w[i], 1e-10) << "entry " << i;
+  }
+}
+
+TEST(second_order_cone_kernels, cone_step_length_many_small_one_sparse_medium_cone)
+{
+  auto stream = rmm::cuda_stream_default;
+
+  std::vector<int> cone_dimensions;
+  for (int i = 0; i < 20; ++i) {
+    cone_dimensions.push_back(3);
+  }
+  cone_dimensions.push_back(1000);
+
+  std::size_t n_cone_entries = 0;
+  for (const auto dim : cone_dimensions) {
+    n_cone_entries += static_cast<std::size_t>(dim);
+  }
+
+  std::vector<double> x_host(n_cone_entries);
+  std::vector<double> z_host(n_cone_entries);
+  std::vector<double> dx_host(n_cone_entries);
+  std::vector<double> dz_host(n_cone_entries);
+
+  std::size_t offset = 0;
+  for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) {
+    const auto dim  = cone_dimensions[cone];
+    x_host[offset]  = 12.0 + static_cast<double>(cone);
+    z_host[offset]  = 14.0 + static_cast<double>(cone);
+    dx_host[offset] = 0.05;
+    dz_host[offset] = -0.04;
+    for (int local_idx = 1; local_idx < dim; ++local_idx) {
+      const auto idx = offset + local_idx;
+      x_host[idx]    = 0.001 * static_cast<double>((local_idx % 5) + 1);
+      z_host[idx]    = 0.0015 * static_cast<double>((local_idx % 7) + 1);
+      dx_host[idx]   = 0.02 * static_cast<double>((local_idx % 5) - 2);
+      dz_host[idx]   = -0.015 * static_cast<double>((local_idx % 7) - 3);
+    }
+    offset += static_cast<std::size_t>(dim);
+  }
+
+  auto x  = cuopt::device_copy(x_host, stream);
+  auto z  = cuopt::device_copy(z_host, stream);
+  auto dx = cuopt::device_copy(dx_host, stream);
+  auto dz = cuopt::device_copy(dz_host, stream);
+
+  cone_data_t<int, double> cones(cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream);
+
+  launch_nt_scaling(cones, stream);
+  launch_update_scaling_sparse(cones, stream);
+
+  const auto [step_primal, step_dual] =
+    compute_cone_step_length(cones,
+                             raft::device_span<const double>(dx.data(), dx.size()),
+                             raft::device_span<const double>(dz.data(), dz.size()),
+                             1.0,
+                             stream);
+
+  EXPECT_GT(step_primal, 0.0);
+  EXPECT_GT(step_dual, 0.0);
+  EXPECT_LE(step_primal, 1.0);
+  EXPECT_LE(step_dual, 1.0);
+}
+
 TEST(second_order_cone_kernels, cone_step_length_keeps_iterate_in_cone)
 {
   auto stream = rmm::cuda_stream_default;
@@ -354,7 +519,7 @@ TEST(second_order_cone_kernels, scaling_operators_match_host_reference)
     raft::device_span<const double>(cone_target.data(), cone_target.size()),
     cuopt::make_span(recovered_dz),
     stream);
-  accumulate_cone_hessian_matvec(v_span, cones, cuopt::make_span(accum), stream);
+  launch_dense_hessian_matvec(v_span, cones, cuopt::make_span(accum), stream);
   apply_w(v_span, cuopt::make_span(w_tmp), cones, stream);
   apply_w(raft::device_span<const double>(w_tmp.data(), w_tmp.size()),
           cuopt::make_span(w_squared_v),
@@ -374,13 +539,15 @@ TEST(second_order_cone_kernels, scaling_operators_match_host_reference)
   std::vector<double> expected_w_inv(n_cone_entries);
   std::vector<double> expected_h(n_cone_entries);
   std::vector<double> expected_h_unscaled(n_cone_entries);
+  std::vector<double> expected_accum(n_cone_entries);
 
   offset = 0;
   for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) {
-    const auto dim = cone_dimensions[cone];
-    const auto w0  = w_host[offset];
-    const auto v0  = v_host[offset];
-    const auto eta = eta_host[cone];
+    const auto dim        = cone_dimensions[cone];
+    const auto w0         = w_host[offset];
+    const auto v0         = v_host[offset];
+    const auto eta        = eta_host[cone];
+    const bool cone_dense = dim <= cones.soc_threshold;
 
     double tail_dot = 0.0;
     for (int local_idx = 1; local_idx < dim; ++local_idx) {
@@ -394,6 +561,8 @@ TEST(second_order_cone_kernels, scaling_operators_match_host_reference)
     const auto rho              = w0 * v0 + tail_dot;
     expected_h_unscaled[offset] = (eta * eta) * (2.0 * w0 * rho - v0);
     expected_h[offset]          = expected_h_unscaled[offset];
+    expected_accum[offset] =
+      accum_initial_host[offset] + (cone_dense ? expected_h_unscaled[offset] : 0.0);
 
     for (int local_idx = 1; local_idx < dim; ++local_idx) {
       const auto idx = offset + local_idx;
@@ -402,6 +571,7 @@ TEST(second_order_cone_kernels, scaling_operators_match_host_reference)
       expected_w_inv[idx]      = (v_host[idx] + (-v0 + tail_dot / (1.0 + w0)) * w_host[idx]) / eta;
       expected_h_unscaled[idx] = (eta * eta) * (2.0 * w_host[idx] * rho + v_host[idx]);
       expected_h[idx]          = expected_h_unscaled[idx];
+      expected_accum[idx] = accum_initial_host[idx] + (cone_dense ? expected_h_unscaled[idx] : 0.0);
     }
 
     offset += static_cast<std::size_t>(dim);
@@ -414,8 +584,7 @@ TEST(second_order_cone_kernels, scaling_operators_match_host_reference)
     EXPECT_NEAR(h_out_host[i], w_squared_v_host[i], 1e-9) << "W^2 v vs H v entry " << i;
     EXPECT_NEAR(recovered_dz_host[i], cone_target_host[i] - expected_h_unscaled[i], 1e-9)
       << "recovered dz entry " << i;
-    EXPECT_NEAR(accum_host[i], accum_initial_host[i] + expected_h_unscaled[i], 1e-9)
-      << "accumulated H entry " << i;
+    EXPECT_NEAR(accum_host[i], expected_accum[i], 1e-9) << "accumulated H entry " << i;
   }
 }
 
@@ -581,4 +750,176 @@ TEST(second_order_cone_kernels, combined_cone_rhs_matches_host_reference)
   }
 }
 
+TEST(second_order_cone_kernels, sparse_cone_classification)
+{
+  auto stream = rmm::cuda_stream_default;
+
+  // threshold=5: cones of dim 3,2 are dense; dim 6,32769 are sparse
+  std::vector<int> cone_dimensions{3, 6, 2, 32769};
+  rmm::device_uvector<double> x(32780, stream);
+  rmm::device_uvector<double> z(32780, stream);
+
+  cone_data_t<int, double> cones(
+    cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream, /*soc_threshold=*/5);
+
+  EXPECT_EQ(cones.soc_threshold, 5);
+  EXPECT_EQ(cones.n_sparse_cones, 2);
+  EXPECT_EQ(cones.n_dense_cones(), 2);
+  EXPECT_EQ(cones.expansion_var_count(), 4);
+  EXPECT_EQ(cones.n_sparse_cone_entries, std::size_t{32775});
+  EXPECT_EQ(cones.d.size(), 2u);
+  EXPECT_EQ(cones.sparse_v.size(), 32775u);
+  EXPECT_EQ(cones.sparse_u.size(), 32775u);
+  EXPECT_EQ(cuopt::host_copy(cones.sparse_cone_ids, stream), (std::vector<int>{1, 3}));
+  EXPECT_EQ(cuopt::host_copy(cones.sparse_cone_dims, stream), (std::vector<int>{6, 32769}));
+}
+
+namespace {
+
+struct sparse_scaling_head_t {
+  double d;
+  double u0;
+  double u1;
+  double v0;
+  double v1;
+};
+
+// Must match update_scaling_sparse_kernel in second_order_cone_kernels.cuh.
+sparse_scaling_head_t sparse_scaling_head_reference(double w0)
+{
+  const double alpha    = 2.0 * w0;
+  const double wsq      = 2.0 * w0 * w0 - 1.0;
+  const double wsq_safe = 0.5 * (wsq + std::sqrt(wsq * wsq + 1.0));
+  const double wsqinv   = 1.0 / wsq_safe;
+  const double d        = 0.5 * wsqinv;
+  const double radicand = wsq_safe - d;
+  const double u0       = std::sqrt(std::max(radicand, 0.0));
+  const double u1       = (u0 > 0.0) ? alpha / u0 : 0.0;
+  const double v0       = 0.0;
+  const double denom    = 2.0 * wsq_safe - wsqinv;
+  const double v1_arg   = (std::abs(denom) > 1e-12) ? 2.0 * (2.0 + wsqinv) / denom : 2.0;
+  const double v1       = std::sqrt(std::max(v1_arg, 0.0));
+  return {d, u0, u1, v0, v1};
+}
+
+}  // namespace
+
+TEST(second_order_cone_kernels, update_scaling_sparse_matches_reference)
+{
+  auto stream = rmm::cuda_stream_default;
+
+  std::vector<int> cone_dimensions{3, 6};
+  rmm::device_uvector<double> x(9, stream);
+  rmm::device_uvector<double> z(9, stream);
+
+  cone_data_t<int, double> cones(
+    cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream, /*soc_threshold=*/4);
+
+  ASSERT_EQ(cones.n_sparse_cones, 1);
+
+  // Place interior point in the sparse cone (second cone, dim 6).
+  std::vector<double> x_host(9, 0.0);
+  std::vector<double> z_host(9, 0.0);
+  x_host[3] = 2.0;
+  z_host[3] = 1.5;
+  for (int j = 1; j < 6; ++j) {
+    x_host[3 + j] = 0.1 * j;
+    z_host[3 + j] = 0.08 * j;
+  }
+  x_host[0] = 1.5;
+  z_host[0] = 1.2;
+  for (int j = 1; j < 3; ++j) {
+    x_host[j] = 0.05;
+    z_host[j] = 0.04;
+  }
+
+  raft::copy(cones.x.data(), x_host.data(), x_host.size(), stream);
+  raft::copy(cones.z.data(), z_host.data(), z_host.size(), stream);
+
+  launch_nt_scaling(cones, stream);
+  launch_update_scaling_sparse(cones, stream);
+
+  auto d_host   = cuopt::host_copy(cones.d, stream);
+  auto v_host   = cuopt::host_copy(cones.sparse_v, stream);
+  auto u_host   = cuopt::host_copy(cones.sparse_u, stream);
+  auto w_host   = cuopt::host_copy(cones.w, stream);
+  auto eta_host = cuopt::host_copy(cones.eta, stream);
+
+  const int sparse_cone = 1;
+  const int cone_dim    = 6;
+  const int cone_off    = 3;
+  const double eta      = eta_host[sparse_cone];
+
+  const auto head    = sparse_scaling_head_reference(w_host[cone_off]);
+  const double scale = eta * eta;
+
+  EXPECT_NEAR(d_host[0], head.d, 1e-10);
+  EXPECT_NEAR(v_host[0], scale * head.v0, 1e-10);
+  EXPECT_NEAR(u_host[0], scale * head.u0, 1e-10);
+  for (int j = 1; j < cone_dim; ++j) {
+    EXPECT_NEAR(v_host[j], scale * head.v1 * w_host[cone_off + j], 1e-10) << "v tail " << j;
+    EXPECT_NEAR(u_host[j], scale * head.u1 * w_host[cone_off + j], 1e-10) << "u tail " << j;
+  }
+}
+
+TEST(second_order_cone_kernels, update_scaling_sparse_two_cones)
+{
+  auto stream = rmm::cuda_stream_default;
+
+  // sparse cones: dim 6 (offset 3) and dim 5 (offset 9)
+  std::vector<int> cone_dimensions{3, 6, 5};
+  rmm::device_uvector<double> x(14, stream);
+  rmm::device_uvector<double> z(14, stream);
+
+  cone_data_t<int, double> cones(
+    cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream, /*soc_threshold=*/4);
+
+  ASSERT_EQ(cones.n_sparse_cones, 2);
+  ASSERT_EQ(cones.n_sparse_cone_entries, std::size_t{11});
+
+  std::vector<double> x_host(14, 0.0);
+  std::vector<double> z_host(14, 0.0);
+  for (int cone = 0; cone < 3; ++cone) {
+    const int off = (cone == 0) ? 0 : (cone == 1 ? 3 : 9);
+    const int dim = cone_dimensions[cone];
+    x_host[off]   = 2.0 + 0.1 * cone;
+    z_host[off]   = 1.5 + 0.05 * cone;
+    for (int j = 1; j < dim; ++j) {
+      x_host[off + j] = 0.1 * j;
+      z_host[off + j] = 0.08 * j;
+    }
+  }
+
+  raft::copy(cones.x.data(), x_host.data(), x_host.size(), stream);
+  raft::copy(cones.z.data(), z_host.data(), z_host.size(), stream);
+
+  launch_nt_scaling(cones, stream);
+  launch_update_scaling_sparse(cones, stream);
+
+  auto v_host   = cuopt::host_copy(cones.sparse_v, stream);
+  auto u_host   = cuopt::host_copy(cones.sparse_u, stream);
+  auto w_host   = cuopt::host_copy(cones.w, stream);
+  auto eta_host = cuopt::host_copy(cones.eta, stream);
+
+  const auto check_cone = [&](int sparse_idx, int cone_idx, int cone_off, int cone_dim) {
+    const double eta   = eta_host[cone_idx];
+    const auto head    = sparse_scaling_head_reference(w_host[cone_off]);
+    const double scale = eta * eta;
+
+    const int block_start = (sparse_idx == 0) ? 0 : 6;
+
+    EXPECT_NEAR(v_host[block_start], scale * head.v0, 1e-10) << "sparse " << sparse_idx;
+    EXPECT_NEAR(u_host[block_start], scale * head.u0, 1e-10) << "sparse " << sparse_idx;
+    for (int j = 1; j < cone_dim; ++j) {
+      EXPECT_NEAR(v_host[block_start + j], scale * head.v1 * w_host[cone_off + j], 1e-10)
+        << "v tail sparse " << sparse_idx << " j " << j;
+      EXPECT_NEAR(u_host[block_start + j], scale * head.u1 * w_host[cone_off + j], 1e-10)
+        << "u tail sparse " << sparse_idx << " j " << j;
+    }
+  };
+
+  check_cone(0, 1, 3, 6);
+  check_cone(1, 2, 9, 5);
+}
+
 }  // namespace cuopt::linear_programming::dual_simplex::test
diff --git a/cpp/tests/socp/solve_barrier_socp.cu b/cpp/tests/socp/solve_barrier_socp.cu
index f960c3dc3a..1878e43898 100644
--- a/cpp/tests/socp/solve_barrier_socp.cu
+++ b/cpp/tests/socp/solve_barrier_socp.cu
@@ -824,4 +824,214 @@ TEST(barrier, qp_with_soc_block)
   EXPECT_NEAR(std::abs(solution.x[3]), 0.0, 1e-4);
 }
 
+TEST(barrier, sparse_soc_expansion_solves_large_single_cone)
+{
+  // minimize x_0
+  // subject to x_1 = 1
+  //            (x_0, ..., x_5) in Q^6
+  //
+  // Optimal: x* = (1, 1, 0, 0, 0, 0), obj* = 1
+
+  raft::handle_t handle{};
+  init_handler(&handle);
+
+  using namespace cuopt::linear_programming::dual_simplex;
+  user_problem_t<int, double> user_problem(&handle);
+
+  constexpr int m  = 1;
+  constexpr int n  = 6;
+  constexpr int nz = 1;
+
+  user_problem.num_rows = m;
+  user_problem.num_cols = n;
+  user_problem.objective.assign(n, 0.0);
+  user_problem.objective[0] = 1.0;
+
+  user_problem.A.m      = m;
+  user_problem.A.n      = n;
+  user_problem.A.nz_max = nz;
+  user_problem.A.reallocate(nz);
+  user_problem.A.col_start = {0, 0, 1, 1, 1, 1, 1};
+  user_problem.A.i[0]      = 0;
+  user_problem.A.x[0]      = 1.0;
+
+  user_problem.rhs       = {1.0};
+  user_problem.row_sense = {'E'};
+  user_problem.lower.assign(n, 0.0);
+  user_problem.upper.assign(n, inf);
+
+  user_problem.num_range_rows         = 0;
+  user_problem.problem_name           = "sparse_soc_single_large_cone";
+  user_problem.cone_var_start         = 0;
+  user_problem.second_order_cone_dims = {6};
+  user_problem.var_types.assign(n, variable_type_t::CONTINUOUS);
+
+  simplex_solver_settings_t<int, double> settings;
+  settings.barrier                      = true;
+  settings.barrier_presolve             = true;
+  settings.dualize                      = 0;
+  settings.barrier_soc_threshold        = 4;
+  settings.barrier_iterative_refinement = true;
+
+  lp_solution_t<int, double> solution(m, n);
+  auto status = solve_linear_program_with_barrier(user_problem, settings, solution);
+  EXPECT_EQ(status, lp_status_t::OPTIMAL);
+  EXPECT_NEAR(solution.objective, 1.0, 1e-4);
+  EXPECT_NEAR(solution.x[0], 1.0, 1e-4);
+  EXPECT_NEAR(solution.x[1], 1.0, 1e-4);
+  for (int j = 2; j < n; ++j) {
+    EXPECT_NEAR(std::abs(solution.x[j]), 0.0, 1e-4) << "index " << j;
+  }
+}
+
+TEST(barrier, mixed_dense_and_sparse_soc_blocks)
+{
+  // Variables ordered as [l1, l2 | dense Q^3 | sparse Q^6].
+  //
+  // minimize   t_dense + t_sparse
+  // subject to l1 - u_dense = 0
+  //            l2 - u_sparse = 0
+  //            l1 + l2 >= 3
+  //            l1 - l2 = 1
+  //
+  // Same optimum as mixed_linear_and_two_soc_blocks_with_inequality: obj* = 3.
+  raft::handle_t handle{};
+  init_handler(&handle);
+
+  using namespace cuopt::linear_programming::dual_simplex;
+  user_problem_t<int, double> user_problem(&handle);
+
+  constexpr int m  = 4;
+  constexpr int n  = 11;
+  constexpr int nz = 8;
+
+  user_problem.num_rows = m;
+  user_problem.num_cols = n;
+  user_problem.objective.assign(n, 0.0);
+  user_problem.objective[2] = 1.0;  // t_dense (head of dense Q^3 block)
+  user_problem.objective[5] = 1.0;  // t_sparse (head of sparse Q^6 block)
+
+  user_problem.A.m      = m;
+  user_problem.A.n      = n;
+  user_problem.A.nz_max = nz;
+  user_problem.A.reallocate(nz);
+  // Columns: l1, l2, t_d, u_d, v_d, t_s, u_s, v_s, w_s, y_s, z_s
+  user_problem.A.col_start = {0, 3, 6, 6, 7, 7, 7, 8, 8, 8, 8, 8};
+  user_problem.A.i[0]      = 0;
+  user_problem.A.x[0]      = 1.0;
+  user_problem.A.i[1]      = 2;
+  user_problem.A.x[1]      = 1.0;
+  user_problem.A.i[2]      = 3;
+  user_problem.A.x[2]      = 1.0;
+  user_problem.A.i[3]      = 1;
+  user_problem.A.x[3]      = 1.0;
+  user_problem.A.i[4]      = 2;
+  user_problem.A.x[4]      = 1.0;
+  user_problem.A.i[5]      = 3;
+  user_problem.A.x[5]      = -1.0;
+  user_problem.A.i[6]      = 0;
+  user_problem.A.x[6]      = -1.0;
+  user_problem.A.i[7]      = 1;
+  user_problem.A.x[7]      = -1.0;
+
+  user_problem.rhs       = {0.0, 0.0, 3.0, 1.0};
+  user_problem.row_sense = {'E', 'E', 'G', 'E'};
+  user_problem.lower.assign(n, 0.0);
+  user_problem.upper.assign(n, inf);
+
+  user_problem.num_range_rows         = 0;
+  user_problem.problem_name           = "mixed_dense_and_sparse_soc_blocks";
+  user_problem.cone_var_start         = 2;
+  user_problem.second_order_cone_dims = {3, 6};
+  user_problem.var_types.assign(n, variable_type_t::CONTINUOUS);
+
+  simplex_solver_settings_t<int, double> settings;
+  settings.barrier                      = true;
+  settings.barrier_presolve             = true;
+  settings.dualize                      = 0;
+  settings.scale_columns                = true;
+  settings.barrier_soc_threshold        = 4;
+  settings.barrier_iterative_refinement = true;
+
+  lp_solution_t<int, double> solution(m, n);
+  auto status = solve_linear_program_with_barrier(user_problem, settings, solution);
+
+  EXPECT_EQ(status, lp_status_t::OPTIMAL);
+  EXPECT_NEAR(solution.objective, 3.0, 1e-4);
+  EXPECT_NEAR(solution.x[0], 2.0, 1e-4);
+  EXPECT_NEAR(solution.x[1], 1.0, 1e-4);
+  EXPECT_NEAR(solution.x[2], 2.0, 1e-4);
+  EXPECT_NEAR(solution.x[3], 2.0, 1e-4);
+  EXPECT_NEAR(std::abs(solution.x[4]), 0.0, 1e-4);
+  EXPECT_NEAR(solution.x[5], 1.0, 1e-4);
+  EXPECT_NEAR(solution.x[6], 1.0, 1e-4);
+  for (int j = 7; j < n; ++j) {
+    EXPECT_NEAR(std::abs(solution.x[j]), 0.0, 1e-4) << "index " << j;
+  }
+}
+
+TEST(barrier, sparse_soc_expansion_solves_dim_500_cone)
+{
+  // minimize x_0
+  // subject to x_1 = 1
+  //            (x_0, ..., x_499) in Q^500
+  //
+  // Optimal: x* = (1, 1, 0, ..., 0), obj* = 1
+  // Sparse cone (dim 500 > default threshold 5) with barrier IR enabled.
+  raft::handle_t handle{};
+  init_handler(&handle);
+
+  using namespace cuopt::linear_programming::dual_simplex;
+  user_problem_t<int, double> user_problem(&handle);
+
+  constexpr int m  = 1;
+  constexpr int n  = 500;
+  constexpr int nz = 1;
+
+  user_problem.num_rows = m;
+  user_problem.num_cols = n;
+  user_problem.objective.assign(n, 0.0);
+  user_problem.objective[0] = 1.0;
+
+  user_problem.A.m      = m;
+  user_problem.A.n      = n;
+  user_problem.A.nz_max = nz;
+  user_problem.A.reallocate(nz);
+  // x_1 = 1: nonzero in column 1.
+  user_problem.A.col_start.assign(n + 1, 1);
+  user_problem.A.col_start[0] = 0;
+  user_problem.A.col_start[1] = 0;
+  user_problem.A.i[0]         = 0;
+  user_problem.A.x[0]         = 1.0;
+
+  user_problem.rhs       = {1.0};
+  user_problem.row_sense = {'E'};
+  user_problem.lower.assign(n, 0.0);
+  user_problem.upper.assign(n, inf);
+
+  user_problem.num_range_rows         = 0;
+  user_problem.problem_name           = "sparse_soc_dim_500";
+  user_problem.cone_var_start         = 0;
+  user_problem.second_order_cone_dims = {500};
+  user_problem.var_types.assign(n, variable_type_t::CONTINUOUS);
+
+  simplex_solver_settings_t<int, double> settings;
+  settings.barrier                      = true;
+  settings.barrier_presolve             = true;
+  settings.dualize                      = 0;
+  settings.barrier_soc_threshold        = 5;
+  settings.barrier_iterative_refinement = true;
+
+  lp_solution_t<int, double> solution(m, n);
+  auto status = solve_linear_program_with_barrier(user_problem, settings, solution);
+
+  EXPECT_EQ(status, lp_status_t::OPTIMAL);
+  EXPECT_NEAR(solution.objective, 1.0, 1e-3);
+  EXPECT_NEAR(solution.x[0], 1.0, 1e-3);
+  EXPECT_NEAR(solution.x[1], 1.0, 1e-3);
+  for (int j = 2; j < n; ++j) {
+    EXPECT_NEAR(std::abs(solution.x[j]), 0.0, 1e-3) << "index " << j;
+  }
+}
+
 }  // namespace cuopt::linear_programming::dual_simplex::test
diff --git a/cpp/tests/socp/sparse_augmented_kkt_test.cu b/cpp/tests/socp/sparse_augmented_kkt_test.cu
new file mode 100644
index 0000000000..400a66a536
--- /dev/null
+++ b/cpp/tests/socp/sparse_augmented_kkt_test.cu
@@ -0,0 +1,353 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#include <barrier/second_order_cone_kernels.cuh>
+
+#include <utilities/copy_helpers.hpp>
+
+#include <gtest/gtest.h>
+
+#include <thrust/fill.h>
+
+#include <cmath>
+#include <vector>
+
+namespace cuopt::linear_programming::dual_simplex::test {
+
+TEST(sparse_augmented_kkt, cone_counts_and_expansion_size)
+{
+  auto stream = rmm::cuda_stream_default;
+
+  std::vector<int> cone_dimensions{3, 6, 5};
+  rmm::device_uvector<double> x(14, stream);
+  rmm::device_uvector<double> z(14, stream);
+
+  cone_data_t<int, double> cones(
+    cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream, /*soc_threshold=*/4);
+
+  EXPECT_EQ(cones.n_sparse_cones, 2);
+  EXPECT_EQ(cones.n_dense_cones(), 1);
+  EXPECT_EQ(cones.expansion_var_count(), 4);
+  EXPECT_EQ(cones.n_sparse_cone_entries, 11u);
+}
+
+TEST(sparse_augmented_kkt, launch_get_Hs_sparse)
+{
+  auto stream = rmm::cuda_stream_default;
+
+  std::vector<int> cone_dimensions{3, 6};
+  rmm::device_uvector<double> x(9, stream);
+  rmm::device_uvector<double> z(9, stream);
+
+  cone_data_t<int, double> cones(
+    cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream, /*soc_threshold=*/4);
+
+  ASSERT_EQ(cones.n_sparse_cones, 1);
+
+  std::vector<double> x_host(9, 0.0);
+  std::vector<double> z_host(9, 0.0);
+  x_host[3] = 2.0;
+  z_host[3] = 1.5;
+  for (int j = 1; j < 6; ++j) {
+    x_host[3 + j] = 0.1 * j;
+    z_host[3 + j] = 0.08 * j;
+  }
+  x_host[0] = 1.5;
+  z_host[0] = 1.2;
+  for (int j = 1; j < 3; ++j) {
+    x_host[j] = 0.05;
+    z_host[j] = 0.04;
+  }
+
+  raft::copy(cones.x.data(), x_host.data(), x_host.size(), stream);
+  raft::copy(cones.z.data(), z_host.data(), z_host.size(), stream);
+
+  launch_nt_scaling(cones, stream);
+  launch_update_scaling_sparse(cones, stream);
+
+  rmm::device_uvector<double> Hs_diag(cones.n_sparse_cone_entries, stream);
+  launch_get_Hs_sparse(cones, Hs_diag, stream);
+
+  auto d_host   = cuopt::host_copy(cones.d, stream);
+  auto eta_host = cuopt::host_copy(cones.eta, stream);
+  auto hs_host  = cuopt::host_copy(Hs_diag, stream);
+
+  const int sparse_cone = 1;
+  const double eta_sq   = eta_host[sparse_cone] * eta_host[sparse_cone];
+
+  EXPECT_NEAR(hs_host[0], eta_sq * d_host[0], 1e-10);
+  for (int j = 1; j < 6; ++j) {
+    EXPECT_NEAR(hs_host[j], eta_sq, 1e-10) << "tail index " << j;
+  }
+}
+
+TEST(sparse_augmented_kkt, scatter_and_update_sparse_expansion)
+{
+  auto stream = rmm::cuda_stream_default;
+
+  std::vector<int> cone_dimensions{6};
+  rmm::device_uvector<double> x(6, stream);
+  rmm::device_uvector<double> z(6, stream);
+
+  cone_data_t<int, double> cones(
+    cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream, /*soc_threshold=*/4);
+
+  ASSERT_EQ(cones.n_sparse_cones, 1);
+  ASSERT_EQ(cones.expansion_var_count(), 2);
+
+  std::vector<double> x_host{2.0, 0.2, 0.3, 0.4, 0.5, 0.6};
+  std::vector<double> z_host{1.5, 0.1, 0.15, 0.2, 0.25, 0.3};
+  raft::copy(cones.x.data(), x_host.data(), x_host.size(), stream);
+  raft::copy(cones.z.data(), z_host.data(), z_host.size(), stream);
+
+  launch_nt_scaling(cones, stream);
+  launch_update_scaling_sparse(cones, stream);
+
+  const int nnz = 32;
+
+  rmm::device_uvector<double> augmented_x(nnz, stream);
+  thrust::fill(rmm::exec_policy(stream), augmented_x.begin(), augmented_x.end(), 0.0);
+
+  // One coupling/Hessian CSR entry per sparse-cone element, matching production where every
+  // *_col/*_row index array has size n_sparse_cone_entries (== cones.sparse_v.size()).
+  const int q_dim = 6;
+  ASSERT_EQ(cones.n_sparse_cone_entries, static_cast<std::size_t>(q_dim));
+
+  std::vector<int> sparse_hessian_diag(q_dim);
+  std::vector<double> sparse_hessian_Q(q_dim, 0.0);
+  std::vector<int> sparse_exp_v_col(q_dim);
+  std::vector<int> sparse_exp_u_col(q_dim);
+  std::vector<int> sparse_exp_v_row(q_dim);
+  std::vector<int> sparse_exp_u_row(q_dim);
+  std::vector<int> sparse_expansion_D{30, 31};
+  for (int j = 0; j < q_dim; ++j) {
+    sparse_hessian_diag[j] = j;       // CSR slots 0..5
+    sparse_exp_v_col[j]    = 6 + j;   // 6..11
+    sparse_exp_u_col[j]    = 12 + j;  // 12..17
+    sparse_exp_v_row[j]    = 18 + j;  // 18..23
+    sparse_exp_u_row[j]    = 24 + j;  // 24..29
+  }
+
+  auto d_sparse_hessian_diag = cuopt::device_copy(sparse_hessian_diag, stream);
+  auto d_sparse_hessian_Q    = cuopt::device_copy(sparse_hessian_Q, stream);
+  auto d_sparse_exp_v_col    = cuopt::device_copy(sparse_exp_v_col, stream);
+  auto d_sparse_exp_u_col    = cuopt::device_copy(sparse_exp_u_col, stream);
+  auto d_sparse_exp_v_row    = cuopt::device_copy(sparse_exp_v_row, stream);
+  auto d_sparse_exp_u_row    = cuopt::device_copy(sparse_exp_u_row, stream);
+  auto d_sparse_expansion_D  = cuopt::device_copy(sparse_expansion_D, stream);
+  rmm::device_uvector<double> d_sparse_Hs_diag(cones.n_sparse_cone_entries, stream);
+
+  launch_get_Hs_sparse(cones, d_sparse_Hs_diag, stream);
+  scatter_sparse_hessian_diag_into_augmented(
+    augmented_x, d_sparse_hessian_diag, d_sparse_Hs_diag, d_sparse_hessian_Q, stream, 0.0);
+  update_sparse_expansion_in_augmented(augmented_x,
+                                       d_sparse_exp_v_col,
+                                       d_sparse_exp_u_col,
+                                       d_sparse_exp_v_row,
+                                       d_sparse_exp_u_row,
+                                       d_sparse_expansion_D,
+                                       cones.sparse_v,
+                                       cones.sparse_u,
+                                       cones.eta,
+                                       cones.sparse_cone_ids,
+                                       stream,
+                                       0.0);
+
+  auto aug_host = cuopt::host_copy(augmented_x, stream);
+  auto v_host   = cuopt::host_copy(cones.sparse_v, stream);
+  auto u_host   = cuopt::host_copy(cones.sparse_u, stream);
+  auto hs_host  = cuopt::host_copy(d_sparse_Hs_diag, stream);
+  auto eta_host = cuopt::host_copy(cones.eta, stream);
+
+  for (int j = 0; j < q_dim; ++j) {
+    EXPECT_NEAR(aug_host[j], -hs_host[j], 1e-10) << "hessian diag " << j;
+    EXPECT_NEAR(aug_host[6 + j], v_host[j], 1e-10) << "v col " << j;
+    EXPECT_NEAR(aug_host[12 + j], u_host[j], 1e-10) << "u col " << j;
+    EXPECT_NEAR(aug_host[18 + j], v_host[j], 1e-10) << "v row " << j;
+    EXPECT_NEAR(aug_host[24 + j], u_host[j], 1e-10) << "u row " << j;
+  }
+
+  const double eta_sq = eta_host[0] * eta_host[0];
+  EXPECT_NEAR(aug_host[30], -eta_sq, 1e-10);
+  EXPECT_NEAR(aug_host[31], eta_sq, 1e-10);
+}
+
+TEST(sparse_augmented_kkt, sparse_augmented_matvec)
+{
+  auto stream = rmm::cuda_stream_default;
+
+  std::vector<int> cone_dimensions{6};
+  rmm::device_uvector<double> x(6, stream);
+  rmm::device_uvector<double> z(6, stream);
+
+  cone_data_t<int, double> cones(
+    cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream, /*soc_threshold=*/4);
+
+  ASSERT_EQ(cones.n_sparse_cones, 1);
+  ASSERT_EQ(cones.expansion_var_count(), 2);
+
+  std::vector<double> x_host{2.0, 0.2, 0.3, 0.4, 0.5, 0.6};
+  std::vector<double> z_host{1.5, 0.1, 0.15, 0.2, 0.25, 0.3};
+  raft::copy(cones.x.data(), x_host.data(), x_host.size(), stream);
+  raft::copy(cones.z.data(), z_host.data(), z_host.size(), stream);
+
+  launch_nt_scaling(cones, stream);
+  launch_update_scaling_sparse(cones, stream);
+
+  const int n_primal = 6;
+  const int m_rows   = 1;
+  const int p        = cones.expansion_var_count();
+  const int sys_size = n_primal + m_rows + p;
+
+  std::vector<double> x_vec(sys_size, 0.0);
+  x_vec[0]                     = 1.1;
+  x_vec[1]                     = 0.3;
+  x_vec[2]                     = 0.4;
+  x_vec[3]                     = 0.2;
+  x_vec[4]                     = 0.5;
+  x_vec[5]                     = 0.6;
+  x_vec[n_primal + m_rows]     = 0.25;   // expansion v
+  x_vec[n_primal + m_rows + 1] = -0.15;  // expansion u
+
+  rmm::device_uvector<double> d_x(sys_size, stream);
+  rmm::device_uvector<double> d_r1(n_primal, stream);
+  rmm::device_uvector<double> d_y_exp(p, stream);
+  rmm::device_uvector<double> d_hs(cones.n_sparse_cone_entries, stream);
+
+  raft::copy(d_x.data(), x_vec.data(), sys_size, stream);
+  thrust::fill(rmm::exec_policy(stream), d_r1.begin(), d_r1.end(), 0.0);
+  thrust::fill(rmm::exec_policy(stream), d_y_exp.begin(), d_y_exp.end(), 0.0);
+
+  launch_get_Hs_sparse(cones, d_hs, stream);
+  launch_sparse_augmented_matvec(raft::device_span<const double>(d_x.data(), d_x.size()),
+                                 raft::device_span<double>(d_r1.data(), d_r1.size()),
+                                 raft::device_span<double>(d_y_exp.data(), d_y_exp.size()),
+                                 cones,
+                                 raft::device_span<const double>(d_hs.data(), d_hs.size()),
+                                 /*cone_var_start=*/0,
+                                 n_primal,
+                                 m_rows,
+                                 stream);
+
+  auto r1_host   = cuopt::host_copy(d_r1, stream);
+  auto yexp_host = cuopt::host_copy(d_y_exp, stream);
+  auto hs_host   = cuopt::host_copy(d_hs, stream);
+  auto v_host    = cuopt::host_copy(cones.sparse_v, stream);
+  auto u_host    = cuopt::host_copy(cones.sparse_u, stream);
+  auto eta_host  = cuopt::host_copy(cones.eta, stream);
+
+  const double eta_sq = eta_host[0] * eta_host[0];
+  double dot_v        = 0.0;
+  double dot_u        = 0.0;
+  for (int j = 0; j < 6; ++j) {
+    dot_v += v_host[j] * x_vec[j];
+    dot_u += u_host[j] * x_vec[j];
+    const double expected = hs_host[j] * x_vec[j] - v_host[j] * x_vec[n_primal + m_rows] -
+                            u_host[j] * x_vec[n_primal + m_rows + 1];
+    EXPECT_NEAR(r1_host[j], expected, 1e-10) << "primal row " << j;
+  }
+
+  EXPECT_NEAR(yexp_host[0], -eta_sq * x_vec[n_primal + m_rows] + dot_v, 1e-10);
+  EXPECT_NEAR(yexp_host[1], eta_sq * x_vec[n_primal + m_rows + 1] + dot_u, 1e-10);
+}
+
+TEST(sparse_augmented_kkt, update_scaling_sparse_dim_1000)
+{
+  auto stream = rmm::cuda_stream_default;
+
+  std::vector<int> cone_dimensions{1000};
+  rmm::device_uvector<double> x(1000, stream);
+  rmm::device_uvector<double> z(1000, stream);
+
+  cone_data_t<int, double> cones(
+    cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream, /*soc_threshold=*/5);
+
+  ASSERT_EQ(cones.n_sparse_cones, 1);
+  ASSERT_EQ(cones.n_sparse_cone_entries, 1000u);
+  ASSERT_EQ(cones.expansion_var_count(), 2);
+
+  std::vector<double> x_host(1000);
+  std::vector<double> z_host(1000);
+  x_host[0] = 100.0;
+  z_host[0] = 80.0;
+  for (int j = 1; j < 1000; ++j) {
+    x_host[j] = 0.001 * ((j % 5) + 1);
+    z_host[j] = 0.0015 * ((j % 7) + 1);
+  }
+
+  raft::copy(cones.x.data(), x_host.data(), x_host.size(), stream);
+  raft::copy(cones.z.data(), z_host.data(), z_host.size(), stream);
+
+  launch_nt_scaling(cones, stream);
+  launch_update_scaling_sparse(cones, stream);
+
+  auto d_host   = cuopt::host_copy(cones.d, stream);
+  auto v_host   = cuopt::host_copy(cones.sparse_v, stream);
+  auto u_host   = cuopt::host_copy(cones.sparse_u, stream);
+  auto eta_host = cuopt::host_copy(cones.eta, stream);
+
+  EXPECT_GT(d_host[0], 0.0);
+  EXPECT_GT(eta_host[0], 0.0);
+  for (int j = 0; j < 1000; ++j) {
+    EXPECT_TRUE(std::isfinite(v_host[j])) << "v entry " << j;
+    EXPECT_TRUE(std::isfinite(u_host[j])) << "u entry " << j;
+  }
+
+  rmm::device_uvector<double> Hs_diag(cones.n_sparse_cone_entries, stream);
+  launch_get_Hs_sparse(cones, Hs_diag, stream);
+
+  const int n_primal = 1000;
+  const int m_rows   = 1;
+  const int p        = cones.expansion_var_count();
+  const int sys_size = n_primal + m_rows + p;
+
+  std::vector<double> x_vec(sys_size, 0.0);
+  for (int j = 0; j < 1000; ++j) {
+    x_vec[j] = x_host[j];
+  }
+  x_vec[n_primal + m_rows]     = 0.25;
+  x_vec[n_primal + m_rows + 1] = -0.15;
+
+  rmm::device_uvector<double> d_x(sys_size, stream);
+  rmm::device_uvector<double> d_r1(n_primal, stream);
+  rmm::device_uvector<double> d_y_exp(p, stream);
+
+  raft::copy(d_x.data(), x_vec.data(), sys_size, stream);
+  thrust::fill(rmm::exec_policy(stream), d_r1.begin(), d_r1.end(), 0.0);
+  thrust::fill(rmm::exec_policy(stream), d_y_exp.begin(), d_y_exp.end(), 0.0);
+
+  launch_sparse_augmented_matvec(raft::device_span<const double>(d_x.data(), d_x.size()),
+                                 raft::device_span<double>(d_r1.data(), d_r1.size()),
+                                 raft::device_span<double>(d_y_exp.data(), d_y_exp.size()),
+                                 cones,
+                                 raft::device_span<const double>(Hs_diag.data(), Hs_diag.size()),
+                                 /*cone_var_start=*/0,
+                                 n_primal,
+                                 m_rows,
+                                 stream);
+
+  auto r1_host   = cuopt::host_copy(d_r1, stream);
+  auto yexp_host = cuopt::host_copy(d_y_exp, stream);
+  auto hs_host   = cuopt::host_copy(Hs_diag, stream);
+
+  const double eta_sq  = eta_host[0] * eta_host[0];
+  const double x_exp_v = x_vec[n_primal + m_rows];
+  const double x_exp_u = x_vec[n_primal + m_rows + 1];
+  double dot_v         = 0.0;
+  double dot_u         = 0.0;
+  for (int j = 0; j < 1000; ++j) {
+    dot_v += v_host[j] * x_vec[j];
+    dot_u += u_host[j] * x_vec[j];
+    const double expected = hs_host[j] * x_vec[j] - v_host[j] * x_exp_v - u_host[j] * x_exp_u;
+    EXPECT_NEAR(r1_host[j], expected, 1e-9) << "primal row " << j;
+  }
+
+  EXPECT_NEAR(yexp_host[0], -eta_sq * x_exp_v + dot_v, 1e-9);
+  EXPECT_NEAR(yexp_host[1], eta_sq * x_exp_u + dot_u, 1e-9);
+}
+
+}  // namespace cuopt::linear_programming::dual_simplex::test

From 4ed64ef5f011f44a998e260809bfb6ab8bdabc9a Mon Sep 17 00:00:00 2001
From: yuwenchen95 <yuwchen@nvidia.com>
Date: Mon, 29 Jun 2026 07:04:28 -0700
Subject: [PATCH 2/2] Stricter IR tolerance for SOCP with sparse socs

Signed-off-by: yuwenchen95 <yuwchen@nvidia.com>
---
 cpp/src/barrier/barrier.cu               | 12 ++++++++----
 cpp/src/barrier/iterative_refinement.hpp | 25 +++++++++++++++---------
 2 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu
index 0a5377ea01..45b1feab4a 100644
--- a/cpp/src/barrier/barrier.cu
+++ b/cpp/src/barrier/barrier.cu
@@ -683,9 +683,11 @@ class iteration_data_t {
 
   i_t cone_end() const { return cone_start() + cone_entry_count(); }
 
+  bool has_sparse_cones() const { return has_cones() && cones_->n_sparse_cones > 0; }
+
   i_t augmented_expansion_count() const
   {
-    return has_cones() && cones().has_sparse_cones() ? cones().expansion_var_count() : i_t(0);
+    return has_sparse_cones() ? cones().expansion_var_count() : i_t(0);
   }
 
   i_t augmented_system_size(i_t n, i_t m) const { return n + m + augmented_expansion_count(); }
@@ -2563,7 +2565,8 @@ int barrier_solver_t<i_t, f_t>::initial_point(iteration_data_t<i_t, f_t>& data)
     } op(data);
 
     if (settings.barrier_iterative_refinement) {
-      iterative_refinement<i_t, f_t, op_t>(op, rhs, soln);
+      const f_t ir_tol = data.has_sparse_cones() ? f_t(1e-12) : f_t(1e-8);
+      iterative_refinement<i_t, f_t, op_t>(op, rhs, soln, ir_tol);
     }
 
     for (i_t k = 0; k < lp.num_cols; k++) {
@@ -3196,8 +3199,9 @@ i_t barrier_solver_t<i_t, f_t>::gpu_compute_search_direction(iteration_data_t<i_
       }
     } op(data);
     if (settings.barrier_iterative_refinement) {
-      const f_t solve_err =
-        iterative_refinement<i_t, f_t, op_t>(op, data.d_augmented_rhs_, data.d_augmented_soln_);
+      const f_t ir_tol    = data.has_sparse_cones() ? f_t(1e-12) : f_t(1e-8);
+      const f_t solve_err = iterative_refinement<i_t, f_t, op_t>(
+        op, data.d_augmented_rhs_, data.d_augmented_soln_, ir_tol);
       if (solve_err > 1e-1) {
         settings.log.printf("|| Aug (dx, dy) - aug_rhs || %e after IR\n", solve_err);
       }
diff --git a/cpp/src/barrier/iterative_refinement.hpp b/cpp/src/barrier/iterative_refinement.hpp
index 8ad0bb4c49..d7bc8276a7 100644
--- a/cpp/src/barrier/iterative_refinement.hpp
+++ b/cpp/src/barrier/iterative_refinement.hpp
@@ -88,7 +88,8 @@ f_t vector_norm2(const rmm::device_uvector<f_t>& x)
 template <typename i_t, typename f_t, typename T>
 f_t iterative_refinement_simple(T& op,
                                 const rmm::device_uvector<f_t>& b,
-                                rmm::device_uvector<f_t>& x)
+                                rmm::device_uvector<f_t>& x,
+                                f_t tol = 1e-8)
 {
   rmm::device_uvector<f_t> x_sav(x, x.stream());
 
@@ -105,7 +106,7 @@ f_t iterative_refinement_simple(T& op,
   }
   rmm::device_uvector<f_t> delta_x(x.size(), op.data_.handle_ptr->get_stream());
   i_t iter = 0;
-  while (error > 1e-8 && iter < 30) {
+  while (error > tol && iter < 30) {
     thrust::fill(op.data_.handle_ptr->get_thrust_policy(),
                  delta_x.data(),
                  delta_x.data() + delta_x.size(),
@@ -154,7 +155,8 @@ f_t iterative_refinement_simple(T& op,
 template <typename i_t, typename f_t, typename T>
 f_t iterative_refinement_gmres(T& op,
                                const rmm::device_uvector<f_t>& b,
-                               rmm::device_uvector<f_t>& x)
+                               rmm::device_uvector<f_t>& x,
+                               f_t tol = 1e-8)
 {
   // Parameters
   // Ideally, we do not need to restart here. But having restarts helps as a checkpoint to get
@@ -162,7 +164,6 @@ f_t iterative_refinement_gmres(T& op,
   // are not converging after some point
   const int max_restarts = 3;
   const int m            = 10;  // Krylov space dimension
-  const f_t tol          = 1e-8;
 
   rmm::device_uvector<f_t> r(x.size(), x.stream());
   rmm::device_uvector<f_t> x_sav(x, x.stream());
@@ -188,7 +189,7 @@ f_t iterative_refinement_gmres(T& op,
 
   f_t norm_r = vector_norm_inf<f_t>(r);
   if (show_info) { CUOPT_LOG_INFO("GMRES IR: initial residual = %e, |b| = %e", norm_r, bnorm); }
-  if (norm_r <= 1e-8) { return norm_r; }
+  if (norm_r <= tol) { return norm_r; }
 
   f_t residual      = norm_r;
   f_t best_residual = norm_r;
@@ -392,13 +393,16 @@ f_t iterative_refinement_gmres(T& op,
 }
 
 template <typename i_t, typename f_t, typename T>
-f_t iterative_refinement(T& op, const dense_vector_t<i_t, f_t>& b, dense_vector_t<i_t, f_t>& x)
+f_t iterative_refinement(T& op,
+                         const dense_vector_t<i_t, f_t>& b,
+                         dense_vector_t<i_t, f_t>& x,
+                         f_t tol = 1e-8)
 {
   rmm::device_uvector<f_t> d_b(b.size(), op.data_.handle_ptr->get_stream());
   raft::copy(d_b.data(), b.data(), b.size(), op.data_.handle_ptr->get_stream());
   rmm::device_uvector<f_t> d_x(x.size(), op.data_.handle_ptr->get_stream());
   raft::copy(d_x.data(), x.data(), x.size(), op.data_.handle_ptr->get_stream());
-  auto err = iterative_refinement_gmres<i_t, f_t, T>(op, d_b, d_x);
+  auto err = iterative_refinement_gmres<i_t, f_t, T>(op, d_b, d_x, tol);
 
   raft::copy(x.data(), d_x.data(), x.size(), op.data_.handle_ptr->get_stream());
 
@@ -407,9 +411,12 @@ f_t iterative_refinement(T& op, const dense_vector_t<i_t, f_t>& b, dense_vector_
 }
 
 template <typename i_t, typename f_t, typename T>
-f_t iterative_refinement(T& op, const rmm::device_uvector<f_t>& b, rmm::device_uvector<f_t>& x)
+f_t iterative_refinement(T& op,
+                         const rmm::device_uvector<f_t>& b,
+                         rmm::device_uvector<f_t>& x,
+                         f_t tol = 1e-8)
 {
-  return iterative_refinement_gmres<i_t, f_t, T>(op, b, x);
+  return iterative_refinement_gmres<i_t, f_t, T>(op, b, x, tol);
 }
 
 }  // namespace cuopt::mathematical_optimization::barrier