From 01f346932591a781eb80252f3ed71938a6e01209 Mon Sep 17 00:00:00 2001 From: yuwenchen95 Date: Wed, 24 Jun 2026 07:31:37 -0700 Subject: [PATCH 1/2] Add support for sparsity exploitation for SOCP of large dimensionality Signed-off-by: yuwenchen95 --- .../cuopt/linear_programming/constants.h | 1 + .../pdlp/solver_settings.hpp | 1 + cpp/src/barrier/barrier.cu | 491 +++++++++++--- cpp/src/barrier/iterative_refinement.hpp | 1 - cpp/src/barrier/second_order_cone_kernels.cuh | 601 ++++++++++++++++-- .../barrier/second_order_cone_reduction.cuh | 116 ++-- .../dual_simplex/simplex_solver_settings.hpp | 2 + cpp/src/math_optimization/solver_settings.cu | 1 + cpp/src/pdlp/solve.cu | 1 + cpp/tests/socp/CMakeLists.txt | 1 + cpp/tests/socp/second_order_cone_kernels.cu | 355 ++++++++++- cpp/tests/socp/solve_barrier_socp.cu | 210 ++++++ cpp/tests/socp/sparse_augmented_kkt_test.cu | 353 ++++++++++ 13 files changed, 1920 insertions(+), 214 deletions(-) create mode 100644 cpp/tests/socp/sparse_augmented_kkt_test.cu diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h index 024fae7583..041cc38688 100644 --- a/cpp/include/cuopt/linear_programming/constants.h +++ b/cpp/include/cuopt/linear_programming/constants.h @@ -48,6 +48,7 @@ #define CUOPT_ORDERING "ordering" #define CUOPT_BARRIER_DUAL_INITIAL_POINT "barrier_dual_initial_point" #define CUOPT_BARRIER_ITERATIVE_REFINEMENT "barrier_iterative_refinement" +#define CUOPT_BARRIER_SOC_THRESHOLD "barrier_soc_threshold" #define CUOPT_BARRIER_STEP_SCALE "barrier_step_scale" #define CUOPT_ELIMINATE_DENSE_COLUMNS "eliminate_dense_columns" #define CUOPT_CUDSS_DETERMINISTIC "cudss_deterministic" diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp index b30286f9ce..dd1979de6c 100644 --- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp @@ -283,6 +283,7 @@ class pdlp_solver_settings_t { bool eliminate_dense_columns{true}; pdlp_precision_t pdlp_precision{pdlp_precision_t::DefaultPrecision}; bool barrier_iterative_refinement{true}; + i_t barrier_soc_threshold{5}; f_t barrier_step_scale{0.9}; bool save_best_primal_so_far{false}; /** diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu index 1cbeed5cf3..845624b6ef 100644 --- a/cpp/src/barrier/barrier.cu +++ b/cpp/src/barrier/barrier.cu @@ -40,6 +40,8 @@ #include #include +#include + #include #include #include @@ -262,6 +264,16 @@ class iteration_data_t { d_augmented_diagonal_indices_(0, lp.handle_ptr->get_stream()), d_cone_csr_indices_(0, lp.handle_ptr->get_stream()), d_cone_Q_values_(0, lp.handle_ptr->get_stream()), + d_dense_cone_block_offsets_(0, lp.handle_ptr->get_stream()), + d_dense_cone_ids_(0, lp.handle_ptr->get_stream()), + d_sparse_hessian_diag_(0, lp.handle_ptr->get_stream()), + d_sparse_hessian_Q_(0, lp.handle_ptr->get_stream()), + d_sparse_exp_v_col_(0, lp.handle_ptr->get_stream()), + d_sparse_exp_u_col_(0, lp.handle_ptr->get_stream()), + d_sparse_exp_v_row_(0, lp.handle_ptr->get_stream()), + d_sparse_exp_u_row_(0, lp.handle_ptr->get_stream()), + d_sparse_expansion_D_(0, lp.handle_ptr->get_stream()), + d_sparse_Hs_diag_(0, lp.handle_ptr->get_stream()), use_augmented(false), has_factorization(false), n_direct_free_linear(0), @@ -406,7 +418,8 @@ class iteration_data_t { std::span(lp.second_order_cone_dims.data(), lp.second_order_cone_dims.size()), raft::device_span{}, raft::device_span{}, - stream_view_); + stream_view_, + settings.barrier_soc_threshold); cuopt_assert(cone_count() > 0, "second-order cone topology must contain at least one cone"); cuopt_assert(cone_entry_count() == total_cone_dim, "second-order cone entry count mismatch"); } @@ -506,7 +519,7 @@ class iteration_data_t { if (use_augmented) { settings.log.printf("Linear system : augmented\n"); - const i_t augmented_size = lp.num_cols + lp.num_rows; + const i_t augmented_size = augmented_system_size(lp.num_cols, lp.num_rows); d_augmented_rhs_.resize(augmented_size, stream_view_); d_augmented_soln_.resize(augmented_size, stream_view_); } else { @@ -599,8 +612,9 @@ class iteration_data_t { if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; } { raft::common::nvtx::range scope("Barrier: LP Data: Cholesky init"); - i_t factorization_size = use_augmented ? lp.num_rows + lp.num_cols : lp.num_rows; - chol = std::make_unique>( + i_t factorization_size = + use_augmented ? augmented_system_size(lp.num_cols, lp.num_rows) : lp.num_rows; + chol = std::make_unique>( handle_ptr, settings, factorization_size); chol->set_positive_definite(false); } @@ -653,16 +667,23 @@ class iteration_data_t { i_t cone_end() const { return cone_start() + cone_entry_count(); } - i_t linear_xz_size(std::size_t full_xz_size) const + i_t augmented_expansion_count() const { - return has_cones() ? cone_start() : static_cast(full_xz_size); + return has_cones() && cones().has_sparse_cones() ? cones().expansion_var_count() : i_t(0); } + i_t augmented_system_size(i_t n, i_t m) const { return n + m + augmented_expansion_count(); } + bool is_cone_variable(i_t variable) const { return has_cones() && variable >= cone_start() && variable < cone_end(); } + i_t linear_xz_size(std::size_t full_xz_size) const + { + return has_cones() ? cone_start() : static_cast(full_xz_size); + } + f_t complementarity_degree(std::size_t num_primal_variables, i_t num_upper_bounds) const { const bool has_soc = has_cones(); @@ -678,52 +699,89 @@ class iteration_data_t { void form_augmented(bool first_call = false) { - i_t n = A.n; - i_t m = A.m; - i_t nnzA = A.col_start[n]; - i_t nnzQ = Q.n > 0 ? Q.col_start[n] : 0; - i_t factorization_size = n + m; + i_t n = A.n; + i_t m = A.m; + i_t nnzA = A.col_start[n]; + i_t nnzQ = Q.n > 0 ? Q.col_start[n] : 0; + + const bool has_soc = has_cones(); + const i_t m_c = cone_entry_count(); + const i_t p = augmented_expansion_count(); + i_t factorization_size = augmented_system_size(n, m); - const bool has_soc = has_cones(); - const i_t m_c = cone_entry_count(); - i_t total_block_nnz = 0; + i_t dense_soc_kkt_nnz = 0; std::vector cone_offsets_host; - std::vector cone_block_offsets_host; + std::vector dense_cone_block_offsets_host; + std::vector dense_cone_ids_host; + std::vector cone_sparse_idx; + std::vector sparse_cone_ids_host; + std::vector sparse_entry_offsets; std::vector local_to_cone; if (first_call) { + const i_t n_sparse_soc = has_soc ? cones().n_sparse_cones : i_t(0); if (has_soc) { raft::common::nvtx::range scope("Barrier: augmented: SOC offsets"); - // Initialize the cone block offsets const i_t n_cones = cone_count(); + // TODO: Once form_augmented() is moved to GPU, we can avoid D2H round-trip. cone_offsets_host.resize(n_cones + 1); - cone_block_offsets_host.resize(n_cones + 1); raft::copy( cone_offsets_host.data(), cones().cone_offsets.data(), n_cones + 1, stream_view_); handle_ptr->sync_stream(); + + cone_sparse_idx.assign(n_cones, i_t(-1)); + if (n_sparse_soc > 0) { + // TODO: Once form_augmented() is moved to GPU, we can avoid D2H round-trip for + // sparse_cone_ids / sparse_cone_dims. + sparse_cone_ids_host = cuopt::host_copy(cones().sparse_cone_ids, stream_view_); + for (i_t s = 0; s < n_sparse_soc; ++s) { + cone_sparse_idx[sparse_cone_ids_host[s]] = s; + } + sparse_entry_offsets.resize(n_sparse_soc + 1, 0); + const auto sparse_dims_host = cuopt::host_copy(cones().sparse_cone_dims, stream_view_); + for (i_t s = 0; s < n_sparse_soc; ++s) { + sparse_entry_offsets[s + 1] = sparse_entry_offsets[s] + sparse_dims_host[s]; + } + } + + dense_cone_ids_host.reserve(n_cones); + dense_cone_block_offsets_host = {0}; + dense_cone_block_offsets_host.reserve(n_cones + 1); for (i_t k = 0; k < n_cones; ++k) { - const auto q_k = cone_offsets_host[k + 1] - cone_offsets_host[k]; - cone_block_offsets_host[k + 1] = cone_block_offsets_host[k] + q_k * q_k; + if (cone_sparse_idx[k] < 0) { + dense_cone_ids_host.push_back(k); + const auto q_k = cone_offsets_host[k + 1] - cone_offsets_host[k]; + dense_cone_block_offsets_host.push_back(dense_cone_block_offsets_host.back() + + q_k * q_k); + } } - total_block_nnz = static_cast(cone_block_offsets_host[n_cones]); + dense_soc_kkt_nnz = static_cast(dense_cone_block_offsets_host.back()); - // Precompute: for each local cone entry, which cone does it belong to? local_to_cone.resize(m_c); for (i_t k = 0; k < n_cones; ++k) { const i_t lo = cone_offsets_host[k]; const i_t hi = cone_offsets_host[k + 1]; - for (i_t p = lo; p < hi; p++) { - local_to_cone[p] = k; + for (i_t idx = lo; idx < hi; idx++) { + local_to_cone[idx] = k; } } } - i_t new_nnz = 2 * nnzA + n + m + nnzQ + total_block_nnz; // conservative estimate of nnz - csr_matrix_t augmented_CSR(n + m, n + m, new_nnz); - std::vector augmented_diagonal_indices(n + m, -1); - std::vector cone_csr_indices_host(total_block_nnz, -1); - std::vector cone_Q_values_host(total_block_nnz, f_t(0)); + const size_t n_sparse_entries = p > 0 ? cones().n_sparse_cone_entries : size_t{0}; + const i_t sparse_soc_kkt_nnz = static_cast(5 * n_sparse_entries + p); + i_t new_nnz = 2 * nnzA + n + m + p + nnzQ + dense_soc_kkt_nnz + sparse_soc_kkt_nnz; + csr_matrix_t augmented_CSR(factorization_size, factorization_size, new_nnz); + std::vector augmented_diagonal_indices(factorization_size, -1); + std::vector cone_csr_indices_host(dense_soc_kkt_nnz, -1); + std::vector cone_Q_values_host(dense_soc_kkt_nnz, f_t(0)); + std::vector sparse_hessian_diag_host(n_sparse_entries, -1); + std::vector sparse_hessian_Q_host(n_sparse_entries, f_t(0)); + std::vector sparse_exp_v_col_host(n_sparse_entries, -1); + std::vector sparse_exp_u_col_host(n_sparse_entries, -1); + std::vector sparse_exp_v_row_host(n_sparse_entries, -1); + std::vector sparse_exp_u_row_host(n_sparse_entries, -1); + std::vector sparse_expansion_D_host(p, -1); i_t q = 0; i_t off_diag_Qnz = 0; @@ -735,50 +793,101 @@ class iteration_data_t { const bool is_cone_row = is_cone_variable(i); if (is_cone_row) { - // Determine which cone this variable belongs to and its local row i_t local_idx = i - cone_start(); i_t k = local_to_cone[local_idx]; i_t local_r = static_cast(static_cast(local_idx) - cone_offsets_host[k]); i_t q_k = static_cast(cone_offsets_host[k + 1] - cone_offsets_host[k]); i_t cone_col_start = cone_start() + static_cast(cone_offsets_host[k]); - i_t block_base = static_cast(cone_block_offsets_host[k]) + local_r * q_k; + i_t cone_col_end = cone_col_start + q_k; - // Merge-join: Q entries (sorted) with dense cone block columns (contiguous) i_t qp = (nnzQ > 0) ? Q.col_start[i] : 0; i_t q_end = (nnzQ > 0) ? Q.col_start[i + 1] : 0; - // Q entries before cone block - for (; qp < q_end && Q.i[qp] < cone_col_start; qp++) { - augmented_CSR.j[q] = Q.i[qp]; - augmented_CSR.x[q++] = -Q.x[qp]; - off_diag_Qnz++; - } - - // Dense cone block, absorbing any Q entries that fall inside - for (i_t c = 0; c < q_k; c++) { - i_t col = cone_col_start + c; - f_t q_contrib = f_t(0); - f_t initial_val = (c == local_r) ? f_t(-dual_perturb) - : f_t(0); // diagonal entry of the cone block column + // Row of sparse SOC. + if (cone_sparse_idx[k] >= 0) { + const i_t sparse_idx = cone_sparse_idx[k]; + const size_t flat_idx = sparse_entry_offsets[sparse_idx] + local_r; + const i_t exp_v_col = n + m + 2 * sparse_idx; + const i_t exp_u_col = n + m + 2 * sparse_idx + 1; + + // Handle off-diagonal entries of sparse SOC before diagonal entry. + for (; qp < q_end && Q.i[qp] < cone_col_start; qp++) { + augmented_CSR.j[q] = Q.i[qp]; + augmented_CSR.x[q++] = -Q.x[qp]; + off_diag_Qnz++; + } - if (qp < q_end && Q.i[qp] == col) { + // Handle diagonal entry of sparse SOC. + f_t q_contrib = f_t(0); + if (qp < q_end && Q.i[qp] == i) { q_contrib = Q.x[qp]; qp++; } + sparse_hessian_diag_host[flat_idx] = q; + sparse_hessian_Q_host[flat_idx] = q_contrib; + augmented_diagonal_indices[i] = q; + augmented_CSR.j[q] = i; + augmented_CSR.x[q++] = -dual_perturb - q_contrib; + + // Handle expansion columns of sparse SOC. + sparse_exp_v_col_host[flat_idx] = q; + augmented_CSR.j[q] = exp_v_col; + augmented_CSR.x[q++] = f_t(0); + + sparse_exp_u_col_host[flat_idx] = q; + augmented_CSR.j[q] = exp_u_col; + augmented_CSR.x[q++] = f_t(0); + + // Handle off-diagonal entries of dense SOC before diagonal entry. + for (; qp < q_end && Q.i[qp] < cone_col_end; qp++) { + if (Q.i[qp] == i) { continue; } + augmented_CSR.j[q] = Q.i[qp]; + augmented_CSR.x[q++] = -Q.x[qp]; + off_diag_Qnz++; + } + for (; qp < q_end; qp++) { + augmented_CSR.j[q] = Q.i[qp]; + augmented_CSR.x[q++] = -Q.x[qp]; + off_diag_Qnz++; + } + } else { + // Row of dense SOC. + i_t dense_idx = 0; + for (i_t t = 0; t < k; ++t) { + if (cone_sparse_idx[t] < 0) { ++dense_idx; } + } + i_t block_base = + static_cast(dense_cone_block_offsets_host[dense_idx]) + local_r * q_k; - cone_csr_indices_host[block_base + c] = q; - cone_Q_values_host[block_base + c] = q_contrib; - if (col == i) { augmented_diagonal_indices[i] = q; } - augmented_CSR.j[q] = col; - augmented_CSR.x[q++] = initial_val - q_contrib; - } + for (; qp < q_end && Q.i[qp] < cone_col_start; qp++) { + augmented_CSR.j[q] = Q.i[qp]; + augmented_CSR.x[q++] = -Q.x[qp]; + off_diag_Qnz++; + } + + for (i_t c = 0; c < q_k; c++) { + i_t col = cone_col_start + c; + f_t q_contrib = f_t(0); + f_t initial_val = (c == local_r) ? f_t(-dual_perturb) : f_t(0); + + if (qp < q_end && Q.i[qp] == col) { + q_contrib = Q.x[qp]; + qp++; + } + + cone_csr_indices_host[block_base + c] = q; + cone_Q_values_host[block_base + c] = q_contrib; + if (col == i) { augmented_diagonal_indices[i] = q; } + augmented_CSR.j[q] = col; + augmented_CSR.x[q++] = initial_val - q_contrib; + } - // Q entries after cone block - for (; qp < q_end; qp++) { - augmented_CSR.j[q] = Q.i[qp]; - augmented_CSR.x[q++] = -Q.x[qp]; - off_diag_Qnz++; + for (; qp < q_end; qp++) { + augmented_CSR.j[q] = Q.i[qp]; + augmented_CSR.x[q++] = -Q.x[qp]; + off_diag_Qnz++; + } } } else if (nnzQ == 0) { augmented_diagonal_indices[i] = q; @@ -816,27 +925,77 @@ class iteration_data_t { } for (i_t k = n; k < n + m; ++k) { - // A block, we can use AT in csc directly augmented_CSR.row_start[k] = q; const i_t l = k - n; const i_t col_beg = AT.col_start[l]; const i_t col_end = AT.col_start[l + 1]; - for (i_t p = col_beg; p < col_end; ++p) { - augmented_CSR.j[q] = AT.i[p]; - augmented_CSR.x[q++] = AT.x[p]; + for (i_t idx = col_beg; idx < col_end; ++idx) { + augmented_CSR.j[q] = AT.i[idx]; + augmented_CSR.x[q++] = AT.x[idx]; } augmented_diagonal_indices[k] = q; augmented_CSR.j[q] = k; augmented_CSR.x[q++] = primal_perturb; } - augmented_CSR.row_start[n + m] = q; - augmented_CSR.nz_max = q; + + // Handle expansion rows of sparse SOC. + for (i_t s = 0; s < n_sparse_soc; ++s) { + const i_t k = sparse_cone_ids_host[s]; + const i_t q_k = static_cast(cone_offsets_host[k + 1] - cone_offsets_host[k]); + const i_t cone_col_start = cone_start() + static_cast(cone_offsets_host[k]); + const size_t flat_base = sparse_entry_offsets[s]; + const i_t prow_v = n + m + 2 * s; + const i_t prow_u = n + m + 2 * s + 1; + + augmented_CSR.row_start[prow_v] = q; + for (i_t j = 0; j < q_k; ++j) { + sparse_exp_v_row_host[flat_base + j] = q; + augmented_CSR.j[q] = cone_col_start + j; + augmented_CSR.x[q++] = f_t(0); + } + sparse_expansion_D_host[2 * s] = q; + augmented_CSR.j[q] = prow_v; + augmented_CSR.x[q++] = f_t(0); + + augmented_CSR.row_start[prow_u] = q; + for (i_t j = 0; j < q_k; ++j) { + sparse_exp_u_row_host[flat_base + j] = q; + augmented_CSR.j[q] = cone_col_start + j; + augmented_CSR.x[q++] = f_t(0); + } + sparse_expansion_D_host[2 * s + 1] = q; + augmented_CSR.j[q] = prow_u; + augmented_CSR.x[q++] = f_t(0); + } + + augmented_CSR.row_start[factorization_size] = q; + augmented_CSR.nz_max = q; augmented_CSR.j.resize(q); augmented_CSR.x.resize(q); - i_t expected_nnz = 2 * nnzA + (n - m_c) + total_block_nnz + m + off_diag_Qnz; + i_t expected_nnz = + 2 * nnzA + (n - m_c) + dense_soc_kkt_nnz + m + off_diag_Qnz + sparse_soc_kkt_nnz; settings_.log.debug("augmented nz %d predicted %d\n", q, expected_nnz); - cuopt_assert(q == expected_nnz, "augmented nnz != predicted"); + cuopt_assert(q == expected_nnz, "augmented nz != predicted"); cuopt_assert(A.col_start[n] == AT.col_start[m], "A nz != AT nz"); + + if (n_sparse_entries > 0) { + for (size_t e = 0; e < n_sparse_entries; ++e) { + cuopt_assert(sparse_hessian_diag_host[e] >= 0, + "sparse Hessian diagonal CSR index unset"); + cuopt_assert(sparse_exp_v_col_host[e] >= 0, + "sparse expansion v-column CSR index unset"); + cuopt_assert(sparse_exp_u_col_host[e] >= 0, + "sparse expansion u-column CSR index unset"); + cuopt_assert(sparse_exp_v_row_host[e] >= 0, "sparse expansion v-row CSR index unset"); + cuopt_assert(sparse_exp_u_row_host[e] >= 0, "sparse expansion u-row CSR index unset"); + } + for (i_t s = 0; s < n_sparse_soc; ++s) { + cuopt_assert(sparse_expansion_D_host[2 * s] >= 0, + "sparse expansion v diagonal CSR index unset"); + cuopt_assert(sparse_expansion_D_host[2 * s + 1] >= 0, + "sparse expansion u diagonal CSR index unset"); + } + } } { @@ -850,16 +1009,72 @@ class iteration_data_t { handle_ptr->get_stream()); if (has_soc) { - d_cone_csr_indices_.resize(total_block_nnz, handle_ptr->get_stream()); + d_cone_csr_indices_.resize(dense_soc_kkt_nnz, handle_ptr->get_stream()); raft::copy(d_cone_csr_indices_.data(), cone_csr_indices_host.data(), - total_block_nnz, + dense_soc_kkt_nnz, handle_ptr->get_stream()); - d_cone_Q_values_.resize(total_block_nnz, handle_ptr->get_stream()); + d_cone_Q_values_.resize(dense_soc_kkt_nnz, handle_ptr->get_stream()); raft::copy(d_cone_Q_values_.data(), cone_Q_values_host.data(), - total_block_nnz, + dense_soc_kkt_nnz, + handle_ptr->get_stream()); + + d_dense_cone_block_offsets_.resize(dense_cone_block_offsets_host.size(), + handle_ptr->get_stream()); + raft::copy(d_dense_cone_block_offsets_.data(), + dense_cone_block_offsets_host.data(), + dense_cone_block_offsets_host.size(), handle_ptr->get_stream()); + d_dense_cone_ids_.resize(dense_cone_ids_host.size(), handle_ptr->get_stream()); + if (!dense_cone_ids_host.empty()) { + raft::copy(d_dense_cone_ids_.data(), + dense_cone_ids_host.data(), + dense_cone_ids_host.size(), + handle_ptr->get_stream()); + } + + const size_t n_sparse_entries = cones().n_sparse_cone_entries; + d_sparse_hessian_diag_.resize(n_sparse_entries, handle_ptr->get_stream()); + d_sparse_hessian_Q_.resize(n_sparse_entries, handle_ptr->get_stream()); + d_sparse_exp_v_col_.resize(n_sparse_entries, handle_ptr->get_stream()); + d_sparse_exp_u_col_.resize(n_sparse_entries, handle_ptr->get_stream()); + d_sparse_exp_v_row_.resize(n_sparse_entries, handle_ptr->get_stream()); + d_sparse_exp_u_row_.resize(n_sparse_entries, handle_ptr->get_stream()); + d_sparse_expansion_D_.resize(p, handle_ptr->get_stream()); + d_sparse_Hs_diag_.resize(n_sparse_entries, handle_ptr->get_stream()); + if (n_sparse_entries > 0) { + raft::copy(d_sparse_hessian_diag_.data(), + sparse_hessian_diag_host.data(), + n_sparse_entries, + handle_ptr->get_stream()); + raft::copy(d_sparse_hessian_Q_.data(), + sparse_hessian_Q_host.data(), + n_sparse_entries, + handle_ptr->get_stream()); + raft::copy(d_sparse_exp_v_col_.data(), + sparse_exp_v_col_host.data(), + n_sparse_entries, + handle_ptr->get_stream()); + raft::copy(d_sparse_exp_u_col_.data(), + sparse_exp_u_col_host.data(), + n_sparse_entries, + handle_ptr->get_stream()); + raft::copy(d_sparse_exp_v_row_.data(), + sparse_exp_v_row_host.data(), + n_sparse_entries, + handle_ptr->get_stream()); + raft::copy(d_sparse_exp_u_row_.data(), + sparse_exp_u_row_host.data(), + n_sparse_entries, + handle_ptr->get_stream()); + } + if (p > 0) { + raft::copy(d_sparse_expansion_D_.data(), + sparse_expansion_D_host.data(), + sparse_expansion_D_host.size(), + handle_ptr->get_stream()); + } } handle_ptr->sync_stream(); @@ -868,11 +1083,11 @@ class iteration_data_t { csc_matrix_t augmented_transpose(1, 1, 1); augmented.transpose(augmented_transpose); settings_.log.printf("Aug nnz %d Aug^T nnz %d\n", - augmented.col_start[m + n], - augmented_transpose.col_start[m + n]); + augmented.col_start[factorization_size], + augmented_transpose.col_start[factorization_size]); augmented.check_matrix(); augmented_transpose.check_matrix(); - csc_matrix_t error(m + n, m + n, 1); + csc_matrix_t error(factorization_size, factorization_size, 1); add(augmented, augmented_transpose, 1.0, -1.0, error); settings_.log.printf("|| Aug - Aug^T ||_1 %e\n", error.norm1()); cuopt_assert(error.norm1() <= 1e-2, "|| Aug - Aug^T ||_1 > 1e-2"); @@ -909,14 +1124,41 @@ class iteration_data_t { RAFT_CHECK_CUDA(handle_ptr->get_stream()); if (has_soc) { - scatter_hessian_into_augmented(cones(), - device_augmented.x, - d_cone_csr_indices_, - d_cone_Q_values_, - handle_ptr->get_stream(), - dual_perturb); - RAFT_CHECK_CUDA(handle_ptr->get_stream()); + if (cones().has_sparse_cones()) { + launch_get_Hs_sparse(cones(), d_sparse_Hs_diag_, handle_ptr->get_stream()); + scatter_sparse_hessian_diag_into_augmented(device_augmented.x, + d_sparse_hessian_diag_, + d_sparse_Hs_diag_, + d_sparse_hessian_Q_, + handle_ptr->get_stream(), + dual_perturb); + update_sparse_expansion_in_augmented(device_augmented.x, + d_sparse_exp_v_col_, + d_sparse_exp_u_col_, + d_sparse_exp_v_row_, + d_sparse_exp_u_row_, + d_sparse_expansion_D_, + cones().sparse_v, + cones().sparse_u, + cones().eta, + cones().sparse_cone_ids, + handle_ptr->get_stream(), + dual_perturb); + RAFT_CHECK_CUDA(handle_ptr->get_stream()); + } + if (cones().n_dense_cones() > 0) { + scatter_dense_hessian_into_augmented(cones(), + device_augmented.x, + d_cone_csr_indices_, + d_cone_Q_values_, + d_dense_cone_block_offsets_, + d_dense_cone_ids_, + handle_ptr->get_stream(), + dual_perturb); + RAFT_CHECK_CUDA(handle_ptr->get_stream()); + } } + handle_ptr->sync_stream(); } } @@ -1400,6 +1642,15 @@ class iteration_data_t { } } + void restore_saved_iterate() + { + x = x_save; + y = y_save; + z = z_save; + v = v_save; + w = w_save; + } + void to_solution(const lp_problem_t& lp, i_t iterations, f_t objective, @@ -1809,18 +2060,28 @@ class iteration_data_t { const i_t m = A.m; const i_t n = A.n; const bool has_soc = has_cones(); + const i_t p = augmented_expansion_count(); + const i_t sys_size = augmented_system_size(n, m); + cuopt_assert(static_cast(x.size()) >= sys_size, "augmented_multiply: x too small"); + cuopt_assert(static_cast(y.size()) >= sys_size, "augmented_multiply: y too small"); rmm::device_uvector d_x1(n, handle_ptr->get_stream()); rmm::device_uvector d_x2(m, handle_ptr->get_stream()); rmm::device_uvector d_y1(n, handle_ptr->get_stream()); rmm::device_uvector d_y2(m, handle_ptr->get_stream()); + rmm::device_uvector d_y_exp(p, handle_ptr->get_stream()); + rmm::device_uvector d_y_exp_orig(p, handle_ptr->get_stream()); raft::copy(d_x1.data(), x.data(), n, handle_ptr->get_stream()); raft::copy(d_x2.data(), x.data() + n, m, handle_ptr->get_stream()); raft::copy(d_y1.data(), y.data(), n, handle_ptr->get_stream()); raft::copy(d_y2.data(), y.data() + n, m, handle_ptr->get_stream()); + if (p > 0) { + raft::copy(d_y_exp_orig.data(), y.data() + n + m, p, handle_ptr->get_stream()); + thrust::fill_n(rmm::exec_policy(stream_view_), d_y_exp.begin(), p, f_t(0)); + } - // y1 <- alpha ( -D * x_1 + A^T x_2) + beta * y1 + // y1 <- alpha ( -(Q + D + H) * x_1 + A^T x_2) + beta * y1 rmm::device_uvector d_r1(n, handle_ptr->get_stream()); thrust::fill_n(rmm::exec_policy(stream_view_), d_r1.begin(), n, f_t(0)); @@ -1835,14 +2096,31 @@ class iteration_data_t { stream_view_); RAFT_CHECK_CUDA(stream_view_); - // r1 <- D * x_1 + H x_1 (cone Hessian block H = S^T S; accumulate_cone_hessian_matvec) + // r1 <- D * x_1 + H x_1 on cone rows + // (dense cones: explicit dense H block; sparse cones: rank-2 expansion, which adds + // Hs_diag .* x_cone to r1 here and writes the expansion rows into d_y_exp) if (has_soc) { const i_t m_c = cone_entry_count(); - accumulate_cone_hessian_matvec(raft::device_span(d_x1.data() + cone_start(), m_c), - cones(), - raft::device_span(d_r1.data() + cone_start(), m_c), - stream_view_); - RAFT_CHECK_CUDA(stream_view_); + if (cones().has_sparse_cones()) { + launch_sparse_augmented_matvec( + raft::device_span(x.data(), x.size()), + raft::device_span(d_r1.data(), d_r1.size()), + raft::device_span(d_y_exp.data(), d_y_exp.size()), + cones(), + raft::device_span(d_sparse_Hs_diag_.data(), d_sparse_Hs_diag_.size()), + cone_start(), + n, + m, + handle_ptr->get_stream()); + RAFT_CHECK_CUDA(stream_view_); + } + if (cones().n_dense_cones() > 0) { + launch_dense_hessian_matvec(raft::device_span(d_x1.data() + cone_start(), m_c), + cones(), + raft::device_span(d_r1.data() + cone_start(), m_c), + stream_view_); + RAFT_CHECK_CUDA(stream_view_); + } } // r1 <- Q x1 + D x1 + H x1 (cone: same H as above) @@ -1861,8 +2139,13 @@ class iteration_data_t { // matrix_vector_multiply(A, alpha, x1, beta, y2); cusparse_view_.spmv(alpha, d_x1, beta, d_y2); + if (p > 0) { + axpy(alpha, d_y_exp.data(), beta, d_y_exp_orig.data(), d_y_exp.data(), p, stream_view_); + } + raft::copy(y.data(), d_y1.data(), n, stream_view_); raft::copy(y.data() + n, d_y2.data(), m, stream_view_); + if (p > 0) { raft::copy(y.data() + n + m, d_y_exp.data(), p, stream_view_); } handle_ptr->sync_stream(); } @@ -1944,6 +2227,16 @@ class iteration_data_t { rmm::device_uvector d_augmented_diagonal_indices_; rmm::device_uvector d_cone_csr_indices_; rmm::device_uvector d_cone_Q_values_; + rmm::device_uvector d_dense_cone_block_offsets_; + rmm::device_uvector d_dense_cone_ids_; + rmm::device_uvector d_sparse_hessian_diag_; + rmm::device_uvector d_sparse_hessian_Q_; + rmm::device_uvector d_sparse_exp_v_col_; + rmm::device_uvector d_sparse_exp_u_col_; + rmm::device_uvector d_sparse_exp_v_row_; + rmm::device_uvector d_sparse_exp_u_row_; + rmm::device_uvector d_sparse_expansion_D_; + rmm::device_uvector d_sparse_Hs_diag_; bool indefinite_Q; cusparse_view_t cusparse_Q_view_; @@ -2064,7 +2357,7 @@ void cholesky_debug_check(const iteration_data_t& data, // return; srand(42); - i_t vec_size = use_augmented ? lp.num_cols + lp.num_rows : lp.num_rows; + i_t vec_size = use_augmented ? data.augmented_system_size(lp.num_cols, lp.num_rows) : lp.num_rows; // 1. Create a random test vector dense_vector_t test_vec(vec_size); for (size_t i = 0; i < test_vec.size(); i++) { @@ -2224,14 +2517,16 @@ int barrier_solver_t::initial_point(iteration_data_t& data) data.inv_diag.pairwise_product(Fu, DinvFu); dense_vector_t q(lp.num_rows); if (use_augmented) { - dense_vector_t rhs(lp.num_cols + lp.num_rows); + const i_t aug_size = data.augmented_system_size(lp.num_cols, lp.num_rows); + dense_vector_t rhs(aug_size); + rhs.set_scalar(0.0); for (i_t k = 0; k < lp.num_cols; k++) { rhs[k] = -Fu[k]; } for (i_t k = 0; k < lp.num_rows; k++) { rhs[lp.num_cols + k] = rhs_x[k]; } - dense_vector_t soln(lp.num_cols + lp.num_rows); + dense_vector_t soln(aug_size); i_t solve_status = data.chol->solve(rhs, soln); struct op_t { op_t(iteration_data_t& data) : data_(data) {} @@ -2374,12 +2669,13 @@ int barrier_solver_t::initial_point(iteration_data_t& data) } } } else if (use_augmented) { - dense_vector_t dual_rhs(lp.num_cols + lp.num_rows); + const i_t aug_size = data.augmented_system_size(lp.num_cols, lp.num_rows); + dense_vector_t dual_rhs(aug_size); dual_rhs.set_scalar(0.0); for (i_t k = 0; k < lp.num_cols; k++) { dual_rhs[k] = data.c[k]; } - dense_vector_t py(lp.num_cols + lp.num_rows); + dense_vector_t py(aug_size); data.chol->solve(dual_rhs, py); for (i_t k = 0; k < lp.num_cols; k++) { data.z[k] = py[k]; @@ -2646,6 +2942,7 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t(data.d_x_.data() + cone_var_start, m_c); cones.z = raft::device_span(data.d_z_.data() + cone_var_start, m_c); launch_nt_scaling(cones, stream_view_); + if (cones.has_sparse_cones()) { launch_update_scaling_sparse(cones, stream_view_); } } max_residual = 0.0; @@ -2852,6 +3149,14 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t 0) { + const i_t expansion_start = lp.num_cols + lp.num_rows; + thrust::fill_n(rmm::exec_policy(stream_view_), + data.d_augmented_rhs_.begin() + expansion_start, + expansion_count, + f_t(0)); + } data.chol->solve(data.d_augmented_rhs_, data.d_augmented_soln_); struct op_t { op_t(iteration_data_t& data) : data_(data) {} @@ -3956,11 +4261,7 @@ lp_status_t barrier_solver_t::check_for_suboptimal_solution( data.relative_dual_residual_save < settings.barrier_relaxed_optimality_tol && data.relative_complementarity_residual_save < settings.barrier_relaxed_complementarity_tol) { settings.log.printf("Restoring previous solution\n"); - raft::copy(data.x.data(), data.d_x_.data(), data.d_x_.size(), stream_view_); - raft::copy(data.y.data(), data.d_y_.data(), data.d_y_.size(), stream_view_); - raft::copy(data.z.data(), data.d_z_.data(), data.d_z_.size(), stream_view_); - raft::copy(data.v.data(), data.d_v_.data(), data.d_v_.size(), stream_view_); - RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); + data.restore_saved_iterate(); data.to_solution(lp, iter, primal_objective_save, diff --git a/cpp/src/barrier/iterative_refinement.hpp b/cpp/src/barrier/iterative_refinement.hpp index 807d055787..bb87984185 100644 --- a/cpp/src/barrier/iterative_refinement.hpp +++ b/cpp/src/barrier/iterative_refinement.hpp @@ -24,7 +24,6 @@ #include #include -#include #include #include diff --git a/cpp/src/barrier/second_order_cone_kernels.cuh b/cpp/src/barrier/second_order_cone_kernels.cuh index 5a674fef70..1c3e527299 100644 --- a/cpp/src/barrier/second_order_cone_kernels.cuh +++ b/cpp/src/barrier/second_order_cone_kernels.cuh @@ -202,12 +202,26 @@ struct cone_data_t { rmm::device_uvector w; // [n_cone_entries], unit-J-norm NT direction rmm::device_uvector lambda; // [n_cone_entries], NT point lambda = W^{-T} z + // Sparse SOC rank-2 scaling (cones with dimension > soc_threshold). + i_t soc_threshold; + i_t n_sparse_cones; + size_t n_sparse_cone_entries; // sum of sparse cone dimensions + rmm::device_uvector sparse_cone_ids; // [n_sparse_cones], indices into cone arrays + rmm::device_uvector sparse_cone_dims; // [n_sparse_cones], dimension of each sparse cone + rmm::device_uvector d; // [n_sparse_cones], corner of rank-2 diagonal block + rmm::device_uvector sparse_v; // [n_sparse_cone_entries], rank-2 vector v per sparse cone + rmm::device_uvector sparse_u; // [n_sparse_cone_entries], rank-2 vector u per sparse cone + rmm::device_uvector + sparse_entry_offsets; // [n_sparse_cones + 1], packed prefix offsets of sparse cone entries + rmm::device_uvector cone_is_sparse; // [n_cones], 1 if cone uses sparse KKT expansion + cone_scratch_t scratch; cone_data_t(std::span cone_dimensions_host, raft::device_span x_in, raft::device_span z_in, - rmm::cuda_stream_view stream) + rmm::cuda_stream_view stream, + i_t soc_threshold_in = 5) : n_cones(cone_dimensions_host.size()), n_cone_entries( std::reduce(cone_dimensions_host.begin(), cone_dimensions_host.end(), size_t{0})), @@ -220,8 +234,58 @@ struct cone_data_t { eta(n_cones, stream), w(n_cone_entries, stream), lambda(n_cone_entries, stream), - scratch(n_cones, n_cone_entries, stream) + sparse_cone_ids(0, stream), + sparse_cone_dims(0, stream), + d(0, stream), + sparse_v(0, stream), + sparse_u(0, stream), + sparse_entry_offsets(0, stream), + cone_is_sparse(n_cones, stream), + scratch(n_cones, n_cone_entries, stream), + soc_threshold(soc_threshold_in), + n_sparse_cones(0), + n_sparse_cone_entries(0) { + thrust::fill(rmm::exec_policy(stream), cone_is_sparse.begin(), cone_is_sparse.end(), 0); + + std::vector sparse_cone_ids_host; + std::vector sparse_cone_dims_host; + sparse_cone_ids_host.reserve(n_cones); + sparse_cone_dims_host.reserve(n_cones); + for (i_t cone = 0; cone < n_cones; ++cone) { + if (cone_dimensions_host[cone] > soc_threshold) { + sparse_cone_ids_host.push_back(cone); + sparse_cone_dims_host.push_back(cone_dimensions_host[cone]); + } + } + n_sparse_cones = static_cast(sparse_cone_ids_host.size()); + n_sparse_cone_entries = + std::reduce(sparse_cone_dims_host.begin(), sparse_cone_dims_host.end(), size_t{0}); + + if (n_sparse_cones > 0) { + sparse_cone_ids.resize(n_sparse_cones, stream); + sparse_cone_dims.resize(n_sparse_cones, stream); + d.resize(n_sparse_cones, stream); + sparse_v.resize(n_sparse_cone_entries, stream); + sparse_u.resize(n_sparse_cone_entries, stream); + sparse_entry_offsets.resize(n_sparse_cones + 1, stream); + raft::copy(sparse_cone_ids.data(), sparse_cone_ids_host.data(), n_sparse_cones, stream); + raft::copy(sparse_cone_dims.data(), sparse_cone_dims_host.data(), n_sparse_cones, stream); + std::vector cone_is_sparse_host(n_cones, 0); + for (i_t cone : sparse_cone_ids_host) { + cone_is_sparse_host[cone] = 1; + } + raft::copy(cone_is_sparse.data(), cone_is_sparse_host.data(), n_cones, stream); + + // Packed prefix offsets of sparse cone entries. + std::vector sparse_entry_offsets_host(n_sparse_cones + 1, 0); + for (i_t s = 0; s < n_sparse_cones; ++s) { + sparse_entry_offsets_host[s + 1] = sparse_entry_offsets_host[s] + sparse_cone_dims_host[s]; + } + raft::copy( + sparse_entry_offsets.data(), sparse_entry_offsets_host.data(), n_sparse_cones + 1, stream); + } + raft::copy(cone_dimensions.data(), cone_dimensions_host.data(), n_cones, stream); cone_offsets.set_element_to_zero_async(0, stream); auto policy = rmm::exec_policy(stream); @@ -242,6 +306,16 @@ struct cone_data_t { element_cone_ids.begin()); segmented_sum.template prepare_workspace(stream); } + + // True when at least one cone is large enough (dim > soc_threshold) to use the + // rank-2 sparse KKT expansion instead of a dense q x q block. + bool has_sparse_cones() const { return n_sparse_cones > 0; } + + // Number of cones kept dense (dim <= soc_threshold). + i_t n_dense_cones() const { return n_cones - n_sparse_cones; } + + // Extra augmented-system columns/rows: two expansion variables (v, u) per sparse cone. + i_t expansion_var_count() const { return 2 * n_sparse_cones; } }; template @@ -482,6 +556,94 @@ void launch_nt_scaling(cone_data_t& cones, rmm::cuda_stream_view strea RAFT_CUDA_TRY(cudaPeekAtLastError()); } +// One block per sparse cone. Recompute the rank-2 factors (corner d and the +// vectors v, u, both scaled by eta^2) from the current NT direction w so that +// the implicit block reproduces the dense H = eta^2 (2 w w^T - J). +template +__global__ void update_scaling_sparse_kernel(raft::device_span w, + raft::device_span eta, + raft::device_span d, + raft::device_span sparse_v, + raft::device_span sparse_u, + raft::device_span cone_offsets, + raft::device_span sparse_cone_dims, + raft::device_span sparse_cone_ids, + raft::device_span sparse_entry_offsets, + i_t n_sparse_cones) +{ + const i_t sparse_idx = blockIdx.x; + if (sparse_idx >= n_sparse_cones) { return; } + + __shared__ f_t s_mem[4]; + + const i_t cone_idx = sparse_cone_ids[sparse_idx]; + const size_t cone_off = cone_offsets[cone_idx]; + const i_t cone_dim = sparse_cone_dims[sparse_idx]; + const i_t block_start = sparse_entry_offsets[sparse_idx]; + + if (threadIdx.x == 0) { + const f_t alpha = f_t(2) * w[cone_off]; + const f_t wsq = f_t(2) * w[cone_off] * w[cone_off] - f_t(1); + const f_t wsq_safe = f_t(0.5) * (wsq + sqrt(wsq * wsq + f_t(1))); + const f_t wsqinv = f_t(1) / wsq_safe; + const f_t di = f_t(0.5) * wsqinv; + d[sparse_idx] = di; + const f_t radicand = wsq_safe - di; + const f_t u0 = sqrt(max(radicand, f_t(0))); + const f_t u1 = (u0 > f_t(0)) ? alpha / u0 : f_t(0); + const f_t v0 = f_t(0); + const f_t denom = f_t(2) * wsq_safe - wsqinv; + const f_t v1_arg = (abs(denom) > f_t(1e-12)) ? f_t(2) * (f_t(2) + wsqinv) / denom : f_t(2); + const f_t v1 = sqrt(max(v1_arg, f_t(0))); + const f_t eta_sq = eta[cone_idx] * eta[cone_idx]; + s_mem[0] = eta_sq * u0; + s_mem[1] = eta_sq * u1; + s_mem[2] = eta_sq * v0; + s_mem[3] = eta_sq * v1; + } + __syncthreads(); + + const f_t scaled_u0 = s_mem[0]; + const f_t scaled_u1 = s_mem[1]; + const f_t scaled_v0 = s_mem[2]; + const f_t scaled_v1 = s_mem[3]; + + for (i_t j = threadIdx.x; j < cone_dim; j += blockDim.x) { + if (j == 0) { + sparse_u[block_start + j] = scaled_u0; + sparse_v[block_start + j] = scaled_v0; + } else { + sparse_u[block_start + j] = scaled_u1 * w[cone_off + j]; + sparse_v[block_start + j] = scaled_v1 * w[cone_off + j]; + } + } +} + +/** + * Refresh the rank-2 NT factors (d, v, u) of every sparse cone from the current + * scaling, so the implicit sparse block matches the dense Hessian for this + * iteration. Call after `launch_nt_scaling` has updated w and eta. + */ +template +void launch_update_scaling_sparse(cone_data_t& cones, rmm::cuda_stream_view stream) +{ + if (!cones.has_sparse_cones()) { return; } + + const i_t n_sparse = cones.n_sparse_cones; + update_scaling_sparse_kernel + <<>>(cuopt::make_span(cones.w), + cuopt::make_span(cones.eta), + cuopt::make_span(cones.d), + cuopt::make_span(cones.sparse_v), + cuopt::make_span(cones.sparse_u), + cuopt::make_span(cones.cone_offsets), + cuopt::make_span(cones.sparse_cone_dims), + cuopt::make_span(cones.sparse_cone_ids), + cuopt::make_span(cones.sparse_entry_offsets), + n_sparse); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + template __global__ void __launch_bounds__(soc_block_size) apply_w_inv_write_kernel(raft::device_span v, @@ -546,21 +708,24 @@ __global__ void __launch_bounds__(soc_block_size) template __global__ void __launch_bounds__(soc_block_size) - apply_hessian_write_kernel(raft::device_span v, - raft::device_span out, - raft::device_span w, - raft::device_span eta, - raft::device_span wv_dot, - raft::device_span cone_offsets, - raft::device_span element_cone_ids, - raft::device_span bias, - f_t output_scale, - f_t bias_scale) + apply_hessian_kernel(raft::device_span v, + raft::device_span out, + raft::device_span w, + raft::device_span eta, + raft::device_span wv_dot, + raft::device_span cone_offsets, + raft::device_span element_cone_ids, + raft::device_span cone_is_sparse, + bool dense_cones_only, + raft::device_span bias, + f_t output_scale, + f_t bias_scale) { const size_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; if (idx >= out.size()) { return; } - const i_t cone = element_cone_ids[idx]; + const i_t cone = element_cone_ids[idx]; + if (dense_cones_only && !cone_is_sparse.empty() && cone_is_sparse[cone] != 0) { return; } const size_t cone_off = cone_offsets[cone]; const size_t local_idx = idx - cone_off; @@ -767,7 +932,8 @@ void apply_hessian(raft::device_span v, rmm::cuda_stream_view stream, f_t output_scale = 1, raft::device_span bias = {}, - f_t bias_scale = 0) + f_t bias_scale = 0, + bool dense_cones_only = false) { auto w = cuopt::make_span(cones.w); auto eta = cuopt::make_span(cones.eta); @@ -781,8 +947,19 @@ void apply_hessian(raft::device_span v, cones.segmented_sum(wv_terms, wv_dot, stream); const size_t grid_dim = raft::ceildiv(out.size(), soc_block_size); - apply_hessian_write_kernel<<>>( - v, out, w, eta, wv_dot, cone_offsets, element_cone_ids, bias, output_scale, bias_scale); + apply_hessian_kernel + <<>>(v, + out, + w, + eta, + wv_dot, + cone_offsets, + element_cone_ids, + cuopt::make_span(cones.cone_is_sparse), + dense_cones_only, + bias, + output_scale, + bias_scale); RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -804,100 +981,392 @@ void recover_cone_dz_from_target(raft::device_span dx, } /** - * Accumulate the SOC cone-block matvec into an existing output vector. + * Accumulate the dense SOC cone-block matvec into an existing output vector: + * out += H x, where H = S^2, applied to dense cones only. + */ +template +void launch_dense_hessian_matvec(raft::device_span x, + cone_data_t& cones, + raft::device_span out, + rmm::cuda_stream_view stream) +{ + auto out_input = raft::device_span(out.data(), out.size()); + apply_hessian(x, out, cones, stream, 1, out_input, 1, true); +} + +// One block per sparse cone. Write the diagonal of the implicit cone Hessian: +// eta^2 on every entry, with the head entry scaled by the rank-2 corner d. +template +__global__ void get_Hs_sparse_kernel(raft::device_span Hs_diag, + raft::device_span eta, + raft::device_span d, + raft::device_span sparse_cone_ids, + raft::device_span sparse_entry_offsets, + raft::device_span sparse_cone_dims, + i_t n_sparse_cones) +{ + const i_t sparse_idx = blockIdx.x; + if (sparse_idx >= n_sparse_cones) { return; } + + const i_t block_start = sparse_entry_offsets[sparse_idx]; + const i_t block_dim = sparse_cone_dims[sparse_idx]; + const i_t cone_idx = sparse_cone_ids[sparse_idx]; + const f_t eta_sq = eta[cone_idx] * eta[cone_idx]; + + for (i_t j = threadIdx.x; j < block_dim; j += blockDim.x) { + Hs_diag[block_start + j] = eta_sq; + } + __syncthreads(); + + if (threadIdx.x == 0) { Hs_diag[block_start] *= d[sparse_idx]; } +} + +/** + * Compute the packed diagonal `Hs_diag` of the implicit sparse cone Hessians, + * one contiguous block of `sparse_cone_dims[s]` entries per sparse cone. This is + * the diagonal that gets scattered onto the cone rows of the augmented system. + */ +template +void launch_get_Hs_sparse(cone_data_t& cones, + rmm::device_uvector& Hs_diag, + rmm::cuda_stream_view stream) +{ + if (!cones.has_sparse_cones()) { return; } + + const i_t n_sparse = cones.n_sparse_cones; + cuopt_assert(Hs_diag.size() == cones.n_sparse_cone_entries, "Hs_diag size mismatch"); + + get_Hs_sparse_kernel + <<>>(cuopt::make_span(Hs_diag), + cuopt::make_span(cones.eta), + cuopt::make_span(cones.d), + cuopt::make_span(cones.sparse_cone_ids), + cuopt::make_span(cones.sparse_entry_offsets), + cuopt::make_span(cones.sparse_cone_dims), + n_sparse); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +// Scatter one sparse-cone diagonal entry into the augmented value buffer: +// augmented_x[csr_indices[e]] = -Hs_diag[e] - q_values[e] - dual_perturb, +// matching the sign convention of the dense scatter (negated KKT block plus +// any quadratic-objective contribution and diagonal regularization). +template +__global__ void __launch_bounds__(soc_block_size) + scatter_sparse_hessian_diag_kernel(raft::device_span augmented_x, + raft::device_span csr_indices, + raft::device_span Hs_diag, + raft::device_span q_values, + f_t dual_perturb) +{ + const size_t e = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (e >= csr_indices.size()) { return; } + + augmented_x[csr_indices[e]] = -Hs_diag[e] - q_values[e] - dual_perturb; +} + +/** + * Write the sparse cone Hessian diagonals into the augmented system value buffer. + * + * `csr_indices[e]` gives the value-buffer position of the diagonal for packed + * cone entry `e`; each is set to `-Hs_diag[e] - q_values[e] - dual_perturb`. + */ +template +void scatter_sparse_hessian_diag_into_augmented(rmm::device_uvector& augmented_x, + const rmm::device_uvector& csr_indices, + const rmm::device_uvector& Hs_diag, + const rmm::device_uvector& q_values, + rmm::cuda_stream_view stream, + f_t dual_perturb) +{ + const size_t count = csr_indices.size(); + if (count == 0) { return; } + cuopt_assert(count == Hs_diag.size(), "sparse Hessian index/value size mismatch"); + cuopt_assert(count == q_values.size(), "sparse Hessian Q-value size mismatch"); + + const size_t grid = raft::ceildiv(count, soc_block_size); + scatter_sparse_hessian_diag_kernel + <<>>(cuopt::make_span(augmented_x), + cuopt::make_span(csr_indices), + cuopt::make_span(Hs_diag), + cuopt::make_span(q_values), + dual_perturb); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +/** + * Refresh the expansion (rank-2 coupling) entries of the augmented system. * - * Used by matrix-free products with the primal-reduced KKT block: - * out += H x, where H = S^2. + * For each sparse cone this scatters the rank-2 vectors v, u into their four + * coupling positions -- the v/u columns and the symmetric v/u rows linking the + * cone variables to the two expansion variables -- and writes the expansion + * diagonal +/-(eta^2 + dual_perturb). The Hessian diagonal itself is written + * separately by `scatter_sparse_hessian_diag_into_augmented`. */ template -void accumulate_cone_hessian_matvec(raft::device_span x, +void update_sparse_expansion_in_augmented(rmm::device_uvector& augmented_x, + const rmm::device_uvector& sparse_exp_v_col, + const rmm::device_uvector& sparse_exp_u_col, + const rmm::device_uvector& sparse_exp_v_row, + const rmm::device_uvector& sparse_exp_u_row, + const rmm::device_uvector& sparse_expansion_D, + const rmm::device_uvector& sparse_v, + const rmm::device_uvector& sparse_u, + const rmm::device_uvector& eta, + const rmm::device_uvector& sparse_cone_ids, + rmm::cuda_stream_view stream, + f_t dual_perturb) +{ + const size_t n_coupling = sparse_exp_v_col.size(); + if (n_coupling == 0) { return; } + + cuopt_assert(sparse_exp_u_col.size() == n_coupling, "sparse_exp_u_col size mismatch"); + cuopt_assert(sparse_exp_v_row.size() == n_coupling, "sparse_exp_v_row size mismatch"); + cuopt_assert(sparse_exp_u_row.size() == n_coupling, "sparse_exp_u_row size mismatch"); + cuopt_assert(sparse_v.size() == n_coupling, "sparse_v size mismatch"); + cuopt_assert(sparse_u.size() == n_coupling, "sparse_u size mismatch"); + + thrust::scatter(rmm::exec_policy(stream), + sparse_v.begin(), + sparse_v.end(), + sparse_exp_v_col.begin(), + augmented_x.begin()); + thrust::scatter(rmm::exec_policy(stream), + sparse_u.begin(), + sparse_u.end(), + sparse_exp_u_col.begin(), + augmented_x.begin()); + thrust::scatter(rmm::exec_policy(stream), + sparse_v.begin(), + sparse_v.end(), + sparse_exp_v_row.begin(), + augmented_x.begin()); + thrust::scatter(rmm::exec_policy(stream), + sparse_u.begin(), + sparse_u.end(), + sparse_exp_u_row.begin(), + augmented_x.begin()); + + const i_t n_expansion = static_cast(sparse_expansion_D.size()); + if (n_expansion > 0) { + auto eta_span = cuopt::make_span(eta); + auto sparse_cone_ids_span = cuopt::make_span(sparse_cone_ids); + // Expansion-row diagonal of sparse cone i/2: -(eta^2 + dual_perturb) for the + // v-row (even i), +(eta^2 + dual_perturb) for the u-row (odd i). + auto diag_values = thrust::make_transform_iterator( + thrust::counting_iterator(0), + [eta_span, sparse_cone_ids_span, dual_perturb] HD(i_t i) -> f_t { + const i_t sparse_idx = i / 2; + const f_t eta_val = eta_span[sparse_cone_ids_span[sparse_idx]]; + const f_t sign = (i % 2 == 0) ? f_t(-1) : f_t(1); + return sign * (eta_val * eta_val + dual_perturb); + }); + thrust::scatter(rmm::exec_policy(stream), + diag_values, + diag_values + n_expansion, + sparse_expansion_D.begin(), + augmented_x.begin()); + } +} + +/** + * Accumulate the sparse-SOC expanded KKT block into a matrix-free product. + * + * For each sparse cone this adds to the primal cone rows (before augmented_multiply negates r1): + * r1 += Hs_diag .* x_cone - v * x_exp_v - u * x_exp_u + * so y1 = -r1 matches the explicit CSR coupling (+v, +u on primal rows). + * and to the expansion rows: + * y_exp[2s] += -eta^2 * x_exp_v + v^T x_cone + * y_exp[2s+1] += +eta^2 * x_exp_u + u^T x_cone + * + * `v` and `u` are the rank-2 vectors in cones.sparse_v / cones.sparse_u. + */ +template +__global__ void sparse_augmented_matvec_kernel(raft::device_span x, + raft::device_span r1, + raft::device_span y_exp, + raft::device_span Hs_diag, + raft::device_span sparse_v, + raft::device_span sparse_u, + raft::device_span eta, + raft::device_span sparse_cone_ids, + raft::device_span sparse_cone_dims, + raft::device_span sparse_entry_offsets, + raft::device_span cone_offsets, + i_t cone_var_start, + i_t n_primal, + i_t m_constraints, + i_t n_sparse_cones) +{ + const i_t sparse_idx = blockIdx.x; + if (sparse_idx >= n_sparse_cones) { return; } + + const i_t cone = sparse_cone_ids[sparse_idx]; + const i_t q = sparse_cone_dims[sparse_idx]; + const i_t flat = sparse_entry_offsets[sparse_idx]; + const size_t off = cone_offsets[cone]; + const i_t base = cone_var_start + static_cast(off); + const i_t exp_v = n_primal + m_constraints + 2 * sparse_idx; + const i_t exp_u = exp_v + 1; + const f_t x_exp_v = x[exp_v]; + const f_t x_exp_u = x[exp_u]; + const f_t eta_sq = eta[cone] * eta[cone]; + + f_t partial_dot_v = f_t(0); + f_t partial_dot_u = f_t(0); + for (i_t j = threadIdx.x; j < q; j += blockDim.x) { + const f_t xj = x[base + j]; + const f_t vj = sparse_v[flat + j]; + const f_t uj = sparse_u[flat + j]; + partial_dot_v += vj * xj; + partial_dot_u += uj * xj; + r1[base + j] += Hs_diag[flat + j] * xj - vj * x_exp_v - uj * x_exp_u; + } + + __shared__ f_t s_dot_v[soc_block_size]; + __shared__ f_t s_dot_u[soc_block_size]; + s_dot_v[threadIdx.x] = partial_dot_v; + s_dot_u[threadIdx.x] = partial_dot_u; + __syncthreads(); + + for (i_t stride = blockDim.x / 2; stride > 0; stride >>= 1) { + if (threadIdx.x < stride) { + s_dot_v[threadIdx.x] += s_dot_v[threadIdx.x + stride]; + s_dot_u[threadIdx.x] += s_dot_u[threadIdx.x + stride]; + } + __syncthreads(); + } + + if (threadIdx.x == 0) { + y_exp[2 * sparse_idx] += -eta_sq * x_exp_v + s_dot_v[0]; + y_exp[2 * sparse_idx + 1] += eta_sq * x_exp_u + s_dot_u[0]; + } +} + +/** + * Matrix-free product of the implicit sparse cone blocks (see + * `sparse_augmented_matvec_kernel`). Accumulates the cone-row contribution into + * `r1` and the expansion-row contribution into `y_exp`, one block per sparse cone. + */ +template +void launch_sparse_augmented_matvec(raft::device_span x, + raft::device_span r1, + raft::device_span y_exp, cone_data_t& cones, - raft::device_span out, + raft::device_span Hs_diag, + i_t cone_var_start, + i_t n_primal, + i_t m_constraints, rmm::cuda_stream_view stream) { - auto out_input = raft::device_span(out.data(), out.size()); - apply_hessian(x, out, cones, stream, 1, out_input, 1); + if (!cones.has_sparse_cones()) { return; } + + const i_t n_sparse = cones.n_sparse_cones; + cuopt_assert(Hs_diag.size() == cones.n_sparse_cone_entries, "Hs_diag size mismatch"); + cuopt_assert(y_exp.size() == static_cast(cones.expansion_var_count()), + "expansion output size mismatch"); + + sparse_augmented_matvec_kernel + <<>>(x, + r1, + y_exp, + Hs_diag, + cuopt::make_span(cones.sparse_v), + cuopt::make_span(cones.sparse_u), + cuopt::make_span(cones.eta), + cuopt::make_span(cones.sparse_cone_ids), + cuopt::make_span(cones.sparse_cone_dims), + cuopt::make_span(cones.sparse_entry_offsets), + cuopt::make_span(cones.cone_offsets), + cone_var_start, + n_primal, + m_constraints, + n_sparse); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } +// Scatter one nonzero of a dense cone's q x q NT Hessian block into the +// augmented value buffer. Only cones with dim <= soc_threshold take this path. template __global__ void __launch_bounds__(soc_block_size) - scatter_hessian_into_augmented_kernel(raft::device_span augmented_x, - raft::device_span csr_indices, - raft::device_span q_values, - raft::device_span w, - raft::device_span eta, - raft::device_span cone_offsets, - raft::device_span block_offsets, - i_t n_cones, - f_t dual_perturb_value) + scatter_dense_hessian_into_augmented_kernel(raft::device_span augmented_x, + raft::device_span csr_indices, + raft::device_span q_values, + raft::device_span w, + raft::device_span eta, + raft::device_span cone_offsets, + raft::device_span dense_block_offsets, + raft::device_span dense_cone_ids, + i_t n_dense_cones, + f_t dual_perturb_value) { const size_t e = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; if (e >= csr_indices.size()) { return; } i_t lo = 0; - i_t hi = n_cones; + i_t hi = n_dense_cones; while (lo < hi) { const i_t mid = lo + (hi - lo) / 2; - if (block_offsets[mid + 1] <= e) { + if (dense_block_offsets[mid + 1] <= e) { lo = mid + 1; } else { hi = mid; } } - const i_t cone = lo; + const i_t dense_idx = lo; + const i_t cone = dense_cone_ids[dense_idx]; const size_t off = cone_offsets[cone]; const size_t q = cone_offsets[cone + 1] - off; - const size_t blk_off = block_offsets[cone]; + const size_t blk_off = dense_block_offsets[dense_idx]; const size_t local = e - blk_off; const size_t r = local / q; const size_t c = local % q; - const f_t eta_sq = eta[cone] * eta[cone]; - const f_t w0 = w[off]; - const f_t u_r = (r == 0) ? w0 : w[off + r]; - const f_t u_c = (c == 0) ? w0 : w[off + c]; - f_t val = f_t{2} * u_r * eta_sq * u_c; - const f_t diag_correction = (r == 0) ? -eta_sq : eta_sq; - if (r == c) { val += diag_correction; } + const f_t eta_sq = eta[cone] * eta[cone]; + const f_t w0 = w[off]; + const f_t u_r = (r == 0) ? w0 : w[off + r]; + const f_t u_c = (c == 0) ? w0 : w[off + c]; + const f_t val = f_t{2} * u_r * eta_sq * u_c; - augmented_x[csr_indices[e]] = -val - q_values[e]; + f_t entry = -val - q_values[e]; + if (r == c) { + const f_t diag_correction = (r == 0) ? -eta_sq : eta_sq; + entry -= diag_correction + dual_perturb_value; + } + augmented_x[csr_indices[e]] = entry; } +/** + * Write the full q x q NT Hessian blocks of the dense cones (dim <= soc_threshold) + * into the augmented system value buffer at the precomputed CSR positions. + */ template -void scatter_hessian_into_augmented(const cone_data_t& cones, - rmm::device_uvector& augmented_x, - const rmm::device_uvector& csr_indices, - const rmm::device_uvector& q_values, - rmm::cuda_stream_view stream, - f_t dual_perturb_value) +void scatter_dense_hessian_into_augmented(const cone_data_t& cones, + rmm::device_uvector& augmented_x, + const rmm::device_uvector& csr_indices, + const rmm::device_uvector& q_values, + const rmm::device_uvector& dense_block_offsets, + const rmm::device_uvector& dense_cone_ids, + rmm::cuda_stream_view stream, + f_t dual_perturb_value) { const size_t count = csr_indices.size(); if (count == 0) { return; } - cuopt_assert(count == q_values.size(), "cone CSR index and Q-value arrays must match"); - - // TODO: This offset calculation should be done in the barrier layer, - // because it is already done in the barrier layer for the augmented system, see - // cone_block_offsets_host. - rmm::device_uvector block_offsets(cones.n_cones + 1, stream); - block_offsets.set_element_to_zero_async(0, stream); - - auto block_sizes = thrust::make_transform_iterator( - cones.cone_dimensions.begin(), [] HD(i_t q) -> size_t { return static_cast(q) * q; }); - thrust::inclusive_scan( - rmm::exec_policy(stream), block_sizes, block_sizes + cones.n_cones, block_offsets.begin() + 1); + cuopt_assert(count == q_values.size(), "dense cone CSR index and Q-value arrays must match"); - // TODO: use dual_perturb_value for regularization + const i_t n_dense = cones.n_dense_cones(); const size_t grid = raft::ceildiv(count, soc_block_size); - scatter_hessian_into_augmented_kernel + scatter_dense_hessian_into_augmented_kernel <<>>(cuopt::make_span(augmented_x), cuopt::make_span(csr_indices), cuopt::make_span(q_values), cuopt::make_span(cones.w), cuopt::make_span(cones.eta), cuopt::make_span(cones.cone_offsets), - cuopt::make_span(block_offsets), - cones.n_cones, + cuopt::make_span(dense_block_offsets), + cuopt::make_span(dense_cone_ids), + n_dense, dual_perturb_value); RAFT_CUDA_TRY(cudaPeekAtLastError()); } diff --git a/cpp/src/barrier/second_order_cone_reduction.cuh b/cpp/src/barrier/second_order_cone_reduction.cuh index ada9fcfb6a..6ab380102a 100644 --- a/cpp/src/barrier/second_order_cone_reduction.cuh +++ b/cpp/src/barrier/second_order_cone_reduction.cuh @@ -10,8 +10,8 @@ #include #include +#include #include -#include #include #include @@ -24,7 +24,6 @@ #include #include -#include #include #include @@ -46,14 +45,22 @@ __global__ void __launch_bounds__(warps_per_cta* raft::WarpSize) OutputIt output, value_t init); +template +__global__ void __launch_bounds__(block_dim) + block_per_cone_reduce_kernel(InputIt input, + raft::device_span medium_cone_ids, + raft::device_span cone_offsets, + OutputIt output, + value_t init); + /** * Segmented-sum dispatcher for packed second-order cone vectors. * * Cone dimensions are fixed for a solve, so the constructor partitions cone * ids once by reduction strategy. Each call then reuses those partitions: - * small cones use one warp per cone, medium cones use CUB DeviceSegmentedReduce, + * small cones use one warp per cone, medium cones use one block per cone, * and large cones use CUB DeviceReduce one cone at a time. The object owns the - * CUB workspace for those medium/large paths. Call `prepare_workspace` once + * CUB workspace for the large-cone path. Call `prepare_workspace` once * before using a CUB-backed path. */ template @@ -69,7 +76,7 @@ struct segmented_sum_t { std::vector large_cone_ids; std::vector large_cone_dimensions; - // Maximum CUB temporary storage needed by prepared medium/large reductions. + // Maximum CUB temporary storage needed by prepared large reductions. std::size_t cub_workspace_bytes = 0; rmm::device_buffer cub_workspace; @@ -80,24 +87,6 @@ struct segmented_sum_t { auto input = thrust::make_constant_iterator(value_t{}); auto output = thrust::make_discard_iterator(); - if (!medium_cone_ids.is_empty()) { - const auto medium_begin_offsets = - thrust::make_permutation_iterator(cone_offsets.data(), medium_cone_ids.begin()); - const auto medium_end_offsets = - thrust::make_permutation_iterator(cone_offsets.data() + 1, medium_cone_ids.begin()); - - std::size_t temp_storage_bytes = 0; - RAFT_CUDA_TRY(cub::DeviceSegmentedReduce::Sum(nullptr, - temp_storage_bytes, - input, - output, - medium_cone_ids.size(), - medium_begin_offsets, - medium_end_offsets, - stream.value())); - cub_workspace_bytes = std::max(cub_workspace_bytes, temp_storage_bytes); - } - for (std::size_t i = 0; i < large_cone_ids.size(); ++i) { std::size_t temp_storage_bytes = 0; RAFT_CUDA_TRY(cub::DeviceReduce::Sum(nullptr, @@ -109,7 +98,7 @@ struct segmented_sum_t { cub_workspace_bytes = std::max(cub_workspace_bytes, temp_storage_bytes); } - if (cub_workspace.size() < cub_workspace_bytes) { + if (cub_workspace_bytes > 0 && cub_workspace.size() < cub_workspace_bytes) { cub_workspace.resize(cub_workspace_bytes, stream); } } @@ -138,31 +127,17 @@ struct segmented_sum_t { } if (!medium_cone_ids.is_empty()) { - cuopt_assert(cub_workspace_bytes > 0 && cub_workspace.size() >= cub_workspace_bytes, - "segmented_sum_t::prepare_workspace must be called before reducing medium or " - "large cones"); - - const auto medium_output = thrust::make_permutation_iterator(output, medium_cone_ids.begin()); - const auto medium_begin_offsets = - thrust::make_permutation_iterator(cone_offsets.data(), medium_cone_ids.begin()); - const auto medium_end_offsets = - thrust::make_permutation_iterator(cone_offsets.data() + 1, medium_cone_ids.begin()); - - std::size_t temp_storage_bytes = cub_workspace_bytes; - RAFT_CUDA_TRY(cub::DeviceSegmentedReduce::Sum(cub_workspace.data(), - temp_storage_bytes, - input, - medium_output, - medium_cone_ids.size(), - medium_begin_offsets, - medium_end_offsets, - stream.value())); + constexpr int medium_block_dim = 256; + const auto n_medium = medium_cone_ids.size(); + block_per_cone_reduce_kernel + <<>>( + input, cuopt::make_span(medium_cone_ids), cone_offsets, output, init); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } if (!large_cone_ids.empty()) { cuopt_assert(cub_workspace_bytes > 0 && cub_workspace.size() >= cub_workspace_bytes, - "segmented_sum_t::prepare_workspace must be called before reducing medium or " - "large cones"); + "segmented_sum_t::prepare_workspace must be called before reducing large cones"); for (std::size_t i = 0; i < large_cone_ids.size(); ++i) { std::size_t temp_storage_bytes = cub_workspace_bytes; @@ -258,4 +233,55 @@ __global__ void __launch_bounds__(warps_per_cta* raft::WarpSize) if (lane_id == 0) { output[cone] = sum; } } +template +__global__ void __launch_bounds__(block_dim) + block_per_cone_reduce_kernel(InputIt input, + raft::device_span medium_cone_ids, + raft::device_span cone_offsets, + OutputIt output, + value_t init) +{ + static_assert(block_dim > 0); + static_assert(block_dim <= 1024); + + constexpr int items_per_thread = 4; + + using block_reduce_t = cub::BlockReduce; + __shared__ typename block_reduce_t::TempStorage temp_storage; + + const auto slot = blockIdx.x; + if (slot >= medium_cone_ids.size()) { return; } + + const auto cone = medium_cone_ids[slot]; + const auto off = cone_offsets[cone]; + const auto dim = cone_offsets[cone + 1] - off; + + value_t acc[items_per_thread]; +#pragma unroll + for (int k = 0; k < items_per_thread; ++k) { + acc[k] = value_t{0}; + } + + // Thread t owns elements t, t+block_dim, ..., t+(items_per_thread-1)*block_dim + // within each tile of `block_dim * items_per_thread` consecutive entries, so + // every warp load stays coalesced. + const std::size_t tile = static_cast(block_dim) * items_per_thread; + for (std::size_t tile_start = 0; tile_start < dim; tile_start += tile) { +#pragma unroll + for (int k = 0; k < items_per_thread; ++k) { + const std::size_t idx = tile_start + threadIdx.x + static_cast(k) * block_dim; + if (idx < dim) { acc[k] = acc[k] + input[off + idx]; } + } + } + + value_t sum = init; +#pragma unroll + for (int k = 0; k < items_per_thread; ++k) { + sum = sum + acc[k]; + } + + sum = block_reduce_t(temp_storage).Sum(sum); + if (threadIdx.x == 0) { output[cone] = sum; } +} + } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp index 286ac7364f..46a551b9f1 100644 --- a/cpp/src/dual_simplex/simplex_solver_settings.hpp +++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp @@ -69,6 +69,7 @@ struct simplex_solver_settings_t { eliminate_dense_columns(true), barrier_iterative_refinement(true), barrier_step_scale(0.9), + barrier_soc_threshold(5), num_gpus(1), folding(-1), augmented(0), @@ -157,6 +158,7 @@ struct simplex_solver_settings_t { bool eliminate_dense_columns; // true to eliminate dense columns from A*D*A^T bool barrier_iterative_refinement; // true to use iterative refinement for barrier method f_t barrier_step_scale; // step scale for barrier method + i_t barrier_soc_threshold; // SOC dimension above which rank-2 sparse scaling is used int num_gpus; // Number of GPUs to use (maximum of 2 gpus are supported at the moment) i_t folding; // -1 automatic, 0 don't fold, 1 fold i_t augmented; // -1 automatic, 0 to solve with ADAT, 1 to solve with augmented system diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu index 4a000c6e05..235fc57f30 100644 --- a/cpp/src/math_optimization/solver_settings.cu +++ b/cpp/src/math_optimization/solver_settings.cu @@ -132,6 +132,7 @@ solver_settings_t::solver_settings_t() : pdlp_settings(), mip_settings {CUOPT_DUALIZE, &pdlp_settings.dualize, -1, 1, -1}, {CUOPT_ORDERING, &pdlp_settings.ordering, -1, 1, -1}, {CUOPT_BARRIER_DUAL_INITIAL_POINT, &pdlp_settings.barrier_dual_initial_point, -1, 1, -1}, + {CUOPT_BARRIER_SOC_THRESHOLD, &pdlp_settings.barrier_soc_threshold, 0, 1000000, 5, "SOC dimension above which rank-2 sparse KKT expansion is used"}, {CUOPT_MIP_CUT_PASSES, &mip_settings.max_cut_passes, -1, std::numeric_limits::max(), 10}, {CUOPT_MIP_MIXED_INTEGER_ROUNDING_CUTS, &mip_settings.mir_cuts, -1, 1, -1}, {CUOPT_MIP_MIXED_INTEGER_GOMORY_CUTS, &mip_settings.mixed_integer_gomory_cuts, -1, 1, -1}, diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index 5d29e97fa0..3fc1fc7dda 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -511,6 +511,7 @@ run_barrier(dual_simplex::user_problem_t& user_problem, barrier_settings.crossover = settings.crossover; barrier_settings.eliminate_dense_columns = settings.eliminate_dense_columns; barrier_settings.barrier_iterative_refinement = settings.barrier_iterative_refinement; + barrier_settings.barrier_soc_threshold = settings.barrier_soc_threshold; barrier_settings.barrier_step_scale = settings.barrier_step_scale; barrier_settings.cudss_deterministic = settings.cudss_deterministic; barrier_settings.barrier_relaxed_feasibility_tol = settings.tolerances.relative_primal_tolerance; diff --git a/cpp/tests/socp/CMakeLists.txt b/cpp/tests/socp/CMakeLists.txt index f380984225..7ea1a732d5 100644 --- a/cpp/tests/socp/CMakeLists.txt +++ b/cpp/tests/socp/CMakeLists.txt @@ -5,6 +5,7 @@ ConfigureTest(SOCP_TEST ${CMAKE_CURRENT_SOURCE_DIR}/second_order_cone_kernels.cu + ${CMAKE_CURRENT_SOURCE_DIR}/sparse_augmented_kkt_test.cu ${CMAKE_CURRENT_SOURCE_DIR}/solve_barrier_socp.cu ${CMAKE_CURRENT_SOURCE_DIR}/general_quadratic_test.cu LABELS numopt) diff --git a/cpp/tests/socp/second_order_cone_kernels.cu b/cpp/tests/socp/second_order_cone_kernels.cu index 49126ee335..43c162b4aa 100644 --- a/cpp/tests/socp/second_order_cone_kernels.cu +++ b/cpp/tests/socp/second_order_cone_kernels.cu @@ -202,6 +202,171 @@ TEST(second_order_cone_kernels, nt_scaling_matches_host_reference) } } +TEST(second_order_cone_kernels, nt_scaling_many_small_one_medium_cone) +{ + auto stream = rmm::cuda_stream_default; + + // chainsing-like topology: many small cones plus one dim-1000 medium cone (warp_cone_dim=64). + std::vector cone_dimensions; + cone_dimensions.reserve(21); + for (int i = 0; i < 20; ++i) { + cone_dimensions.push_back(3); + } + cone_dimensions.push_back(1000); + + std::size_t n_cone_entries = 0; + for (const auto dim : cone_dimensions) { + n_cone_entries += static_cast(dim); + } + + std::vector x_host(n_cone_entries); + std::vector z_host(n_cone_entries); + std::size_t offset = 0; + for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) { + const auto dim = cone_dimensions[cone]; + x_host[offset] = 100.0 + static_cast(cone); + z_host[offset] = 80.0 + static_cast(cone); + for (int local_idx = 1; local_idx < dim; ++local_idx) { + x_host[offset + local_idx] = 0.001 * static_cast((local_idx % 5) + 1); + z_host[offset + local_idx] = 0.0015 * static_cast((local_idx % 7) + 1); + } + offset += static_cast(dim); + } + + auto x = cuopt::device_copy(x_host, stream); + auto z = cuopt::device_copy(z_host, stream); + cone_data_t cones(cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream); + + EXPECT_EQ(cuopt::host_copy(cones.segmented_sum.medium_cone_ids, stream), (std::vector{20})); + EXPECT_EQ(cones.n_sparse_cones, 1); + + launch_nt_scaling(cones, stream); + launch_update_scaling_sparse(cones, stream); + + auto eta_host = cuopt::host_copy(cones.eta, stream); + auto w_host = cuopt::host_copy(cones.w, stream); + + std::vector expected_eta(cone_dimensions.size()); + std::vector expected_w(n_cone_entries); + + offset = 0; + for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) { + const auto dim = cone_dimensions[cone]; + + double x_tail_sq = 0.0; + double z_tail_sq = 0.0; + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + x_tail_sq += x_host[idx] * x_host[idx]; + z_tail_sq += z_host[idx] * z_host[idx]; + } + + const auto x_tail_norm = std::sqrt(x_tail_sq); + const auto z_tail_norm = std::sqrt(z_tail_sq); + const auto x_det = (x_host[offset] - x_tail_norm) * (x_host[offset] + x_tail_norm); + const auto z_det = (z_host[offset] - z_tail_norm) * (z_host[offset] + z_tail_norm); + ASSERT_GT(x_det, 0.0) << "cone " << cone; + ASSERT_GT(z_det, 0.0) << "cone " << cone; + + const auto x_scale = std::sqrt(x_det); + const auto z_scale = std::sqrt(z_det); + + expected_eta[cone] = std::sqrt(z_scale / x_scale); + + double normalized_xz_dot = 0.0; + for (int local_idx = 0; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + normalized_xz_dot += x_host[idx] * z_host[idx] / (x_scale * z_scale); + } + const auto w_det = 2.0 + 2.0 * normalized_xz_dot; + ASSERT_GT(w_det, 0.0) << "cone " << cone; + const auto w_scale = std::sqrt(w_det); + + expected_w[offset] = 0.0; + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + expected_w[idx] = (z_host[idx] / z_scale - x_host[idx] / x_scale) / w_scale; + } + + double normalized_tail_sq = 0.0; + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + normalized_tail_sq += expected_w[idx] * expected_w[idx]; + } + expected_w[offset] = std::sqrt(1.0 + normalized_tail_sq); + + offset += static_cast(dim); + } + + for (std::size_t i = 0; i < expected_eta.size(); ++i) { + EXPECT_NEAR(eta_host[i], expected_eta[i], 1e-10) << "cone " << i; + } + + for (std::size_t i = 0; i < expected_w.size(); ++i) { + EXPECT_NEAR(w_host[i], expected_w[i], 1e-10) << "entry " << i; + } +} + +TEST(second_order_cone_kernels, cone_step_length_many_small_one_sparse_medium_cone) +{ + auto stream = rmm::cuda_stream_default; + + std::vector cone_dimensions; + for (int i = 0; i < 20; ++i) { + cone_dimensions.push_back(3); + } + cone_dimensions.push_back(1000); + + std::size_t n_cone_entries = 0; + for (const auto dim : cone_dimensions) { + n_cone_entries += static_cast(dim); + } + + std::vector x_host(n_cone_entries); + std::vector z_host(n_cone_entries); + std::vector dx_host(n_cone_entries); + std::vector dz_host(n_cone_entries); + + std::size_t offset = 0; + for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) { + const auto dim = cone_dimensions[cone]; + x_host[offset] = 12.0 + static_cast(cone); + z_host[offset] = 14.0 + static_cast(cone); + dx_host[offset] = 0.05; + dz_host[offset] = -0.04; + for (int local_idx = 1; local_idx < dim; ++local_idx) { + const auto idx = offset + local_idx; + x_host[idx] = 0.001 * static_cast((local_idx % 5) + 1); + z_host[idx] = 0.0015 * static_cast((local_idx % 7) + 1); + dx_host[idx] = 0.02 * static_cast((local_idx % 5) - 2); + dz_host[idx] = -0.015 * static_cast((local_idx % 7) - 3); + } + offset += static_cast(dim); + } + + auto x = cuopt::device_copy(x_host, stream); + auto z = cuopt::device_copy(z_host, stream); + auto dx = cuopt::device_copy(dx_host, stream); + auto dz = cuopt::device_copy(dz_host, stream); + + cone_data_t cones(cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream); + + launch_nt_scaling(cones, stream); + launch_update_scaling_sparse(cones, stream); + + const auto [step_primal, step_dual] = + compute_cone_step_length(cones, + raft::device_span(dx.data(), dx.size()), + raft::device_span(dz.data(), dz.size()), + 1.0, + stream); + + EXPECT_GT(step_primal, 0.0); + EXPECT_GT(step_dual, 0.0); + EXPECT_LE(step_primal, 1.0); + EXPECT_LE(step_dual, 1.0); +} + TEST(second_order_cone_kernels, cone_step_length_keeps_iterate_in_cone) { auto stream = rmm::cuda_stream_default; @@ -354,7 +519,7 @@ TEST(second_order_cone_kernels, scaling_operators_match_host_reference) raft::device_span(cone_target.data(), cone_target.size()), cuopt::make_span(recovered_dz), stream); - accumulate_cone_hessian_matvec(v_span, cones, cuopt::make_span(accum), stream); + launch_dense_hessian_matvec(v_span, cones, cuopt::make_span(accum), stream); apply_w(v_span, cuopt::make_span(w_tmp), cones, stream); apply_w(raft::device_span(w_tmp.data(), w_tmp.size()), cuopt::make_span(w_squared_v), @@ -374,13 +539,15 @@ TEST(second_order_cone_kernels, scaling_operators_match_host_reference) std::vector expected_w_inv(n_cone_entries); std::vector expected_h(n_cone_entries); std::vector expected_h_unscaled(n_cone_entries); + std::vector expected_accum(n_cone_entries); offset = 0; for (std::size_t cone = 0; cone < cone_dimensions.size(); ++cone) { - const auto dim = cone_dimensions[cone]; - const auto w0 = w_host[offset]; - const auto v0 = v_host[offset]; - const auto eta = eta_host[cone]; + const auto dim = cone_dimensions[cone]; + const auto w0 = w_host[offset]; + const auto v0 = v_host[offset]; + const auto eta = eta_host[cone]; + const bool cone_dense = dim <= cones.soc_threshold; double tail_dot = 0.0; for (int local_idx = 1; local_idx < dim; ++local_idx) { @@ -394,6 +561,8 @@ TEST(second_order_cone_kernels, scaling_operators_match_host_reference) const auto rho = w0 * v0 + tail_dot; expected_h_unscaled[offset] = (eta * eta) * (2.0 * w0 * rho - v0); expected_h[offset] = expected_h_unscaled[offset]; + expected_accum[offset] = + accum_initial_host[offset] + (cone_dense ? expected_h_unscaled[offset] : 0.0); for (int local_idx = 1; local_idx < dim; ++local_idx) { const auto idx = offset + local_idx; @@ -402,6 +571,7 @@ TEST(second_order_cone_kernels, scaling_operators_match_host_reference) expected_w_inv[idx] = (v_host[idx] + (-v0 + tail_dot / (1.0 + w0)) * w_host[idx]) / eta; expected_h_unscaled[idx] = (eta * eta) * (2.0 * w_host[idx] * rho + v_host[idx]); expected_h[idx] = expected_h_unscaled[idx]; + expected_accum[idx] = accum_initial_host[idx] + (cone_dense ? expected_h_unscaled[idx] : 0.0); } offset += static_cast(dim); @@ -414,8 +584,7 @@ TEST(second_order_cone_kernels, scaling_operators_match_host_reference) EXPECT_NEAR(h_out_host[i], w_squared_v_host[i], 1e-9) << "W^2 v vs H v entry " << i; EXPECT_NEAR(recovered_dz_host[i], cone_target_host[i] - expected_h_unscaled[i], 1e-9) << "recovered dz entry " << i; - EXPECT_NEAR(accum_host[i], accum_initial_host[i] + expected_h_unscaled[i], 1e-9) - << "accumulated H entry " << i; + EXPECT_NEAR(accum_host[i], expected_accum[i], 1e-9) << "accumulated H entry " << i; } } @@ -581,4 +750,176 @@ TEST(second_order_cone_kernels, combined_cone_rhs_matches_host_reference) } } +TEST(second_order_cone_kernels, sparse_cone_classification) +{ + auto stream = rmm::cuda_stream_default; + + // threshold=5: cones of dim 3,2 are dense; dim 6,32769 are sparse + std::vector cone_dimensions{3, 6, 2, 32769}; + rmm::device_uvector x(32780, stream); + rmm::device_uvector z(32780, stream); + + cone_data_t cones( + cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream, /*soc_threshold=*/5); + + EXPECT_EQ(cones.soc_threshold, 5); + EXPECT_EQ(cones.n_sparse_cones, 2); + EXPECT_EQ(cones.n_dense_cones(), 2); + EXPECT_EQ(cones.expansion_var_count(), 4); + EXPECT_EQ(cones.n_sparse_cone_entries, std::size_t{32775}); + EXPECT_EQ(cones.d.size(), 2u); + EXPECT_EQ(cones.sparse_v.size(), 32775u); + EXPECT_EQ(cones.sparse_u.size(), 32775u); + EXPECT_EQ(cuopt::host_copy(cones.sparse_cone_ids, stream), (std::vector{1, 3})); + EXPECT_EQ(cuopt::host_copy(cones.sparse_cone_dims, stream), (std::vector{6, 32769})); +} + +namespace { + +struct sparse_scaling_head_t { + double d; + double u0; + double u1; + double v0; + double v1; +}; + +// Must match update_scaling_sparse_kernel in second_order_cone_kernels.cuh. +sparse_scaling_head_t sparse_scaling_head_reference(double w0) +{ + const double alpha = 2.0 * w0; + const double wsq = 2.0 * w0 * w0 - 1.0; + const double wsq_safe = 0.5 * (wsq + std::sqrt(wsq * wsq + 1.0)); + const double wsqinv = 1.0 / wsq_safe; + const double d = 0.5 * wsqinv; + const double radicand = wsq_safe - d; + const double u0 = std::sqrt(std::max(radicand, 0.0)); + const double u1 = (u0 > 0.0) ? alpha / u0 : 0.0; + const double v0 = 0.0; + const double denom = 2.0 * wsq_safe - wsqinv; + const double v1_arg = (std::abs(denom) > 1e-12) ? 2.0 * (2.0 + wsqinv) / denom : 2.0; + const double v1 = std::sqrt(std::max(v1_arg, 0.0)); + return {d, u0, u1, v0, v1}; +} + +} // namespace + +TEST(second_order_cone_kernels, update_scaling_sparse_matches_reference) +{ + auto stream = rmm::cuda_stream_default; + + std::vector cone_dimensions{3, 6}; + rmm::device_uvector x(9, stream); + rmm::device_uvector z(9, stream); + + cone_data_t cones( + cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream, /*soc_threshold=*/4); + + ASSERT_EQ(cones.n_sparse_cones, 1); + + // Place interior point in the sparse cone (second cone, dim 6). + std::vector x_host(9, 0.0); + std::vector z_host(9, 0.0); + x_host[3] = 2.0; + z_host[3] = 1.5; + for (int j = 1; j < 6; ++j) { + x_host[3 + j] = 0.1 * j; + z_host[3 + j] = 0.08 * j; + } + x_host[0] = 1.5; + z_host[0] = 1.2; + for (int j = 1; j < 3; ++j) { + x_host[j] = 0.05; + z_host[j] = 0.04; + } + + raft::copy(cones.x.data(), x_host.data(), x_host.size(), stream); + raft::copy(cones.z.data(), z_host.data(), z_host.size(), stream); + + launch_nt_scaling(cones, stream); + launch_update_scaling_sparse(cones, stream); + + auto d_host = cuopt::host_copy(cones.d, stream); + auto v_host = cuopt::host_copy(cones.sparse_v, stream); + auto u_host = cuopt::host_copy(cones.sparse_u, stream); + auto w_host = cuopt::host_copy(cones.w, stream); + auto eta_host = cuopt::host_copy(cones.eta, stream); + + const int sparse_cone = 1; + const int cone_dim = 6; + const int cone_off = 3; + const double eta = eta_host[sparse_cone]; + + const auto head = sparse_scaling_head_reference(w_host[cone_off]); + const double scale = eta * eta; + + EXPECT_NEAR(d_host[0], head.d, 1e-10); + EXPECT_NEAR(v_host[0], scale * head.v0, 1e-10); + EXPECT_NEAR(u_host[0], scale * head.u0, 1e-10); + for (int j = 1; j < cone_dim; ++j) { + EXPECT_NEAR(v_host[j], scale * head.v1 * w_host[cone_off + j], 1e-10) << "v tail " << j; + EXPECT_NEAR(u_host[j], scale * head.u1 * w_host[cone_off + j], 1e-10) << "u tail " << j; + } +} + +TEST(second_order_cone_kernels, update_scaling_sparse_two_cones) +{ + auto stream = rmm::cuda_stream_default; + + // sparse cones: dim 6 (offset 3) and dim 5 (offset 9) + std::vector cone_dimensions{3, 6, 5}; + rmm::device_uvector x(14, stream); + rmm::device_uvector z(14, stream); + + cone_data_t cones( + cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream, /*soc_threshold=*/4); + + ASSERT_EQ(cones.n_sparse_cones, 2); + ASSERT_EQ(cones.n_sparse_cone_entries, std::size_t{11}); + + std::vector x_host(14, 0.0); + std::vector z_host(14, 0.0); + for (int cone = 0; cone < 3; ++cone) { + const int off = (cone == 0) ? 0 : (cone == 1 ? 3 : 9); + const int dim = cone_dimensions[cone]; + x_host[off] = 2.0 + 0.1 * cone; + z_host[off] = 1.5 + 0.05 * cone; + for (int j = 1; j < dim; ++j) { + x_host[off + j] = 0.1 * j; + z_host[off + j] = 0.08 * j; + } + } + + raft::copy(cones.x.data(), x_host.data(), x_host.size(), stream); + raft::copy(cones.z.data(), z_host.data(), z_host.size(), stream); + + launch_nt_scaling(cones, stream); + launch_update_scaling_sparse(cones, stream); + + auto v_host = cuopt::host_copy(cones.sparse_v, stream); + auto u_host = cuopt::host_copy(cones.sparse_u, stream); + auto w_host = cuopt::host_copy(cones.w, stream); + auto eta_host = cuopt::host_copy(cones.eta, stream); + + const auto check_cone = [&](int sparse_idx, int cone_idx, int cone_off, int cone_dim) { + const double eta = eta_host[cone_idx]; + const auto head = sparse_scaling_head_reference(w_host[cone_off]); + const double scale = eta * eta; + + const int block_start = (sparse_idx == 0) ? 0 : 6; + + EXPECT_NEAR(v_host[block_start], scale * head.v0, 1e-10) << "sparse " << sparse_idx; + EXPECT_NEAR(u_host[block_start], scale * head.u0, 1e-10) << "sparse " << sparse_idx; + for (int j = 1; j < cone_dim; ++j) { + EXPECT_NEAR(v_host[block_start + j], scale * head.v1 * w_host[cone_off + j], 1e-10) + << "v tail sparse " << sparse_idx << " j " << j; + EXPECT_NEAR(u_host[block_start + j], scale * head.u1 * w_host[cone_off + j], 1e-10) + << "u tail sparse " << sparse_idx << " j " << j; + } + }; + + check_cone(0, 1, 3, 6); + check_cone(1, 2, 9, 5); +} + } // namespace cuopt::linear_programming::dual_simplex::test diff --git a/cpp/tests/socp/solve_barrier_socp.cu b/cpp/tests/socp/solve_barrier_socp.cu index f960c3dc3a..1878e43898 100644 --- a/cpp/tests/socp/solve_barrier_socp.cu +++ b/cpp/tests/socp/solve_barrier_socp.cu @@ -824,4 +824,214 @@ TEST(barrier, qp_with_soc_block) EXPECT_NEAR(std::abs(solution.x[3]), 0.0, 1e-4); } +TEST(barrier, sparse_soc_expansion_solves_large_single_cone) +{ + // minimize x_0 + // subject to x_1 = 1 + // (x_0, ..., x_5) in Q^6 + // + // Optimal: x* = (1, 1, 0, 0, 0, 0), obj* = 1 + + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 1; + constexpr int n = 6; + constexpr int nz = 1; + + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective.assign(n, 0.0); + user_problem.objective[0] = 1.0; + + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + user_problem.A.col_start = {0, 0, 1, 1, 1, 1, 1}; + user_problem.A.i[0] = 0; + user_problem.A.x[0] = 1.0; + + user_problem.rhs = {1.0}; + user_problem.row_sense = {'E'}; + user_problem.lower.assign(n, 0.0); + user_problem.upper.assign(n, inf); + + user_problem.num_range_rows = 0; + user_problem.problem_name = "sparse_soc_single_large_cone"; + user_problem.cone_var_start = 0; + user_problem.second_order_cone_dims = {6}; + user_problem.var_types.assign(n, variable_type_t::CONTINUOUS); + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.barrier_presolve = true; + settings.dualize = 0; + settings.barrier_soc_threshold = 4; + settings.barrier_iterative_refinement = true; + + lp_solution_t solution(m, n); + auto status = solve_linear_program_with_barrier(user_problem, settings, solution); + EXPECT_EQ(status, lp_status_t::OPTIMAL); + EXPECT_NEAR(solution.objective, 1.0, 1e-4); + EXPECT_NEAR(solution.x[0], 1.0, 1e-4); + EXPECT_NEAR(solution.x[1], 1.0, 1e-4); + for (int j = 2; j < n; ++j) { + EXPECT_NEAR(std::abs(solution.x[j]), 0.0, 1e-4) << "index " << j; + } +} + +TEST(barrier, mixed_dense_and_sparse_soc_blocks) +{ + // Variables ordered as [l1, l2 | dense Q^3 | sparse Q^6]. + // + // minimize t_dense + t_sparse + // subject to l1 - u_dense = 0 + // l2 - u_sparse = 0 + // l1 + l2 >= 3 + // l1 - l2 = 1 + // + // Same optimum as mixed_linear_and_two_soc_blocks_with_inequality: obj* = 3. + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 4; + constexpr int n = 11; + constexpr int nz = 8; + + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective.assign(n, 0.0); + user_problem.objective[2] = 1.0; // t_dense (head of dense Q^3 block) + user_problem.objective[5] = 1.0; // t_sparse (head of sparse Q^6 block) + + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + // Columns: l1, l2, t_d, u_d, v_d, t_s, u_s, v_s, w_s, y_s, z_s + user_problem.A.col_start = {0, 3, 6, 6, 7, 7, 7, 8, 8, 8, 8, 8}; + user_problem.A.i[0] = 0; + user_problem.A.x[0] = 1.0; + user_problem.A.i[1] = 2; + user_problem.A.x[1] = 1.0; + user_problem.A.i[2] = 3; + user_problem.A.x[2] = 1.0; + user_problem.A.i[3] = 1; + user_problem.A.x[3] = 1.0; + user_problem.A.i[4] = 2; + user_problem.A.x[4] = 1.0; + user_problem.A.i[5] = 3; + user_problem.A.x[5] = -1.0; + user_problem.A.i[6] = 0; + user_problem.A.x[6] = -1.0; + user_problem.A.i[7] = 1; + user_problem.A.x[7] = -1.0; + + user_problem.rhs = {0.0, 0.0, 3.0, 1.0}; + user_problem.row_sense = {'E', 'E', 'G', 'E'}; + user_problem.lower.assign(n, 0.0); + user_problem.upper.assign(n, inf); + + user_problem.num_range_rows = 0; + user_problem.problem_name = "mixed_dense_and_sparse_soc_blocks"; + user_problem.cone_var_start = 2; + user_problem.second_order_cone_dims = {3, 6}; + user_problem.var_types.assign(n, variable_type_t::CONTINUOUS); + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.barrier_presolve = true; + settings.dualize = 0; + settings.scale_columns = true; + settings.barrier_soc_threshold = 4; + settings.barrier_iterative_refinement = true; + + lp_solution_t solution(m, n); + auto status = solve_linear_program_with_barrier(user_problem, settings, solution); + + EXPECT_EQ(status, lp_status_t::OPTIMAL); + EXPECT_NEAR(solution.objective, 3.0, 1e-4); + EXPECT_NEAR(solution.x[0], 2.0, 1e-4); + EXPECT_NEAR(solution.x[1], 1.0, 1e-4); + EXPECT_NEAR(solution.x[2], 2.0, 1e-4); + EXPECT_NEAR(solution.x[3], 2.0, 1e-4); + EXPECT_NEAR(std::abs(solution.x[4]), 0.0, 1e-4); + EXPECT_NEAR(solution.x[5], 1.0, 1e-4); + EXPECT_NEAR(solution.x[6], 1.0, 1e-4); + for (int j = 7; j < n; ++j) { + EXPECT_NEAR(std::abs(solution.x[j]), 0.0, 1e-4) << "index " << j; + } +} + +TEST(barrier, sparse_soc_expansion_solves_dim_500_cone) +{ + // minimize x_0 + // subject to x_1 = 1 + // (x_0, ..., x_499) in Q^500 + // + // Optimal: x* = (1, 1, 0, ..., 0), obj* = 1 + // Sparse cone (dim 500 > default threshold 5) with barrier IR enabled. + raft::handle_t handle{}; + init_handler(&handle); + + using namespace cuopt::linear_programming::dual_simplex; + user_problem_t user_problem(&handle); + + constexpr int m = 1; + constexpr int n = 500; + constexpr int nz = 1; + + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective.assign(n, 0.0); + user_problem.objective[0] = 1.0; + + user_problem.A.m = m; + user_problem.A.n = n; + user_problem.A.nz_max = nz; + user_problem.A.reallocate(nz); + // x_1 = 1: nonzero in column 1. + user_problem.A.col_start.assign(n + 1, 1); + user_problem.A.col_start[0] = 0; + user_problem.A.col_start[1] = 0; + user_problem.A.i[0] = 0; + user_problem.A.x[0] = 1.0; + + user_problem.rhs = {1.0}; + user_problem.row_sense = {'E'}; + user_problem.lower.assign(n, 0.0); + user_problem.upper.assign(n, inf); + + user_problem.num_range_rows = 0; + user_problem.problem_name = "sparse_soc_dim_500"; + user_problem.cone_var_start = 0; + user_problem.second_order_cone_dims = {500}; + user_problem.var_types.assign(n, variable_type_t::CONTINUOUS); + + simplex_solver_settings_t settings; + settings.barrier = true; + settings.barrier_presolve = true; + settings.dualize = 0; + settings.barrier_soc_threshold = 5; + settings.barrier_iterative_refinement = true; + + lp_solution_t solution(m, n); + auto status = solve_linear_program_with_barrier(user_problem, settings, solution); + + EXPECT_EQ(status, lp_status_t::OPTIMAL); + EXPECT_NEAR(solution.objective, 1.0, 1e-3); + EXPECT_NEAR(solution.x[0], 1.0, 1e-3); + EXPECT_NEAR(solution.x[1], 1.0, 1e-3); + for (int j = 2; j < n; ++j) { + EXPECT_NEAR(std::abs(solution.x[j]), 0.0, 1e-3) << "index " << j; + } +} + } // namespace cuopt::linear_programming::dual_simplex::test diff --git a/cpp/tests/socp/sparse_augmented_kkt_test.cu b/cpp/tests/socp/sparse_augmented_kkt_test.cu new file mode 100644 index 0000000000..400a66a536 --- /dev/null +++ b/cpp/tests/socp/sparse_augmented_kkt_test.cu @@ -0,0 +1,353 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#include + +#include + +#include + +#include + +#include +#include + +namespace cuopt::linear_programming::dual_simplex::test { + +TEST(sparse_augmented_kkt, cone_counts_and_expansion_size) +{ + auto stream = rmm::cuda_stream_default; + + std::vector cone_dimensions{3, 6, 5}; + rmm::device_uvector x(14, stream); + rmm::device_uvector z(14, stream); + + cone_data_t cones( + cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream, /*soc_threshold=*/4); + + EXPECT_EQ(cones.n_sparse_cones, 2); + EXPECT_EQ(cones.n_dense_cones(), 1); + EXPECT_EQ(cones.expansion_var_count(), 4); + EXPECT_EQ(cones.n_sparse_cone_entries, 11u); +} + +TEST(sparse_augmented_kkt, launch_get_Hs_sparse) +{ + auto stream = rmm::cuda_stream_default; + + std::vector cone_dimensions{3, 6}; + rmm::device_uvector x(9, stream); + rmm::device_uvector z(9, stream); + + cone_data_t cones( + cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream, /*soc_threshold=*/4); + + ASSERT_EQ(cones.n_sparse_cones, 1); + + std::vector x_host(9, 0.0); + std::vector z_host(9, 0.0); + x_host[3] = 2.0; + z_host[3] = 1.5; + for (int j = 1; j < 6; ++j) { + x_host[3 + j] = 0.1 * j; + z_host[3 + j] = 0.08 * j; + } + x_host[0] = 1.5; + z_host[0] = 1.2; + for (int j = 1; j < 3; ++j) { + x_host[j] = 0.05; + z_host[j] = 0.04; + } + + raft::copy(cones.x.data(), x_host.data(), x_host.size(), stream); + raft::copy(cones.z.data(), z_host.data(), z_host.size(), stream); + + launch_nt_scaling(cones, stream); + launch_update_scaling_sparse(cones, stream); + + rmm::device_uvector Hs_diag(cones.n_sparse_cone_entries, stream); + launch_get_Hs_sparse(cones, Hs_diag, stream); + + auto d_host = cuopt::host_copy(cones.d, stream); + auto eta_host = cuopt::host_copy(cones.eta, stream); + auto hs_host = cuopt::host_copy(Hs_diag, stream); + + const int sparse_cone = 1; + const double eta_sq = eta_host[sparse_cone] * eta_host[sparse_cone]; + + EXPECT_NEAR(hs_host[0], eta_sq * d_host[0], 1e-10); + for (int j = 1; j < 6; ++j) { + EXPECT_NEAR(hs_host[j], eta_sq, 1e-10) << "tail index " << j; + } +} + +TEST(sparse_augmented_kkt, scatter_and_update_sparse_expansion) +{ + auto stream = rmm::cuda_stream_default; + + std::vector cone_dimensions{6}; + rmm::device_uvector x(6, stream); + rmm::device_uvector z(6, stream); + + cone_data_t cones( + cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream, /*soc_threshold=*/4); + + ASSERT_EQ(cones.n_sparse_cones, 1); + ASSERT_EQ(cones.expansion_var_count(), 2); + + std::vector x_host{2.0, 0.2, 0.3, 0.4, 0.5, 0.6}; + std::vector z_host{1.5, 0.1, 0.15, 0.2, 0.25, 0.3}; + raft::copy(cones.x.data(), x_host.data(), x_host.size(), stream); + raft::copy(cones.z.data(), z_host.data(), z_host.size(), stream); + + launch_nt_scaling(cones, stream); + launch_update_scaling_sparse(cones, stream); + + const int nnz = 32; + + rmm::device_uvector augmented_x(nnz, stream); + thrust::fill(rmm::exec_policy(stream), augmented_x.begin(), augmented_x.end(), 0.0); + + // One coupling/Hessian CSR entry per sparse-cone element, matching production where every + // *_col/*_row index array has size n_sparse_cone_entries (== cones.sparse_v.size()). + const int q_dim = 6; + ASSERT_EQ(cones.n_sparse_cone_entries, static_cast(q_dim)); + + std::vector sparse_hessian_diag(q_dim); + std::vector sparse_hessian_Q(q_dim, 0.0); + std::vector sparse_exp_v_col(q_dim); + std::vector sparse_exp_u_col(q_dim); + std::vector sparse_exp_v_row(q_dim); + std::vector sparse_exp_u_row(q_dim); + std::vector sparse_expansion_D{30, 31}; + for (int j = 0; j < q_dim; ++j) { + sparse_hessian_diag[j] = j; // CSR slots 0..5 + sparse_exp_v_col[j] = 6 + j; // 6..11 + sparse_exp_u_col[j] = 12 + j; // 12..17 + sparse_exp_v_row[j] = 18 + j; // 18..23 + sparse_exp_u_row[j] = 24 + j; // 24..29 + } + + auto d_sparse_hessian_diag = cuopt::device_copy(sparse_hessian_diag, stream); + auto d_sparse_hessian_Q = cuopt::device_copy(sparse_hessian_Q, stream); + auto d_sparse_exp_v_col = cuopt::device_copy(sparse_exp_v_col, stream); + auto d_sparse_exp_u_col = cuopt::device_copy(sparse_exp_u_col, stream); + auto d_sparse_exp_v_row = cuopt::device_copy(sparse_exp_v_row, stream); + auto d_sparse_exp_u_row = cuopt::device_copy(sparse_exp_u_row, stream); + auto d_sparse_expansion_D = cuopt::device_copy(sparse_expansion_D, stream); + rmm::device_uvector d_sparse_Hs_diag(cones.n_sparse_cone_entries, stream); + + launch_get_Hs_sparse(cones, d_sparse_Hs_diag, stream); + scatter_sparse_hessian_diag_into_augmented( + augmented_x, d_sparse_hessian_diag, d_sparse_Hs_diag, d_sparse_hessian_Q, stream, 0.0); + update_sparse_expansion_in_augmented(augmented_x, + d_sparse_exp_v_col, + d_sparse_exp_u_col, + d_sparse_exp_v_row, + d_sparse_exp_u_row, + d_sparse_expansion_D, + cones.sparse_v, + cones.sparse_u, + cones.eta, + cones.sparse_cone_ids, + stream, + 0.0); + + auto aug_host = cuopt::host_copy(augmented_x, stream); + auto v_host = cuopt::host_copy(cones.sparse_v, stream); + auto u_host = cuopt::host_copy(cones.sparse_u, stream); + auto hs_host = cuopt::host_copy(d_sparse_Hs_diag, stream); + auto eta_host = cuopt::host_copy(cones.eta, stream); + + for (int j = 0; j < q_dim; ++j) { + EXPECT_NEAR(aug_host[j], -hs_host[j], 1e-10) << "hessian diag " << j; + EXPECT_NEAR(aug_host[6 + j], v_host[j], 1e-10) << "v col " << j; + EXPECT_NEAR(aug_host[12 + j], u_host[j], 1e-10) << "u col " << j; + EXPECT_NEAR(aug_host[18 + j], v_host[j], 1e-10) << "v row " << j; + EXPECT_NEAR(aug_host[24 + j], u_host[j], 1e-10) << "u row " << j; + } + + const double eta_sq = eta_host[0] * eta_host[0]; + EXPECT_NEAR(aug_host[30], -eta_sq, 1e-10); + EXPECT_NEAR(aug_host[31], eta_sq, 1e-10); +} + +TEST(sparse_augmented_kkt, sparse_augmented_matvec) +{ + auto stream = rmm::cuda_stream_default; + + std::vector cone_dimensions{6}; + rmm::device_uvector x(6, stream); + rmm::device_uvector z(6, stream); + + cone_data_t cones( + cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream, /*soc_threshold=*/4); + + ASSERT_EQ(cones.n_sparse_cones, 1); + ASSERT_EQ(cones.expansion_var_count(), 2); + + std::vector x_host{2.0, 0.2, 0.3, 0.4, 0.5, 0.6}; + std::vector z_host{1.5, 0.1, 0.15, 0.2, 0.25, 0.3}; + raft::copy(cones.x.data(), x_host.data(), x_host.size(), stream); + raft::copy(cones.z.data(), z_host.data(), z_host.size(), stream); + + launch_nt_scaling(cones, stream); + launch_update_scaling_sparse(cones, stream); + + const int n_primal = 6; + const int m_rows = 1; + const int p = cones.expansion_var_count(); + const int sys_size = n_primal + m_rows + p; + + std::vector x_vec(sys_size, 0.0); + x_vec[0] = 1.1; + x_vec[1] = 0.3; + x_vec[2] = 0.4; + x_vec[3] = 0.2; + x_vec[4] = 0.5; + x_vec[5] = 0.6; + x_vec[n_primal + m_rows] = 0.25; // expansion v + x_vec[n_primal + m_rows + 1] = -0.15; // expansion u + + rmm::device_uvector d_x(sys_size, stream); + rmm::device_uvector d_r1(n_primal, stream); + rmm::device_uvector d_y_exp(p, stream); + rmm::device_uvector d_hs(cones.n_sparse_cone_entries, stream); + + raft::copy(d_x.data(), x_vec.data(), sys_size, stream); + thrust::fill(rmm::exec_policy(stream), d_r1.begin(), d_r1.end(), 0.0); + thrust::fill(rmm::exec_policy(stream), d_y_exp.begin(), d_y_exp.end(), 0.0); + + launch_get_Hs_sparse(cones, d_hs, stream); + launch_sparse_augmented_matvec(raft::device_span(d_x.data(), d_x.size()), + raft::device_span(d_r1.data(), d_r1.size()), + raft::device_span(d_y_exp.data(), d_y_exp.size()), + cones, + raft::device_span(d_hs.data(), d_hs.size()), + /*cone_var_start=*/0, + n_primal, + m_rows, + stream); + + auto r1_host = cuopt::host_copy(d_r1, stream); + auto yexp_host = cuopt::host_copy(d_y_exp, stream); + auto hs_host = cuopt::host_copy(d_hs, stream); + auto v_host = cuopt::host_copy(cones.sparse_v, stream); + auto u_host = cuopt::host_copy(cones.sparse_u, stream); + auto eta_host = cuopt::host_copy(cones.eta, stream); + + const double eta_sq = eta_host[0] * eta_host[0]; + double dot_v = 0.0; + double dot_u = 0.0; + for (int j = 0; j < 6; ++j) { + dot_v += v_host[j] * x_vec[j]; + dot_u += u_host[j] * x_vec[j]; + const double expected = hs_host[j] * x_vec[j] - v_host[j] * x_vec[n_primal + m_rows] - + u_host[j] * x_vec[n_primal + m_rows + 1]; + EXPECT_NEAR(r1_host[j], expected, 1e-10) << "primal row " << j; + } + + EXPECT_NEAR(yexp_host[0], -eta_sq * x_vec[n_primal + m_rows] + dot_v, 1e-10); + EXPECT_NEAR(yexp_host[1], eta_sq * x_vec[n_primal + m_rows + 1] + dot_u, 1e-10); +} + +TEST(sparse_augmented_kkt, update_scaling_sparse_dim_1000) +{ + auto stream = rmm::cuda_stream_default; + + std::vector cone_dimensions{1000}; + rmm::device_uvector x(1000, stream); + rmm::device_uvector z(1000, stream); + + cone_data_t cones( + cone_dimensions, cuopt::make_span(x), cuopt::make_span(z), stream, /*soc_threshold=*/5); + + ASSERT_EQ(cones.n_sparse_cones, 1); + ASSERT_EQ(cones.n_sparse_cone_entries, 1000u); + ASSERT_EQ(cones.expansion_var_count(), 2); + + std::vector x_host(1000); + std::vector z_host(1000); + x_host[0] = 100.0; + z_host[0] = 80.0; + for (int j = 1; j < 1000; ++j) { + x_host[j] = 0.001 * ((j % 5) + 1); + z_host[j] = 0.0015 * ((j % 7) + 1); + } + + raft::copy(cones.x.data(), x_host.data(), x_host.size(), stream); + raft::copy(cones.z.data(), z_host.data(), z_host.size(), stream); + + launch_nt_scaling(cones, stream); + launch_update_scaling_sparse(cones, stream); + + auto d_host = cuopt::host_copy(cones.d, stream); + auto v_host = cuopt::host_copy(cones.sparse_v, stream); + auto u_host = cuopt::host_copy(cones.sparse_u, stream); + auto eta_host = cuopt::host_copy(cones.eta, stream); + + EXPECT_GT(d_host[0], 0.0); + EXPECT_GT(eta_host[0], 0.0); + for (int j = 0; j < 1000; ++j) { + EXPECT_TRUE(std::isfinite(v_host[j])) << "v entry " << j; + EXPECT_TRUE(std::isfinite(u_host[j])) << "u entry " << j; + } + + rmm::device_uvector Hs_diag(cones.n_sparse_cone_entries, stream); + launch_get_Hs_sparse(cones, Hs_diag, stream); + + const int n_primal = 1000; + const int m_rows = 1; + const int p = cones.expansion_var_count(); + const int sys_size = n_primal + m_rows + p; + + std::vector x_vec(sys_size, 0.0); + for (int j = 0; j < 1000; ++j) { + x_vec[j] = x_host[j]; + } + x_vec[n_primal + m_rows] = 0.25; + x_vec[n_primal + m_rows + 1] = -0.15; + + rmm::device_uvector d_x(sys_size, stream); + rmm::device_uvector d_r1(n_primal, stream); + rmm::device_uvector d_y_exp(p, stream); + + raft::copy(d_x.data(), x_vec.data(), sys_size, stream); + thrust::fill(rmm::exec_policy(stream), d_r1.begin(), d_r1.end(), 0.0); + thrust::fill(rmm::exec_policy(stream), d_y_exp.begin(), d_y_exp.end(), 0.0); + + launch_sparse_augmented_matvec(raft::device_span(d_x.data(), d_x.size()), + raft::device_span(d_r1.data(), d_r1.size()), + raft::device_span(d_y_exp.data(), d_y_exp.size()), + cones, + raft::device_span(Hs_diag.data(), Hs_diag.size()), + /*cone_var_start=*/0, + n_primal, + m_rows, + stream); + + auto r1_host = cuopt::host_copy(d_r1, stream); + auto yexp_host = cuopt::host_copy(d_y_exp, stream); + auto hs_host = cuopt::host_copy(Hs_diag, stream); + + const double eta_sq = eta_host[0] * eta_host[0]; + const double x_exp_v = x_vec[n_primal + m_rows]; + const double x_exp_u = x_vec[n_primal + m_rows + 1]; + double dot_v = 0.0; + double dot_u = 0.0; + for (int j = 0; j < 1000; ++j) { + dot_v += v_host[j] * x_vec[j]; + dot_u += u_host[j] * x_vec[j]; + const double expected = hs_host[j] * x_vec[j] - v_host[j] * x_exp_v - u_host[j] * x_exp_u; + EXPECT_NEAR(r1_host[j], expected, 1e-9) << "primal row " << j; + } + + EXPECT_NEAR(yexp_host[0], -eta_sq * x_exp_v + dot_v, 1e-9); + EXPECT_NEAR(yexp_host[1], eta_sq * x_exp_u + dot_u, 1e-9); +} + +} // namespace cuopt::linear_programming::dual_simplex::test From 4ed64ef5f011f44a998e260809bfb6ab8bdabc9a Mon Sep 17 00:00:00 2001 From: yuwenchen95 Date: Mon, 29 Jun 2026 07:04:28 -0700 Subject: [PATCH 2/2] Stricter IR tolerance for SOCP with sparse socs Signed-off-by: yuwenchen95 --- cpp/src/barrier/barrier.cu | 12 ++++++++---- cpp/src/barrier/iterative_refinement.hpp | 25 +++++++++++++++--------- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu index 0a5377ea01..45b1feab4a 100644 --- a/cpp/src/barrier/barrier.cu +++ b/cpp/src/barrier/barrier.cu @@ -683,9 +683,11 @@ class iteration_data_t { i_t cone_end() const { return cone_start() + cone_entry_count(); } + bool has_sparse_cones() const { return has_cones() && cones_->n_sparse_cones > 0; } + i_t augmented_expansion_count() const { - return has_cones() && cones().has_sparse_cones() ? cones().expansion_var_count() : i_t(0); + return has_sparse_cones() ? cones().expansion_var_count() : i_t(0); } i_t augmented_system_size(i_t n, i_t m) const { return n + m + augmented_expansion_count(); } @@ -2563,7 +2565,8 @@ int barrier_solver_t::initial_point(iteration_data_t& data) } op(data); if (settings.barrier_iterative_refinement) { - iterative_refinement(op, rhs, soln); + const f_t ir_tol = data.has_sparse_cones() ? f_t(1e-12) : f_t(1e-8); + iterative_refinement(op, rhs, soln, ir_tol); } for (i_t k = 0; k < lp.num_cols; k++) { @@ -3196,8 +3199,9 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t(op, data.d_augmented_rhs_, data.d_augmented_soln_); + const f_t ir_tol = data.has_sparse_cones() ? f_t(1e-12) : f_t(1e-8); + const f_t solve_err = iterative_refinement( + op, data.d_augmented_rhs_, data.d_augmented_soln_, ir_tol); if (solve_err > 1e-1) { settings.log.printf("|| Aug (dx, dy) - aug_rhs || %e after IR\n", solve_err); } diff --git a/cpp/src/barrier/iterative_refinement.hpp b/cpp/src/barrier/iterative_refinement.hpp index 8ad0bb4c49..d7bc8276a7 100644 --- a/cpp/src/barrier/iterative_refinement.hpp +++ b/cpp/src/barrier/iterative_refinement.hpp @@ -88,7 +88,8 @@ f_t vector_norm2(const rmm::device_uvector& x) template f_t iterative_refinement_simple(T& op, const rmm::device_uvector& b, - rmm::device_uvector& x) + rmm::device_uvector& x, + f_t tol = 1e-8) { rmm::device_uvector x_sav(x, x.stream()); @@ -105,7 +106,7 @@ f_t iterative_refinement_simple(T& op, } rmm::device_uvector delta_x(x.size(), op.data_.handle_ptr->get_stream()); i_t iter = 0; - while (error > 1e-8 && iter < 30) { + while (error > tol && iter < 30) { thrust::fill(op.data_.handle_ptr->get_thrust_policy(), delta_x.data(), delta_x.data() + delta_x.size(), @@ -154,7 +155,8 @@ f_t iterative_refinement_simple(T& op, template f_t iterative_refinement_gmres(T& op, const rmm::device_uvector& b, - rmm::device_uvector& x) + rmm::device_uvector& x, + f_t tol = 1e-8) { // Parameters // Ideally, we do not need to restart here. But having restarts helps as a checkpoint to get @@ -162,7 +164,6 @@ f_t iterative_refinement_gmres(T& op, // are not converging after some point const int max_restarts = 3; const int m = 10; // Krylov space dimension - const f_t tol = 1e-8; rmm::device_uvector r(x.size(), x.stream()); rmm::device_uvector x_sav(x, x.stream()); @@ -188,7 +189,7 @@ f_t iterative_refinement_gmres(T& op, f_t norm_r = vector_norm_inf(r); if (show_info) { CUOPT_LOG_INFO("GMRES IR: initial residual = %e, |b| = %e", norm_r, bnorm); } - if (norm_r <= 1e-8) { return norm_r; } + if (norm_r <= tol) { return norm_r; } f_t residual = norm_r; f_t best_residual = norm_r; @@ -392,13 +393,16 @@ f_t iterative_refinement_gmres(T& op, } template -f_t iterative_refinement(T& op, const dense_vector_t& b, dense_vector_t& x) +f_t iterative_refinement(T& op, + const dense_vector_t& b, + dense_vector_t& x, + f_t tol = 1e-8) { rmm::device_uvector d_b(b.size(), op.data_.handle_ptr->get_stream()); raft::copy(d_b.data(), b.data(), b.size(), op.data_.handle_ptr->get_stream()); rmm::device_uvector d_x(x.size(), op.data_.handle_ptr->get_stream()); raft::copy(d_x.data(), x.data(), x.size(), op.data_.handle_ptr->get_stream()); - auto err = iterative_refinement_gmres(op, d_b, d_x); + auto err = iterative_refinement_gmres(op, d_b, d_x, tol); raft::copy(x.data(), d_x.data(), x.size(), op.data_.handle_ptr->get_stream()); @@ -407,9 +411,12 @@ f_t iterative_refinement(T& op, const dense_vector_t& b, dense_vector_ } template -f_t iterative_refinement(T& op, const rmm::device_uvector& b, rmm::device_uvector& x) +f_t iterative_refinement(T& op, + const rmm::device_uvector& b, + rmm::device_uvector& x, + f_t tol = 1e-8) { - return iterative_refinement_gmres(op, b, x); + return iterative_refinement_gmres(op, b, x, tol); } } // namespace cuopt::mathematical_optimization::barrier