diff --git a/ci/build_wheel_libcuopt.sh b/ci/build_wheel_libcuopt.sh
index faa28c147b..7a15684f5d 100755
--- a/ci/build_wheel_libcuopt.sh
+++ b/ci/build_wheel_libcuopt.sh
@@ -17,17 +17,34 @@ fi
 # Install Boost and TBB
 bash ci/utils/install_boost_tbb.sh
 
-# Install libuuid (needed by cuopt_grpc_server)
+# Install libuuid and LLVM's OpenMP runtime
 if command -v dnf &> /dev/null; then
-    dnf install -y libuuid-devel
+    # LLVM Toolset is distributed as a module on Rocky/RHEL 8.
+    dnf module install -y llvm-toolset
+    dnf install -y libuuid-devel libomp-devel
 elif command -v apt-get &> /dev/null; then
-    apt-get update && apt-get install -y uuid-dev
+    apt-get update
+    apt-get install -y uuid-dev libomp-dev
 fi
 
 # Install Protobuf + gRPC (protoc + grpc_cpp_plugin)
 bash ci/utils/install_protobuf_grpc.sh
 
-export SKBUILD_CMAKE_ARGS="-DCUOPT_BUILD_WHEELS=ON;-DDISABLE_DEPRECATION_WARNING=ON"
+# Compile with GCC, but use LLVM libomp as the OpenMP runtime bundled in the wheel. Resolve the
+# versioned ELF library rather than an unversioned linker script or compiler-toolset indirection.
+LIBOMP_LIBRARY="$(
+    ldconfig -p |
+        awk '$1 ~ /^libomp\.so(\.[0-9]+)*$/ && !library { library = $NF }
+             END { print library }'
+)"
+if [[ "${LIBOMP_LIBRARY}" != /* || ! -f "${LIBOMP_LIBRARY}" ]]; then
+    echo "Could not resolve the LLVM OpenMP runtime: '${LIBOMP_LIBRARY}'" >&2
+    exit 1
+fi
+
+echo "Using LLVM OpenMP runtime: ${LIBOMP_LIBRARY}"
+
+export SKBUILD_CMAKE_ARGS="-DOpenMP_gomp_LIBRARY:FILEPATH=${LIBOMP_LIBRARY}"
 
 # OpenSSL 3 hints for libcuopt's own find_package(OpenSSL).
 #
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 44f9caf2cb..98dc5c152d 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -186,7 +186,6 @@ list(APPEND CUOPT_CUDA_FLAGS -Xfatbin=-compress-all)
 if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.9 AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 13.0)
     list(APPEND CUOPT_CUDA_FLAGS -Xfatbin=--compress-level=3)
 endif ()
-list(APPEND CUOPT_CUDA_FLAGS -fopenmp)
 
 # Add jobserver flags for parallel compilation if PARALLEL_LEVEL is set
 if (PARALLEL_LEVEL AND NOT "${PARALLEL_LEVEL}" STREQUAL "")
@@ -198,13 +197,14 @@ if (PARALLEL_LEVEL AND NOT "${PARALLEL_LEVEL}" STREQUAL "")
     endif ()
 endif ()
 
-# The MIP solver requires OpenMP to work
-find_package(OpenMP REQUIRED)
+# The MIP solver requires OpenMP for both C++ and CUDA host code.
+find_package(OpenMP REQUIRED COMPONENTS CXX CUDA)
 message(VERBOSE "cuOpt: OpenMP found in ${OpenMP_CXX_INCLUDE_DIRS}")
 
-# MPS/QPS parser supports compressed inputs via bzip2 and zlib
+# MPS/QPS parser supports compressed inputs via bzip2, zlib and lz4
 option(CUOPT_PARSER_WITH_BZIP2 "Build MPS parser with bzip2 decompression" ON)
 option(CUOPT_PARSER_WITH_ZLIB "Build MPS parser with zlib decompression" ON)
+option(CUOPT_PARSER_WITH_LZ4 "Build experimental fast MPS parser with LZ4 decompression" ON)
 if (CUOPT_PARSER_WITH_BZIP2)
     find_package(BZip2 REQUIRED)
     add_compile_definitions(MPS_PARSER_WITH_BZIP2)
@@ -213,6 +213,10 @@ if (CUOPT_PARSER_WITH_ZLIB)
     find_package(ZLIB REQUIRED)
     add_compile_definitions(MPS_PARSER_WITH_ZLIB)
 endif ()
+if (CUOPT_PARSER_WITH_LZ4)
+    # No headers or link target needed; the experimental reader loads one liblz4 symbol at runtime.
+    add_compile_definitions(MPS_PARSER_WITH_LZ4)
+endif ()
 
 # Debug options
 if (CMAKE_BUILD_TYPE MATCHES Debug)
@@ -250,6 +254,20 @@ else ()
     find_package(RAFT REQUIRED)
 endif ()
 
+rapids_cpm_find(simde 0.8.2
+        CPM_ARGS
+        GIT_REPOSITORY https://github.com/simd-everywhere/simde.git
+        GIT_TAG v0.8.2
+        GIT_SHALLOW TRUE
+        DOWNLOAD_ONLY TRUE
+)
+
+if (NOT TARGET simde::simde)
+    add_library(simde::simde INTERFACE IMPORTED GLOBAL)
+    set_target_properties(simde::simde
+            PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${simde_SOURCE_DIR}")
+endif ()
+
 FetchContent_Declare(
         papilo
         GIT_REPOSITORY "https://github.com/scipopt/papilo.git"
@@ -436,6 +454,7 @@ if (BUILD_TESTS)
 endif ()
 
 set(CUOPT_SRC_FILES)
+set(MPS_FAST_SRC_FILES)
 add_subdirectory(src)
 
 # nvcc 13.0.3 ICE (signal 11) compiling sliding_window.cu with 7 GPU architectures;
@@ -445,14 +464,24 @@ set_source_files_properties(
     PROPERTIES COMPILE_OPTIONS "--split-compile=0")
 
 if (HOST_LINEINFO)
-    set_source_files_properties(${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTIES COMPILE_OPTIONS "-g1")
+    set_source_files_properties(${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTIES COMPILE_OPTIONS "-g1")
+endif ()
+
+# Needed for the fast MPS parser, available on all x86-64-v3 compliant x86 CPUs (essentially since Haswell ~2013)
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64|amd64)$" AND
+        CMAKE_CXX_COMPILER_ID MATCHES "^(GNU|Clang|AppleClang)$")
+    set_property(SOURCE ${MPS_FAST_SRC_FILES} DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+            APPEND PROPERTY COMPILE_OPTIONS "-mbmi2;-mavx2;-msse4.2")
 endif ()
 
+# TODO: figure out a set of flags for ARM that fits the range of CPUs we wish to support (neoverse?)
+# NEON should be universal on aarch64 and enough for our purposes (parsing) though
+
 # Apply -UNDEBUG only to solver source files (not gRPC infrastructure).
 # Must happen before gRPC files are appended to CUOPT_SRC_FILES.
 # Uses APPEND to preserve any existing per-file options (e.g. -g1 from HOST_LINEINFO).
 if (DEFINE_ASSERT)
-    set_property(SOURCE ${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR}
+    set_property(SOURCE ${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
             APPEND PROPERTY COMPILE_OPTIONS "-UNDEBUG")
 endif ()
 
@@ -477,7 +506,7 @@ if (NOT SKIP_GRPC_BUILD)
     # The conda-forge abseil shared library is built with NDEBUG and does not
     # export that symbol (abseil-cpp#1624).  Without this, Debug builds fail
     # at runtime with "undefined symbol: absl::…::Mutex::Dtor".
-    set_property(SOURCE ${GRPC_INFRA_FILES} DIRECTORY ${CMAKE_SOURCE_DIR}
+    set_property(SOURCE ${GRPC_INFRA_FILES} DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
             APPEND PROPERTY COMPILE_OPTIONS "-DNDEBUG")
 endif (NOT SKIP_GRPC_BUILD)
 
@@ -603,11 +632,13 @@ target_link_libraries(cuopt
         ${CUDSS_LIB_FILE}
         PRIVATE
         ${CUOPT_PRIVATE_CUDA_LIBS}
+        simde::simde
+        OpenMP::OpenMP_CXX
+        OpenMP::OpenMP_CUDA
         $<$<BOOL:${CUOPT_ENABLE_GRPC}>:protobuf::libprotobuf>
         $<$<BOOL:${CUOPT_ENABLE_GRPC}>:gRPC::grpc++>
 )
 
-
 # ##################################################################################################
 # - generate tests --------------------------------------------------------------------------------
 if (BUILD_TESTS)
diff --git a/cpp/cuopt_cli.cpp b/cpp/cuopt_cli.cpp
index bf16e9e9da..bbc9cfd4d8 100644
--- a/cpp/cuopt_cli.cpp
+++ b/cpp/cuopt_cli.cpp
@@ -90,11 +90,13 @@ inline cuopt::init_logger_t dummy_logger(
  *                  .mps/.qps and their .gz/.bz2 variants → MPS parser;
  *                  anything else is rejected.
  * @param initial_solution_file Path to initial solution file in SOL format
+ * @param mps_reader MPS reader implementation selected by the CLI
  * @param settings Merged solver settings (config file loaded in main, then CLI overrides applied)
  */
 int run_single_file(const std::string& file_path,
                     const std::string& initial_solution_file,
                     bool solve_relaxation,
+                    cuopt::mathematical_optimization::io::mps_reader_type_t mps_reader,
                     cuopt::mathematical_optimization::solver_settings_t<int, double>& settings)
 {
   cuopt::init_logger_t log(settings.get_parameter<std::string>(CUOPT_LOG_FILE),
@@ -108,7 +110,8 @@ int run_single_file(const std::string& file_path,
   {
     CUOPT_LOG_INFO("Reading file %s", base_filename.c_str());
     try {
-      mps_data_model = cuopt::mathematical_optimization::io::read<int, double>(file_path);
+      mps_data_model =
+        cuopt::mathematical_optimization::io::read<int, double>(file_path, mps_reader);
     } catch (const std::logic_error& e) {
       CUOPT_LOG_ERROR("Parser exception: %s", e.what());
       parsing_failed = true;
@@ -287,8 +290,8 @@ int main(int argc, char* argv[])
   program.add_argument("filename")
     .help(
       "input problem file; format dispatched by extension (case-insensitive). "
-      "Supported: .lp, .mps, .qps and their .gz / .bz2 compressed variants "
-      "(e.g. .lp.gz, .mps.bz2, .qps.gz)")
+      "Supported: .lp, .mps, .qps and their .gz / .bz2 / .lz4 compressed variants "
+      "(e.g. .lp.gz, .mps.bz2, .qps.lz4).")
     .nargs(1)
     .required();
 
@@ -306,6 +309,14 @@ int main(int argc, char* argv[])
     .help("path to parameter config file (key = value format, supports all parameters)")
     .default_value(std::string(""));
 
+  program.add_argument("--mps-reader")
+    .help(
+      "MPS reader implementation: default uses the production parser; experimental-fast uses the "
+      "experimental SIMD parser for free-format LP/MIP/QP/QCQP (SOCP) .mps/.qps files and their "
+      ".gz/.bz2/.lz4 compressed variants")
+    .default_value(std::string("default"))
+    .choices("default", "experimental-fast");
+
   program.add_argument("--dump-hyper-params")
     .help("print hyper-parameters only in config file format and exit")
     .default_value(false)
@@ -406,6 +417,12 @@ int main(int argc, char* argv[])
   const auto initial_solution_file = program.get<std::string>("--initial-solution");
   const auto solve_relaxation      = program.get<bool>("--relaxation");
   const auto params_file           = program.get<std::string>("--params-file");
+  const auto mps_reader_arg        = program.get<std::string>("--mps-reader");
+
+  auto mps_reader = cuopt::mathematical_optimization::io::mps_reader_type_t::default_reader;
+  if (mps_reader_arg == "experimental-fast") {
+    mps_reader = cuopt::mathematical_optimization::io::mps_reader_type_t::fast_experimental;
+  }
 
   cuopt::mathematical_optimization::solver_settings_t<int, double> settings;
   try {
@@ -435,5 +452,5 @@ int main(int argc, char* argv[])
     RAFT_CUDA_TRY(cudaSetDevice(0));
   }
 
-  return run_single_file(file_name, initial_solution_file, solve_relaxation, settings);
+  return run_single_file(file_name, initial_solution_file, solve_relaxation, mps_reader, settings);
 }
diff --git a/cpp/include/cuopt/mathematical_optimization/io/parser.hpp b/cpp/include/cuopt/mathematical_optimization/io/parser.hpp
index bbdaa1a819..a9348c9adb 100644
--- a/cpp/include/cuopt/mathematical_optimization/io/parser.hpp
+++ b/cpp/include/cuopt/mathematical_optimization/io/parser.hpp
@@ -11,17 +11,26 @@
 
 #include <algorithm>
 #include <cctype>
+#include <cstring>
 #include <stdexcept>
 #include <string>
 #include <string_view>
 
 namespace cuopt::mathematical_optimization::io {
 
+/**
+ * @brief Selects which MPS reader implementation should be used by dispatching entry points.
+ *
+ * The experimental fast reader is intentionally opt-in. It supports the same free-format
+ * MPS/QPS scope as read_mps(): LP, MIP, QP (QUADOBJ/QMATRIX), and QCQP/SOCP (QCMATRIX).
+ */
+enum class mps_reader_type_t { default_reader, fast_experimental };
+
 /**
  * @brief Reads the equation from an MPS or QPS file.
  *
  * The input file can be a plain text file in MPS-/QPS-format or a compressed MPS/QPS
- * file (.mps.gz or .mps.bz2).
+ * file (.mps.gz, .mps.bz2, or .mps.lz4).
  *
  * Read this link http://lpsolve.sourceforge.net/5.5/mps-format.htm for more
  * details on both free and fixed MPS format.
@@ -32,8 +41,8 @@ namespace cuopt::mathematical_optimization::io {
  * - QMATRIX: Full symmetric quadratic objective matrix (alternative to QUADOBJ)
  * - QCMATRIX: Symmetric quadratic terms for a named constraint row (QCQP)
  *
- * Note: Compressed MPS files .mps.gz, .mps.bz2 can only be read if the compression
- * libraries zlib or libbzip2 are installed, respectively.
+ * Note: Compressed MPS files .mps.gz, .mps.bz2, and .mps.lz4 can only be read if
+ * zlib, libbzip2, or liblz4 are installed, respectively.
  *
  * @param[in] mps_file_path Path to MPS/QPSfile.
  * @param[in] fixed_mps_format If MPS/QPS file should be parsed as fixed, false by default
@@ -43,6 +52,19 @@ template <typename i_t, typename f_t>
 mps_data_model_t<i_t, f_t> read_mps(const std::string& mps_file_path,
                                     bool fixed_mps_format = false);
 
+/**
+ * @brief Reads an MPS/QPS problem with the experimental SIMD-optimized reader.
+ *
+ * Supports the same free-format LP/MIP/QP/QCQP (SOCP-relevant QCMATRIX) scope as read_mps().
+ * Fixed MPS format forcing is not supported. Accepts .mps/.qps and their .gz/.bz2/.lz4 variants
+ * (compression is detected from the file path, same as read_mps()).
+ *
+ * @param[in] mps_file_path Path to a raw or compressed .mps or .qps file.
+ * @return mps_data_model_t A fully formed LP/MIP/QP problem which represents the given file.
+ */
+template <typename i_t, typename f_t>
+mps_data_model_t<i_t, f_t> read_mps_fast_experimental(const std::string& mps_file_path);
+
 /**
  * @brief Reads an MPS problem from in-memory file contents.
  *
@@ -111,38 +133,72 @@ mps_data_model_t<i_t, f_t> read_lp_from_string(std::string_view lp_contents);
  * @brief Reads an optimization problem from a file, dispatching on the file
  *        extension. Extension matching is case-insensitive.
  *
- * Routing:
- *   - .mps, .mps.gz, .mps.bz2, .qps, .qps.gz, .qps.bz2 → read_mps()
- *   - .lp,  .lp.gz,  .lp.bz2                            → read_lp()
+ * Routing (case-insensitive extensions):
+ *   - .mps, .mps.gz, .mps.bz2, .mps.lz4, .qps, .qps.gz, .qps.bz2, .qps.lz4
+ *     → read_mps() when mps_reader == default_reader, or read_mps_fast_experimental()
+ *       when mps_reader == fast_experimental (fixed_mps_format must be false)
+ *   - .lp,  .lp.gz,  .lp.bz2, .lp.lz4 → read_lp()
  *   - anything else → std::logic_error
  *
  * This is the entry point of choice for user-facing tools (CLI, C API) that
  * want both formats to "just work" without an explicit format flag.
  *
  * @param[in] path Path to the input file.
+ * @param[in] mps_reader Selects the MPS reader implementation for MPS/QPS inputs.
  * @param[in] fixed_mps_format If the MPS/QPS reader should use fixed format;
  *             ignored for LP inputs. False by default.
  * @return mps_data_model_t The parsed problem.
  */
 template <typename i_t, typename f_t>
-inline mps_data_model_t<i_t, f_t> read(const std::string& path, bool fixed_mps_format = false)
+inline mps_data_model_t<i_t, f_t> read(const std::string& path,
+                                       mps_reader_type_t mps_reader,
+                                       bool fixed_mps_format = false)
 {
   std::string lower(path);
   std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c) {
     return static_cast<char>(std::tolower(c));
   });
-  if (lower.ends_with(".mps") || lower.ends_with(".mps.gz") || lower.ends_with(".mps.bz2") ||
-      lower.ends_with(".qps") || lower.ends_with(".qps.gz") || lower.ends_with(".qps.bz2")) {
-    return read_mps<i_t, f_t>(path, fixed_mps_format);
+  for (const char* compression_suffix : {".bz2", ".gz", ".lz4"}) {
+    if (lower.ends_with(compression_suffix)) {
+      lower.resize(lower.size() - std::strlen(compression_suffix));
+      break;
+    }
   }
-  if (lower.ends_with(".lp") || lower.ends_with(".lp.gz") || lower.ends_with(".lp.bz2")) {
-    return read_lp<i_t, f_t>(path);
+  if (lower.ends_with(".mps") || lower.ends_with(".qps")) {
+    if (mps_reader == mps_reader_type_t::fast_experimental) {
+      if (fixed_mps_format) {
+        throw std::logic_error(
+          "experimental fast MPS reader does not support fixed MPS format forcing");
+      }
+      return read_mps_fast_experimental<i_t, f_t>(path);
+    }
+    return read_mps<i_t, f_t>(path, fixed_mps_format);
   }
+  if (lower.ends_with(".lp")) { return read_lp<i_t, f_t>(path); }
   throw std::logic_error(
     "read: unrecognized input file extension. Supported (case-insensitive): "
-    ".mps, .mps.gz, .mps.bz2, .qps, .qps.gz, .qps.bz2, .lp, .lp.gz, .lp.bz2. "
+    ".mps, .mps.gz, .mps.bz2, .mps.lz4, .qps, .qps.gz, .qps.bz2, .qps.lz4, "
+    ".lp, .lp.gz, .lp.bz2, .lp.lz4. "
     "Given path: " +
     path);
 }
 
+/**
+ * @brief Reads an optimization problem from a file, dispatching on the file
+ *        extension. Extension matching is case-insensitive.
+ *
+ * Uses the default MPS reader. See the 3-argument read() overload for routing
+ * details and supported extensions.
+ *
+ * @param[in] path Path to the input file.
+ * @param[in] fixed_mps_format If the MPS/QPS reader should use fixed format;
+ *             ignored for LP inputs. False by default.
+ * @return mps_data_model_t The parsed problem.
+ */
+template <typename i_t, typename f_t>
+inline mps_data_model_t<i_t, f_t> read(const std::string& path, bool fixed_mps_format = false)
+{
+  return read<i_t, f_t>(path, mps_reader_type_t::default_reader, fixed_mps_format);
+}
+
 }  // namespace cuopt::mathematical_optimization::io
diff --git a/cpp/src/CMakeLists.txt b/cpp/src/CMakeLists.txt
index 928f1db76c..e8737cf6da 100644
--- a/cpp/src/CMakeLists.txt
+++ b/cpp/src/CMakeLists.txt
@@ -26,3 +26,4 @@ add_subdirectory(branch_and_bound)
 add_subdirectory(cuts)
 
 set(CUOPT_SRC_FILES ${CUOPT_SRC_FILES} ${UTIL_SRC_FILES} PARENT_SCOPE)
+set(MPS_FAST_SRC_FILES ${MPS_FAST_SRC_FILES} PARENT_SCOPE)
diff --git a/cpp/src/io/CMakeLists.txt b/cpp/src/io/CMakeLists.txt
index cc4affa890..cafcffb23f 100644
--- a/cpp/src/io/CMakeLists.txt
+++ b/cpp/src/io/CMakeLists.txt
@@ -3,6 +3,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # cmake-format: on
 
+set(MPS_FAST_SRC_FILES
+  ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/fast_parser.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/file_reader.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/lz4_file_reader.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/mps_section_scanner.cpp
+)
+
 set(PARSERS_SRC_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/data_model_view.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/file_to_string.cpp
@@ -13,6 +20,8 @@ set(PARSERS_SRC_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/parser.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/writer.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/utilities/cython_parser.cpp
+  ${MPS_FAST_SRC_FILES}
 )
 
 set(CUOPT_SRC_FILES ${CUOPT_SRC_FILES} ${PARSERS_SRC_FILES} PARENT_SCOPE)
+set(MPS_FAST_SRC_FILES ${MPS_FAST_SRC_FILES} PARENT_SCOPE)
diff --git a/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp
new file mode 100644
index 0000000000..4f99c1cab9
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/fast_fp64_parser.hpp
@@ -0,0 +1,436 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <utilities/error.hpp>
+
+#include <array>
+#include <bit>
+#include <cerrno>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+#include <stdexcept>
+#include <string_view>
+
+namespace cuopt::mathematical_optimization::io::detail {
+
+using cuopt::mathematical_optimization::io::error_type_t;
+using cuopt::mathematical_optimization::io::mps_parser_expects;
+using cuopt::mathematical_optimization::io::mps_parser_fail;
+
+namespace fp64 {
+
+#define FASTP64_MIN_EXP_10    (-307)
+#define FASTP64_MAX_EXP_10    288
+#define FASTP64_POWER_COUNT   (FASTP64_MAX_EXP_10 - FASTP64_MIN_EXP_10 + 1)
+#define FASTP64_MANTISSA_MASK ((uint64_t{1} << 52) - 1)
+#define FASTP64_EXPONENT_MASK 0x7FF
+#define FASTP64_HALF_MASK     0x1FF
+
+// Fast FP64 parser optimized for the <=19digits case, based on the Eisel-Lemire algorithm
+// see Daniel Lemire, Number Parsing at a Gigabyte per Second, Software: Practice and Experience 51
+// (8), 2021.
+// verified on a large corpus of FP64 values: https://github.com/lemire/simple_fastfloat_benchmark
+
+struct power_10_lut_entry_t {
+  uint64_t high;
+  uint64_t low;
+  int biased_e2;
+};
+
+// util class to perform 256bit precision arithmetic in constexpr to build the eisel-lemire lookup
+// table
+struct cuopt_uint256_t {
+  std::array<uint64_t, 4> limb{};
+
+  constexpr uint32_t mul_u32(uint32_t m)
+  {
+    unsigned __int128 carry = 0;
+    for (uint64_t& v : limb) {
+      unsigned __int128 x = (unsigned __int128)v * m + carry;
+      v                   = (uint64_t)x;
+      carry               = x >> 64;
+    }
+    return (uint32_t)carry;
+  }
+
+  constexpr cuopt_uint256_t shl_small(int bits) const
+  {
+    cuopt_uint256_t out;
+    if (bits == 0) return *this;
+    for (int i = 3; i >= 0; --i) {
+      uint64_t v = limb[i] << bits;
+      if (i > 0) v |= limb[i - 1] >> (64 - bits);
+      out.limb[i] = v;
+    }
+    return out;
+  }
+};
+
+struct cuopt_normalized_uint256_t {
+  cuopt_uint256_t sig;
+  int exp2 = 0;
+
+  static constexpr cuopt_normalized_uint256_t one()
+  {
+    cuopt_normalized_uint256_t x;
+    x.sig.limb[3] = uint64_t{1} << 63;
+    x.exp2        = -255;
+    return x;
+  }
+
+  constexpr void mul10()
+  {
+    uint32_t carry = sig.mul_u32(10);
+    int shift      = 32 - std::countl_zero(carry);
+    // The normalized 256-bit value always overflows into carry after *10; keep
+    // the guard explicit because the cross-limb path shifts by 64 - shift.
+    if (shift == 0) { return; }
+    cuopt_uint256_t out;
+    for (int i = 0; i < 4; ++i) {
+      uint64_t lower = sig.limb[i] >> shift;
+      uint64_t upper = 0;
+      if (i + 1 < 4) {
+        upper = sig.limb[i + 1] << (64 - shift);
+      } else {
+        upper = (uint64_t)carry << (64 - shift);
+      }
+      out.limb[i] = lower | upper;
+    }
+    sig = out;
+    exp2 += shift;
+  }
+
+  constexpr void div10()
+  {
+    constexpr uint64_t div10_shift_4_threshold = 0xA000000000000000ULL;
+    int shift                                  = sig.limb[3] < div10_shift_4_threshold ? 4 : 3;
+    uint64_t extra                             = sig.limb[3] >> (64 - shift);
+    cuopt_uint256_t shifted                    = sig.shl_small(shift);
+
+    cuopt_uint256_t quotient;
+    unsigned __int128 rem = extra;
+    for (int i = 3; i >= 0; --i) {
+      unsigned __int128 cur = (rem << 64) | shifted.limb[i];
+      quotient.limb[i]      = (uint64_t)(cur / 10);
+      rem                   = cur % 10;
+    }
+    sig = quotient;
+    exp2 -= shift;
+  }
+};
+
+constexpr power_10_lut_entry_t make_power(const cuopt_normalized_uint256_t& p)
+{
+  int e2 = p.exp2 + 192;
+  return {p.sig.limb[3], p.sig.limb[2], 1150 + e2};
+}
+
+// build time LUT for the lemire trick
+constexpr std::array<power_10_lut_entry_t, FASTP64_POWER_COUNT> make_power_table()
+{
+  std::array<power_10_lut_entry_t, FASTP64_POWER_COUNT> table{};
+  cuopt_normalized_uint256_t p = cuopt_normalized_uint256_t::one();
+  table[-FASTP64_MIN_EXP_10]   = make_power(p);
+
+  for (int e = 1; e <= FASTP64_MAX_EXP_10; ++e) {
+    p.mul10();
+    table[e - FASTP64_MIN_EXP_10] = make_power(p);
+  }
+
+  p = cuopt_normalized_uint256_t::one();
+  for (int e = -1; e >= FASTP64_MIN_EXP_10; --e) {
+    p.div10();
+    table[e - FASTP64_MIN_EXP_10] = make_power(p);
+  }
+  return table;
+}
+
+inline constexpr auto fast_fp64_parse_lut = make_power_table();
+
+inline constexpr std::array<double, 23> small_powers = {
+  1e0,  1e1,  1e2,  1e3,  1e4,  1e5,  1e6,  1e7,  1e8,  1e9,  1e10, 1e11,
+  1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22};
+
+inline constexpr std::array<uint64_t, 16> small_integer_powers = {1ULL,
+                                                                  10ULL,
+                                                                  100ULL,
+                                                                  1000ULL,
+                                                                  10000ULL,
+                                                                  100000ULL,
+                                                                  1000000ULL,
+                                                                  10000000ULL,
+                                                                  100000000ULL,
+                                                                  1000000000ULL,
+                                                                  10000000000ULL,
+                                                                  100000000000ULL,
+                                                                  1000000000000ULL,
+                                                                  10000000000000ULL,
+                                                                  100000000000000ULL,
+                                                                  1000000000000000ULL};
+
+struct parsed_decimal_t {
+  bool negative      = false;
+  bool fast_eligible = false;
+  uint64_t mantissa  = 0;
+  int exp10          = 0;
+};
+
+static inline bool is_digit(char c) noexcept { return c >= '0' && c <= '9'; }
+
+// SWAR 8char run of digits -> integer representation
+// better and more portable than AVX2 stuff since AVX2 doesn't like swizzling across 16B lanes
+// saw no real difference w/ 16B SSE
+static inline bool parse_8_digits(const char* p, uint32_t& out)
+{
+  // comply with strict aliasing rules
+  std::array<char, sizeof(uint64_t)> bytes{};
+  std::memcpy(bytes.data(), p, bytes.size());
+  uint64_t raw       = std::bit_cast<uint64_t>(bytes);
+  uint64_t high      = raw & 0xF0F0F0F0F0F0F0F0ULL;
+  uint64_t low_check = (raw + 0x0606060606060606ULL) & 0xF0F0F0F0F0F0F0F0ULL;
+  if (high != 0x3030303030303030ULL || low_check != 0x3030303030303030ULL) { return false; }
+
+  uint64_t v     = raw - 0x3030303030303030ULL;
+  uint64_t pairs = (v * 10 + (v >> 8)) & 0x00FF00FF00FF00FFULL;
+  uint64_t quads = (pairs * 100 + (pairs >> 16)) & 0x0000FFFF0000FFFFULL;
+  out            = (uint32_t)((quads * 10000 + (quads >> 32)) & 0xFFFFFFFFULL);
+  return true;
+}
+
+static inline void parse_u64_digits_advance(const char*& p, const char* end, uint64_t& out)
+{
+  while (p < end && is_digit(*p)) {
+    if (end - p >= 8) {
+      uint32_t chunk = 0;
+      if (parse_8_digits(p, chunk)) {
+        out = out * 100000000ULL + (uint64_t)chunk;
+        p += 8;
+        continue;
+      }
+    }
+    out = out * 10 + (uint64_t)(*p - '0');
+    ++p;
+  }
+}
+
+static inline void scan_digit_run(const char*& p,
+                                  const char* end,
+                                  bool after_dot,
+                                  parsed_decimal_t& out,
+                                  bool& saw_digit,
+                                  int& frac_digits,
+                                  int& sig_digits,
+                                  bool& too_many_digits)
+{
+  while (p < end) {
+    uint32_t chunk = 0;
+    if (end - p >= 8 && parse_8_digits(p, chunk)) {
+      saw_digit = true;
+      if (after_dot) frac_digits += 8;
+
+      if (!too_many_digits) {
+        if (sig_digits == 0 && chunk == 0) {
+          p += 8;
+          continue;
+        }
+
+        if (sig_digits + 8 <= 19) {
+          out.mantissa = out.mantissa * 100000000ULL + chunk;
+          sig_digits += 8;
+        } else {
+          too_many_digits = true;
+        }
+      }
+
+      p += 8;
+      continue;
+    }
+
+    if (!is_digit(*p)) return;
+    saw_digit = true;
+    int digit = *p - '0';
+    if (after_dot) ++frac_digits;
+    if (!too_many_digits && (digit != 0 || sig_digits != 0)) {
+      if (sig_digits < 19) {
+        out.mantissa = (out.mantissa * 10) + (uint64_t)digit;
+        ++sig_digits;
+      } else {
+        too_many_digits = true;
+      }
+    }
+    ++p;
+  }
+}
+
+static inline bool parse_decimal_advance(const char*& p, const char* end, parsed_decimal_t& out)
+{
+  if (p < end && (*p == '-' || *p == '+')) {
+    out.negative = *p == '-';
+    ++p;
+  }
+
+  bool saw_digit       = false;
+  int frac_digits      = 0;
+  int sig_digits       = 0;
+  bool too_many_digits = false;
+
+  scan_digit_run(p, end, false, out, saw_digit, frac_digits, sig_digits, too_many_digits);
+  if (p < end && *p == '.') {
+    ++p;
+    scan_digit_run(p, end, true, out, saw_digit, frac_digits, sig_digits, too_many_digits);
+  }
+
+  if (!saw_digit) return false;
+
+  int explicit_exp = 0;
+  if (p < end && (*p == 'e' || *p == 'E' || *p == 'd' || *p == 'D')) {
+    const char* exp_start = p;
+    ++p;
+    bool exp_negative = false;
+    if (p < end && (*p == '-' || *p == '+')) {
+      exp_negative = *p == '-';
+      ++p;
+    }
+    if (p == end || !is_digit(*p)) {
+      p = exp_start;
+    } else {
+      int exp_value = 0;
+      while (p < end && is_digit(*p)) {
+        if (exp_value < 1000000) exp_value = exp_value * 10 + (*p - '0');
+        ++p;
+      }
+      explicit_exp = exp_negative ? -exp_value : exp_value;
+    }
+  }
+
+  out.exp10         = explicit_exp - frac_digits;
+  out.fast_eligible = !too_many_digits;
+  return true;
+}
+
+// fallback to stdlib for edge case or ambiguous roundings (very rare)
+static inline double fallback_strtod(std::string_view s)
+{
+  char stack_buf[32];
+  // The MPS specs mandate that numeric tokens are not longer than 25 characters
+  if (s.size() >= sizeof(stack_buf)) {
+    mps_parser_fail(error_type_t::ValidationError, "MPS numeric token exceeds supported length");
+  }
+  std::memcpy(stack_buf, s.data(), s.size());
+  stack_buf[s.size()] = '\0';
+  for (size_t i = 0; i < s.size(); ++i) {
+    if (stack_buf[i] == 'd' || stack_buf[i] == 'D') stack_buf[i] = 'e';
+  }
+
+  char* parse_end = nullptr;
+  errno           = 0;
+  double value    = std::strtod(stack_buf, &parse_end);
+  if (parse_end != stack_buf + s.size() || errno == ERANGE) {
+    mps_parser_fail(error_type_t::ValidationError, "Invalid or out-of-range MPS numeric token");
+  }
+  return value;
+}
+
+// see Daniel Lemire, Number Parsing at a Gigabyte per Second, Software: Practice and Experience 51
+// (8), 2021.
+static inline bool eisel_lemire(uint64_t man, int exp10, uint64_t& bits)
+{
+  if (exp10 < FASTP64_MIN_EXP_10 || exp10 > FASTP64_MAX_EXP_10) { return false; }
+
+  const power_10_lut_entry_t p = fast_fp64_parse_lut[exp10 - FASTP64_MIN_EXP_10];
+  int lz                       = std::countl_zero(man);
+  uint64_t norm                = man << lz;
+  int adj_e2                   = p.biased_e2 - lz;
+
+  unsigned __int128 product = (unsigned __int128)norm * p.high;
+  uint64_t hi               = (uint64_t)(product >> 64);
+  uint64_t lo               = (uint64_t)product;
+
+  // If the high product lands near the 9-bit halfway window, include the low
+  // 64x64 product to disambiguate rounding before deciding whether to fallback.
+  if ((hi & FASTP64_HALF_MASK) == FASTP64_HALF_MASK && lo + norm < norm) {
+    unsigned __int128 low_product = (unsigned __int128)norm * p.low;
+    uint64_t low_hi               = (uint64_t)(low_product >> 64);
+    uint64_t low_lo               = (uint64_t)low_product;
+    uint64_t old_lo               = lo;
+    lo += low_hi;
+    hi += lo < old_lo ? 1 : 0;
+    if ((hi & FASTP64_HALF_MASK) == FASTP64_HALF_MASK &&
+        lo == std::numeric_limits<uint64_t>::max() && low_lo + norm < low_lo) {
+      return false;
+    }
+  }
+
+  uint64_t hi_msb = hi >> 63;
+  // Extract 54 bits: 53 significand bits plus one rounding bit. The product
+  // may be shifted by one depending on whether hi already has its top bit set.
+  uint64_t x54 = hi >> (9 + hi_msb);
+  adj_e2 -= (int)(1 - hi_msb);
+
+  // Exact halfway with round-to-even ambiguity; let strtod handle the rare tie.
+  if (lo == 0 && (hi & FASTP64_HALF_MASK) == 0 && (x54 & 3) == 1) { return false; }
+
+  // Round 54 -> 53 bits, carry into the exponent if rounding overflows.
+  uint64_t x53      = (x54 + (x54 & 1)) >> 1;
+  uint64_t overflow = x53 >> 53;
+  uint64_t ret_man  = (x53 >> overflow) & FASTP64_MANTISSA_MASK;
+  int ret_exp       = adj_e2 + (int)overflow;
+  if (ret_exp <= 0 || ret_exp >= FASTP64_EXPONENT_MASK) { return false; }
+
+  bits = ((uint64_t)ret_exp << 52) | ret_man;
+  return true;
+}
+
+static inline double assemble_fp64(const parsed_decimal_t& dec)
+{
+  uint64_t bits = dec.negative ? (uint64_t{1} << 63) : 0;
+  if (dec.mantissa == 0) { return std::bit_cast<double>(bits); }
+
+  if (dec.fast_eligible) {
+    double small    = 0.0;
+    bool used_small = false;
+    if (dec.exp10 >= 0 && dec.exp10 < (int)small_integer_powers.size()) {
+      uint64_t limit = (uint64_t{1} << 53) / small_integer_powers[dec.exp10];
+      if (dec.mantissa <= limit) {
+        small      = (double)dec.mantissa * small_powers[dec.exp10];
+        used_small = true;
+      }
+    } else if (dec.exp10 < 0 && dec.exp10 >= -22 && dec.mantissa < (uint64_t{1} << 53)) {
+      small      = (double)dec.mantissa / small_powers[-dec.exp10];
+      used_small = true;
+    }
+    if (used_small) { return dec.negative ? -small : small; }
+
+    uint64_t mag_bits = 0;
+    if (eisel_lemire(dec.mantissa, dec.exp10, mag_bits)) {
+      return std::bit_cast<double>(bits | mag_bits);
+    }
+  }
+
+  return std::numeric_limits<double>::quiet_NaN();
+}
+
+static inline double parse_fp64_advance(const char*& p, const char* end)
+{
+  const char* start = p;
+  parsed_decimal_t dec;
+  if (!parse_decimal_advance(p, end, dec)) {
+    return fallback_strtod(std::string_view(start, (size_t)(p - start)));
+  }
+
+  double v = assemble_fp64(dec);
+  if (v == v) {
+    if (p < end && (unsigned char)*p > 32) {
+      mps_parser_fail(error_type_t::ValidationError, "Invalid or out-of-range MPS numeric token");
+    }
+    return v;
+  }
+  return fallback_strtod(std::string_view(start, (size_t)(p - start)));
+}
+
+}  // namespace fp64
+}  // namespace cuopt::mathematical_optimization::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp
new file mode 100644
index 0000000000..3d1700e579
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/fast_parse_primitives.hpp
@@ -0,0 +1,386 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "fast_fp64_parser.hpp"
+
+#include <cstdarg>
+#include <cstddef>
+#include <utility>
+
+#include <simde/x86/avx2.h>
+#include <simde/x86/sse4.2.h>
+
+#ifndef LIKELY
+#define LIKELY(x) __builtin_expect(!!(x), 1)
+#endif
+
+#ifndef UNLIKELY
+#define UNLIKELY(x) __builtin_expect(!!(x), 0)
+#endif
+
+namespace cuopt::mathematical_optimization::io::detail {
+
+enum scan_mode {
+  skip_whitespace,
+  until_whitespace,
+};
+
+// util to serially scan along an in-memory input buffer
+// contains optimized primitives for most parsing operations
+struct cursor_t {
+  const char* start;
+  const char* ptr;
+  const char* end;
+
+  cursor_t(const char* data, std::size_t size) : start(data), ptr(data), end(data + size) {}
+
+  bool done() const { return ptr >= end; }
+
+  // used in error reporting
+  std::pair<std::size_t, std::size_t> linecol_position() const
+  {
+    std::size_t line       = 1;
+    const char* line_start = start;
+    for (const char* p = start; p < ptr; ++p) {
+      if (*p == '\n') {
+        ++line;
+        line_start = p + 1;
+      }
+    }
+    std::size_t column = (std::size_t)(ptr - line_start) + 1;
+    return {line, column};
+  }
+
+  [[noreturn]] void error(const char* msg, ...)
+  {
+    auto [line, col] = linecol_position();
+    va_list args;
+    va_start(args, msg);
+    char msg_buf[512];
+    std::vsnprintf(msg_buf, sizeof(msg_buf), msg, args);
+    va_end(args);
+    mps_parser_fail(error_type_t::ValidationError, "%zu:%zu: %s", line, col, msg_buf);
+  }
+
+  void advance(std::size_t n)
+  {
+    if (ptr + n > end) { mps_parser_fail(error_type_t::ValidationError, "Unexpected end of file"); }
+    ptr += n;
+  }
+
+  template <scan_mode mode>
+  static const char* scalar_scan(const char* p, const char* end)
+  {
+    while (p < end) {
+      unsigned char c = (unsigned char)*p;
+      if constexpr (mode == skip_whitespace) {
+        if (c > 32 || c == '\n') return p;
+      } else {
+        if (c <= 32) return p;
+      }
+      p++;
+    }
+    return end;
+  }
+
+  // scans for the first non-whitespace (or vice versa)
+  template <scan_mode mode>
+  static const char* simd_scan(const char* p, const char* end)
+  {
+    const simde__m256i v32 = simde_mm256_set1_epi8(32);  // space/control characters
+    const simde__m256i vnl = simde_mm256_set1_epi8('\n');
+
+    while (p + 32 <= end) {
+      simde__m256i data = simde_mm256_loadu_si256((const simde__m256i*)p);
+      simde__m256i gt32 = simde_mm256_cmpgt_epi8(data, v32);
+
+      unsigned int mask;
+      if constexpr (mode == skip_whitespace) {
+        simde__m256i is_nl = simde_mm256_cmpeq_epi8(data, vnl);
+        mask = (unsigned int)simde_mm256_movemask_epi8(simde_mm256_or_si256(gt32, is_nl));
+      } else {
+        mask = ~(unsigned int)simde_mm256_movemask_epi8(gt32);
+      }
+
+      if (mask != 0) { return p + __builtin_ctz(mask); }
+      p += 32;
+    }
+    return scalar_scan<mode>(p, end);
+  }
+
+  void skip_ws() { ptr = simd_scan<skip_whitespace>(ptr, end); }
+
+  bool eol() const { return ptr < end && (*ptr == '\n' || *ptr == '\r'); }
+
+  void consume_eol()
+  {
+    if (ptr < end && *ptr == '\r') {
+      ptr++;
+      if (ptr < end && *ptr == '\n') { ptr++; }
+      return;
+    }
+    if (ptr < end && *ptr == '\n') { ptr++; }
+  }
+
+  // could be SIMD but comments are usually rare
+  void skip_comment_line()
+  {
+    while (!done() && *ptr != '\n' && *ptr != '\r') {
+      ptr++;
+    }
+    consume_eol();
+  }
+
+  void skip_to_eol()
+  {
+    while (!done() && *ptr != '\n' && *ptr != '\r') {
+      ptr++;
+    }
+  }
+
+  // useful for parsing NAME/OBJNAME which may span multiple "fields" according to the MPS spec
+  std::string_view read_rest_of_line_trimmed()
+  {
+    const char* begin    = ptr;
+    const char* line_end = begin;
+    while (line_end < end && *line_end != '\n' && *line_end != '\r') {
+      ++line_end;
+    }
+
+    while (begin < line_end && (*begin == ' ' || *begin == '\t')) {
+      ++begin;
+    }
+    while (line_end > begin && (line_end[-1] == ' ' || line_end[-1] == '\t')) {
+      --line_end;
+    }
+    ptr = line_end;
+    return std::string_view(begin, (std::size_t)(line_end - begin));
+  }
+
+  inline __attribute__((always_inline)) std::string_view read_field()
+  {
+    if (UNLIKELY(done())) { return {}; }
+
+    const char* field_start = ptr;
+    if (UNLIKELY(end - ptr < 32)) {
+      ptr                   = scalar_scan<until_whitespace>(ptr, end);
+      const char* field_end = ptr;
+      if (ptr < end) { skip_ws(); }
+      return std::string_view(field_start, field_end - field_start);
+    }
+
+    const simde__m256i v32 = simde_mm256_set1_epi8(32);
+    const simde__m256i vnl = simde_mm256_set1_epi8('\n');
+
+    // all input streams provide trailing padding, so this 32B load is valid
+    // whenever end - ptr >= 32
+    simde__m256i data    = simde_mm256_loadu_si256((const simde__m256i*)ptr);
+    simde__m256i gt32    = simde_mm256_cmpgt_epi8(data, v32);
+    unsigned int ws_mask = ~(unsigned int)simde_mm256_movemask_epi8(gt32);
+
+    if (UNLIKELY(ws_mask == 0)) {
+      ptr                   = simd_scan<until_whitespace>(ptr + 32, end);
+      const char* field_end = ptr;
+      if (ptr < end) { skip_ws(); }
+      return std::string_view(field_start, field_end - field_start);
+    }
+
+    int field_end_off     = __builtin_ctz(ws_mask);
+    const char* field_end = ptr + field_end_off;
+
+    simde__m256i is_nl = simde_mm256_cmpeq_epi8(data, vnl);
+    unsigned int stop_mask =
+      (unsigned int)simde_mm256_movemask_epi8(simde_mm256_or_si256(gt32, is_nl));
+    unsigned int after_field = stop_mask & ~((1u << field_end_off) - 1);
+
+    if (LIKELY(after_field != 0)) {
+      ptr = ptr + __builtin_ctz(after_field);
+    } else {
+      ptr = field_end;
+      if (ptr < end) { skip_ws(); }
+    }
+
+    return std::string_view(field_start, field_end - field_start);
+  }
+
+  // read but do not consume
+  inline __attribute__((always_inline)) std::string_view peek_field()
+  {
+    if (UNLIKELY(done())) { return {}; }
+    const char* field_end = simd_scan<until_whitespace>(ptr, end);
+    return std::string_view(ptr, field_end - ptr);
+  }
+
+  static inline std::string_view peek_field_at(const char* line_start, const char* section_end)
+  {
+    cursor_t cursor(line_start, (std::size_t)(section_end - line_start));
+    cursor.skip_ws();
+    return cursor.peek_field();
+  }
+
+  // usually in MPS fields go in pair. these can usually be extracted in a single 32B load
+  inline __attribute__((always_inline)) std::pair<std::string_view, std::string_view>
+  read_two_fields()
+  {
+    auto slow = [&] {
+      auto f1 = read_field();
+      auto f2 = read_field();
+      return std::pair<std::string_view, std::string_view>{f1, f2};
+    };
+
+    if (UNLIKELY(end - ptr < 32)) { return slow(); }
+
+    const char* field1_start = ptr;
+    const simde__m256i v32   = simde_mm256_set1_epi8(32);
+    const simde__m256i vnl   = simde_mm256_set1_epi8('\n');
+
+    // Same padded-buffer contract as read_field().
+    simde__m256i data = simde_mm256_loadu_si256((const simde__m256i*)ptr);
+    simde__m256i gt32 = simde_mm256_cmpgt_epi8(data, v32);
+
+    unsigned int printable_mask = (unsigned int)simde_mm256_movemask_epi8(gt32);
+    unsigned int ws_mask        = ~printable_mask;
+
+    if (UNLIKELY(ws_mask == 0)) { return slow(); }
+    int field1_end_off = __builtin_ctz(ws_mask);
+
+    simde__m256i is_nl                = simde_mm256_cmpeq_epi8(data, vnl);
+    unsigned int nl_mask              = (unsigned int)simde_mm256_movemask_epi8(is_nl);
+    unsigned int barrier_after_field1 = (printable_mask | nl_mask) >> field1_end_off;
+    if (UNLIKELY(barrier_after_field1 == 0)) { return slow(); }
+    int field2_rel_off = __builtin_ctz(barrier_after_field1);
+    if (UNLIKELY(ptr[field1_end_off + field2_rel_off] == '\n' ||
+                 ptr[field1_end_off + field2_rel_off] == '\r')) {
+      return slow();
+    }
+    int field2_start_off = field1_end_off + field2_rel_off;
+
+    unsigned int ws_after_field2_start = ws_mask >> field2_start_off;
+    if (UNLIKELY(ws_after_field2_start == 0)) { return slow(); }
+    int field2_end_off = field2_start_off + __builtin_ctz(ws_after_field2_start);
+
+    unsigned int stop_mask         = printable_mask | nl_mask;
+    unsigned int stop_after_field2 = stop_mask >> field2_end_off;
+    if (LIKELY(stop_after_field2 != 0)) {
+      ptr = ptr + field2_end_off + __builtin_ctz(stop_after_field2);
+    } else {
+      ptr = ptr + field2_end_off;
+      skip_ws();
+    }
+
+    return {std::string_view(field1_start, field1_end_off),
+            std::string_view(field1_start + field2_start_off, field2_end_off - field2_start_off)};
+  }
+};
+
+static inline void expect(cursor_t& cursor, const char* field)
+{
+  auto id = cursor.read_field();
+  if (UNLIKELY(id != field)) {
+    cursor.error("expected '%s', got '%.*s'", field, (int)id.size(), id.data());
+  }
+}
+
+static inline void accept_comment_line(cursor_t& cursor)
+{
+  for (;;) {
+    while (!cursor.done() && cursor.eol()) {
+      cursor.consume_eol();
+    }
+    if (cursor.done() || (cursor.ptr[0] != '*' && cursor.ptr[0] != '$')) { return; }
+    cursor.skip_comment_line();
+  }
+}
+
+static inline void expect_eol(cursor_t& cursor)
+{
+  if (UNLIKELY(!cursor.eol())) {
+    auto got = cursor.peek_field();
+    cursor.error("expected end of line, got '%.*s'", (int)got.size(), got.data());
+  }
+
+  for (;;) {
+    while (cursor.eol()) {
+      cursor.consume_eol();
+    }
+    if (UNLIKELY(cursor.done())) { return; }
+
+    if (UNLIKELY(cursor.ptr[0] == '*' || cursor.ptr[0] == '$')) {
+      cursor.skip_comment_line();
+      continue;
+    }
+
+    if (LIKELY(cursor.ptr[0] == ' ') && LIKELY(cursor.ptr + 1 < cursor.end)) { cursor.ptr += 1; }
+
+    if (UNLIKELY(cursor.done())) { return; }
+    char c = cursor.ptr[0];
+    if (UNLIKELY(!((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')))) {
+      cursor.skip_ws();
+      if (cursor.eol()) { continue; }
+    }
+    break;
+  }
+}
+
+static inline std::string_view peek(cursor_t& cursor) { return cursor.peek_field(); }
+
+static inline bool accept(cursor_t& cursor, const char* field)
+{
+  if (peek(cursor) == field) {
+    expect(cursor, field);
+    return true;
+  }
+  return false;
+}
+
+static inline void expect_section(cursor_t& cursor, const char* section)
+{
+  expect(cursor, section);
+  expect_eol(cursor);
+}
+
+static inline double expect_number(cursor_t& cursor)
+{
+  auto num = cursor.read_field();
+  if (num.empty()) { cursor.error("expected number, got empty field"); }
+  const char* p = num.data();
+  return fp64::parse_fp64_advance(p, p + num.size());
+}
+
+static inline double expect_number_fast_pm_one(cursor_t& cursor)
+{
+  const char* p = cursor.ptr;
+  if (cursor.end - p >= 3 && p[0] == '-' && p[1] == '1' && p[2] <= ' ') {
+    cursor.ptr = p + 2;
+    cursor.skip_ws();
+    return -1.0;
+  }
+  if (cursor.end - p >= 2 && p[0] == '1' && p[1] <= ' ') {
+    cursor.ptr = p + 1;
+    cursor.skip_ws();
+    return 1.0;
+  }
+  return expect_number(cursor);
+}
+
+static inline bool accept_section(cursor_t& cursor, const char* section)
+{
+  if (accept(cursor, section)) {
+    expect_eol(cursor);
+    return true;
+  }
+  return false;
+}
+
+static inline bool accept_comment(cursor_t& cursor)
+{
+  if (UNLIKELY(!cursor.done() && cursor.ptr[0] == '$')) {
+    cursor.skip_to_eol();
+    return true;
+  }
+  return false;
+}
+
+}  // namespace cuopt::mathematical_optimization::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.cpp b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
new file mode 100644
index 0000000000..4b74943a1d
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/fast_parser.cpp
@@ -0,0 +1,3219 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+#include "fast_parser.hpp"
+#include "fast_parse_primitives.hpp"
+#include "file_reader.hpp"
+#include "hash_table_smallstr.hpp"
+#include "mmap_region.hpp"
+#include "mps_section_scanner.hpp"
+#include "nvtx_ranges.hpp"
+
+#include <cuda/cmath>
+#if defined(MPS_FAST_PERF_COUNTERS) || defined(MPS_FAST_TIMERS)
+#include <utilities/perf_counters.hpp>
+#endif
+
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include <omp.h>
+#include <algorithm>
+#include <cassert>
+#include <cctype>
+#include <cerrno>
+#include <charconv>
+#include <climits>
+#include <concepts>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <exception>
+#include <limits>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <string_view>
+#include <tuple>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include <file_to_string.hpp>
+
+#define MPS_FAST_COMPACT_ROW_HASH
+#define MPS_FAST_THP_PREFAULT
+
+namespace cuopt::mathematical_optimization::io::detail {
+
+static constexpr size_t KiB = 1024;
+static constexpr size_t MiB = 1024 * KiB;
+static constexpr size_t GiB = 1024 * MiB;
+
+// per-chunk row-count scratch tile for the column parsing workers
+// small enough to remain warm in L1
+static constexpr size_t COLUMN_ROW_COUNT_BLOCK_ROWS = 4096;
+static constexpr int MPS_ROWS_THREAD_CAP            = 16;
+static constexpr int MPS_COLUMNS_THREAD_CAP         = 32;
+static constexpr int MPS_BOUNDS_THREAD_CAP          = 32;
+static constexpr int MPS_NAMES_THREAD_CAP           = 16;
+// avoid openmp setup for small bounds sections
+static constexpr size_t MPS_BOUNDS_PARALLEL_MIN_BYTES = 256 * MiB;
+// ordered-name fallback is cheap enough to parallelize on smaller bounds sections
+static constexpr size_t MPS_BOUNDS_ORDERED_HINT_PARALLEL_MIN_BYTES = 8 * MiB;
+// lower bound on columns chunk size to avoid tiny parser tasks
+static constexpr size_t MPS_COLUMNS_MIN_CHUNK_BYTES = 1 * MiB;
+// parser-wide thread cap switch; very small files lose to scheduling overhead
+static constexpr size_t MPS_MEDIUM_FILE_THREAD_THRESHOLD_BYTES = 100ull * 1000ull * 1000ull;
+// thread caps for small and large files
+static constexpr int MPS_SMALL_FILE_THREAD_CAP = 16;
+static constexpr int MPS_LARGE_FILE_THREAD_CAP = 32;
+
+static int parser_thread_cap_for_size(size_t bytes)
+{
+  int size_cap = bytes < MPS_MEDIUM_FILE_THREAD_THRESHOLD_BYTES ? MPS_SMALL_FILE_THREAD_CAP
+                                                                : MPS_LARGE_FILE_THREAD_CAP;
+  return std::max(1, std::min(size_cap, omp_get_max_threads()));
+}
+
+static int phase_thread_count(int phase_cap)
+{
+  const int available_threads = omp_in_parallel() ? omp_get_num_threads() : omp_get_max_threads();
+  return std::max(1, std::min(phase_cap, available_threads));
+}
+
+// Arena allocator for the strings (row names, column names) to avoid the dreadful overheads of
+// glibc's malloc and std::vector<std::string>
+class chunk_name_arena_t {
+ public:
+  void reserve(size_t bytes)
+  {
+    if (bytes > next_slab_size_) { next_slab_size_ = bytes; }
+  }
+
+  std::string_view copy(std::string_view name)
+  {
+    char* dst = allocate(name.size() + 1);
+    std::memcpy(dst, name.data(), name.size());
+    dst[name.size()] = '\0';
+    return std::string_view(dst, name.size());
+  }
+
+ private:
+  struct slab_t {
+    std::vector<char> data;
+    size_t used = 0;
+  };
+
+  char* allocate(size_t bytes)
+  {
+    if (slabs_.empty() || slabs_.back().used + bytes > slabs_.back().data.size()) {
+      size_t capacity = std::max(bytes, next_slab_size_);
+      slab_t slab;
+      slab.data.resize(capacity);
+      slabs_.push_back(std::move(slab));
+      next_slab_size_ = std::max(next_slab_size_ * 2, capacity);
+    }
+    slab_t& slab = slabs_.back();
+    char* ptr    = slab.data.data() + slab.used;
+    slab.used += bytes;
+    return ptr;
+  }
+
+  std::vector<slab_t> slabs_;
+  size_t next_slab_size_ = 64 * KiB;
+};
+
+struct timer_entry_t {
+  const char* name;
+  double elapsed_ms;
+  size_t rss_kb;
+  size_t hwm_kb;
+};
+
+static std::vector<timer_entry_t>& get_timer_buffer()
+{
+  static std::vector<timer_entry_t> buffer;
+  buffer.reserve(100);
+  return buffer;
+}
+
+static std::mutex& get_timer_mutex()
+{
+  static std::mutex mutex;
+  return mutex;
+}
+
+static void flush_timers()
+{
+#ifdef MPS_FAST_TIMERS
+  std::lock_guard<std::mutex> lock(get_timer_mutex());
+  auto& buffer = get_timer_buffer();
+  for (const auto& entry : buffer) {
+    std::fprintf(stderr,
+                 "[TIMER] %s: %.3f ms rss_GB=%.3f hwm_GB=%.3f\n",
+                 entry.name,
+                 entry.elapsed_ms,
+                 (double)entry.rss_kb / (double)(GiB / KiB),
+                 (double)entry.hwm_kb / (double)(GiB / KiB));
+  }
+  buffer.clear();
+#endif
+}
+
+enum class materialize_touch_t {
+  write_2mb,
+  write_4kb,
+};
+
+// instanciate a range using mmap anon pages with hugepage hints, and materialize them
+// by touching each to nudge the kernel into invoking its THP mechanism
+static void materialize_hugepages([[maybe_unused]] const char* label,
+                                  void* data,
+                                  size_t bytes,
+                                  materialize_touch_t touch)
+{
+  if (data == nullptr || bytes == 0) return;
+
+  constexpr size_t two_mb = 2 * MiB;
+  size_t page_size        = system_page_size();
+  uintptr_t start         = reinterpret_cast<uintptr_t>(data);
+  uintptr_t end           = start + bytes;
+  uintptr_t aligned_start = start & ~(uintptr_t)(page_size - 1);
+  uintptr_t aligned_end   = (end + page_size - 1) & ~(uintptr_t)(page_size - 1);
+  size_t aligned_bytes    = (size_t)(aligned_end - aligned_start);
+
+  errno = 0;
+  madvise((void*)(aligned_start), aligned_bytes, MADV_HUGEPAGE);
+
+  size_t step        = touch == materialize_touch_t::write_2mb ? two_mb : page_size;
+  volatile char* ptr = (volatile char*)(data);
+  for (size_t offset = 0; offset < bytes; offset += step) {
+    ptr[offset] = ptr[offset];
+  }
+  ptr[bytes - 1] = ptr[bytes - 1];
+}
+
+template <typename T>
+static void materialize_vector_hugepages(const char* label,
+                                         std::vector<T>& values,
+                                         materialize_touch_t touch)
+{
+  materialize_hugepages(label, values.data(), values.size() * sizeof(T), touch);
+}
+
+class scoped_timer_t {
+ public:
+  scoped_timer_t([[maybe_unused]] const char* name, double* accumulator = nullptr)
+#ifdef MPS_FAST_TIMERS
+    : name_(name),
+      accumulator_(accumulator),
+      nvtx_(name, nvtx::color_for_name(name)),
+      start_(std::chrono::high_resolution_clock::now()){}
+#else
+    : accumulator_(accumulator)
+  {
+  }
+#endif
+
+      ~scoped_timer_t()
+  {
+#ifdef MPS_FAST_TIMERS
+    auto end          = std::chrono::high_resolution_clock::now();
+    double elapsed_ms = std::chrono::duration<double, std::milli>(end - start_).count();
+    nvtx_.end();
+    if (accumulator_) { *accumulator_ += elapsed_ms; }
+    auto [rss_kb, hwm_kb] = current_process_rss_kb();
+    std::lock_guard<std::mutex> lock(get_timer_mutex());
+    get_timer_buffer().push_back({name_, elapsed_ms, rss_kb, hwm_kb});
+#endif
+  }
+
+  scoped_timer_t(const scoped_timer_t&)            = delete;
+  scoped_timer_t& operator=(const scoped_timer_t&) = delete;
+
+ private:
+#ifdef MPS_FAST_TIMERS
+  const char* name_;
+#endif
+  double* accumulator_;
+#ifdef MPS_FAST_TIMERS
+  nvtx::scoped_range_t nvtx_;
+  std::chrono::high_resolution_clock::time_point start_;
+#endif
+};
+
+class omp_max_active_levels_guard_t {
+ public:
+  explicit omp_max_active_levels_guard_t(int value) : old_value_(omp_get_max_active_levels())
+  {
+    omp_set_max_active_levels(value);
+  }
+
+  ~omp_max_active_levels_guard_t() { omp_set_max_active_levels(old_value_); }
+
+  omp_max_active_levels_guard_t(const omp_max_active_levels_guard_t&)            = delete;
+  omp_max_active_levels_guard_t& operator=(const omp_max_active_levels_guard_t&) = delete;
+
+ private:
+  int old_value_ = 0;
+};
+
+static inline void error_unknown_row(cursor_t& cursor, const char* row_start, const char* section)
+{
+  const char* row_end = row_start;
+  while (row_end < cursor.end && *row_end > ' ') {
+    row_end++;
+  }
+  cursor.error("unknown row name in %s: %.*s", section, (int)(row_end - row_start), row_start);
+}
+
+// Two modes for row/column name lookup:
+// - hash: arbitrary names via hash table (rows) or var_names_map (columns)
+// - dense_ordered: sequential numeric suffixes like R0001/R0002 or V0/V1
+enum class index_mode_t {
+  hash,
+  dense_ordered,
+};
+
+// Every 19-digit decimal string fits in uint64_t; 20+ digits may not and are wildly unlikely in the
+// context of dense MPS rows/cols
+static constexpr size_t dense_suffix_max_digits = 19;
+
+static inline size_t decimal_digits_u64(uint64_t value)
+{
+  size_t digits = 1;
+  while (value >= 10) {
+    value /= 10;
+    digits++;
+  }
+  return digits;
+}
+
+static inline bool parse_trailing_u64(std::string_view name,
+                                      std::string_view& prefix,
+                                      uint64_t& value,
+                                      size_t& suffix_width)
+{
+  size_t pos = name.size();
+  while (pos > 0 && fp64::is_digit(name[pos - 1])) {
+    pos--;
+  }
+  if (pos == name.size()) { return false; }
+
+  suffix_width = name.size() - pos;
+  if (suffix_width > dense_suffix_max_digits) { return false; }
+
+  uint64_t parsed = 0;
+  for (size_t i = pos; i < name.size(); ++i) {
+    parsed = parsed * 10 + (uint64_t)(name[i] - '0');
+  }
+
+  prefix = std::string_view(name.data(), pos);
+  value  = parsed;
+  return true;
+}
+
+// necessary to handle cases like R0001, ..., R2000, ...
+static inline bool dense_suffix_is_zero_padded(std::string_view name, size_t suffix_width)
+{
+  return suffix_width > 1 && name[name.size() - suffix_width] == '0';
+}
+
+static inline size_t dense_initial_pad_width(std::string_view name, size_t suffix_width)
+{
+  return dense_suffix_is_zero_padded(name, suffix_width) ? suffix_width : 0;
+}
+
+static inline bool dense_suffix_width_ok(uint64_t value, size_t suffix_width, size_t pad_width)
+{
+  size_t digits         = decimal_digits_u64(value);
+  size_t expected_width = std::max(pad_width, digits);
+  return suffix_width == expected_width;
+}
+
+struct dense_name_index_t {
+  std::string prefix;
+  uint64_t min_id  = 0;
+  uint64_t max_id  = 0;
+  size_t pad_width = 0;
+
+  void reset()
+  {
+    prefix.clear();
+    min_id    = 0;
+    max_id    = 0;
+    pad_width = 0;
+  }
+
+  bool suffix_width_ok(uint64_t value, size_t suffix_width) const
+  {
+    return dense_suffix_width_ok(value, suffix_width, pad_width);
+  }
+
+  size_t lookup(std::string_view name) const
+  {
+    std::string_view parsed_prefix;
+    uint64_t value      = 0;
+    size_t suffix_width = 0;
+    if (!parse_trailing_u64(name, parsed_prefix, value, suffix_width)) { return SIZE_MAX; }
+    if (parsed_prefix != prefix || !suffix_width_ok(value, suffix_width)) { return SIZE_MAX; }
+    if (value < min_id || value > max_id) { return SIZE_MAX; }
+    return (size_t)(value - min_id);
+  }
+
+  void format_name(size_t idx, std::string& out) const
+  {
+    uint64_t value = min_id + idx;
+    char digits_buf[32];
+    auto [digits_end, ec] = std::to_chars(digits_buf, digits_buf + sizeof(digits_buf), value);
+    if (ec != std::errc()) {
+      out.assign(prefix);
+      return;
+    }
+    size_t digits_len = (size_t)(digits_end - digits_buf);
+    size_t width      = std::max(pad_width, digits_len);
+    out.resize(prefix.size() + width);
+    std::memcpy(out.data(), prefix.data(), prefix.size());
+    char* suffix = out.data() + prefix.size();
+    if (width > digits_len) {
+      std::memset(suffix, '0', width - digits_len);
+      suffix += width - digits_len;
+    }
+    std::memcpy(suffix, digits_buf, digits_len);
+  }
+};
+
+struct dense_observe_state_t {
+  bool candidate = true;
+  dense_name_index_t index;
+  size_t count = 0;
+};
+
+static inline void observe_dense_name(bool& candidate,
+                                      dense_name_index_t& index,
+                                      size_t& observed_count,
+                                      std::string_view name,
+                                      uint64_t expected_id = std::numeric_limits<uint64_t>::max())
+{
+  if (!candidate) { return; }
+
+  std::string_view prefix;
+  uint64_t value      = 0;
+  size_t suffix_width = 0;
+  if (!parse_trailing_u64(name, prefix, value, suffix_width)) {
+    candidate = false;
+    return;
+  }
+
+  if (observed_count == 0) {
+    index.prefix.assign(prefix);
+    index.min_id    = value;
+    index.max_id    = value;
+    index.pad_width = dense_initial_pad_width(name, suffix_width);
+    observed_count  = 1;
+    return;
+  }
+
+  if (prefix != index.prefix) {
+    candidate = false;
+    return;
+  }
+
+  if (expected_id != std::numeric_limits<uint64_t>::max() && value != expected_id) {
+    candidate = false;
+    return;
+  }
+
+  if (!index.suffix_width_ok(value, suffix_width)) {
+    candidate = false;
+    return;
+  }
+
+  index.max_id = value;
+  observed_count++;
+}
+
+// Maps MPS row/column names to indices via one of two strategies, chosen per problem:
+//
+//   * dense_ordered - when every name in a section is a shared prefix followed by a
+//     contiguous run of integers (e.g. R0001, R0002, ... or x1, x2, ...). The index is
+//     then computed straight from the parsed integer (value - min_id), so no hash table
+//     is built or probed. This is the common, fast case for solver-generated models.
+//   * hash          - the general fallback (smallstr_hash_table_t) for arbitrary names.
+//
+// Each section decides its own mode while scanning: it stays a dense_ordered "candidate"
+// as long as names keep matching the prefix + consecutive-integer + zero-pad-width rule
+// (see observe_dense_name), and the first violation drops it to the hash path. The chosen
+// mode lives in row_index_mode / col_index_mode, and every lookup branches on it
+// (row_lookup / read_row_lookup vs the dense_ordered variants below). Holding this in mind
+// explains most of the paired/dual code paths throughout this file.
+template <typename i_t, typename f_t>
+struct parse_state_t {
+  mps_data_model_t<i_t, f_t>& problem;
+  cursor_t& cursor;
+
+  // backed by the input buffer
+  std::vector<std::string_view> row_names_sv;
+  // backed by the arena allocator
+  std::vector<std::string_view> var_names_sv;
+  std::vector<chunk_name_arena_t> var_name_arenas;
+  std::string_view problem_name_sv;
+  std::string_view objective_name_sv;
+  // secondary 'N' rows in ROWS — rare; membership distinguishes them from unknown row names
+  std::unordered_set<std::string_view> ignored_objective_names;
+
+  // Column name lookup for labels like V0, V1, ...
+  index_mode_t col_index_mode = index_mode_t::hash;
+  dense_name_index_t col_dense;
+
+  smallstr_hash_table_t row_hash_;
+
+  // Row name lookup for labels like R0001, R0002, ...
+  index_mode_t row_index_mode = index_mode_t::hash;
+  bool row_dense_candidate    = true;
+  dense_name_index_t row_dense;
+
+  // var_names still uses STL (only used in parse_bounds, not as hot)
+  std::unordered_map<std::string_view, size_t> var_names_map;
+
+  mmap_region_t temp_A_region;
+  mmap_region_t temp_A_indices_region;
+  f_t* temp_A                = nullptr;
+  i_t* temp_A_indices        = nullptr;
+  size_t temp_csr_nnz        = 0;
+  bool temp_csr_materialized = false;
+
+  struct bounds_only_var_t {
+    f_t lb    = f_t{0};
+    f_t ub    = std::numeric_limits<f_t>::infinity();
+    char type = 'C';
+  };
+
+  // some writers introduce zero-column variables only in BOUNDS.
+  std::map<std::string_view, bounds_only_var_t> bounds_only_vars;
+
+  struct qcmatrix_block_t {
+    size_t row_idx = SIZE_MAX;
+    std::string_view row_name;
+    std::vector<std::tuple<i_t, i_t, f_t>> entries;
+  };
+
+  std::vector<qcmatrix_block_t> qcmatrix_blocks;
+
+  parse_state_t(mps_data_model_t<i_t, f_t>& p, cursor_t& c) : problem(p), cursor(c) {}
+
+  void init_row_hash_table()
+  {
+    if (init_row_dense_ordered_table()) { return; }
+    init_row_hash_table_impl();
+  }
+
+  void observe_objective_row_name(std::string_view name)
+  {
+    if (objective_name_sv.empty()) {
+      objective_name_sv = name;
+    } else if (name != objective_name_sv) {
+      ignored_objective_names.insert(name);
+    }
+  }
+
+  bool init_row_dense_ordered_table()
+  {
+    scoped_timer_t timer("row_dense_finalize");
+    size_t n_rows = row_names_sv.size();
+    if (!row_dense_candidate || n_rows == 0) { return false; }
+    if (row_dense.max_id < row_dense.min_id) { return false; }
+    uint64_t dense_count = row_dense.max_id - row_dense.min_id + 1;
+    if (dense_count != n_rows) { return false; }
+
+    row_index_mode = index_mode_t::dense_ordered;
+    return true;
+  }
+
+  // Insert all rows into the hash table. The perf-counter instrumentation is isolated in
+  // these two helpers so its #ifdefs do not fragment init_row_hash_table_impl's setup flow;
+  // both compile down to a bare insert loop when MPS_FAST_PERF_COUNTERS is off.
+  void insert_rows_partitioned(
+    int num_threads,
+    const std::array<size_t, MPS_ROW_HASH_PARTITIONS + 1>& partition_offsets,
+    const std::vector<size_t>& row_order,
+    const std::vector<uint32_t>& row_hashes)
+  {
+    scoped_timer_t timer("row_hash_insert_partitioned");
+#ifdef MPS_FAST_PERF_COUNTERS
+    std::vector<perf_counter_snapshot_t> perf_snapshots(MPS_ROW_HASH_PARTITIONS);
+#endif
+#pragma omp parallel for schedule(static) num_threads(num_threads)
+    for (int part_id = 0; part_id < (int)MPS_ROW_HASH_PARTITIONS; ++part_id) {
+      size_t p = (size_t)part_id;
+#ifdef MPS_FAST_PERF_COUNTERS
+      thread_perf_counters_t perf_counters;
+#endif
+      for (size_t pos = partition_offsets[p]; pos < partition_offsets[p + 1]; ++pos) {
+        size_t idx = row_order[pos];
+        row_hash_.insert_partition(p, row_names_sv[idx], row_hashes[idx], idx);
+      }
+#ifdef MPS_FAST_PERF_COUNTERS
+      perf_snapshots[p] = perf_counters.stop();
+#endif
+    }
+#ifdef MPS_FAST_PERF_COUNTERS
+    print_perf_totals("row_hash_insert_partitioned", perf_snapshots);
+#endif
+  }
+
+  void insert_rows_serial(size_t n_rows)
+  {
+#ifdef MPS_FAST_PERF_COUNTERS
+    thread_perf_counters_t perf_counters;
+#endif
+    for (size_t idx = 0; idx < n_rows; ++idx) {
+      row_hash_.insert_serial(row_names_sv[idx], idx);
+    }
+#ifdef MPS_FAST_PERF_COUNTERS
+    print_perf_totals("row_hash_insert_all", {perf_counters.stop()});
+#endif
+  }
+
+  void init_row_hash_table_impl()
+  {
+    scoped_timer_t timer("row_hash_init_total");
+    size_t n_rows              = row_names_sv.size();
+    const int num_threads      = phase_thread_count(MPS_ROWS_THREAD_CAP);
+    const bool use_partitioned = n_rows >= MPS_ROW_HASH_PARTITIONED_MIN_ROWS && num_threads > 1;
+#ifdef MPS_FAST_COMPACT_ROW_HASH
+    constexpr bool compact_row_hash = true;
+#else
+    constexpr bool compact_row_hash = false;
+#endif
+    std::vector<uint32_t> row_hashes;
+    std::vector<size_t> row_order;
+    std::array<size_t, MPS_ROW_HASH_PARTITIONS> partition_counts      = {};
+    std::array<size_t, MPS_ROW_HASH_PARTITIONS + 1> partition_offsets = {};
+
+    if (use_partitioned) {
+      scoped_timer_t timer("row_hash_partition_metadata");
+      row_hashes.resize(n_rows);
+      size_t inline_rows = 0;
+      for (size_t idx = 0; idx < n_rows; ++idx) {
+        std::string_view name = row_names_sv[idx];
+        if (UNLIKELY(name.size() > HASH_KEY_BYTES)) {
+          row_hash_.note_long_name(name, idx);
+          continue;
+        }
+        uint32_t hash   = fnv1a_hash(name.data(), name.size());
+        row_hashes[idx] = hash;
+        ++partition_counts[hash_partition_for(hash)];
+        ++inline_rows;
+      }
+
+      for (size_t p = 0; p < MPS_ROW_HASH_PARTITIONS; ++p) {
+        partition_offsets[p + 1] = partition_offsets[p] + partition_counts[p];
+      }
+
+      row_order.resize(inline_rows);
+      auto next_offsets = partition_offsets;
+      for (size_t idx = 0; idx < n_rows; ++idx) {
+        if (UNLIKELY(row_names_sv[idx].size() > HASH_KEY_BYTES)) { continue; }
+        size_t part                     = hash_partition_for(row_hashes[idx]);
+        row_order[next_offsets[part]++] = idx;
+      }
+    }
+
+    if (use_partitioned) {
+      row_hash_.configure_partitioned_buckets(partition_counts, compact_row_hash);
+    } else {
+      row_hash_.configure_serial_buckets(n_rows, compact_row_hash);
+    }
+
+    {
+      scoped_timer_t timer("row_hash_mmap");
+      row_hash_.allocate_mmap("row hash table");
+    }
+
+#ifdef MPS_FAST_THP_PREFAULT
+    {
+      scoped_timer_t timer("row_hash_thp_prefault");
+      materialize_hugepages("row_names_ht",
+                            row_hash_.slots(),
+                            row_hash_.region().size(),
+                            materialize_touch_t::write_2mb);
+    }
+#endif
+
+    {
+      scoped_timer_t timer("row_hash_insert_all");
+      row_hash_.reset_build_probe_stats();
+      if (use_partitioned) {
+        insert_rows_partitioned(num_threads, partition_offsets, row_order, row_hashes);
+      } else {
+        insert_rows_serial(n_rows);
+      }
+      row_hash_.print_build_probe_report(n_rows);
+    }
+
+#ifdef MPS_FAST_MADV_COLLAPSE
+    {
+      scoped_timer_t timer("row_hash_madv_collapse");
+      row_hash_.region().advise(MADV_COLLAPSE);
+    }
+#endif
+  }
+
+  size_t row_lookup(std::string_view name) const
+  {
+    if (LIKELY(row_index_mode == index_mode_t::dense_ordered)) { return row_dense.lookup(name); }
+    return row_hash_.lookup(name);
+  }
+
+  size_t read_row_lookup_dense_ordered(cursor_t& cursor) const
+  {
+    const char* start = cursor.ptr;
+    const char* p     = start;
+
+    size_t prefix_len = row_dense.prefix.size();
+    if (prefix_len > 0) {
+      if ((size_t)(cursor.end - p) < prefix_len ||
+          std::memcmp(p, row_dense.prefix.data(), prefix_len) != 0) {
+        cursor.read_field();
+        return SIZE_MAX;
+      }
+      p += prefix_len;
+    }
+
+    const char* digits_start = p;
+    uint64_t value           = 0;
+    fp64::parse_u64_digits_advance(p, cursor.end, value);
+
+    size_t suffix_width = (size_t)(p - digits_start);
+    if (suffix_width == 0 || suffix_width > dense_suffix_max_digits || p >= cursor.end ||
+        *p > ' ' || !row_dense.suffix_width_ok(value, suffix_width) || value < row_dense.min_id ||
+        value > row_dense.max_id) {
+      cursor.ptr = start;
+      cursor.read_field();
+      return SIZE_MAX;
+    }
+
+    cursor.ptr = p;
+    cursor.skip_ws();
+    return (size_t)(value - row_dense.min_id);
+  }
+
+  size_t read_row_lookup(cursor_t& cursor) const
+  {
+    if (LIKELY(row_index_mode == index_mode_t::dense_ordered)) {
+      return read_row_lookup_dense_ordered(cursor);
+    }
+
+    auto row_name = cursor.read_field();
+    return row_hash_.lookup(row_name);
+  }
+};
+
+// =============================================================================
+// Section parsers
+// =============================================================================
+
+template <typename i_t, typename f_t>
+static void parse_name_section(parse_state_t<i_t, f_t>& state)
+{
+  scoped_timer_t timer("parse_name");
+  if (peek(state.cursor) == "ROWS") { return; }
+  expect(state.cursor, "NAME");
+  if (!state.cursor.eol()) { state.problem_name_sv = state.cursor.read_rest_of_line_trimmed(); }
+  expect_eol(state.cursor);
+}
+
+template <typename i_t, typename f_t>
+static void parse_objsense_section(parse_state_t<i_t, f_t>& state)
+{
+  scoped_timer_t timer("parse_objsense");
+  if (accept(state.cursor, "OBJSENSE")) {
+    if (state.cursor.eol()) { expect_eol(state.cursor); }
+    auto sense = state.cursor.read_field();
+    if (sense == "MIN" || sense == "MINIMIZE") {
+      state.problem.maximize_ = false;
+    } else if (sense == "MAX" || sense == "MAXIMIZE") {
+      state.problem.maximize_ = true;
+    } else {
+      state.cursor.error("expected MIN/MAX or MINIMIZE/MAXIMIZE, got '%s'", sense.data());
+    }
+    accept_comment(state.cursor);
+    expect_eol(state.cursor);
+  }
+}
+
+template <typename i_t, typename f_t>
+static void parse_objname_section(parse_state_t<i_t, f_t>& state)
+{
+  scoped_timer_t timer("parse_objname");
+  if (accept(state.cursor, "OBJNAME")) {
+    if (state.cursor.eol()) { expect_eol(state.cursor); }
+    state.objective_name_sv = state.cursor.read_field();
+    accept_comment(state.cursor);
+    expect_eol(state.cursor);
+  }
+}
+
+struct row_chunk_boundary_t {
+  const char* start;
+  const char* end;
+};
+
+struct row_chunk_info_t {
+  size_t constraints = 0;
+  bool malformed     = false;
+  std::vector<std::string_view> objective_names;
+  bool has_first_constraint = false;
+  std::string_view first_constraint_name;
+};
+
+static const char* rows_find_next_line(const char* p, const char* end)
+{
+  while (p < end && *p != '\n')
+    p++;
+  if (p < end) p++;
+  return p;
+}
+
+static bool parse_rows_line_fast(const char*& p,
+                                 const char* end,
+                                 char& row_type,
+                                 std::string_view& row_name)
+{
+  p = cursor_t::simd_scan<skip_whitespace>(p, end);
+  if (p >= end) { return false; }
+  if (*p == '\n') {
+    p++;
+    return false;
+  }
+  if (*p == '*' || *p == '$') {
+    p = rows_find_next_line(p, end);
+    return false;
+  }
+
+  row_type = *p++;
+  p        = cursor_t::simd_scan<skip_whitespace>(p, end);
+
+  const char* name_start = p;
+  p                      = cursor_t::simd_scan<until_whitespace>(p, end);
+  if (name_start == p) { return false; }
+  row_name = std::string_view(name_start, (size_t)(p - name_start));
+
+  // ROWS only uses fields 1-2. Fields 3-6 are ignored by the MPS spec, and
+  // field 3 may start with '$' to comment the rest of the record.
+  // could be SIMD'd, but in practice the newline is right after the row name
+  p = rows_find_next_line(p, end);
+  return true;
+}
+
+// row chunks are established based on byte count, thus boundaries can land in the middle of a row
+// this cleans up chunks to have row line boundaries
+static std::vector<row_chunk_boundary_t> compute_row_chunk_boundaries(const char* rows_start,
+                                                                      const char* rows_end,
+                                                                      int num_threads)
+{
+  scoped_timer_t timer("rows_compute_chunk_boundaries");
+
+  std::vector<row_chunk_boundary_t> boundaries((size_t)num_threads);
+  size_t total_size = (size_t)(rows_end - rows_start);
+  size_t chunk_size = total_size / (size_t)num_threads;
+
+  boundaries[0].start = rows_start;
+  for (int t = 0; t < num_threads; ++t) {
+    if (t == num_threads - 1) {
+      boundaries[(size_t)t].end = rows_end;
+    } else {
+      const char* boundary            = rows_start + (size_t)(t + 1) * chunk_size;
+      boundary                        = rows_find_next_line(boundary, rows_end);
+      boundaries[(size_t)t].end       = boundary;
+      boundaries[(size_t)t + 1].start = boundary;
+    }
+  }
+
+  return boundaries;
+}
+
+// reads the row section in chunks and inserts into the worker's hash table partition
+// Parallel ROWS parser: count constraints per chunk, prefix-sum, then fill the output arrays
+// in parallel (with per-chunk dense-name reconciliation at the end). Must keep the same line
+// grammar as its serial twin parse_rows_section_serial_impl; parse_rows_section chooses between
+// them by size. Returns false if a chunk hit a malformed line (nothing committed for the fill
+// pass), so the caller can reset and retry serially for clean error reporting.
+template <typename i_t, typename f_t>
+static bool parse_rows_section_parallel_impl(parse_state_t<i_t, f_t>& state,
+                                             const char* rows_start,
+                                             const char* rows_end,
+                                             int num_threads)
+{
+  scoped_timer_t timer("parse_rows_parallel");
+
+  auto boundaries = compute_row_chunk_boundaries(rows_start, rows_end, num_threads);
+  std::vector<row_chunk_info_t> infos((size_t)num_threads);
+
+  {
+    scoped_timer_t timer("rows_count_parallel");
+#pragma omp parallel for num_threads(num_threads)
+    for (int t = 0; t < num_threads; ++t) {
+      MPS_NVTX_RANGE(std::string("rows_count_chunk ") + std::to_string(t), nvtx::colors::rows);
+      const char* p   = boundaries[(size_t)t].start;
+      const char* end = boundaries[(size_t)t].end;
+      row_chunk_info_t info;
+
+      while (p < end) {
+        char row_type = 0;
+        std::string_view row_name;
+        const char* before = p;
+        if (!parse_rows_line_fast(p, end, row_type, row_name)) {
+          if (p == before) {
+            info.malformed = true;
+            break;
+          }
+          continue;
+        }
+
+        if (row_type == 'N') {
+          info.objective_names.push_back(row_name);
+        } else {
+          if (!info.has_first_constraint) {
+            info.first_constraint_name = row_name;
+            info.has_first_constraint  = true;
+          }
+          info.constraints++;
+        }
+      }
+
+      infos[(size_t)t] = info;
+    }
+  }
+
+  if (std::any_of(
+        infos.begin(), infos.end(), [](const row_chunk_info_t& info) { return info.malformed; })) {
+    return false;
+  }
+
+  // prefix sum to do a paralle scatter of every row entries into the global output arrays
+  std::vector<size_t> offsets((size_t)num_threads + 1, 0);
+  {
+    scoped_timer_t timer("rows_prefix_sum");
+    for (int t = 0; t < num_threads; ++t) {
+      offsets[(size_t)t + 1] = offsets[(size_t)t] + infos[(size_t)t].constraints;
+    }
+  }
+
+  size_t total_rows = offsets[(size_t)num_threads];
+  if (UNLIKELY(total_rows > (size_t)INT_MAX)) {
+    state.cursor.error("fast MPS parser requires <= INT_MAX rows, got %zu", total_rows);
+  }
+  {
+    scoped_timer_t timer("rows_resize_outputs");
+    state.row_names_sv.resize(total_rows);
+    state.problem.row_types_.resize(total_rows);
+  }
+
+  if (state.objective_name_sv.empty()) {
+    for (const auto& info : infos) {
+      if (!info.objective_names.empty()) {
+        state.objective_name_sv = info.objective_names.front();
+        break;
+      }
+    }
+  }
+  for (const auto& info : infos) {
+    for (std::string_view name : info.objective_names) {
+      if (name != state.objective_name_sv) { state.ignored_objective_names.insert(name); }
+    }
+  }
+
+  bool dense_candidate = total_rows > 0;
+  std::string_view dense_prefix;
+  uint64_t dense_base_id = 0;
+  size_t dense_pad_width = 0;
+
+  if (dense_candidate) {
+    std::string_view first_name;
+    for (const auto& info : infos) {
+      if (info.has_first_constraint) {
+        first_name = info.first_constraint_name;
+        break;
+      }
+    }
+
+    uint64_t first_value      = 0;
+    size_t first_suffix_width = 0;
+    if (!parse_trailing_u64(first_name, dense_prefix, first_value, first_suffix_width)) {
+      dense_candidate = false;
+    } else {
+      dense_base_id   = first_value;
+      dense_pad_width = dense_initial_pad_width(first_name, first_suffix_width);
+    }
+  }
+
+  std::vector<uint8_t> dense_ok_by_chunk((size_t)num_threads, 1);
+
+  {
+    scoped_timer_t timer("rows_fill_parallel");
+#pragma omp parallel for num_threads(num_threads)
+    for (int t = 0; t < num_threads; ++t) {
+      MPS_NVTX_RANGE(std::string("rows_fill_chunk ") + std::to_string(t), nvtx::colors::rows);
+      const char* p   = boundaries[(size_t)t].start;
+      const char* end = boundaries[(size_t)t].end;
+      size_t out      = offsets[(size_t)t];
+
+      bool local_dense_ok = dense_candidate;
+      dense_name_index_t dense_index;
+      if (local_dense_ok) {
+        dense_index.prefix.assign(dense_prefix);
+        dense_index.min_id    = dense_base_id;
+        dense_index.max_id    = dense_base_id;
+        dense_index.pad_width = dense_pad_width;
+      }
+
+      while (p < end) {
+        char row_type = 0;
+        std::string_view row_name;
+        const char* before = p;
+        if (!parse_rows_line_fast(p, end, row_type, row_name)) {
+          if (p == before) {
+            local_dense_ok = false;
+            break;
+          }
+          continue;
+        }
+
+        if (row_type == 'N') { continue; }
+
+        state.row_names_sv[out]       = row_name;
+        state.problem.row_types_[out] = row_type;
+
+        if (local_dense_ok) {
+          size_t observed_count = out;
+          observe_dense_name(
+            local_dense_ok, dense_index, observed_count, row_name, dense_base_id + out);
+        }
+        out++;
+      }
+
+      dense_ok_by_chunk[(size_t)t] = local_dense_ok ? 1 : 0;
+    }
+  }
+
+  {
+    scoped_timer_t timer("rows_dense_metadata");
+    for (uint8_t ok : dense_ok_by_chunk) {
+      dense_candidate = dense_candidate && ok;
+    }
+    state.row_dense_candidate = dense_candidate;
+    if (dense_candidate) {
+      state.row_dense.prefix.assign(dense_prefix);
+      state.row_dense.min_id    = dense_base_id;
+      state.row_dense.max_id    = dense_base_id + total_rows - 1;
+      state.row_dense.pad_width = dense_pad_width;
+    }
+  }
+
+  return true;
+}
+
+template <typename i_t, typename f_t>
+static void parse_rows_section_serial_impl(parse_state_t<i_t, f_t>& state, const char* rows_end)
+{
+  scoped_timer_t timer("parse_rows_serial");
+
+  while (state.cursor.ptr < rows_end) {
+    auto row_type = state.cursor.ptr[0];
+    state.cursor.advance(1);
+    state.cursor.skip_ws();
+
+    auto row_name = state.cursor.read_field();
+    // ROWS fields after the row name are unused; tolerate annotations/comments there.
+    state.cursor.skip_to_eol();
+
+    // 'N' type is the objective row - store its name but don't add to constraints
+    if (row_type == 'N') {
+      state.observe_objective_row_name(row_name);
+    } else {
+      size_t row_idx = state.row_names_sv.size();
+      state.row_names_sv.push_back(row_name);
+      observe_dense_name(
+        state.row_dense_candidate,
+        state.row_dense,
+        row_idx,
+        row_name,
+        row_idx == 0 ? std::numeric_limits<uint64_t>::max() : state.row_dense.min_id + row_idx);
+      state.problem.row_types_.push_back(row_type);
+    }
+    expect_eol(state.cursor);
+  }
+  if (UNLIKELY(state.row_names_sv.size() > (size_t)INT_MAX)) {
+    state.cursor.error("fast MPS parser requires <= INT_MAX rows, got %zu",
+                       state.row_names_sv.size());
+  }
+}
+
+template <typename i_t, typename f_t>
+static void parse_rows_section(parse_state_t<i_t, f_t>& state, const char* rows_end)
+{
+  scoped_timer_t timer("parse_rows");
+  expect_section(state.cursor, "ROWS");
+
+  {
+    scoped_timer_t timer("parse_rows_scan");
+    const char* rows_start = state.cursor.ptr;
+
+    size_t rows_bytes    = (size_t)(rows_end - state.cursor.ptr);
+    int num_threads      = phase_thread_count(MPS_ROWS_THREAD_CAP);
+    bool parsed_parallel = false;
+    if (rows_bytes >= 512 * MiB && num_threads > 1) {
+      parsed_parallel =
+        parse_rows_section_parallel_impl<i_t, f_t>(state, state.cursor.ptr, rows_end, num_threads);
+      // serial fallback in case a likely malformed chunk has been encounter
+      // makes error reporting much easier
+      if (!parsed_parallel) {
+        state.row_names_sv.clear();
+        state.problem.row_types_.clear();
+        state.row_dense_candidate = true;
+        state.row_dense.reset();
+        state.cursor.ptr = rows_start;
+        parse_rows_section_serial_impl(state, rows_end);
+      }
+    } else {
+      parse_rows_section_serial_impl(state, rows_end);
+    }
+    state.cursor.ptr = rows_end;
+  }
+
+  state.problem.n_constraints_ = (i_t)state.row_names_sv.size();
+  state.problem.b_.resize((size_t)state.problem.n_constraints_);
+
+  {
+    scoped_timer_t timer("parse_rows_hash_init");
+    state.init_row_hash_table();
+  }
+}
+
+// Columns parser
+
+// integer variable markers
+struct marker_info_t {
+  enum Type { INTORG, INTEND };
+  Type type;
+  size_t after_local_var_idx;  // SIZE_MAX means "before first variable"
+};
+
+struct row_count_block_t {
+  size_t block_id       = 0;
+  size_t storage_offset = 0;
+};
+
+// Each column parsing worker owns chunks of the global CSC which are parsed in parallel and then
+// later scattered into the final CSR
+struct chunk_result_t {
+  std::vector<double> values;
+  std::vector<uint32_t> row_indices;
+  std::vector<size_t> col_offsets;
+  std::vector<std::string_view> var_names;
+  chunk_name_arena_t var_name_arena;
+  std::vector<marker_info_t> markers;
+  std::vector<std::pair<size_t, double>> objective_entries;  // local_col_idx -> coefficient
+  // COLUMNS is parsed as chunk-local CSC. To build the global CSR, each chunk needs row counts
+  // first, then row-local write cursors for scatter. Store those counts only for touched
+  // 4096-row blocks instead of allocating a dense chunks*n_rows matrix
+  // The same slots are rewritten as write cursors after the global CSR row offsets are known
+  std::vector<int64_t> row_count_storage;
+  std::vector<row_count_block_t> row_count_blocks;
+  std::vector<int32_t> row_count_block_dir;
+  dense_observe_state_t dense_col_stats;
+};
+
+struct chunk_boundary_t {
+  const char* start;
+  const char* end;
+};
+
+struct bounds_chunk_boundary_t {
+  const char* start;
+  const char* end;
+};
+
+// enables representing row counts per chunk as a sparse representation w/ 4096 granularity
+// works well since nnzs are often clustered around the same matrix blocks
+static inline int64_t& column_row_count_slot(chunk_result_t& result, size_t row_idx)
+{
+  size_t block_id   = row_idx / COLUMN_ROW_COUNT_BLOCK_ROWS;
+  size_t local      = row_idx - block_id * COLUMN_ROW_COUNT_BLOCK_ROWS;
+  int32_t block_pos = result.row_count_block_dir[block_id];
+  if (UNLIKELY(block_pos < 0)) {
+    block_pos                            = (int32_t)result.row_count_blocks.size();
+    result.row_count_block_dir[block_id] = block_pos;
+    row_count_block_t block;
+    block.block_id       = block_id;
+    block.storage_offset = result.row_count_storage.size();
+    result.row_count_storage.resize(block.storage_offset + COLUMN_ROW_COUNT_BLOCK_ROWS, 0);
+    result.row_count_blocks.push_back(std::move(block));
+  }
+  return result
+    .row_count_storage[result.row_count_blocks[(size_t)block_pos].storage_offset + local];
+}
+
+static bool dense_col_chunk_padding_compatible(const dense_observe_state_t& stats,
+                                               size_t global_pad_width)
+{
+  if (global_pad_width > 0) {
+    return stats.index.pad_width == global_pad_width ||
+           (stats.index.pad_width == 0 &&
+            decimal_digits_u64(stats.index.min_id) >= global_pad_width);
+  }
+  return stats.index.pad_width == 0;
+}
+
+static const char* find_next_line(const char* p, const char* end)
+{
+  while (p < end && *p != '\n')
+    p++;
+  if (p < end) p++;
+  return p;
+}
+
+static std::string_view peek_bounds_line_var_name(const char* line_start, const char* end)
+{
+  const char* p = line_start;
+  for (int field = 0; field < 2; ++field) {
+    while (p < end && *p <= ' ' && *p != '\n')
+      p++;
+    while (p < end && *p > ' ')
+      p++;
+  }
+  while (p < end && *p <= ' ' && *p != '\n')
+    p++;
+  const char* var_start = p;
+  while (p < end && *p > ' ')
+    p++;
+  return std::string_view(var_start, (size_t)(p - var_start));
+}
+
+static const char* find_line_start(const char* section_start, const char* p)
+{
+  while (p > section_start && p[-1] != '\n')
+    --p;
+  return p;
+}
+
+static std::vector<bounds_chunk_boundary_t> compute_bounds_chunk_boundaries(
+  const char* section_start, const char* section_end, int num_threads)
+{
+  scoped_timer_t timer("bounds_compute_chunk_boundaries");
+
+  const size_t total_size = (size_t)(section_end - section_start);
+  const size_t chunk_size = total_size / (size_t)num_threads;
+
+  std::vector<bounds_chunk_boundary_t> boundaries((size_t)num_threads);
+  boundaries[0].start = section_start;
+  for (int t = 0; t < num_threads; ++t) {
+    if (t == num_threads - 1) {
+      boundaries[(size_t)t].end = section_end;
+    } else {
+      const char* boundary =
+        find_next_line(section_start + (size_t)(t + 1) * chunk_size, section_end);
+
+      // Keep consecutive BOUNDS records for the same variable in one chunk.
+      // Then each thread owns full LO/UP-style groups and can apply file order locally.
+      while (boundary < section_end) {
+        const char* prev_line = find_line_start(section_start, boundary - 1);
+        const auto prev_var   = peek_bounds_line_var_name(prev_line, section_end);
+        const auto next_var   = peek_bounds_line_var_name(boundary, section_end);
+        if (prev_var.empty() || next_var.empty() || prev_var != next_var) { break; }
+        boundary = find_next_line(boundary, section_end);
+      }
+
+      boundaries[(size_t)t].end       = boundary;
+      boundaries[(size_t)t + 1].start = boundary;
+    }
+  }
+  return boundaries;
+}
+
+static std::vector<chunk_boundary_t> compute_chunk_boundaries(const char* columns_start,
+                                                              const char* columns_end,
+                                                              int num_threads)
+{
+  scoped_timer_t timer("compute_chunk_boundaries");
+
+  size_t total_size = (size_t)(columns_end - columns_start);
+  size_t chunk_size = total_size / (size_t)num_threads;
+
+  std::vector<chunk_boundary_t> boundaries(num_threads);
+
+  for (int t = 0; t < num_threads; t++) {
+    if (t == 0) { boundaries[t].start = columns_start; }
+
+    if (t == num_threads - 1) {
+      boundaries[t].end = columns_end;
+    } else {
+      // Find estimated position and align to line boundary
+      const char* estimated_end = columns_start + (t + 1) * chunk_size;
+      const char* line_start    = estimated_end;
+      while (line_start < columns_end && *line_start != '\n')
+        line_start++;
+      if (line_start < columns_end) line_start++;
+
+      // Read column name at this line
+      std::string_view col_name = cursor_t::peek_field_at(line_start, columns_end);
+
+      // Scan forward until column name changes (to avoid splitting a column)
+      const char* boundary = line_start;
+      while (boundary < columns_end) {
+        const char* next_line = find_next_line(boundary, columns_end);
+        if (next_line >= columns_end) break;
+
+        std::string_view next_col = cursor_t::peek_field_at(next_line, columns_end);
+        if (next_col != col_name && !next_col.empty() && next_col[0] != '\'') {
+          // Found a column transition. Marker-state fixup later handles any split near markers.
+          boundary = next_line;
+          break;
+        }
+        boundary = next_line;
+      }
+      boundaries[t].end = boundary;
+    }
+  }
+
+  // Fix up start pointers (each start is previous end)
+  for (int t = 1; t < num_threads; t++) {
+    boundaries[t].start = boundaries[t - 1].end;
+  }
+
+  return boundaries;
+}
+
+template <typename i_t, typename f_t>
+static chunk_result_t parse_columns_chunk(const char* chunk_start,
+                                          const char* chunk_end,
+                                          const parse_state_t<i_t, f_t>& state)
+{
+  chunk_result_t result;
+
+  if (chunk_start >= chunk_end) {
+    result.col_offsets.push_back(0);
+    return result;
+  }
+
+  size_t chunk_size     = (size_t)(chunk_end - chunk_start);
+  size_t estimated_nnz  = chunk_size / 100;
+  size_t estimated_cols = estimated_nnz / 10;
+  if (UNLIKELY(state.problem.n_constraints_ > (i_t)std::numeric_limits<int32_t>::max())) {
+    state.cursor.error("fast COLUMNS path requires <= INT32_MAX rows for chunk row indices");
+  }
+  result.values.reserve(estimated_nnz);
+  result.row_indices.reserve(estimated_nnz);
+  result.col_offsets.reserve(estimated_cols + 1);
+  result.var_names.reserve(estimated_cols);
+  result.var_name_arena.reserve(std::max<size_t>(4096, estimated_cols * 16));
+  result.objective_entries.reserve(estimated_cols);
+  size_t n_row_blocks =
+    cuda::ceil_div((size_t)state.problem.n_constraints_, COLUMN_ROW_COUNT_BLOCK_ROWS);
+  result.row_count_block_dir.resize(n_row_blocks, -1);
+  size_t estimated_touched_blocks = std::min(n_row_blocks, std::max<size_t>(16, estimated_nnz));
+  result.row_count_blocks.reserve(estimated_touched_blocks);
+  result.row_count_storage.reserve(estimated_touched_blocks * COLUMN_ROW_COUNT_BLOCK_ROWS);
+
+  cursor_t cursor(chunk_start, (size_t)(chunk_end - chunk_start));
+  std::string_view prev_var_name = "";
+
+  cursor.skip_ws();
+
+  while (!cursor.done()) {
+    if (UNLIKELY(*cursor.ptr == 'R')) {
+      auto next = cursor.peek_field();
+      // RHS section is mandatory right after COLUMNS section
+      if (next == "RHS") { break; }
+    }
+
+    auto [var_name, field2] = cursor.read_two_fields();
+    if (UNLIKELY(!field2.empty() && field2[0] == '$')) {
+      cursor.skip_to_eol();
+      expect_eol(cursor);
+      continue;
+    }
+
+    // Check for integer marker
+    if (UNLIKELY(field2[0] == '\'' && field2 == "'MARKER'")) {
+      auto marker_type = cursor.read_field();
+
+      marker_info_t marker;
+      marker.after_local_var_idx =
+        result.var_names.empty() ? SIZE_MAX : result.var_names.size() - 1;
+
+      if (marker_type == "'INTORG'") {
+        marker.type = marker_info_t::INTORG;
+      } else if (marker_type == "'INTEND'") {
+        marker.type = marker_info_t::INTEND;
+      } else {
+        cursor.error("unknown integer marker type in COLUMNS: %.*s",
+                     (int)marker_type.size(),
+                     marker_type.data());
+      }
+      result.markers.push_back(marker);
+
+      while (!cursor.done() && !cursor.eol())
+        cursor.ptr++;
+      if (!cursor.done()) cursor.ptr++;
+      cursor.skip_ws();
+      continue;
+    }
+
+    auto row_name = field2;
+    // quite often in MIPs the coefficient is just a single-digit integer
+    double value;
+    double sign = 1.0;
+    if (cursor.ptr[0] == '-') {
+      sign = -1.0;
+      cursor.advance(1);
+    }
+    if (cursor.ptr + 1 < cursor.end && fp64::is_digit(cursor.ptr[0]) &&
+        (cursor.ptr[1] == '\n' || cursor.ptr[1] == '\r')) {
+      value = sign * (cursor.ptr[0] - '0');
+      cursor.advance(1);
+    } else {
+      value = sign * fp64::parse_fp64_advance(cursor.ptr, cursor.end);
+    }
+    // usually EOL directly follows
+    if (UNLIKELY(!cursor.eol())) { cursor.skip_ws(); }
+    accept_comment(cursor);
+
+    if (prev_var_name != var_name) {
+      std::string_view owned_var_name = result.var_name_arena.copy(var_name);
+      result.var_names.push_back(owned_var_name);
+      observe_dense_name(result.dense_col_stats.candidate,
+                         result.dense_col_stats.index,
+                         result.dense_col_stats.count,
+                         owned_var_name);
+      result.col_offsets.push_back(result.values.size());
+      prev_var_name = owned_var_name;
+    }
+
+    auto add_entry = [&](std::string_view rn, double val) {
+      size_t row_idx = state.row_lookup(rn);
+      if (LIKELY(row_idx != SIZE_MAX)) {
+        assert(row_idx <= (size_t)std::numeric_limits<int32_t>::max());
+        result.values.push_back(val);
+        result.row_indices.push_back((uint32_t)row_idx);
+        column_row_count_slot(result, row_idx)++;
+      } else if (LIKELY(rn == state.objective_name_sv)) {
+        result.objective_entries.push_back({result.var_names.size() - 1, val});
+      } else if (state.ignored_objective_names.count(rn)) {
+        return;
+      } else {
+        cursor.error("unknown row name in COLUMNS: %.*s", (int)rn.size(), rn.data());
+      }
+    };
+
+    add_entry(row_name, value);
+
+    // Optional second entry on same line
+    if (!cursor.eol()) {
+      auto row_name2 = cursor.read_field();
+      if (UNLIKELY(!row_name2.empty() && row_name2[0] == '$')) {
+        cursor.skip_to_eol();
+        expect_eol(cursor);
+        continue;
+      }
+      double value2 = fp64::parse_fp64_advance(cursor.ptr, cursor.end);
+      cursor.skip_ws();
+      accept_comment(cursor);
+
+      add_entry(row_name2, value2);
+    }
+
+    expect_eol(cursor);
+  }
+
+  result.col_offsets.push_back(result.values.size());
+
+  return result;
+}
+
+// Fused merge + CSR construction: directly builds CSR from chunks without intermediate global CSC
+template <typename i_t>
+struct column_merge_shape_t {
+  int num_chunks = 0;
+  i_t n_rows     = 0;
+  std::vector<size_t> global_col_offset;
+  size_t total_cols = 0;
+  size_t total_nnz  = 0;
+};
+
+template <typename i_t>
+static column_merge_shape_t<i_t> compute_column_merge_shape(
+  const std::vector<chunk_result_t>& chunks, i_t n_rows)
+{
+  column_merge_shape_t<i_t> shape;
+  shape.num_chunks = (int)chunks.size();
+  shape.n_rows     = n_rows;
+  shape.global_col_offset.resize((size_t)shape.num_chunks + 1);
+  {
+    scoped_timer_t timer("columns_global_offsets");
+    for (int t = 0; t < shape.num_chunks; t++) {
+      shape.global_col_offset[(size_t)t + 1] =
+        shape.global_col_offset[(size_t)t] + chunks[(size_t)t].var_names.size();
+      shape.total_nnz += chunks[(size_t)t].values.size();
+    }
+  }
+  shape.total_cols = shape.global_col_offset[(size_t)shape.num_chunks];
+  if constexpr (std::numeric_limits<i_t>::max() < std::numeric_limits<int64_t>::max()) {
+    const size_t index_max = (size_t)std::numeric_limits<i_t>::max();
+    if (shape.total_nnz > index_max) {
+      mps_parser_fail(error_type_t::RuntimeError,
+                      "fast MPS parser requires 64-bit indices: nnz=%zu exceeds index max=%zu",
+                      shape.total_nnz,
+                      index_max);
+    }
+    if (shape.total_cols > index_max || (size_t)n_rows > index_max) {
+      mps_parser_fail(error_type_t::RuntimeError,
+                      "fast MPS parser requires 64-bit indices: rows=%zu cols=%zu exceed index "
+                      "max=%zu",
+                      (size_t)n_rows,
+                      shape.total_cols,
+                      index_max);
+    }
+  }
+  return shape;
+}
+
+template <typename i_t, typename f_t>
+static void detect_dense_column_metadata(parse_state_t<i_t, f_t>& state,
+                                         const std::vector<chunk_result_t>& chunks,
+                                         const column_merge_shape_t<i_t>& shape)
+{
+  scoped_timer_t timer("columns_dense_metadata");
+  bool dense_ok   = shape.total_cols > 0;
+  bool have_first = false;
+  std::string_view dense_prefix;
+  uint64_t expected_next_id = 0;
+  uint64_t dense_min_id     = 0;
+  uint64_t dense_max_id     = 0;
+  size_t dense_pad_width    = 0;
+
+  for (int t = 0; t < shape.num_chunks && dense_ok; ++t) {
+    const auto& stats = chunks[(size_t)t].dense_col_stats;
+    if (stats.count == 0) { continue; }
+    if (!stats.candidate || stats.count != chunks[(size_t)t].var_names.size()) {
+      dense_ok = false;
+      break;
+    }
+    if (!have_first) {
+      have_first       = true;
+      dense_prefix     = stats.index.prefix;
+      expected_next_id = stats.index.min_id;
+      dense_min_id     = stats.index.min_id;
+      dense_pad_width  = stats.index.pad_width;
+    }
+    if (stats.index.prefix != dense_prefix || stats.index.min_id != expected_next_id ||
+        !dense_col_chunk_padding_compatible(stats, dense_pad_width)) {
+      dense_ok = false;
+      break;
+    }
+    if (stats.index.max_id < stats.index.min_id ||
+        stats.index.max_id - stats.index.min_id + 1 != stats.count) {
+      dense_ok = false;
+      break;
+    }
+    dense_max_id = stats.index.max_id;
+    if (stats.index.max_id == std::numeric_limits<uint64_t>::max()) {
+      dense_ok = false;
+      break;
+    }
+    expected_next_id = stats.index.max_id + 1;
+  }
+
+  if (!have_first || dense_max_id < dense_min_id ||
+      dense_max_id - dense_min_id + 1 != shape.total_cols) {
+    dense_ok = false;
+  }
+
+  state.col_index_mode = dense_ok ? index_mode_t::dense_ordered : index_mode_t::hash;
+  if (dense_ok) {
+    state.col_dense.prefix.assign(dense_prefix);
+    state.col_dense.min_id    = dense_min_id;
+    state.col_dense.max_id    = dense_max_id;
+    state.col_dense.pad_width = dense_pad_width;
+  }
+}
+
+template <typename i_t, typename f_t>
+static std::vector<i_t> build_csr_row_offsets(parse_state_t<i_t, f_t>& state,
+                                              const std::vector<chunk_result_t>& chunks,
+                                              const column_merge_shape_t<i_t>& shape)
+{
+  std::vector<i_t> global_row_counts((size_t)shape.n_rows, 0);
+  {
+    scoped_timer_t timer("columns_sum_row_counts");
+    for (int t = 0; t < shape.num_chunks; t++) {
+      for (const auto& block : chunks[(size_t)t].row_count_blocks) {
+        const int64_t* block_counts =
+          chunks[(size_t)t].row_count_storage.data() + block.storage_offset;
+        size_t row_base    = block.block_id * COLUMN_ROW_COUNT_BLOCK_ROWS;
+        size_t block_limit = std::min(COLUMN_ROW_COUNT_BLOCK_ROWS, (size_t)shape.n_rows - row_base);
+        for (size_t local = 0; local < block_limit; ++local) {
+          global_row_counts[row_base + local] += (i_t)block_counts[local];
+        }
+      }
+    }
+  }
+  {
+    scoped_timer_t timer("columns_build_row_offsets");
+    state.problem.A_offsets_.resize((size_t)shape.n_rows + 1);
+    state.problem.A_offsets_[0] = 0;
+    for (i_t r = 0; r < shape.n_rows; r++) {
+      state.problem.A_offsets_[(size_t)r + 1] =
+        state.problem.A_offsets_[(size_t)r] + global_row_counts[(size_t)r];
+    }
+  }
+  return global_row_counts;
+}
+
+template <typename i_t>
+static void convert_counts_to_write_positions(std::vector<chunk_result_t>& chunks,
+                                              const column_merge_shape_t<i_t>& shape,
+                                              const std::vector<i_t>& row_offsets,
+                                              std::vector<i_t>& global_row_counts)
+{
+  scoped_timer_t timer("columns_counts_to_write_positions");
+  std::fill(global_row_counts.begin(), global_row_counts.end(), i_t{0});
+  for (int t = 0; t < shape.num_chunks; t++) {
+    for (auto& block : chunks[(size_t)t].row_count_blocks) {
+      int64_t* block_counts = chunks[(size_t)t].row_count_storage.data() + block.storage_offset;
+      size_t row_base       = block.block_id * COLUMN_ROW_COUNT_BLOCK_ROWS;
+      size_t block_limit = std::min(COLUMN_ROW_COUNT_BLOCK_ROWS, (size_t)shape.n_rows - row_base);
+      for (size_t local = 0; local < block_limit; ++local) {
+        int64_t count = block_counts[local];
+        if (count == 0) continue;
+        size_t row          = row_base + local;
+        i_t pos             = row_offsets[row] + global_row_counts[row];
+        block_counts[local] = (int64_t)pos;
+        global_row_counts[row] += (i_t)count;
+      }
+    }
+  }
+}
+
+static void materialize_chunk_row_count_storage(std::vector<chunk_result_t>& chunks,
+                                                int num_threads)
+{
+  scoped_timer_t timer("columns_row_count_storage_hugepages");
+#pragma omp parallel for num_threads(num_threads)
+  for (int t = 0; t < (int)chunks.size(); ++t) {
+    materialize_vector_hugepages("column_row_count_storage",
+                                 chunks[(size_t)t].row_count_storage,
+                                 materialize_touch_t::write_2mb);
+  }
+}
+
+template <typename i_t, typename f_t>
+static void allocate_column_outputs(parse_state_t<i_t, f_t>& state,
+                                    const column_merge_shape_t<i_t>& shape)
+{
+  scoped_timer_t timer("allocate_temp_csr_arrays");
+  size_t values_bytes  = shape.total_nnz * sizeof(f_t);
+  size_t indices_bytes = shape.total_nnz * sizeof(i_t);
+  state.temp_csr_nnz   = shape.total_nnz;
+
+#pragma omp parallel sections num_threads(4)
+  {
+#pragma omp section
+    {
+      state.temp_A_region = mmap_region_t::anonymous(
+        std::max<size_t>(values_bytes, 1), PROT_READ | PROT_WRITE, MAP_PRIVATE, "temp CSR values");
+      state.temp_A = (f_t*)state.temp_A_region.data();
+      state.temp_A_region.advise(MADV_HUGEPAGE);
+    }
+#pragma omp section
+    {
+      state.temp_A_indices_region = mmap_region_t::anonymous(std::max<size_t>(indices_bytes, 1),
+                                                             PROT_READ | PROT_WRITE,
+                                                             MAP_PRIVATE,
+                                                             "temp CSR column indices");
+      state.temp_A_indices        = (i_t*)state.temp_A_indices_region.data();
+      state.temp_A_indices_region.advise(MADV_HUGEPAGE);
+    }
+#pragma omp section
+    {
+      if (state.col_index_mode != index_mode_t::dense_ordered) {
+        state.var_name_arenas.clear();
+        state.var_name_arenas.resize((size_t)shape.num_chunks);
+        state.var_names_sv.resize(shape.total_cols);
+      }
+    }
+#pragma omp section
+    {
+      state.problem.var_types_.resize(shape.total_cols);
+    }
+  }
+}
+
+template <typename i_t, typename f_t>
+static void scatter_column_chunks_to_csr(parse_state_t<i_t, f_t>& state,
+                                         std::vector<chunk_result_t>& chunks,
+                                         const column_merge_shape_t<i_t>& shape,
+                                         int num_threads)
+{
+  scoped_timer_t timer("scatter_into_csr");
+  {
+    scoped_timer_t matrix_timer("scatter_matrix_entries");
+#ifdef MPS_FAST_PERF_COUNTERS
+    std::vector<perf_counter_snapshot_t> perf_snapshots((size_t)shape.num_chunks);
+#endif
+#pragma omp parallel for num_threads(num_threads)
+    for (int t = 0; t < shape.num_chunks; t++) {
+#ifdef MPS_FAST_PERF_COUNTERS
+      thread_perf_counters_t perf_counters;
+#endif
+      auto& chunk = chunks[(size_t)t];
+      for (size_t local_col = 0; local_col < chunk.var_names.size(); local_col++) {
+        i_t global_col   = (i_t)(shape.global_col_offset[(size_t)t] + local_col);
+        size_t col_start = chunk.col_offsets[local_col];
+        size_t col_end   = chunk.col_offsets[local_col + 1];
+        for (size_t idx = col_start; idx < col_end; idx++) {
+          i_t row                    = (i_t)chunk.row_indices[idx];
+          size_t row_idx             = (size_t)row;
+          size_t block_id            = row_idx / COLUMN_ROW_COUNT_BLOCK_ROWS;
+          size_t local               = row_idx - block_id * COLUMN_ROW_COUNT_BLOCK_ROWS;
+          int32_t block_pos          = chunk.row_count_block_dir[block_id];
+          row_count_block_t& block   = chunk.row_count_blocks[(size_t)block_pos];
+          int64_t& write_pos         = chunk.row_count_storage[block.storage_offset + local];
+          i_t dest                   = (i_t)write_pos++;
+          state.temp_A[dest]         = (f_t)chunk.values[idx];
+          state.temp_A_indices[dest] = global_col;
+        }
+      }
+#ifdef MPS_FAST_PERF_COUNTERS
+      perf_snapshots[(size_t)t] = perf_counters.stop();
+#endif
+    }
+#ifdef MPS_FAST_PERF_COUNTERS
+    print_perf_totals("scatter_matrix_entries", perf_snapshots);
+#endif
+  }
+
+  if (state.col_index_mode != index_mode_t::dense_ordered) {
+    scoped_timer_t names_timer("scatter_var_names");
+#pragma omp parallel for num_threads(num_threads)
+    for (int t = 0; t < shape.num_chunks; t++) {
+      chunk_name_arena_t& arena = state.var_name_arenas[(size_t)t];
+      arena.reserve(std::max<size_t>(4096, chunks[(size_t)t].var_names.size() * 16));
+      for (size_t i = 0; i < chunks[(size_t)t].var_names.size(); i++) {
+        state.var_names_sv[shape.global_col_offset[(size_t)t] + i] =
+          arena.copy(chunks[(size_t)t].var_names[i]);
+      }
+    }
+  } else {
+    scoped_timer_t names_timer("scatter_var_names");
+  }
+}
+
+struct global_marker_t {
+  marker_info_t::Type type;
+  size_t global_var_idx;
+};
+
+template <typename i_t, typename f_t>
+static void apply_column_integer_markers(parse_state_t<i_t, f_t>& state,
+                                         const std::vector<chunk_result_t>& chunks,
+                                         const column_merge_shape_t<i_t>& shape)
+{
+  scoped_timer_t timer("columns_apply_markers");
+  std::vector<global_marker_t> all_markers;
+  for (int t = 0; t < shape.num_chunks; t++) {
+    for (const auto& m : chunks[(size_t)t].markers) {
+      global_marker_t gm;
+      gm.type = m.type;
+      gm.global_var_idx =
+        m.after_local_var_idx == SIZE_MAX
+          ? (shape.global_col_offset[(size_t)t] > 0 ? shape.global_col_offset[(size_t)t] - 1
+                                                    : SIZE_MAX)
+          : shape.global_col_offset[(size_t)t] + m.after_local_var_idx;
+      all_markers.push_back(gm);
+    }
+  }
+
+  std::stable_sort(all_markers.begin(), all_markers.end(), [](const auto& a, const auto& b) {
+    if (a.global_var_idx == SIZE_MAX && b.global_var_idx != SIZE_MAX) return true;
+    if (b.global_var_idx == SIZE_MAX && a.global_var_idx != SIZE_MAX) return false;
+    return a.global_var_idx < b.global_var_idx;
+  });
+
+  bool is_integer   = false;
+  size_t marker_idx = 0;
+  for (size_t v = 0; v < shape.total_cols; v++) {
+    while (marker_idx < all_markers.size() && (all_markers[marker_idx].global_var_idx == SIZE_MAX ||
+                                               all_markers[marker_idx].global_var_idx < v)) {
+      is_integer = all_markers[marker_idx].type == marker_info_t::INTORG;
+      marker_idx++;
+    }
+    state.problem.var_types_[v] = is_integer ? 'I' : 'C';
+  }
+}
+
+template <typename i_t, typename f_t>
+static void assign_column_objective_entries(parse_state_t<i_t, f_t>& state,
+                                            const std::vector<chunk_result_t>& chunks,
+                                            const column_merge_shape_t<i_t>& shape)
+{
+  scoped_timer_t timer("columns_objective_entries");
+  state.problem.c_.resize(shape.total_cols, f_t{0});
+  for (int t = 0; t < shape.num_chunks; t++) {
+    for (const auto& [local_col, coeff] : chunks[(size_t)t].objective_entries) {
+      size_t global_col = shape.global_col_offset[(size_t)t] + local_col;
+      if (global_col < shape.total_cols) { state.problem.c_[global_col] = (f_t)coeff; }
+    }
+  }
+}
+
+template <typename i_t, typename f_t>
+static void merge_chunk_results_to_csr(parse_state_t<i_t, f_t>& state,
+                                       std::vector<chunk_result_t>& chunks,
+                                       int num_threads)
+{
+  scoped_timer_t timer("merge_chunks_to_csr");
+  if (chunks.empty()) return;
+
+  auto shape = compute_column_merge_shape<i_t>(chunks, state.problem.n_constraints_);
+  detect_dense_column_metadata(state, chunks, shape);
+  auto global_row_counts = build_csr_row_offsets(state, chunks, shape);
+  convert_counts_to_write_positions(chunks, shape, state.problem.A_offsets_, global_row_counts);
+  materialize_chunk_row_count_storage(chunks, num_threads);
+  allocate_column_outputs(state, shape);
+  scatter_column_chunks_to_csr(state, chunks, shape, num_threads);
+  apply_column_integer_markers(state, chunks, shape);
+  assign_column_objective_entries(state, chunks, shape);
+
+  state.problem.n_vars_ = (i_t)shape.total_cols;
+  state.problem.nnz_    = (i_t)shape.total_nnz;
+}
+
+template <typename i_t, typename f_t>
+static void materialize_problem_csr(parse_state_t<i_t, f_t>& state)
+{
+  scoped_timer_t timer("materialize_problem_csr");
+  size_t nnz       = state.temp_csr_nnz;
+  int copy_threads = 2;
+  copy_threads     = std::max(1, std::min(copy_threads, MPS_LARGE_FILE_THREAD_CAP));
+
+  int resize_threads = copy_threads > 1 ? 2 : 1;
+#pragma omp parallel sections num_threads(resize_threads)
+  {
+#pragma omp section
+    {
+      state.problem.A_.resize(nnz);
+    }
+#pragma omp section
+    {
+      state.problem.A_indices_.resize(nnz);
+    }
+  }
+
+  size_t value_bytes = nnz * sizeof(f_t);
+  size_t index_bytes = nnz * sizeof(i_t);
+  size_t total_bytes = value_bytes + index_bytes;
+  // Copy A_ and A_indices overlapping with the other phases
+  // this hides the latency costs of heap alloc and default init with other parsing/IO
+  // instead of making it blocking for the column parse
+  // TODO: just have A_ and A_indices_ be mmap anon allocs directly in the mps_data_model_t
+  // but that'd require careful work around avoiding breaking changes and the API esp cython stuff
+  if (total_bytes != 0) {
+#pragma omp parallel for num_threads(copy_threads) schedule(static)
+    for (int t = 0; t < copy_threads; ++t) {
+      size_t begin = (total_bytes * (size_t)t) / (size_t)copy_threads;
+      size_t end   = (total_bytes * (size_t)(t + 1)) / (size_t)copy_threads;
+      if (begin < value_bytes) {
+        size_t value_end = std::min(end, value_bytes);
+        if (value_end > begin) {
+          std::memcpy((char*)state.problem.A_.data() + begin,
+                      (const char*)state.temp_A + begin,
+                      value_end - begin);
+        }
+      }
+      if (end > value_bytes) {
+        size_t index_begin = begin > value_bytes ? begin - value_bytes : 0;
+        size_t index_end   = end - value_bytes;
+        std::memcpy((char*)state.problem.A_indices_.data() + index_begin,
+                    (const char*)state.temp_A_indices + index_begin,
+                    index_end - index_begin);
+      }
+    }
+  }
+
+  state.temp_A                = nullptr;
+  state.temp_A_indices        = nullptr;
+  state.temp_csr_materialized = true;
+  state.temp_A_region.reset();
+  state.temp_A_indices_region.reset();
+}
+
+// COLUMNS is always parsed chunk-parallel: each chunk is counted/parsed by parse_columns_chunk
+// and the per-chunk results are stitched together by merge_chunk_results_to_csr. There is no
+// separate serial implementation -- a single thread just runs one chunk through the same path.
+template <typename i_t, typename f_t>
+static void parse_columns_section_parallel(parse_state_t<i_t, f_t>& state,
+                                           int num_threads,
+                                           const char* columns_end)
+{
+  scoped_timer_t timer("parse_columns_parallel");
+
+  if (num_threads <= 0) { num_threads = phase_thread_count(MPS_COLUMNS_THREAD_CAP); }
+
+  // Skip the "COLUMNS" header
+  expect_section(state.cursor, "COLUMNS");
+
+  const char* columns_start    = state.cursor.ptr;
+  size_t columns_bytes         = (size_t)(columns_end - columns_start);
+  size_t chunk_limited_threads = std::max<size_t>(1, columns_bytes / MPS_COLUMNS_MIN_CHUNK_BYTES);
+  num_threads = std::max(1, std::min<int>(num_threads, (int)chunk_limited_threads));
+
+  auto chunk_bounds = compute_chunk_boundaries(columns_start, columns_end, num_threads);
+
+  // Parse chunks in parallel
+  std::vector<chunk_result_t> results(num_threads);
+
+  {
+    scoped_timer_t timer("parse_columns_chunk_parallel");
+#ifdef MPS_FAST_PERF_COUNTERS
+    std::vector<perf_counter_snapshot_t> perf_snapshots((size_t)num_threads);
+#endif
+    std::exception_ptr first_error = nullptr;
+    std::mutex error_mutex;
+    {
+#pragma omp parallel for num_threads(num_threads)
+      for (int t = 0; t < num_threads; t++) {
+        try {
+          MPS_NVTX_RANGE(std::string("columns_chunk ") + std::to_string(t), nvtx::colors::columns);
+#ifdef MPS_FAST_PERF_COUNTERS
+          thread_perf_counters_t perf_counters;
+#endif
+          results[t] =
+            parse_columns_chunk<i_t, f_t>(chunk_bounds[t].start, chunk_bounds[t].end, state);
+#ifdef MPS_FAST_PERF_COUNTERS
+          perf_snapshots[(size_t)t] = perf_counters.stop();
+#endif
+        } catch (...) {
+          std::lock_guard<std::mutex> lock(error_mutex);
+          if (!first_error) { first_error = std::current_exception(); }
+        }
+      }
+    }
+    if (first_error) { std::rethrow_exception(first_error); }
+#ifdef MPS_FAST_PERF_COUNTERS
+    print_perf_totals("parse_columns_chunk_parallel", perf_snapshots);
+#endif
+  }
+
+  // Merge results directly into CSR format
+  merge_chunk_results_to_csr(state, results, num_threads);
+
+  // Update cursor to RHS section
+  state.cursor.ptr = columns_end;
+  state.cursor.skip_ws();
+}
+
+template <typename i_t, typename f_t>
+static void parse_rhs_section(parse_state_t<i_t, f_t>& state, cursor_t& cursor)
+{
+  scoped_timer_t timer("parse_rhs");
+  expect_section(cursor, "RHS");
+
+  // necessary on the cold path since we directly read and lookup on the hot path
+  auto reread_field_name = [](const char* start, const char* end) {
+    const char* p = start;
+    while (p < end && *p > ' ') {
+      p++;
+    }
+    return std::string_view(start, (size_t)(p - start));
+  };
+
+  auto apply_rhs = [&](const char* row_start, size_t row_idx, f_t value) {
+    // This is a regular non-obj row.
+    if (row_idx != SIZE_MAX) {
+      state.problem.b_[row_idx] = value;
+      return;
+    }
+    // This is the objective row.
+    std::string_view row_name = reread_field_name(row_start, cursor.end);
+    if (row_name == state.objective_name_sv) {
+      state.problem.objective_offset_ = -value;
+      return;
+    }
+    // Other objectives, ignored currently. cold path
+    if (state.ignored_objective_names.count(row_name)) { return; }
+    // Unexpected!
+    error_unknown_row(cursor, row_start, "RHS");
+  };
+
+  while (cursor.ptr < cursor.end) {
+    [[maybe_unused]] auto rhs_name = cursor.read_field();
+    if (accept_comment(cursor)) {
+      expect_eol(cursor);
+      continue;
+    }
+    const char* row_start = cursor.ptr;
+    size_t row_idx        = state.read_row_lookup(cursor);
+    auto value            = expect_number_fast_pm_one(cursor);
+    apply_rhs(row_start, row_idx, (f_t)value);
+
+    accept_comment(cursor);
+    // Optional second entry
+    if (!cursor.eol()) {
+      const char* row_start2 = cursor.ptr;
+      size_t row_idx2        = state.read_row_lookup(cursor);
+      auto value2            = expect_number_fast_pm_one(cursor);
+      apply_rhs(row_start2, row_idx2, (f_t)value2);
+      accept_comment(cursor);
+    }
+    expect_eol(cursor);
+  }
+}
+
+// does the job on 99% of instances, in the vast majority of cases bound names are sequential with
+// occasional sparsity
+static size_t find_var_after_hint(const std::vector<std::string_view>& var_names,
+                                  std::string_view var_name,
+                                  size_t hint_idx)
+{
+  const size_t n_vars = var_names.size();
+  if (hint_idx + 1 < n_vars && var_names[hint_idx + 1] == var_name) { return hint_idx + 1; }
+  if (hint_idx < n_vars && var_names[hint_idx] == var_name) { return hint_idx; }
+
+  const size_t first_begin = std::min(hint_idx + 2, n_vars);
+  for (size_t i = first_begin; i < n_vars; ++i) {
+    if (var_names[i] == var_name) { return i; }
+  }
+  for (size_t i = 0; i < hint_idx && i < n_vars; ++i) {
+    if (var_names[i] == var_name) { return i; }
+  }
+  return SIZE_MAX;
+}
+
+template <typename f_t, typename SetLb, typename SetUb, typename SetType, typename Error>
+static bool apply_bound_record(std::string_view bound_type,
+                               f_t value,
+                               bool has_value,
+                               bool first_bound_for_var,
+                               SetLb&& set_lb,
+                               SetUb&& set_ub,
+                               SetType&& set_type,
+                               Error&& error)
+{
+  if (bound_type == "LO") {
+    set_lb(value);
+  } else if (bound_type == "UP") {
+    set_ub(value);
+    if (first_bound_for_var && value < f_t{0}) { set_lb(-std::numeric_limits<f_t>::infinity()); }
+  } else if (bound_type == "FX") {
+    set_lb(value);
+    set_ub(value);
+  } else if (bound_type == "FR") {
+    set_lb(-std::numeric_limits<f_t>::infinity());
+    set_ub(std::numeric_limits<f_t>::infinity());
+  } else if (bound_type == "MI") {
+    set_lb(-std::numeric_limits<f_t>::infinity());
+  } else if (bound_type == "PL") {
+    set_ub(std::numeric_limits<f_t>::infinity());
+  } else if (bound_type == "BV") {
+    set_lb(f_t{0});
+    set_ub(f_t{1});
+    set_type('I');
+  } else if (bound_type == "LI") {
+    set_lb(value);
+    set_type('I');
+  } else if (bound_type == "UI") {
+    set_ub(value);
+    if (first_bound_for_var && value < f_t{0}) { set_lb(-std::numeric_limits<f_t>::infinity()); }
+    set_type('I');
+  } else if (bound_type == "SC") {
+    if (UNLIKELY(!has_value)) {
+      error("SC bound requires an upper bound value", bound_type);
+      return false;
+    }
+    set_ub(value);
+    set_type('S');
+  } else {
+    error("unknown bound type", bound_type);
+    return false;
+  }
+  return true;
+}
+
+// Parallel BOUNDS parser for the common dense/ordered-name case. Returns false when the section
+// is too small or not safely parallelizable, so parse_bounds_section resets and falls back to its
+// serial path. Bound-type semantics (LO/UP/FX/...) are shared with the serial path through
+// apply_bound_record, so the two cannot drift.
+template <typename i_t, typename f_t>
+static bool parse_bounds_section_parallel_dense(parse_state_t<i_t, f_t>& state,
+                                                cursor_t& cursor,
+                                                const char* bounds_body_start,
+                                                const char* bounds_body_end,
+                                                size_t n_vars)
+{
+  const size_t bounds_bytes   = (size_t)(bounds_body_end - bounds_body_start);
+  const int num_threads       = phase_thread_count(MPS_BOUNDS_THREAD_CAP);
+  const bool use_dense_lookup = state.col_index_mode == index_mode_t::dense_ordered;
+  const size_t min_parallel_bytes =
+    use_dense_lookup ? MPS_BOUNDS_PARALLEL_MIN_BYTES : MPS_BOUNDS_ORDERED_HINT_PARALLEL_MIN_BYTES;
+  if (bounds_bytes < min_parallel_bytes || num_threads < 2) { return false; }
+
+  MPS_NVTX_RANGE(
+    use_dense_lookup ? "parse_bounds_parallel_dense" : "parse_bounds_parallel_ordered_hint",
+    nvtx::colors::bounds);
+
+  struct bounds_parallel_stats_t {
+    size_t lines            = 0;
+    size_t dense_hits       = 0;
+    size_t dense_misses     = 0;
+    size_t comments         = 0;
+    size_t min_var          = SIZE_MAX;
+    size_t max_var          = 0;
+    size_t decreasing_order = 0;
+    const char* error_ptr   = nullptr;
+    char error_msg[192]     = {};
+  };
+
+  std::vector<bounds_parallel_stats_t> stats((size_t)num_threads);
+  auto boundaries =
+    compute_bounds_chunk_boundaries(bounds_body_start, bounds_body_end, num_threads);
+
+  std::vector<uint8_t> bound_seen;
+  {
+    scoped_timer_t timer("bounds_parallel_seen_alloc");
+    bound_seen.resize(n_vars, 0);
+  }
+
+  {
+    scoped_timer_t timer(use_dense_lookup ? "parse_bounds_parallel_dense"
+                                          : "parse_bounds_parallel_ordered_hint");
+    // Repeated BOUNDS for the same variable are safe inside a group-owned chunk.
+    // Parse optimistically, then accept only if chunk summaries prove no backward jumps.
+#pragma omp parallel for schedule(static) num_threads(num_threads)
+    for (int t = 0; t < num_threads; ++t) {
+      auto& local = stats[(size_t)t];
+      cursor_t cursor(boundaries[(size_t)t].start,
+                      (size_t)(boundaries[(size_t)t].end - boundaries[(size_t)t].start));
+      cursor.skip_ws();
+      size_t prev_var = SIZE_MAX;
+      size_t hint_idx = 0;
+      auto lookup_var = [&](std::string_view var_name) {
+        if (use_dense_lookup) { return state.col_dense.lookup(var_name); }
+        // quite often variables are in order, so a cheap lookup trick is to look for the variable
+        // right after this one
+        return find_var_after_hint(state.var_names_sv, var_name, hint_idx);
+      };
+      try {
+        while (cursor.ptr < cursor.end) {
+          if (UNLIKELY(*cursor.ptr == '$')) {
+            cursor.skip_to_eol();
+            expect_eol(cursor);
+            local.comments++;
+            continue;
+          }
+
+          auto bound_type = cursor.read_field();
+          if (UNLIKELY(bound_type.empty())) { break; }
+          if (UNLIKELY(bound_type[0] == '$')) {
+            cursor.skip_to_eol();
+            expect_eol(cursor);
+            local.comments++;
+            continue;
+          }
+
+          [[maybe_unused]] auto bound_name = cursor.read_field();
+          auto var_name                    = cursor.read_field();
+          if (UNLIKELY(!var_name.empty() && var_name[0] == '$')) {
+            cursor.skip_to_eol();
+            expect_eol(cursor);
+            local.comments++;
+            continue;
+          }
+
+          size_t var_idx = lookup_var(var_name);
+          if (UNLIKELY(var_idx == SIZE_MAX)) {
+            local.dense_misses++;
+            break;
+          }
+          hint_idx = var_idx;
+          local.dense_hits++;
+          local.lines++;
+          local.min_var = std::min(local.min_var, var_idx);
+          local.max_var = std::max(local.max_var, var_idx);
+          if (prev_var != SIZE_MAX && var_idx < prev_var) { local.decreasing_order++; }
+          prev_var = var_idx;
+
+          bool first_bound_for_var = bound_seen[var_idx] == 0;
+          bound_seen[var_idx]      = 1;
+
+          f_t value      = 0;
+          bool has_value = false;
+          accept_comment(cursor);
+          if (!cursor.eol()) {
+            value     = (f_t)expect_number_fast_pm_one(cursor);
+            has_value = true;
+            accept_comment(cursor);
+          }
+
+          auto set_lb    = [&](f_t x) { state.problem.variable_lower_bounds_[var_idx] = x; };
+          auto set_ub    = [&](f_t x) { state.problem.variable_upper_bounds_[var_idx] = x; };
+          auto set_type  = [&](char t) { state.problem.var_types_[var_idx] = t; };
+          auto set_error = [&](const char* msg, std::string_view type) {
+            if (type.empty() || std::strcmp(msg, "unknown bound type") != 0) {
+              std::snprintf(local.error_msg, sizeof(local.error_msg), "%s", msg);
+            } else {
+              std::snprintf(local.error_msg,
+                            sizeof(local.error_msg),
+                            "%s: %.*s",
+                            msg,
+                            (int)type.size(),
+                            type.data());
+            }
+            local.error_ptr = cursor.ptr;
+          };
+          if (!apply_bound_record(bound_type,
+                                  value,
+                                  has_value,
+                                  first_bound_for_var,
+                                  set_lb,
+                                  set_ub,
+                                  set_type,
+                                  set_error)) {
+            break;
+          }
+
+          expect_eol(cursor);
+        }
+      } catch (const std::exception& e) {
+        std::snprintf(local.error_msg, sizeof(local.error_msg), "%s", e.what());
+        local.error_ptr = cursor.ptr;
+      }
+    }
+  }
+
+  size_t dense_misses     = 0;
+  size_t decreasing_order = 0;
+  size_t overlap_chunks   = 0;
+  size_t prev_max         = SIZE_MAX;
+  for (int t = 0; t < num_threads; ++t) {
+    const auto& local = stats[(size_t)t];
+    if (local.error_ptr != nullptr) {
+      cursor.ptr = local.error_ptr;
+      cursor.error("%s", local.error_msg);
+    }
+    dense_misses += local.dense_misses;
+    decreasing_order += local.decreasing_order;
+    if (local.lines > 0) {
+      if (prev_max != SIZE_MAX && local.min_var <= prev_max) { overlap_chunks++; }
+      prev_max = local.max_var;
+    }
+  }
+
+  const bool order_safe = dense_misses == 0 && decreasing_order == 0 && overlap_chunks == 0;
+
+  if (!order_safe) {
+    std::fprintf(stderr,
+                 "[WARN] parallel BOUNDS fallback to serial: lookup_misses=%zu "
+                 "decreasing_order=%zu overlap_chunks=%zu\n",
+                 dense_misses,
+                 decreasing_order,
+                 overlap_chunks);
+    cursor.ptr = bounds_body_start;
+    return false;
+  }
+
+  {
+    scoped_timer_t timer("bounds_integer_defaults");
+    for (size_t i = 0; i < n_vars; ++i) {
+      if (!bound_seen[i] && state.problem.var_types_[i] == 'I') {
+        state.problem.variable_lower_bounds_[i] = f_t{0};
+        state.problem.variable_upper_bounds_[i] = f_t{1};
+      }
+    }
+  }
+
+  cursor.ptr = bounds_body_end;
+  return true;
+}
+
+template <typename i_t, typename f_t>
+static void init_variable_bounds_defaults(parse_state_t<i_t, f_t>& state)
+{
+  size_t n_vars = (size_t)state.problem.n_vars_;
+  {
+    scoped_timer_t timer("bounds_init_defaults");
+    state.problem.variable_lower_bounds_.resize(n_vars, f_t{0});
+    state.problem.variable_upper_bounds_.resize(n_vars, std::numeric_limits<f_t>::infinity());
+  }
+  {
+    scoped_timer_t timer("bounds_madvise_pretouch");
+    materialize_vector_hugepages("variable_lower_bounds",
+                                 state.problem.variable_lower_bounds_,
+                                 materialize_touch_t::write_4kb);
+    materialize_vector_hugepages("variable_upper_bounds",
+                                 state.problem.variable_upper_bounds_,
+                                 materialize_touch_t::write_4kb);
+  }
+}
+
+template <typename i_t, typename f_t, typename HasBound>
+static void apply_unspecified_integer_bounds(parse_state_t<i_t, f_t>& state, HasBound&& has_bound)
+{
+  scoped_timer_t timer("bounds_integer_defaults");
+  size_t n_vars = (size_t)state.problem.n_vars_;
+  for (size_t i = 0; i < n_vars; ++i) {
+    if (!has_bound(i) && state.problem.var_types_[i] == 'I') {
+      state.problem.variable_lower_bounds_[i] = f_t{0};
+      state.problem.variable_upper_bounds_[i] = f_t{1};
+    }
+  }
+}
+
+template <typename i_t, typename f_t>
+static void init_variable_bounds_without_bounds_section(parse_state_t<i_t, f_t>& state)
+{
+  init_variable_bounds_defaults(state);
+  apply_unspecified_integer_bounds(state, [](size_t) { return false; });
+}
+
+template <typename i_t, typename f_t>
+static void parse_bounds_section(parse_state_t<i_t, f_t>& state,
+                                 cursor_t& cursor,
+                                 bool allow_parallel_dense = false)
+{
+  size_t n_vars = (size_t)state.problem.n_vars_;
+  init_variable_bounds_defaults(state);
+
+  std::vector<uint64_t> bound_seen((n_vars + 63) / 64, 0);
+  auto has_bound = [&](size_t var_idx) {
+    return (bound_seen[var_idx >> 6] & (uint64_t{1} << (var_idx & 63))) != 0;
+  };
+  auto mark_bound = [&](size_t var_idx) {
+    bound_seen[var_idx >> 6] |= uint64_t{1} << (var_idx & 63);
+  };
+
+  if (!accept_section(cursor, "BOUNDS")) {
+    apply_unspecified_integer_bounds(state, has_bound);
+    return;
+  }
+
+  const char* bounds_body_start = cursor.ptr;
+  const char* bounds_body_end   = cursor.end;
+  if (allow_parallel_dense) {
+    if (parse_bounds_section_parallel_dense(
+          state, cursor, bounds_body_start, bounds_body_end, n_vars)) {
+      return;
+    }
+    {
+      scoped_timer_t timer("bounds_parallel_fallback_reset");
+      std::fill(state.problem.variable_lower_bounds_.begin(),
+                state.problem.variable_lower_bounds_.end(),
+                f_t{0});
+      std::fill(state.problem.variable_upper_bounds_.begin(),
+                state.problem.variable_upper_bounds_.end(),
+                std::numeric_limits<f_t>::infinity());
+    }
+  }
+
+  size_t hint_idx = 0;
+  {
+    scoped_timer_t timer("parse_bounds");
+    while (!cursor.done()) {
+      auto bound_type                  = cursor.read_field();
+      [[maybe_unused]] auto bound_name = cursor.read_field();
+      auto var_name                    = cursor.read_field();
+      if (UNLIKELY(!var_name.empty() && var_name[0] == '$')) {
+        cursor.skip_to_eol();
+        expect_eol(cursor);
+        continue;
+      }
+
+      // optimized lookup using hint (bounds often in same order as columns)
+      size_t var_idx = SIZE_MAX;
+      // handle annoying bounds-only vars that weren't declared in COLUMNS
+      typename parse_state_t<i_t, f_t>::bounds_only_var_t* aux_var = nullptr;
+      if (LIKELY(state.col_index_mode == index_mode_t::dense_ordered)) {
+        var_idx = state.col_dense.lookup(var_name);
+        if (var_idx == SIZE_MAX) { aux_var = &state.bounds_only_vars[var_name]; }
+      } else {
+        var_idx = find_var_after_hint(state.var_names_sv, var_name, hint_idx);
+        if (var_idx == SIZE_MAX) { aux_var = &state.bounds_only_vars[var_name]; }
+      }
+      if (var_idx != SIZE_MAX) { hint_idx = var_idx; }
+      bool first_bound_for_var = aux_var == nullptr && !has_bound(var_idx);
+
+      f_t value      = 0;
+      bool has_value = false;
+      accept_comment(cursor);
+      if (!cursor.eol()) {
+        value     = (f_t)expect_number(cursor);
+        has_value = true;
+        accept_comment(cursor);
+      }
+
+      auto set_lb = [&](f_t x) {
+        if (aux_var) {
+          aux_var->lb = x;
+        } else {
+          state.problem.variable_lower_bounds_[var_idx] = x;
+        }
+      };
+      auto set_ub = [&](f_t x) {
+        if (aux_var) {
+          aux_var->ub = x;
+        } else {
+          state.problem.variable_upper_bounds_[var_idx] = x;
+        }
+      };
+      auto set_type = [&](char t) {
+        if (aux_var) {
+          aux_var->type = t;
+        } else {
+          state.problem.var_types_[var_idx] = t;
+        }
+      };
+
+      auto set_error = [&](const char* msg, std::string_view type) {
+        if (std::strcmp(msg, "unknown bound type") == 0) {
+          cursor.error("%s: %.*s", msg, (int)type.size(), type.data());
+        }
+        cursor.error("%s", msg);
+      };
+      [[maybe_unused]] bool bound_applied = apply_bound_record(
+        bound_type, value, has_value, first_bound_for_var, set_lb, set_ub, set_type, set_error);
+      if (aux_var == nullptr) { mark_bound(var_idx); }
+
+      expect_eol(cursor);
+    }
+  }
+  apply_unspecified_integer_bounds(state, has_bound);
+}
+
+template <typename i_t, typename f_t>
+static void init_constraint_bounds_from_rows(parse_state_t<i_t, f_t>& state)
+{
+  state.problem.constraint_lower_bounds_.resize((size_t)state.problem.n_constraints_);
+  state.problem.constraint_upper_bounds_.resize((size_t)state.problem.n_constraints_);
+
+  for (i_t i = 0; i < state.problem.n_constraints_; ++i) {
+    char row_type = state.problem.row_types_[i];
+    f_t b         = state.problem.b_[i];
+    if (row_type == 'E') {
+      state.problem.constraint_lower_bounds_[i] = b;
+      state.problem.constraint_upper_bounds_[i] = b;
+    } else if (row_type == 'L') {
+      state.problem.constraint_lower_bounds_[i] = -std::numeric_limits<f_t>::infinity();
+      state.problem.constraint_upper_bounds_[i] = b;
+    } else if (row_type == 'G') {
+      state.problem.constraint_lower_bounds_[i] = b;
+      state.problem.constraint_upper_bounds_[i] = std::numeric_limits<f_t>::infinity();
+    }
+  }
+}
+
+template <typename i_t, typename f_t>
+static void parse_ranges_section(parse_state_t<i_t, f_t>& state, cursor_t& cursor)
+{
+  scoped_timer_t timer("parse_ranges");
+  init_constraint_bounds_from_rows(state);
+
+  if (!accept_section(cursor, "RANGES")) { return; }
+
+  auto apply_range = [&](std::string_view row_name, f_t range_val) {
+    size_t row_idx = state.row_lookup(row_name);
+    if (row_idx == SIZE_MAX) {
+      cursor.error("unknown row name in RANGES: %.*s", (int)row_name.size(), row_name.data());
+    }
+    char row_type = state.problem.row_types_[row_idx];
+    f_t abs_range = std::abs(range_val);
+
+    if (row_type == 'E') {
+      if (range_val >= 0) {
+        state.problem.constraint_upper_bounds_[row_idx] =
+          state.problem.constraint_lower_bounds_[row_idx] + abs_range;
+      } else {
+        state.problem.constraint_lower_bounds_[row_idx] =
+          state.problem.constraint_upper_bounds_[row_idx] - abs_range;
+      }
+    } else if (row_type == 'L') {
+      state.problem.constraint_lower_bounds_[row_idx] =
+        state.problem.constraint_upper_bounds_[row_idx] - abs_range;
+    } else if (row_type == 'G') {
+      state.problem.constraint_upper_bounds_[row_idx] =
+        state.problem.constraint_lower_bounds_[row_idx] + abs_range;
+    }
+  };
+
+  while (cursor.ptr < cursor.end) {
+    [[maybe_unused]] auto range_name = cursor.read_field();
+    if (accept_comment(cursor)) {
+      expect_eol(cursor);
+      continue;
+    }
+    auto row_name = cursor.read_field();
+    auto value    = (f_t)expect_number(cursor);
+    apply_range(row_name, value);
+
+    accept_comment(cursor);
+    if (!cursor.eol()) {
+      auto row_name2 = cursor.read_field();
+      if (UNLIKELY(!row_name2.empty() && row_name2[0] == '$')) {
+        cursor.skip_to_eol();
+        expect_eol(cursor);
+        continue;
+      }
+      auto value2 = (f_t)expect_number(cursor);
+      apply_range(row_name2, value2);
+      accept_comment(cursor);
+    }
+    expect_eol(cursor);
+  }
+}
+
+// quadratric stuff is bare bones for now, optimize if needed
+
+template <typename i_t, typename f_t>
+static void build_var_name_map_if_needed(parse_state_t<i_t, f_t>& state)
+{
+  if (state.col_index_mode == index_mode_t::dense_ordered || !state.var_names_map.empty()) {
+    return;
+  }
+  scoped_timer_t timer("quadratic_build_var_name_map");
+  state.var_names_map.reserve((size_t)state.problem.n_vars_ * 2);
+  for (size_t i = 0; i < state.var_names_sv.size(); ++i) {
+    state.var_names_map.emplace(state.var_names_sv[i], i);
+  }
+}
+
+template <typename i_t, typename f_t>
+static size_t lookup_quadratic_var(parse_state_t<i_t, f_t>& state, std::string_view name)
+{
+  if (state.col_index_mode == index_mode_t::dense_ordered) { return state.col_dense.lookup(name); }
+  auto it = state.var_names_map.find(name);
+  return it == state.var_names_map.end() ? SIZE_MAX : it->second;
+}
+
+template <typename i_t, typename f_t>
+static void build_quadratic_csr(parse_state_t<i_t, f_t>& state,
+                                const std::vector<std::tuple<i_t, i_t, f_t>>& entries,
+                                bool symmetric_upper_triangular)
+{
+  scoped_timer_t timer("build_quadratic_csr");
+  const size_t n_vars = (size_t)state.problem.n_vars_;
+  if (entries.empty()) { return; }
+
+  struct expanded_entry_t {
+    size_t row;
+    size_t col;
+    size_t seq;
+    f_t value;
+  };
+
+  std::vector<expanded_entry_t> expanded;
+  expanded.reserve(symmetric_upper_triangular ? entries.size() * 2 : entries.size());
+  size_t seq = 0;
+  for (const auto& [row_i, col_i, value] : entries) {
+    size_t row = (size_t)row_i;
+    size_t col = (size_t)col_i;
+    expanded.push_back({row, col, seq++, value});
+    if (symmetric_upper_triangular && row != col) { expanded.push_back({col, row, seq++, value}); }
+  }
+
+  std::stable_sort(expanded.begin(), expanded.end(), [](const auto& a, const auto& b) {
+    if (a.row != b.row) return a.row < b.row;
+    if (a.col != b.col) return a.col < b.col;
+    return a.seq < b.seq;
+  });
+
+  auto& values  = state.problem.Q_objective_values_;
+  auto& indices = state.problem.Q_objective_indices_;
+  auto& offsets = state.problem.Q_objective_offsets_;
+  values.clear();
+  indices.clear();
+  offsets.assign(n_vars + 1, i_t{0});
+  values.reserve(expanded.size());
+  indices.reserve(expanded.size());
+
+  size_t current_row = 0;
+  offsets[0]         = 0;
+  for (const auto& entry : expanded) {
+    while (current_row < entry.row) {
+      offsets[++current_row] = (i_t)values.size();
+    }
+    values.push_back(entry.value * f_t{0.5});
+    indices.push_back((i_t)entry.col);
+  }
+  while (current_row < n_vars) {
+    offsets[++current_row] = (i_t)values.size();
+  }
+}
+
+template <typename i_t, typename f_t>
+static void parse_quadratic_sections(parse_state_t<i_t, f_t>& state, cursor_t& cursor)
+{
+  scoped_timer_t timer("parse_quadratic_sections");
+  if (cursor.done()) { return; }
+
+  build_var_name_map_if_needed(state);
+  std::vector<std::tuple<i_t, i_t, f_t>> quadobj_entries;
+  std::vector<std::tuple<i_t, i_t, f_t>> qmatrix_entries;
+  std::vector<std::tuple<i_t, i_t, f_t>>* active_entries = nullptr;
+
+  auto add_entry = [&](std::string_view var1, std::string_view var2, f_t value) {
+    size_t var1_idx = lookup_quadratic_var(state, var1);
+    if (var1_idx == SIZE_MAX) {
+      cursor.error(
+        "unknown variable name in quadratic section: %.*s", (int)var1.size(), var1.data());
+    }
+    size_t var2_idx = lookup_quadratic_var(state, var2);
+    if (var2_idx == SIZE_MAX) {
+      cursor.error(
+        "unknown variable name in quadratic section: %.*s", (int)var2.size(), var2.data());
+    }
+    active_entries->emplace_back((i_t)var1_idx, (i_t)var2_idx, value);
+  };
+
+  while (cursor.ptr < cursor.end) {
+    if (accept_section(cursor, "QUADOBJ")) {
+      active_entries = &quadobj_entries;
+      continue;
+    }
+    if (accept_section(cursor, "QMATRIX")) {
+      active_entries = &qmatrix_entries;
+      continue;
+    }
+    if (accept(cursor, "QCMATRIX")) {
+      auto row_name = cursor.read_field();
+      if (row_name.empty()) { cursor.error("QCMATRIX missing constraint row name"); }
+      size_t row_idx = state.row_lookup(row_name);
+      if (row_idx == SIZE_MAX) {
+        cursor.error(
+          "unknown constraint row name in QCMATRIX: %.*s", (int)row_name.size(), row_name.data());
+      }
+      char row_type = state.problem.row_types_[row_idx];
+      if (row_type != 'L' && row_type != 'G') {
+        cursor.error(
+          "QCMATRIX row must have ROWS type L or G: %.*s", (int)row_name.size(), row_name.data());
+      }
+      expect_eol(cursor);
+      typename parse_state_t<i_t, f_t>::qcmatrix_block_t block;
+      block.row_idx  = row_idx;
+      block.row_name = row_name;
+      state.qcmatrix_blocks.push_back(std::move(block));
+      active_entries = &state.qcmatrix_blocks.back().entries;
+      continue;
+    }
+    if (active_entries == nullptr) { break; }
+
+    const char* field_start = cursor.ptr;
+    auto var1               = cursor.read_field();
+    if (UNLIKELY(var1.empty())) { break; }
+    if (UNLIKELY(var1[0] == '$' || var1[0] == '*')) {
+      cursor.skip_to_eol();
+      expect_eol(cursor);
+      continue;
+    }
+    const bool starts_column_one =
+      field_start == cursor.start || field_start[-1] == '\n' || field_start[-1] == '\r';
+    if (UNLIKELY(starts_column_one)) {
+      cursor.error("unknown quadratic section record: %.*s", (int)var1.size(), var1.data());
+    }
+    auto var2 = cursor.read_field();
+    if (UNLIKELY(!var2.empty() && var2[0] == '$')) {
+      cursor.skip_to_eol();
+      expect_eol(cursor);
+      continue;
+    }
+    f_t value = (f_t)expect_number(cursor);
+    add_entry(var1, var2, value);
+    accept_comment(cursor);
+    expect_eol(cursor);
+  }
+
+  if (!quadobj_entries.empty()) {
+    build_quadratic_csr(state, quadobj_entries, true);
+  } else if (!qmatrix_entries.empty()) {
+    build_quadratic_csr(state, qmatrix_entries, false);
+  }
+}
+
+template <typename i_t, typename f_t>
+static void set_cursor_range(parse_state_t<i_t, f_t>& state, mps_phase_range_t range)
+{
+  state.cursor.ptr = range.begin;
+  state.cursor.end = range.end;
+}
+
+template <typename i_t, typename f_t>
+static void parse_header_range(parse_state_t<i_t, f_t>& state, mps_phase_range_t range)
+{
+  set_cursor_range(state, range);
+  accept_comment_line(state.cursor);
+  if (state.cursor.done()) { return; }
+  parse_name_section(state);
+  parse_objsense_section(state);
+  parse_objname_section(state);
+}
+
+template <typename i_t, typename f_t>
+static void parse_rows_range(parse_state_t<i_t, f_t>& state, mps_phase_range_t range)
+{
+  set_cursor_range(state, range);
+  parse_rows_section(state, range.end);
+}
+
+template <typename i_t, typename f_t>
+static void parse_columns_range(parse_state_t<i_t, f_t>& state,
+                                mps_phase_range_t range,
+                                int num_threads = 0)
+{
+  set_cursor_range(state, range);
+  parse_columns_section_parallel(state, num_threads, range.end);
+}
+
+template <typename i_t, typename f_t>
+static void parse_rhs_range(parse_state_t<i_t, f_t>& state, mps_phase_range_t range)
+{
+  if (!range.present) { return; }
+  cursor_t cursor(range.begin, (size_t)(range.end - range.begin));
+  parse_rhs_section(state, cursor);
+}
+
+template <typename i_t, typename f_t>
+static void parse_bounds_range(parse_state_t<i_t, f_t>& state, mps_phase_range_t range)
+{
+  if (!range.present) {
+    init_variable_bounds_without_bounds_section(state);
+    return;
+  }
+  cursor_t cursor(range.begin, (size_t)(range.end - range.begin));
+  parse_bounds_section(state, cursor, true);
+}
+
+template <typename i_t, typename f_t>
+static void parse_ranges_range(parse_state_t<i_t, f_t>& state, mps_phase_range_t range)
+{
+  if (!range.present) {
+    init_constraint_bounds_from_rows(state);
+    return;
+  }
+  cursor_t cursor(range.begin, (size_t)(range.end - range.begin));
+  parse_ranges_section(state, cursor);
+}
+
+template <typename i_t, typename f_t>
+static void parse_quadratic_range(parse_state_t<i_t, f_t>& state, mps_phase_range_t range)
+{
+  if (!range.present) { return; }
+  cursor_t cursor(range.begin, (size_t)(range.end - range.begin));
+  parse_quadratic_sections(state, cursor);
+}
+
+template <typename i_t, typename f_t>
+static void finalize_qcmatrix_constraints(parse_state_t<i_t, f_t>& state)
+{
+  if (state.qcmatrix_blocks.empty()) { return; }
+  scoped_timer_t timer("finalize_qcmatrix_constraints");
+  const size_t original_rows = (size_t)state.problem.n_constraints_;
+  std::vector<uint8_t> quadratic_rows(original_rows, 0);
+  std::vector<uint8_t> seen_rows(original_rows, 0);
+  size_t active_blocks = 0;
+
+  for (const auto& block : state.qcmatrix_blocks) {
+    if (block.entries.empty()) { continue; }
+    if (block.row_idx >= original_rows) {
+      state.cursor.error("QCMATRIX row index is out of range");
+    }
+    if (seen_rows[block.row_idx]) {
+      state.cursor.error("duplicate QCMATRIX block for constraint row: %.*s",
+                         (int)block.row_name.size(),
+                         block.row_name.data());
+    }
+    seen_rows[block.row_idx]      = 1;
+    quadratic_rows[block.row_idx] = 1;
+    ++active_blocks;
+  }
+
+  if (active_blocks == 0) { return; }
+
+  // rebuild the A_ matrix. fairly ugly and brute force, could do better if we parsed the QCMATRIX
+  // entries before building the CSR in COLUMNS but unclear if worth it
+  for (const auto& block : state.qcmatrix_blocks) {
+    if (block.entries.empty()) { continue; }
+
+    size_t linear_begin = (size_t)state.problem.A_offsets_[block.row_idx];
+    size_t linear_end   = (size_t)state.problem.A_offsets_[block.row_idx + 1];
+    typename mps_data_model_t<i_t, f_t>::quadratic_constraint_t qc;
+    qc.constraint_row_index = (i_t)block.row_idx;
+    qc.constraint_row_name  = state.problem.row_names_[block.row_idx];
+    qc.constraint_row_type  = state.problem.row_types_[block.row_idx];
+    qc.rhs_value            = state.problem.b_[block.row_idx];
+    qc.linear_values.assign(state.problem.A_.begin() + linear_begin,
+                            state.problem.A_.begin() + linear_end);
+    qc.linear_indices.assign(state.problem.A_indices_.begin() + linear_begin,
+                             state.problem.A_indices_.begin() + linear_end);
+
+    std::vector<size_t> perm(block.entries.size());
+    for (size_t i = 0; i < perm.size(); ++i) {
+      perm[i] = i;
+    }
+    std::sort(perm.begin(), perm.end(), [&](size_t a, size_t b) {
+      const auto& ea = block.entries[a];
+      const auto& eb = block.entries[b];
+      if (std::get<0>(ea) != std::get<0>(eb)) { return std::get<0>(ea) < std::get<0>(eb); }
+      return std::get<1>(ea) < std::get<1>(eb);
+    });
+
+    qc.rows.reserve(block.entries.size());
+    qc.cols.reserve(block.entries.size());
+    qc.vals.reserve(block.entries.size());
+    for (size_t idx : perm) {
+      const auto& [row, col, val] = block.entries[idx];
+      qc.rows.push_back(row);
+      qc.cols.push_back(col);
+      qc.vals.push_back(val);
+    }
+    state.problem.quadratic_constraints_.push_back(std::move(qc));
+  }
+
+  std::vector<f_t> new_A;
+  std::vector<i_t> new_A_indices;
+  std::vector<i_t> new_A_offsets;
+  std::vector<f_t> new_b;
+  std::vector<f_t> new_clb;
+  std::vector<f_t> new_cub;
+  std::vector<std::string> new_row_names;
+  std::vector<char> new_row_types;
+
+  new_A.reserve(state.problem.A_.size());
+  new_A_indices.reserve(state.problem.A_indices_.size());
+  new_A_offsets.reserve(original_rows + 1 - active_blocks);
+  new_b.reserve(original_rows - active_blocks);
+  new_clb.reserve(original_rows - active_blocks);
+  new_cub.reserve(original_rows - active_blocks);
+  new_row_names.reserve(original_rows - active_blocks);
+  new_row_types.reserve(original_rows - active_blocks);
+  new_A_offsets.push_back(0);
+
+  for (size_t row = 0; row < original_rows; ++row) {
+    if (quadratic_rows[row]) { continue; }
+    size_t begin = (size_t)state.problem.A_offsets_[row];
+    size_t end   = (size_t)state.problem.A_offsets_[row + 1];
+    new_A.insert(new_A.end(), state.problem.A_.begin() + begin, state.problem.A_.begin() + end);
+    new_A_indices.insert(new_A_indices.end(),
+                         state.problem.A_indices_.begin() + begin,
+                         state.problem.A_indices_.begin() + end);
+    new_A_offsets.push_back((i_t)new_A.size());
+    new_b.push_back(state.problem.b_[row]);
+    new_clb.push_back(state.problem.constraint_lower_bounds_[row]);
+    new_cub.push_back(state.problem.constraint_upper_bounds_[row]);
+    new_row_names.push_back(std::move(state.problem.row_names_[row]));
+    new_row_types.push_back(state.problem.row_types_[row]);
+  }
+
+  state.problem.A_                       = std::move(new_A);
+  state.problem.A_indices_               = std::move(new_A_indices);
+  state.problem.A_offsets_               = std::move(new_A_offsets);
+  state.problem.b_                       = std::move(new_b);
+  state.problem.constraint_lower_bounds_ = std::move(new_clb);
+  state.problem.constraint_upper_bounds_ = std::move(new_cub);
+  state.problem.row_names_               = std::move(new_row_names);
+  state.problem.row_types_               = std::move(new_row_types);
+  state.problem.n_constraints_           = (i_t)state.problem.b_.size();
+  state.problem.nnz_                     = (i_t)state.problem.A_.size();
+}
+
+template <typename i_t, typename f_t>
+static void materialize_problem_names(parse_state_t<i_t, f_t>& state)
+{
+  scoped_timer_t timer("materialize_problem_names");
+  int num_threads = phase_thread_count(MPS_NAMES_THREAD_CAP);
+  // Copy string_views to actual strings (this is where allocation happens)
+  {
+    scoped_timer_t timer("materialize_problem_scalar_names");
+    state.problem.problem_name_   = std::string(state.problem_name_sv);
+    state.problem.objective_name_ = std::string(state.objective_name_sv);
+  }
+
+  {
+    scoped_timer_t timer("materialize_problem_row_names");
+    size_t n = state.row_names_sv.size();
+    state.problem.row_names_.resize(n);
+    // row names are usually small enough for SSO - parallel assigns mostly don't touch the heap and
+    // as such may help a lot ideally we could just allocate an arena and store non-owning string
+    // views but that'd require a refactor of the problem representation
+    if (n >= 1'000'000 && num_threads > 1) {
+#pragma omp parallel for schedule(static) num_threads(num_threads)
+      for (size_t i = 0; i < n; ++i) {
+        state.problem.row_names_[i].assign(state.row_names_sv[i]);
+      }
+    } else {
+      for (size_t i = 0; i < n; ++i) {
+        state.problem.row_names_[i].assign(state.row_names_sv[i]);
+      }
+    }
+  }
+
+  {
+    scoped_timer_t timer("materialize_problem_var_names");
+    const bool col_dense_ordered = state.col_index_mode == index_mode_t::dense_ordered;
+    size_t n = col_dense_ordered ? (size_t)state.problem.n_vars_ : state.var_names_sv.size();
+    state.problem.var_names_.resize(n);
+    if (col_dense_ordered && n >= 1'000'000 && num_threads > 1) {
+#pragma omp parallel for schedule(static) num_threads(num_threads)
+      for (size_t i = 0; i < n; ++i) {
+        state.col_dense.format_name(i, state.problem.var_names_[i]);
+      }
+    } else if (col_dense_ordered) {
+      for (size_t i = 0; i < n; ++i) {
+        state.col_dense.format_name(i, state.problem.var_names_[i]);
+      }
+    } else if (n >= 1'000'000 && num_threads > 1) {
+#pragma omp parallel for schedule(static) num_threads(num_threads)
+      for (size_t i = 0; i < n; ++i) {
+        state.problem.var_names_[i].assign(state.var_names_sv[i]);
+      }
+    } else {
+      for (size_t i = 0; i < n; ++i) {
+        state.problem.var_names_[i].assign(state.var_names_sv[i]);
+      }
+    }
+  }
+}
+
+template <typename i_t, typename f_t>
+static void append_bounds_only_variables(parse_state_t<i_t, f_t>& state)
+{
+  if (state.bounds_only_vars.empty()) { return; }
+  scoped_timer_t timer("append_bounds_only_variables");
+
+  // BOUNDS-only variables have no matrix entries; append after COLUMNS vars.
+  for (const auto& [name, aux] : state.bounds_only_vars) {
+    state.problem.var_names_.emplace_back(name);
+    state.problem.var_types_.push_back(aux.type);
+    state.problem.c_.push_back(f_t{0});
+    state.problem.variable_lower_bounds_.push_back(aux.lb);
+    state.problem.variable_upper_bounds_.push_back(aux.ub);
+  }
+  state.problem.n_vars_ = (i_t)state.problem.var_names_.size();
+}
+
+template <typename i_t, typename f_t>
+static std::size_t init_problem_storage(mps_data_model_t<i_t, f_t>& problem,
+                                        std::size_t reserve_hint)
+{
+  problem.n_vars_                   = 0;
+  problem.n_constraints_            = 0;
+  problem.nnz_                      = 0;
+  problem.maximize_                 = false;
+  problem.objective_scaling_factor_ = f_t{1};
+  problem.objective_offset_         = f_t{0};
+
+  std::size_t reserve_size = std::max<std::size_t>(reserve_hint, 1 * MiB);
+  std::size_t reserve_dim  = std::max((size_t)1000, reserve_size / 1000);
+  problem.A_offsets_.reserve(reserve_dim);
+  problem.b_.reserve(reserve_dim);
+  problem.variable_lower_bounds_.reserve(reserve_dim);
+  problem.variable_upper_bounds_.reserve(reserve_dim);
+  problem.var_types_.reserve(reserve_dim);
+  problem.row_types_.reserve(reserve_dim);
+  problem.row_names_.reserve(reserve_dim);
+  problem.var_names_.reserve(reserve_dim);
+  problem.constraint_lower_bounds_.reserve(reserve_dim);
+  problem.constraint_upper_bounds_.reserve(reserve_dim);
+  return reserve_dim;
+}
+
+// Contract every input stream fed to parse_mps_fast_stream must satisfy.
+template <typename Stream>
+concept InputStream = requires(Stream stream)
+{
+  {stream.data()}->std::convertible_to<const char*>;
+  {stream.mutable_data()}->std::convertible_to<char*>;
+  {stream.size()}->std::convertible_to<std::size_t>;
+  {stream.compressed_size()}->std::convertible_to<std::size_t>;
+  {stream.reserve_size_hint()}->std::convertible_to<std::size_t>;
+  {stream.registry()}->std::same_as<mps_phase_registry_t&>;
+  {stream.view()}->std::same_as<input_stream_view_t>;
+  {stream.run_decode_tasks()}->std::same_as<void>;
+};
+
+template <InputStream Stream, typename i_t, typename f_t>
+static mps_data_model_t<i_t, f_t> parse_mps_fast_stream(Stream& stream,
+                                                        const char* total_timer_name,
+                                                        const char* producer_task_name)
+{
+  omp_max_active_levels_guard_t omp_active_levels(2);
+
+  input_stream_view_t input = stream.view();
+  auto total_timer          = std::make_unique<scoped_timer_t>(total_timer_name);
+  mps_data_model_t<i_t, f_t> problem;
+  std::size_t reserve_dim = init_problem_storage(problem, stream.reserve_size_hint());
+
+  cursor_t cursor(input.data, 0);
+  parse_state_t<i_t, f_t> state(problem, cursor);
+  state.row_names_sv.reserve(reserve_dim);
+
+  auto phase_end = [](const char*) { flush_timers(); };
+
+  parallel_error_latch_t parser_tasks;
+
+  auto run_parser_task = [&](auto&& fn) {
+    if (parser_tasks.stopped()) { return; }
+    try {
+      fn();
+    } catch (...) {
+      parser_tasks.capture(std::current_exception());
+    }
+  };
+
+  auto unblock_phase_waiters_after_error = [&]() {
+    mps_phase_range_t empty{input.data, input.data, false};
+    input.registry->publish(mps_phase_kind::header, empty);
+    input.registry->publish(mps_phase_kind::rows, empty);
+    input.registry->publish(mps_phase_kind::columns, empty);
+    input.registry->publish(mps_phase_kind::rhs, empty);
+    input.registry->publish(mps_phase_kind::bounds, empty);
+    input.registry->publish(mps_phase_kind::ranges, empty);
+    input.registry->publish(mps_phase_kind::quadratic, empty);
+  };
+
+  // These ints carry no data; they exist only as OpenMP task-dependency tokens. A task's
+  // depend(out: X) "produces" X and depend(in: X) waits on it, so the phase ordering in the
+  // task graph below (e.g. bounds after columns_done, because bounds reference variable names)
+  // is expressed purely through which tokens each task depends on.
+  int header_ready = 0, rows_ready = 0, columns_ready = 0;
+  int rhs_ready = 0, bounds_ready = 0, ranges_ready = 0, quadratic_ready = 0;
+  int header_done = 0, rows_done = 0, columns_done = 0;
+  int rhs_done = 0, bounds_done = 0, ranges_done = 0, quadratic_done = 0, names_done = 0;
+  int csr_done = 0;
+
+  const std::size_t parser_size = std::max(stream.reserve_size_hint(), input.compressed_size);
+  const int parser_threads      = parser_thread_cap_for_size(parser_size);
+
+#pragma omp parallel num_threads(parser_threads)
+  {
+    std::string thread_name = "omp-parser-" + std::to_string(omp_get_thread_num());
+    nvtx::name_current_thread(thread_name.c_str());
+
+#pragma omp single
+    {
+      // Bridge between the producer and the parse tasks: each detached task below blocks
+      // until run_decode_tasks() publishes that phase's byte range into the registry, then
+      // completes its event and fulfills depend(out: <phase>_ready) -- releasing the matching
+      // parse task. This is what lets ROWS parsing start the instant the ROWS bytes are
+      // decoded, overlapping with the decode of later sections.
+      omp_event_handle_t ev_header;
+#pragma omp task detach(ev_header) depend(out : header_ready)
+      {
+        input.registry->attach_event(mps_phase_kind::header, ev_header);
+      }
+      omp_event_handle_t ev_rows;
+#pragma omp task detach(ev_rows) depend(out : rows_ready)
+      {
+        input.registry->attach_event(mps_phase_kind::rows, ev_rows);
+      }
+      omp_event_handle_t ev_columns;
+#pragma omp task detach(ev_columns) depend(out : columns_ready)
+      {
+        input.registry->attach_event(mps_phase_kind::columns, ev_columns);
+      }
+      omp_event_handle_t ev_rhs;
+#pragma omp task detach(ev_rhs) depend(out : rhs_ready)
+      {
+        input.registry->attach_event(mps_phase_kind::rhs, ev_rhs);
+      }
+      omp_event_handle_t ev_bounds;
+#pragma omp task detach(ev_bounds) depend(out : bounds_ready)
+      {
+        input.registry->attach_event(mps_phase_kind::bounds, ev_bounds);
+      }
+      omp_event_handle_t ev_ranges;
+#pragma omp task detach(ev_ranges) depend(out : ranges_ready)
+      {
+        input.registry->attach_event(mps_phase_kind::ranges, ev_ranges);
+      }
+      omp_event_handle_t ev_quadratic;
+#pragma omp task detach(ev_quadratic) depend(out : quadratic_ready)
+      {
+        input.registry->attach_event(mps_phase_kind::quadratic, ev_quadratic);
+      }
+
+      // We intentionally keep LZ4/raw input as a stable full-buffer producer here. The
+      // progressive decoded-page lifetime prototype saved RSS, but made COLUMNS/merge slower
+      // and really wants a separate memory-limited parser pipeline instead of this fast path.
+#pragma omp task
+      {
+        MPS_NVTX_RANGE(producer_task_name, nvtx::colors::io);
+        try {
+          stream.run_decode_tasks();
+        } catch (...) {
+          parser_tasks.capture(std::current_exception());
+          unblock_phase_waiters_after_error();
+        }
+      }
+
+#pragma omp task depend(in : header_ready) depend(out : header_done)
+      {
+        run_parser_task([&] {
+          MPS_NVTX_RANGE("task_header", nvtx::colors::generic);
+          parse_header_range(state, input.registry->range(mps_phase_kind::header));
+          phase_end("header");
+        });
+      }
+
+#pragma omp task depend(in : rows_ready, header_done) depend(out : rows_done)
+      {
+        run_parser_task([&] {
+          MPS_NVTX_RANGE("task_rows", nvtx::colors::rows);
+          parse_rows_range(state, input.registry->range(mps_phase_kind::rows));
+          phase_end("rows");
+        });
+      }
+
+#pragma omp task depend(in : rows_done, columns_ready) depend(out : columns_done)
+      {
+        run_parser_task([&] {
+          MPS_NVTX_RANGE("task_columns", nvtx::colors::columns);
+          parse_columns_range(state, input.registry->range(mps_phase_kind::columns));
+          phase_end("columns");
+        });
+      }
+
+#pragma omp task depend(in : columns_done) depend(out : names_done)
+      {
+        run_parser_task([&] {
+          MPS_NVTX_RANGE("task_materialize_names", nvtx::colors::names);
+          scoped_timer_t timer("materialize_problem_names_task");
+          materialize_problem_names(state);
+        });
+      }
+
+#pragma omp task depend(in : columns_done) depend(out : csr_done)
+      {
+        run_parser_task([&] {
+          MPS_NVTX_RANGE("task_materialize_csr", nvtx::colors::alloc);
+          materialize_problem_csr(state);
+        });
+      }
+
+#pragma omp task depend(in : rhs_ready, columns_done) depend(out : rhs_done)
+      {
+        run_parser_task([&] {
+          MPS_NVTX_RANGE("task_rhs", nvtx::colors::rhs);
+          parse_rhs_range(state, input.registry->range(mps_phase_kind::rhs));
+          phase_end("rhs");
+        });
+      }
+
+#pragma omp task depend(in : ranges_ready, rhs_done) depend(out : ranges_done)
+      {
+        run_parser_task([&] {
+          MPS_NVTX_RANGE("task_ranges", nvtx::colors::ranges);
+          parse_ranges_range(state, input.registry->range(mps_phase_kind::ranges));
+          phase_end("ranges");
+        });
+      }
+
+#pragma omp task depend(in : bounds_ready, columns_done) depend(out : bounds_done)
+      {
+        run_parser_task([&] {
+          MPS_NVTX_RANGE("task_bounds", nvtx::colors::bounds);
+          parse_bounds_range(state, input.registry->range(mps_phase_kind::bounds));
+          phase_end("bounds");
+        });
+      }
+
+#pragma omp task depend(in : quadratic_ready, columns_done) depend(out : quadratic_done)
+      {
+        run_parser_task([&] {
+          MPS_NVTX_RANGE("task_quadratic", nvtx::colors::generic);
+          parse_quadratic_range(state, input.registry->range(mps_phase_kind::quadratic));
+          phase_end("quadratic");
+        });
+      }
+    }
+  }
+
+  parser_tasks.rethrow_if_error();
+
+  finalize_qcmatrix_constraints(state);
+  append_bounds_only_variables(state);
+
+  input.size = stream.size();
+  cursor.end = input.data + input.size;
+  if (!input.registry->endata_ready()) {
+    cursor.ptr = input.data + input.size;
+    cursor.error("input ended before ENDATA boundary was resolved");
+  }
+  if (input.registry->endata_present()) {
+    cursor.ptr = input.registry->endata_begin();
+    expect(cursor, "ENDATA");
+  }
+
+  total_timer.reset();
+  flush_timers();
+  return problem;
+}
+
+struct padded_memory_input_t {
+  std::vector<char> buffer;
+  std::size_t input_size      = 0;
+  std::size_t compressed_size = 0;
+};
+
+static padded_memory_input_t read_compressed_mps_file(const std::string& path)
+{
+  std::vector<char> buffer = file_to_string(path);
+  if (buffer.empty()) { buffer.push_back('\0'); }
+
+  std::size_t input_size = buffer.size() - 1;
+  ensure_input_buffer_padding(buffer, input_size);
+  return {std::move(buffer), input_size, get_file_size(path)};
+}
+
+template <typename i_t, typename f_t>
+mps_data_model_t<i_t, f_t> parse_mps_fast_file(const std::string& path, FileReadMethod read_method)
+{
+  FileReadMethod effective_method = effective_file_read_method(path, read_method);
+  switch (effective_method) {
+    case FileReadMethod::Lz4: {
+      lz4_input_stream_t stream(path);
+      return parse_mps_fast_stream<lz4_input_stream_t, i_t, f_t>(
+        stream, "parse_mps_fast_file_lz4 (total)", "task_lz4_read_decode");
+    }
+    case FileReadMethod::Gzip:
+    case FileReadMethod::Bzip2: {
+      padded_memory_input_t input = read_compressed_mps_file(path);
+      memory_input_stream_t stream(
+        std::move(input.buffer), input.input_size, input.compressed_size);
+      const char* timer_name = effective_method == FileReadMethod::Gzip
+                                 ? "parse_mps_fast_file_gzip (total)"
+                                 : "parse_mps_fast_file_bzip2 (total)";
+      return parse_mps_fast_stream<memory_input_stream_t, i_t, f_t>(
+        stream, timer_name, "task_memory_scan");
+    }
+    case FileReadMethod::Read: {
+      raw_input_stream_t stream(path);
+      return parse_mps_fast_stream<raw_input_stream_t, i_t, f_t>(
+        stream, "parse_mps_fast_file_raw (total)", "task_raw_read");
+    }
+  }
+  __builtin_unreachable();
+}
+
+template mps_data_model_t<int, float> parse_mps_fast_file(const std::string& path,
+                                                          FileReadMethod read_method);
+template mps_data_model_t<int, double> parse_mps_fast_file(const std::string& path,
+                                                           FileReadMethod read_method);
+template mps_data_model_t<int64_t, float> parse_mps_fast_file(const std::string& path,
+                                                              FileReadMethod read_method);
+template mps_data_model_t<int64_t, double> parse_mps_fast_file(const std::string& path,
+                                                               FileReadMethod read_method);
+
+}  // namespace cuopt::mathematical_optimization::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/fast_parser.hpp b/cpp/src/io/experimental_mps_fast/fast_parser.hpp
new file mode 100644
index 0000000000..ea0ec6fd99
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/fast_parser.hpp
@@ -0,0 +1,22 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "file_reader.hpp"
+
+#include <cuopt/mathematical_optimization/io/mps_data_model.hpp>
+
+#include <cstddef>
+#include <string>
+
+namespace cuopt::mathematical_optimization::io::detail {
+
+template <typename i_t, typename f_t>
+using parser_model_t = mps_data_model_t<i_t, f_t>;
+
+template <typename i_t, typename f_t>
+parser_model_t<i_t, f_t> parse_mps_fast_file(const std::string& path,
+                                             FileReadMethod read_method = FileReadMethod::Read);
+
+}  // namespace cuopt::mathematical_optimization::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/file_reader.cpp b/cpp/src/io/experimental_mps_fast/file_reader.cpp
new file mode 100644
index 0000000000..2d1fa083b6
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/file_reader.cpp
@@ -0,0 +1,371 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+#include "file_reader.hpp"
+#include "nvtx_ranges.hpp"
+
+#include <utilities/error.hpp>
+#include <utilities/scope_guard.hpp>
+
+#include <cuda/cmath>
+
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <atomic>
+#include <cctype>
+#include <cerrno>
+#include <chrono>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+#include <mutex>
+#include <stdexcept>
+#include <string>
+#include <thread>
+#include <utility>
+#include <vector>
+
+namespace cuopt::mathematical_optimization::io::detail {
+
+using cuopt::mathematical_optimization::io::error_type_t;
+using cuopt::mathematical_optimization::io::mps_parser_fail;
+
+namespace {
+
+constexpr std::size_t raw_input_window_bytes              = 64ull * 1024ull * 1024ull;
+constexpr std::size_t raw_input_max_read_threads          = 8;
+constexpr std::size_t raw_input_direct_io_threshold_bytes = 1ull * 1024ull * 1024ull * 1024ull;
+constexpr long nfs_super_magic                            = 0x6969;
+
+bool path_has_suffix(const std::string& path, const char* suffix) noexcept
+{
+  std::size_t suffix_len = std::strlen(suffix);
+  if (path.size() < suffix_len) { return false; }
+  for (std::size_t i = 0; i < suffix_len; ++i) {
+    unsigned char path_char = path[path.size() - suffix_len + i];
+    if (std::tolower(path_char) != suffix[i]) { return false; }
+  }
+  return true;
+}
+
+std::size_t add_input_padding(std::size_t size)
+{
+  if (size > std::numeric_limits<std::size_t>::max() - input_buffer_padding_bytes) {
+    mps_parser_fail(error_type_t::OutOfMemoryError, "input padding size overflow");
+  }
+  return size + input_buffer_padding_bytes;
+}
+
+bool is_nfs_backed_path(const std::string& path) noexcept
+{
+  struct statfs fs;
+  return ::statfs(path.c_str(), &fs) == 0 && fs.f_type == nfs_super_magic;
+}
+
+}  // namespace
+
+void ensure_input_buffer_padding(std::vector<char>& buffer, std::size_t input_size)
+{
+  if (input_size > buffer.size()) {
+    mps_parser_fail(error_type_t::ValidationError,
+                    "input_size %zu exceeds buffer size %zu",
+                    input_size,
+                    buffer.size());
+  }
+  std::size_t required = add_input_padding(input_size);
+  if (buffer.size() < required) { buffer.resize(required, '\0'); }
+}
+
+std::size_t get_file_size(int fd, const std::string& path)
+{
+  struct stat st;
+  if (::fstat(fd, &st) != 0) {
+    mps_parser_fail(error_type_t::RuntimeError,
+                    "Failed to stat file '%s': %s",
+                    path.c_str(),
+                    std::strerror(errno));
+  }
+  if (st.st_size < 0) {
+    mps_parser_fail(error_type_t::RuntimeError, "Negative file size for '%s'", path.c_str());
+  }
+  return (std::size_t)st.st_size;
+}
+
+std::size_t get_file_size(const std::string& path)
+{
+  int fd = ::open(path.c_str(), O_RDONLY);
+  if (fd < 0) {
+    mps_parser_fail(error_type_t::RuntimeError,
+                    "Failed to open file '%s': %s",
+                    path.c_str(),
+                    std::strerror(errno));
+  }
+  cuopt::scope_guard close_fd([&] {
+    if (fd >= 0) { ::close(fd); }
+  });
+
+  std::size_t size = get_file_size(fd, path);
+  ::close(fd);
+  return size;
+}
+
+std::size_t system_page_size()
+{
+  static std::size_t page_size = [] {
+    long value = ::sysconf(_SC_PAGESIZE);
+    return value > 0 ? (std::size_t)value : (std::size_t)4096;
+  }();
+  return page_size;
+}
+
+bool pread_full(int fd, char* dst, std::size_t bytes, std::size_t offset)
+{
+  std::size_t done = 0;
+  while (done < bytes) {
+    std::size_t remaining = bytes - done;
+    std::size_t chunk =
+      std::min<std::size_t>(remaining, (std::size_t)std::numeric_limits<ssize_t>::max());
+    ssize_t got = ::pread(fd, dst + done, chunk, (off_t)(offset + done));
+    if (got < 0) {
+      if (errno == EINTR) { continue; }
+      return false;
+    }
+    if (got == 0) {
+      errno = EIO;
+      return false;
+    }
+    done += (std::size_t)got;
+  }
+  return true;
+}
+
+raw_input_stream_t::raw_input_stream_t(const std::string& path) : path_(path)
+{
+  MPS_NVTX_RANGE("raw_input_construct", nvtx::colors::io);
+  int buffered_fd = ::open(path.c_str(), O_RDONLY);
+  cuopt::scope_guard close_buffered([&] {
+    if (buffered_fd >= 0) { ::close(buffered_fd); }
+  });
+  if (buffered_fd < 0) {
+    mps_parser_fail(error_type_t::RuntimeError,
+                    "Failed to open raw MPS file '%s': %s",
+                    path.c_str(),
+                    std::strerror(errno));
+  }
+
+  int direct_fd = -1;
+  cuopt::scope_guard close_direct([&] {
+    if (direct_fd >= 0) { ::close(direct_fd); }
+  });
+
+  file_size_                   = get_file_size(buffered_fd, path);
+  int read_fd                  = buffered_fd;
+  bool large_enough_for_direct = file_size_ > raw_input_direct_io_threshold_bytes;
+  bool nfs_backed              = is_nfs_backed_path(path);
+  // Buffered reads are consistently faster than O_DIRECT on our NFS mounts;
+  // keep direct I/O for large local files where it wins.
+  if (large_enough_for_direct && !nfs_backed) {
+#ifdef O_DIRECT
+    direct_fd = ::open(path.c_str(), O_RDONLY | O_DIRECT);
+    if (direct_fd >= 0) {
+      read_fd    = direct_fd;
+      direct_io_ = true;
+    }
+#endif
+  }
+  window_bytes_ = raw_input_window_bytes;
+  window_count_ = std::max<std::size_t>(1, (file_size_ + window_bytes_ - 1) / window_bytes_);
+#ifdef MPS_FAST_TIMERS
+  read_window_ms_.assign(window_count_, 0);
+#endif
+
+  output_mapped_size_ =
+    cuda::round_up(std::max<std::size_t>(add_input_padding(file_size_), 1), system_page_size());
+  output_region_ = mmap_region_t::anonymous(
+    output_mapped_size_, PROT_READ | PROT_WRITE, MAP_PRIVATE, "raw input buffer");
+  output_data_ = output_region_.char_data();
+  output_region_.advise(MADV_HUGEPAGE);
+
+  section_scanner_ =
+    std::make_unique<mps_section_block_scanner_t>(output_data_, window_count_, registry_);
+
+  buffered_fd_ = buffered_fd;
+  buffered_fd  = -1;
+  fd_          = read_fd;
+  if (read_fd == direct_fd) { direct_fd = -1; }
+}
+
+raw_input_stream_t::~raw_input_stream_t()
+{
+  if (fd_ >= 0) { ::close(fd_); }
+  if (buffered_fd_ >= 0 && buffered_fd_ != fd_) { ::close(buffered_fd_); }
+}
+
+const char* raw_input_stream_t::data() const noexcept { return output_data_; }
+char* raw_input_stream_t::mutable_data() noexcept { return output_data_; }
+std::size_t raw_input_stream_t::size() const noexcept { return output_view_size_; }
+std::size_t raw_input_stream_t::compressed_size() const noexcept { return file_size_; }
+std::size_t raw_input_stream_t::reserve_size_hint() const noexcept { return file_size_; }
+
+void raw_input_stream_t::read_window_payload(std::size_t offset, std::size_t size)
+{
+  if (pread_full(fd_, output_data_ + offset, size, offset)) { return; }
+  // O_DIRECT can reject an unaligned request with EINVAL; fall back to the
+  // buffered descriptor for this window when that happens.
+  if (direct_io_ && errno == EINVAL && buffered_fd_ >= 0 &&
+      pread_full(buffered_fd_, output_data_ + offset, size, offset)) {
+    return;
+  }
+  mps_parser_fail(error_type_t::RuntimeError,
+                  "Failed to pread raw MPS file '%s': %s",
+                  path_.c_str(),
+                  std::strerror(errno));
+}
+
+void raw_input_stream_t::run_decode_tasks()
+{
+  MPS_NVTX_RANGE("raw_input_run_read_tasks", nvtx::colors::io);
+  if (file_size_ == 0) {
+    output_view_size_ = 0;
+    section_scanner_->publish_ready(0);
+    return;
+  }
+
+  std::size_t hw_threads =
+    std::max<std::size_t>(1, (std::size_t)std::thread::hardware_concurrency());
+  std::size_t thread_count = std::min(raw_input_max_read_threads, hw_threads);
+  thread_count             = std::max<std::size_t>(1, std::min(thread_count, window_count_));
+
+  // Each window is read independently and handed to the scanner, which owns the
+  // contiguous decoded-byte frontier and the parallel section publication.
+  parallel_error_latch_t latch;
+#ifdef MPS_FAST_TIMERS
+  auto read_wall_start = std::chrono::steady_clock::now();
+#endif
+  parallel_for_indexed(
+    window_count_, thread_count, latch, "raw-input-read-", [&](std::size_t index) {
+      MPS_NVTX_RANGE("raw_window_read", nvtx::colors::io);
+      std::size_t offset = index * window_bytes_;
+      std::size_t size   = std::min(window_bytes_, file_size_ - offset);
+      {
+        MPS_NVTX_RANGE("raw_window_pread", nvtx::colors::io);
+#ifdef MPS_FAST_TIMERS
+        auto start = std::chrono::steady_clock::now();
+#endif
+        read_window_payload(offset, size);
+#ifdef MPS_FAST_TIMERS
+        auto end     = std::chrono::steady_clock::now();
+        auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+        read_window_ms_[index] =
+          (uint32_t)std::min<long long>(elapsed.count(), std::numeric_limits<uint32_t>::max());
+#endif
+      }
+      MPS_NVTX_RANGE("raw_window_scan_publish", nvtx::colors::io);
+      section_scanner_->observe_block(index, output_data_ + offset, output_data_ + offset + size);
+    });
+#ifdef MPS_FAST_TIMERS
+  auto read_wall_end = std::chrono::steady_clock::now();
+#endif
+  latch.rethrow_if_error();
+
+#ifdef MPS_FAST_TIMERS
+  if (!read_window_ms_.empty()) {
+    std::vector<uint32_t> sorted = read_window_ms_;
+    std::sort(sorted.begin(), sorted.end());
+    auto percentile = [&](double pct) {
+      std::size_t idx = (std::size_t)std::min<double>((double)(sorted.size() - 1),
+                                                      pct * (double)(sorted.size() - 1));
+      return sorted[idx];
+    };
+    uint64_t total_ms = 0;
+    for (uint32_t value : read_window_ms_) {
+      total_ms += value;
+    }
+    std::fprintf(
+      stderr,
+      "[RAW_READ_LATENCY] windows=%zu wall_ms=%lld total_window_ms=%llu avg_ms=%.3f min_ms=%u "
+      "p50_ms=%u p90_ms=%u p99_ms=%u max_ms=%u\n",
+      read_window_ms_.size(),
+      (long long)std::chrono::duration_cast<std::chrono::milliseconds>(read_wall_end -
+                                                                       read_wall_start)
+        .count(),
+      (unsigned long long)total_ms,
+      (double)total_ms / (double)read_window_ms_.size(),
+      sorted.front(),
+      percentile(0.50),
+      percentile(0.90),
+      percentile(0.99),
+      sorted.back());
+  }
+#endif
+
+  output_view_size_ = section_scanner_->ready_bytes();
+  section_scanner_->publish_ready(output_view_size_);
+}
+
+memory_input_stream_t::memory_input_stream_t(std::vector<char> buffer,
+                                             std::size_t input_size,
+                                             std::size_t compressed_size)
+  : buffer_(std::move(buffer)), input_size_(input_size), compressed_size_(compressed_size)
+{
+  ensure_input_buffer_padding(buffer_, input_size_);
+  section_scanner_ = std::make_unique<mps_section_block_scanner_t>(buffer_.data(), 1, registry_);
+}
+
+const char* memory_input_stream_t::data() const noexcept { return buffer_.data(); }
+char* memory_input_stream_t::mutable_data() noexcept { return buffer_.data(); }
+std::size_t memory_input_stream_t::size() const noexcept { return input_size_; }
+std::size_t memory_input_stream_t::compressed_size() const noexcept { return compressed_size_; }
+std::size_t memory_input_stream_t::reserve_size_hint() const noexcept { return input_size_; }
+
+void memory_input_stream_t::run_decode_tasks()
+{
+  MPS_NVTX_RANGE("memory_input_scan", nvtx::colors::io);
+  // Single block: observe_block advances the frontier and publishes.
+  section_scanner_->observe_block(0, buffer_.data(), buffer_.data() + input_size_);
+}
+
+bool has_lz4_extension(const std::string& path) noexcept { return path_has_suffix(path, ".lz4"); }
+bool has_gzip_extension(const std::string& path) noexcept { return path_has_suffix(path, ".gz"); }
+bool has_bzip2_extension(const std::string& path) noexcept { return path_has_suffix(path, ".bz2"); }
+
+void drop_file_cache(const std::string& path)
+{
+  MPS_NVTX_RANGE("drop_file_cache", nvtx::colors::io);
+  int fd = ::open(path.c_str(), O_RDONLY);
+  if (fd < 0) { return; }
+  ::posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
+  ::close(fd);
+}
+
+FileReadMethod effective_file_read_method(const std::string& path, FileReadMethod method)
+{
+  if (has_lz4_extension(path)) { return FileReadMethod::Lz4; }
+  if (has_gzip_extension(path)) { return FileReadMethod::Gzip; }
+  if (has_bzip2_extension(path)) { return FileReadMethod::Bzip2; }
+  if (method == FileReadMethod::Lz4) {
+    mps_parser_fail(
+      error_type_t::ValidationError, "lz4 read method requires a .lz4 input: %s", path.c_str());
+  }
+  return method;
+}
+
+const char* file_read_method_name(FileReadMethod method) noexcept
+{
+  switch (method) {
+    case FileReadMethod::Read: return "read";
+    case FileReadMethod::Lz4: return "lz4";
+    case FileReadMethod::Gzip: return "gzip";
+    case FileReadMethod::Bzip2: return "bzip2";
+    default: return "unknown";
+  }
+}
+
+}  // namespace cuopt::mathematical_optimization::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/file_reader.hpp b/cpp/src/io/experimental_mps_fast/file_reader.hpp
new file mode 100644
index 0000000000..c2df168ac6
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/file_reader.hpp
@@ -0,0 +1,319 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+// Input layer for the fast MPS parser: turns on-disk bytes (plain or .lz4) into one
+// contiguous parse buffer and publishes MPS section boundaries as data becomes available.
+//
+// Model:
+//   - Output is an anonymous mmap'd buffer (THP-hinted, tail-padded for SIMD/cursor safety).
+//     Raw inputs pread directly into fixed slots; LZ4 decodes into the same layout.
+//   - Work is split into windows (fixed spans of compressed/raw file bytes). Workers use
+//     parallel_for_indexed() — std::thread + shared-index dispatch, not OpenMP — because
+//     blocking pread()/decode does not compose cleanly with OMP team barriers.
+//   - Each completed window/block is handed to mps_section_block_scanner_t::observe_block().
+//     Blocks may finish out of order; the scanner advances a contiguous ready_bytes_
+//     frontier and publishes section ranges into mps_phase_registry_t only once the prefix
+//     up to a section title is contiguous and scannable.
+//   - The parser runs as OpenMP tasks on those published phases while run_decode_tasks()
+//     (raw parallel pread, or the LZ4 reader → metadata scanner → decoder pipeline) fills
+//     the buffer on separate threads. parallel_error_latch_t propagates the first worker
+//     failure and stops the rest.
+//
+// LZ4 adds a resident-window pool (parallel pread of compressed spans), block metadata
+// scanning with ptr_if_contiguous()/copy_to for window-boundary payloads, parallel decode
+// workers, window ref-counting/release, and lazy commit_up_to() of decoded output pages.
+
+#pragma once
+
+#include "mmap_region.hpp"
+#include "mps_section_scanner.hpp"
+#include "nvtx_ranges.hpp"
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <exception>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <utility>
+#include <vector>
+
+namespace cuopt::mathematical_optimization::io::detail {
+
+inline constexpr std::size_t input_buffer_padding_bytes = 64;
+
+void ensure_input_buffer_padding(std::vector<char>& buffer, std::size_t input_size);
+
+struct lz4_pipeline_t;
+
+/**
+ * @brief File reading method selection
+ */
+enum class FileReadMethod { Read, Lz4, Gzip, Bzip2 };
+
+/**
+ * @brief Return the effective method for a path.
+ *
+ * Compressed inputs are auto-detected by extension; all other inputs use raw input reads.
+ */
+FileReadMethod effective_file_read_method(const std::string& path, FileReadMethod method);
+
+/**
+ * @brief Human-readable method name.
+ */
+const char* file_read_method_name(FileReadMethod method) noexcept;
+
+/**
+ * @brief True when the file name has an lz4 extension.
+ */
+bool has_lz4_extension(const std::string& path) noexcept;
+bool has_gzip_extension(const std::string& path) noexcept;
+bool has_bzip2_extension(const std::string& path) noexcept;
+
+/**
+ * @brief Ask the OS to evict clean cached pages for this file.
+ *
+ * This is advisory and affects the local client page cache only.
+ */
+void drop_file_cache(const std::string& path);
+
+/**
+ * @brief OS memory page size, queried once and cached.
+ */
+std::size_t system_page_size();
+
+/**
+ * @brief File size in bytes; fails with a parser error if it cannot be determined.
+ */
+std::size_t get_file_size(int fd, const std::string& path);
+std::size_t get_file_size(const std::string& path);
+
+/**
+ * @brief Read exactly @p bytes at @p offset into @p dst, retrying on EINTR.
+ *
+ * Returns false and leaves errno set on error or unexpected EOF.
+ */
+bool pread_full(int fd, char* dst, std::size_t bytes, std::size_t offset);
+
+// First-error-wins latch shared by the parallel reader/decoder pipelines. The
+// first captured exception is retained and a stop flag is raised so cooperating
+// workers can unwind promptly. The retained exception is rethrown by the
+// orchestrating thread once all workers have joined.
+class parallel_error_latch_t {
+ public:
+  void capture(std::exception_ptr eptr)
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (!first_error_) {
+      first_error_ = eptr;
+      stopped_.store(true, std::memory_order_release);
+    }
+  }
+
+  bool stopped() const noexcept { return stopped_.load(std::memory_order_acquire); }
+
+  void rethrow_if_error() const
+  {
+    if (first_error_) { std::rethrow_exception(first_error_); }
+  }
+
+ private:
+  std::mutex mutex_;
+  std::exception_ptr first_error_ = nullptr;
+  std::atomic_bool stopped_{false};
+};
+
+class scoped_thread_group {
+ public:
+  void reserve(std::size_t count) { threads_.reserve(count); }
+
+  template <typename F>
+  void emplace(F&& f)
+  {
+    threads_.emplace_back(std::forward<F>(f));
+  }
+
+  ~scoped_thread_group()
+  {
+    for (auto& thread : threads_) {
+      if (thread.joinable()) { thread.join(); }
+    }
+  }
+
+ private:
+  std::vector<std::thread> threads_;
+};
+
+// Work-stealing parallel loop over [0, count). Each of thread_count workers pulls
+// the next index from a shared counter and invokes body(index). An exception
+// escaping body is captured into the latch and stops the loop; the caller is
+// responsible for calling latch.rethrow_if_error() after this returns. Workers
+// are named "<thread_name_prefix><worker-id>" when a prefix is supplied.
+// OMP just doesn't really play well with blocking pread()
+template <typename Body>
+void parallel_for_indexed(std::size_t count,
+                          std::size_t thread_count,
+                          parallel_error_latch_t& latch,
+                          const char* thread_name_prefix,
+                          Body body)
+{
+  assert(thread_count > 0);
+
+  std::atomic_size_t next{0};
+  scoped_thread_group workers;
+  workers.reserve(thread_count);
+  for (std::size_t t = 0; t < thread_count; ++t) {
+    workers.emplace([&, t] {
+      if (thread_name_prefix != nullptr) {
+        std::string name = thread_name_prefix + std::to_string(t);
+        nvtx::name_current_thread(name.c_str());
+      }
+      while (!latch.stopped()) {
+        std::size_t index = next.fetch_add(1, std::memory_order_relaxed);
+        if (index >= count) { break; }
+        try {
+          body(index);
+        } catch (...) {
+          latch.capture(std::current_exception());
+          return;
+        }
+      }
+    });
+  }
+}
+
+struct input_stream_view_t {
+  const char* data               = nullptr;
+  char* mutable_data             = nullptr;
+  std::size_t size               = 0;
+  std::size_t compressed_size    = 0;
+  mps_phase_registry_t* registry = nullptr;
+};
+
+/**
+ * @brief CRTP base supplying the registry and view() shared by every input
+ * stream. Derived classes provide data()/mutable_data()/size()/compressed_size().
+ */
+template <typename Derived>
+class input_stream_base_t {
+ public:
+  mps_phase_registry_t& registry() noexcept { return registry_; }
+
+  input_stream_view_t view() noexcept
+  {
+    auto* self = static_cast<Derived*>(this);
+    return {self->data(), self->mutable_data(), self->size(), self->compressed_size(), &registry_};
+  }
+
+ protected:
+  mps_phase_registry_t registry_;
+};
+
+// Handles lz4 compressed files (useful since lz4 is very fast, works well for MPS, and makes
+// parallel decompression trivial)
+class lz4_input_stream_t : public input_stream_base_t<lz4_input_stream_t> {
+ public:
+  explicit lz4_input_stream_t(const std::string& path);
+  ~lz4_input_stream_t();
+
+  lz4_input_stream_t(const lz4_input_stream_t&)            = delete;
+  lz4_input_stream_t& operator=(const lz4_input_stream_t&) = delete;
+
+  const char* data() const noexcept;
+  char* mutable_data() noexcept;
+  std::size_t size() const noexcept;
+  std::size_t compressed_size() const noexcept;
+  std::size_t reserve_size_hint() const noexcept;
+
+  void run_decode_tasks();
+
+ private:
+  friend struct lz4_pipeline_t;
+
+  void commit_up_to(std::size_t bytes);
+
+  std::string path_;
+  int fd_ = -1;
+  mmap_region_t output_region_;
+  std::size_t compressed_size_       = 0;
+  char* output_data_                 = nullptr;
+  std::size_t output_mapped_size_    = 0;
+  std::size_t output_view_size_      = 0;
+  std::size_t output_committed_size_ = 0;
+  std::size_t block_max_size_        = 0;
+  std::size_t content_size_          = 0;
+  std::size_t header_size_           = 0;
+  bool content_size_present_         = false;
+  bool block_checksum_               = false;
+  bool content_checksum_             = false;
+  bool dict_id_                      = false;
+  std::mutex commit_mutex_;
+  std::unique_ptr<mps_section_block_scanner_t> section_scanner_;
+  std::size_t block_slot_count_ = 0;
+};
+
+// Takes a file path
+class raw_input_stream_t : public input_stream_base_t<raw_input_stream_t> {
+ public:
+  explicit raw_input_stream_t(const std::string& path);
+  ~raw_input_stream_t();
+
+  raw_input_stream_t(const raw_input_stream_t&)            = delete;
+  raw_input_stream_t& operator=(const raw_input_stream_t&) = delete;
+
+  const char* data() const noexcept;
+  char* mutable_data() noexcept;
+  std::size_t size() const noexcept;
+  std::size_t compressed_size() const noexcept;
+  std::size_t reserve_size_hint() const noexcept;
+
+  void run_decode_tasks();
+
+ private:
+  void read_window_payload(std::size_t offset, std::size_t size);
+
+  std::string path_;
+  int fd_          = -1;
+  int buffered_fd_ = -1;
+  bool direct_io_  = false;
+  mmap_region_t output_region_;
+  char* output_data_              = nullptr;
+  std::size_t output_mapped_size_ = 0;
+  std::size_t output_view_size_   = 0;
+  std::size_t file_size_          = 0;
+  std::size_t window_bytes_       = 0;
+  std::size_t window_count_       = 0;
+#ifdef MPS_FAST_TIMERS
+  std::vector<uint32_t> read_window_ms_;
+#endif
+  std::unique_ptr<mps_section_block_scanner_t> section_scanner_;
+};
+
+// Takes an in-memory buffer
+class memory_input_stream_t : public input_stream_base_t<memory_input_stream_t> {
+ public:
+  memory_input_stream_t(std::vector<char> buffer,
+                        std::size_t input_size,
+                        std::size_t compressed_size);
+
+  memory_input_stream_t(const memory_input_stream_t&)            = delete;
+  memory_input_stream_t& operator=(const memory_input_stream_t&) = delete;
+
+  const char* data() const noexcept;
+  char* mutable_data() noexcept;
+  std::size_t size() const noexcept;
+  std::size_t compressed_size() const noexcept;
+  std::size_t reserve_size_hint() const noexcept;
+
+  void run_decode_tasks();
+
+ private:
+  std::vector<char> buffer_;
+  std::size_t input_size_      = 0;
+  std::size_t compressed_size_ = 0;
+  std::unique_ptr<mps_section_block_scanner_t> section_scanner_;
+};
+
+}  // namespace cuopt::mathematical_optimization::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp b/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp
new file mode 100644
index 0000000000..bf33d7e895
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/hash_table_smallstr.hpp
@@ -0,0 +1,304 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "mmap_region.hpp"
+
+#include <cuda/cmath>
+
+#include <simde/x86/avx2.h>
+
+#include <sys/mman.h>
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#ifdef MPS_FAST_PERF_COUNTERS
+#include <cstdio>
+#endif
+#include <limits>
+#include <string_view>
+#include <unordered_map>
+
+namespace cuopt::mathematical_optimization::io::detail {
+
+// below this threshold, the serial row-hash build is usually cheaper than partition setup
+inline constexpr size_t MPS_ROW_HASH_PARTITIONED_MIN_ROWS = 64 * 1024;
+inline constexpr int MPS_ROW_HASH_PARTITION_BITS          = 5;
+inline constexpr size_t MPS_ROW_HASH_PARTITIONS           = (1 << MPS_ROW_HASH_PARTITION_BITS);
+
+// FNV-1a over bytes in reverse order; row names commonly share long prefixes.
+static inline uint32_t fnv1a_hash(const char* ptr, std::size_t len)
+{
+  constexpr uint32_t fnv_offset = 2166136261u;
+  constexpr uint32_t fnv_prime  = 16777619u;
+
+  uint32_t h    = fnv_offset;
+  const char* p = ptr + len;
+  while (p > ptr) {
+    --p;
+    h ^= (uint8_t)*p;
+    h *= fnv_prime;
+  }
+  return h;
+}
+
+// 28-byte inline key + uint32 payload: two slots per 64-byte cache line.
+struct alignas(32) hash_slot_28_t {
+  char key[28];
+  uint32_t count;
+};
+
+using hash_key_t                     = simde__m256i;
+using hash_slot_var_t                = hash_slot_28_t;
+constexpr std::size_t HASH_KEY_BYTES = 28;
+
+static_assert(sizeof(hash_slot_28_t) == 32);
+static_assert(alignof(hash_slot_28_t) == 32);
+static_assert(offsetof(hash_slot_28_t, count) == HASH_KEY_BYTES);
+
+static inline hash_key_t make_key(const char* ptr, std::size_t len)
+{
+  alignas(32) char buf[32] = {};
+  std::memcpy(buf, ptr, len < HASH_KEY_BYTES ? len : HASH_KEY_BYTES);
+  return simde_mm256_load_si256(reinterpret_cast<const simde__m256i*>(buf));
+}
+
+static inline bool key_cmpeq(const char* slot_key, hash_key_t key)
+{
+  simde__m256i slot_vec = simde_mm256_loadu_si256(reinterpret_cast<const simde__m256i*>(slot_key));
+  int mask              = simde_mm256_movemask_epi8(simde_mm256_cmpeq_epi8(slot_vec, key));
+  return (mask & 0x0fffffff) == 0x0fffffff;
+}
+
+static inline void key_store(char* slot_key, hash_key_t key)
+{
+  simde_mm256_store_si256(reinterpret_cast<simde__m256i*>(slot_key), key);
+}
+
+struct hash_partition_t {
+  hash_slot_var_t* slots = nullptr;
+  size_t buckets         = 0;
+  size_t mask            = 0;
+};
+
+static inline size_t hash_partition_for(uint32_t hash)
+{
+  return (size_t)(hash >> (32 - MPS_ROW_HASH_PARTITION_BITS));
+}
+
+static inline size_t hash_bucket_count_for(size_t n_rows, bool compact)
+{
+  if (compact) { return cuda::next_power_of_two(std::max(n_rows + n_rows / 2, (size_t)64)); }
+  return cuda::next_power_of_two(std::max(n_rows * 2, (size_t)64));
+}
+
+static inline size_t hash_lookup_in(
+  const hash_slot_var_t* slots, size_t buckets, size_t mask, hash_key_t key, uint32_t hash)
+{
+  const hash_slot_var_t* slot = &slots[hash & (uint32_t)mask];
+  for (size_t i = 0; i < buckets; ++i, ++slot) {
+    if (slot >= &slots[buckets]) { slot = &slots[0]; }
+    if (slot->count == 0) { return std::numeric_limits<size_t>::max(); }
+    if (key_cmpeq(slot->key, key)) { return slot->count - 1; }
+  }
+  return std::numeric_limits<size_t>::max();
+}
+
+static inline size_t hash_insert_into(hash_slot_var_t* slots,
+                                      size_t buckets,
+                                      size_t mask,
+                                      std::string_view name,
+                                      uint32_t hash,
+                                      size_t index)
+{
+  hash_key_t key        = make_key(name.data(), name.size());
+  hash_slot_var_t* slot = &slots[hash & (uint32_t)mask];
+  for (size_t i = 0; i < buckets; ++i, ++slot) {
+    if (slot >= &slots[buckets]) { slot = &slots[0]; }
+    if (slot->count == 0) {
+      key_store(slot->key, key);
+      slot->count = (uint32_t)(index + 1);
+      return i + 1;
+    }
+    if (key_cmpeq(slot->key, key)) {
+      slot->count = (uint32_t)(index + 1);
+      return i + 1;
+    }
+  }
+  __builtin_unreachable();
+}
+
+#ifdef MPS_FAST_PERF_COUNTERS
+struct hash_build_probe_stats_t {
+  size_t total_probes = 0;
+  size_t max_probes   = 0;
+  size_t long_names   = 0;
+
+  void seed_long_names(size_t n) { long_names = n; }
+
+  void record_insert(size_t probes)
+  {
+    if (probes == 0) {
+      ++long_names;
+    } else {
+      total_probes += probes;
+      max_probes = std::max(max_probes, probes);
+    }
+  }
+
+  void merge(const hash_build_probe_stats_t& other)
+  {
+    total_probes += other.total_probes;
+    max_probes = std::max(max_probes, other.max_probes);
+    long_names += other.long_names;
+  }
+};
+#endif
+
+class smallstr_hash_table_t {
+ public:
+  void note_long_name(std::string_view name, size_t index) { long_names_[name] = index; }
+
+  size_t long_name_count() const { return long_names_.size(); }
+
+  void reset_build_probe_stats()
+  {
+#ifdef MPS_FAST_PERF_COUNTERS
+    build_probe_stats_ = {};
+    build_probe_stats_.seed_long_names(long_names_.size());
+    partition_probe_stats_ = {};
+#endif
+  }
+
+  void print_build_probe_report(size_t n_rows) const
+  {
+#ifdef MPS_FAST_PERF_COUNTERS
+    hash_build_probe_stats_t stats = build_probe_stats_;
+    if (partition_count_ != 0) {
+      for (size_t p = 0; p < partition_count_; ++p) {
+        stats.merge(partition_probe_stats_[p]);
+      }
+    }
+    size_t probed_rows = n_rows - stats.long_names;
+    double mean_probes = probed_rows == 0 ? 0.0 : (double)stats.total_probes / (double)probed_rows;
+    double load_factor = buckets_ == 0 ? 0.0 : (double)n_rows / (double)buckets_;
+    std::fprintf(stderr,
+                 "[ROW_HASH_PROBES] rows=%zu buckets=%zu load=%.3f long=%zu mean=%.3f max=%zu\n",
+                 n_rows,
+                 buckets_,
+                 load_factor,
+                 stats.long_names,
+                 mean_probes,
+                 stats.max_probes);
+#endif
+  }
+
+  void configure_serial_buckets(size_t n_rows, bool compact)
+  {
+    partition_count_ = 0;
+    buckets_         = hash_bucket_count_for(n_rows, compact);
+    mask_            = buckets_ - 1;
+  }
+
+  void configure_partitioned_buckets(
+    const std::array<size_t, MPS_ROW_HASH_PARTITIONS>& partition_counts, bool compact)
+  {
+    partition_count_ = MPS_ROW_HASH_PARTITIONS;
+    buckets_         = 0;
+    for (size_t p = 0; p < MPS_ROW_HASH_PARTITIONS; ++p) {
+      partitions_[p].buckets = hash_bucket_count_for(partition_counts[p], compact);
+      partitions_[p].mask    = partitions_[p].buckets - 1;
+      buckets_ += partitions_[p].buckets;
+    }
+    mask_ = buckets_ - 1;
+  }
+
+  void allocate_mmap(const char* label)
+  {
+    size_t mmap_size = buckets_ * sizeof(hash_slot_var_t);
+    region_ = mmap_region_t::anonymous(mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, label);
+    slots_  = (hash_slot_var_t*)region_.data();
+    if (partition_count_ != 0) {
+      hash_slot_var_t* next_slots = slots_;
+      for (size_t p = 0; p < partition_count_; ++p) {
+        partitions_[p].slots = next_slots;
+        next_slots += partitions_[p].buckets;
+      }
+    }
+    region_.advise(MADV_HUGEPAGE);
+  }
+
+  mmap_region_t& region() noexcept { return region_; }
+  const mmap_region_t& region() const noexcept { return region_; }
+
+  hash_slot_var_t* slots() noexcept { return slots_; }
+  const hash_slot_var_t* slots() const noexcept { return slots_; }
+
+  size_t buckets() const noexcept { return buckets_; }
+  size_t mask() const noexcept { return mask_; }
+  size_t partition_count() const noexcept { return partition_count_; }
+
+  const hash_partition_t& partition(size_t p) const noexcept { return partitions_[p]; }
+
+  size_t lookup(std::string_view name) const
+  {
+    if (name.size() > HASH_KEY_BYTES) {
+      auto it = long_names_.find(name);
+      return it != long_names_.end() ? it->second : std::numeric_limits<size_t>::max();
+    }
+    hash_key_t key = make_key(name.data(), name.size());
+    uint32_t hash  = fnv1a_hash(name.data(), name.size());
+    if (partition_count_ != 0) {
+      const auto& part = partitions_[hash_partition_for(hash)];
+      return hash_lookup_in(part.slots, part.buckets, part.mask, key, hash);
+    }
+    return hash_lookup_in(slots_, buckets_, mask_, key, hash);
+  }
+
+  size_t insert_serial(std::string_view name, size_t index)
+  {
+    size_t probes;
+    if (name.size() > HASH_KEY_BYTES) {
+      note_long_name(name, index);
+      probes = 0;
+    } else {
+      probes = hash_insert_into(
+        slots_, buckets_, mask_, name, fnv1a_hash(name.data(), name.size()), index);
+    }
+#ifdef MPS_FAST_PERF_COUNTERS
+    build_probe_stats_.record_insert(probes);
+#endif
+    return probes;
+  }
+
+  size_t insert_partition(size_t partition, std::string_view name, uint32_t hash, size_t index)
+  {
+    const auto& part = partitions_[partition];
+    size_t probes    = hash_insert_into(part.slots, part.buckets, part.mask, name, hash, index);
+#ifdef MPS_FAST_PERF_COUNTERS
+    partition_probe_stats_[partition].record_insert(probes);
+#endif
+    return probes;
+  }
+
+ private:
+  mmap_region_t region_;
+  hash_slot_var_t* slots_ = nullptr;
+  size_t buckets_         = 0;
+  size_t mask_            = 0;
+  size_t partition_count_ = 0;
+  std::array<hash_partition_t, MPS_ROW_HASH_PARTITIONS> partitions_{};
+  std::unordered_map<std::string_view, size_t> long_names_{};
+#ifdef MPS_FAST_PERF_COUNTERS
+  hash_build_probe_stats_t build_probe_stats_{};
+  std::array<hash_build_probe_stats_t, MPS_ROW_HASH_PARTITIONS> partition_probe_stats_{};
+#endif
+};
+
+}  // namespace cuopt::mathematical_optimization::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
new file mode 100644
index 0000000000..ed0e6d5404
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/lz4_file_reader.cpp
@@ -0,0 +1,920 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+#include "file_reader.hpp"
+#include "mps_section_scanner.hpp"
+#include "nvtx_ranges.hpp"
+
+#include <utilities/error.hpp>
+#include <utilities/scope_guard.hpp>
+
+#include <cuda/cmath>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <cerrno>
+#include <condition_variable>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <deque>
+#include <limits>
+#include <memory>
+#include <mutex>
+#include <stdexcept>
+#include <string>
+#include <thread>
+#include <utility>
+#include <vector>
+
+namespace cuopt::mathematical_optimization::io::detail {
+
+using cuopt::mathematical_optimization::io::error_type_t;
+using cuopt::mathematical_optimization::io::mps_parser_expects;
+using cuopt::mathematical_optimization::io::mps_parser_fail;
+
+namespace {
+
+constexpr uint32_t lz4_frame_magic                        = 0x184D2204u;
+constexpr uint32_t lz4_uncompressed_block                 = 0x80000000u;
+constexpr uint32_t lz4_block_size_mask                    = 0x7FFFFFFFu;
+constexpr std::size_t lz4_pipeline_batch_bytes            = 64ull * 1024ull * 1024ull;
+constexpr std::size_t lz4_decode_batch_decompressed_bytes = 256ull * 1024ull * 1024ull;
+constexpr std::size_t lz4_input_max_io_threads            = 8;
+constexpr std::size_t lz4_no_content_size_reserve_ratio   = 128;
+
+using LZ4_decompress_safe_t = int (*)(const char*, char*, int, int);
+
+std::size_t estimate_lz4_no_content_size(std::size_t compressed_size)
+{
+  constexpr std::size_t max_size = std::numeric_limits<std::size_t>::max();
+  if (compressed_size > max_size / lz4_no_content_size_reserve_ratio) {
+    return max_size - input_buffer_padding_bytes;
+  }
+  return compressed_size * lz4_no_content_size_reserve_ratio;
+}
+
+#if defined(MPS_PARSER_WITH_LZ4)
+struct lz4_runtime_t {
+  void* handle                          = nullptr;
+  LZ4_decompress_safe_t decompress_safe = nullptr;
+
+  lz4_runtime_t()
+  {
+    for (const char* soname : {"liblz4.so.1", "liblz4.so"}) {
+      handle = ::dlopen(soname, RTLD_LAZY);
+      if (handle != nullptr) { break; }
+    }
+    if (handle == nullptr) {
+      mps_parser_fail(error_type_t::RuntimeError,
+                      "Could not open .mps.lz4 file since liblz4 was not found "
+                      "(tried liblz4.so.1, liblz4.so). Decompress the .lz4 file manually "
+                      "or install liblz4.");
+    }
+
+    decompress_safe =
+      reinterpret_cast<LZ4_decompress_safe_t>(::dlsym(handle, "LZ4_decompress_safe"));
+    if (decompress_safe == nullptr) {
+      mps_parser_fail(error_type_t::RuntimeError,
+                      "Error loading LZ4_decompress_safe from liblz4. Decompress the .lz4 file "
+                      "manually or install a compatible liblz4.");
+    }
+  }
+
+  ~lz4_runtime_t()
+  {
+    if (handle != nullptr) { ::dlclose(handle); }
+  }
+
+  lz4_runtime_t(const lz4_runtime_t&)            = delete;
+  lz4_runtime_t& operator=(const lz4_runtime_t&) = delete;
+};
+
+const lz4_runtime_t& lz4_runtime()
+{
+  static const lz4_runtime_t runtime;
+  return runtime;
+}
+#endif
+
+int lz4_decompress_safe_runtime([[maybe_unused]] const char* src,
+                                [[maybe_unused]] char* dst,
+                                [[maybe_unused]] int compressed_size,
+                                [[maybe_unused]] int dst_capacity)
+{
+#if defined(MPS_PARSER_WITH_LZ4)
+  return lz4_runtime().decompress_safe(src, dst, compressed_size, dst_capacity);
+#else
+  mps_parser_fail(
+    error_type_t::RuntimeError,
+    "Experimental fast MPS parser was built without LZ4 decompression support. "
+    "Reconfigure with CUOPT_PARSER_WITH_LZ4=ON or decompress the .lz4 file manually.");
+#endif
+}
+
+void ensure_lz4_runtime_available()
+{
+#if defined(MPS_PARSER_WITH_LZ4)
+  [[maybe_unused]] auto& runtime = lz4_runtime();
+#else
+  mps_parser_fail(
+    error_type_t::RuntimeError,
+    "Experimental fast MPS parser was built without LZ4 decompression support. "
+    "Reconfigure with CUOPT_PARSER_WITH_LZ4=ON or decompress the .lz4 file manually.");
+#endif
+}
+
+int open_lz4_fd(const std::string& path)
+{
+  int fd = ::open(path.c_str(), O_RDONLY);
+  if (fd < 0) {
+    mps_parser_fail(error_type_t::RuntimeError,
+                    "Failed to open LZ4 file '%s': %s",
+                    path.c_str(),
+                    std::strerror(errno));
+  }
+  return fd;
+}
+
+uint32_t read_le32(const char* ptr)
+{
+  const auto* p = reinterpret_cast<const unsigned char*>(ptr);
+  return (uint32_t)p[0] | ((uint32_t)p[1] << 8) | ((uint32_t)p[2] << 16) | ((uint32_t)p[3] << 24);
+}
+
+uint64_t read_le64(const char* ptr)
+{
+  const auto* p  = reinterpret_cast<const unsigned char*>(ptr);
+  uint64_t value = 0;
+  for (int i = 7; i >= 0; --i) {
+    value = (value << 8) | p[i];
+  }
+  return value;
+}
+
+std::size_t block_max_size_from_bd(unsigned char bd)
+{
+  unsigned block_size_id = (bd >> 4) & 0x7u;
+  switch (block_size_id) {
+    case 4: return 64ull * 1024ull;
+    case 5: return 256ull * 1024ull;
+    case 6: return 1024ull * 1024ull;
+    case 7: return 4ull * 1024ull * 1024ull;
+    default: mps_parser_fail(error_type_t::ValidationError, "unsupported LZ4 frame block size ID");
+  }
+}
+
+struct lz4_resident_window_t {
+  std::size_t index       = 0;
+  std::size_t file_offset = 0;
+  std::size_t size        = 0;
+  std::unique_ptr<char[]> data;
+};
+
+class lz4_resident_windows_t {
+ public:
+  explicit lz4_resident_windows_t(std::vector<lz4_resident_window_t>& windows) : windows_(windows)
+  {
+  }
+
+  // Compressed file bytes arrive in fixed resident windows; block payloads may span a boundary.
+  // Return a direct pointer when the whole payload sits in one window (LZ4 decompress + pin);
+  // otherwise nullptr and the caller stages via copy_to.
+  const char* ptr_if_contiguous(std::size_t offset, std::size_t size) const
+  {
+    if (size == 0) return nullptr;
+    const auto& w     = window_for_offset(offset);
+    std::size_t local = offset - w.file_offset;
+    if (local <= w.size && size <= w.size - local) { return w.data.get() + local; }
+    return nullptr;
+  }
+
+  void copy_to(std::size_t offset, char* dst, std::size_t size) const
+  {
+    std::size_t copied = 0;
+    while (copied < size) {
+      const auto& w     = window_for_offset(offset + copied);
+      std::size_t local = offset + copied - w.file_offset;
+      std::size_t take  = std::min(w.size - local, size - copied);
+      std::memcpy(dst + copied, w.data.get() + local, take);
+      copied += take;
+    }
+  }
+
+  uint8_t read_u8(std::size_t offset) const
+  {
+    uint8_t value = 0;
+    copy_to(offset, reinterpret_cast<char*>(&value), sizeof(value));
+    return value;
+  }
+
+  uint32_t read_u32(std::size_t offset) const
+  {
+    char bytes[4];
+    copy_to(offset, bytes, sizeof(bytes));
+    return read_le32(bytes);
+  }
+
+  uint64_t read_u64(std::size_t offset) const
+  {
+    char bytes[8];
+    copy_to(offset, bytes, sizeof(bytes));
+    return read_le64(bytes);
+  }
+
+ private:
+  const lz4_resident_window_t& window_for_offset(std::size_t offset) const
+  {
+    if (windows_.empty()) {
+      mps_parser_fail(error_type_t::RuntimeError, "LZ4 resident window lookup with no windows");
+    }
+    std::size_t window_stride = windows_.size() > 1 ? windows_[1].file_offset : windows_[0].size;
+    std::size_t idx           = offset / window_stride;
+    if (idx >= windows_.size()) {
+      mps_parser_fail(error_type_t::RuntimeError, "LZ4 offset outside resident windows");
+    }
+    const auto& w = windows_[idx];
+    if (offset >= w.file_offset + w.size) {
+      mps_parser_fail(error_type_t::RuntimeError, "LZ4 offset outside resident windows");
+    }
+    return w;
+  }
+
+  std::vector<lz4_resident_window_t>& windows_;
+};
+
+// Parsed fields of the leading LZ4 frame descriptor (RFC: magic, FLG, BD, and
+// optional content size / dictionary id / header checksum).
+struct lz4_frame_header_t {
+  std::size_t block_max_size = 0;
+  std::size_t content_size   = 0;
+  std::size_t header_size    = 0;
+  bool content_size_present  = false;
+  bool block_checksum        = false;
+  bool content_checksum      = false;
+  bool dict_id               = false;
+};
+
+lz4_frame_header_t parse_lz4_frame_header(int fd,
+                                          const std::string& path,
+                                          std::size_t compressed_size)
+{
+  if (compressed_size < 7) {
+    mps_parser_fail(error_type_t::ValidationError,
+                    "LZ4 input is too small to contain a frame header");
+  }
+  char header[32];
+  std::size_t header_bytes = std::min<std::size_t>(sizeof(header), compressed_size);
+  if (!pread_full(fd, header, header_bytes, 0)) {
+    mps_parser_fail(error_type_t::RuntimeError,
+                    "Failed to read LZ4 frame header '%s': %s",
+                    path.c_str(),
+                    std::strerror(errno));
+  }
+
+  std::size_t offset = 0;
+  uint32_t magic     = read_le32(header + offset);
+  if (magic != lz4_frame_magic) {
+    mps_parser_fail(error_type_t::ValidationError,
+                    "unsupported LZ4 input: expected standard LZ4 frame magic");
+  }
+  offset += 4;
+  unsigned char flg = (unsigned char)header[offset++];
+  unsigned char bd  = (unsigned char)header[offset++];
+  unsigned version  = (flg >> 6) & 0x3u;
+  if (version != 1) {
+    mps_parser_fail(error_type_t::ValidationError, "unsupported LZ4 frame version");
+  }
+  bool block_independent = (flg & 0x20u) != 0;
+  if (!block_independent) {
+    mps_parser_fail(error_type_t::ValidationError,
+                    "parallel LZ4 reader requires independent blocks; compress with -BI");
+  }
+
+  lz4_frame_header_t info;
+  info.block_checksum       = (flg & 0x10u) != 0;
+  info.content_size_present = (flg & 0x08u) != 0;
+  info.content_checksum     = (flg & 0x04u) != 0;
+  info.dict_id              = (flg & 0x01u) != 0;
+  info.block_max_size       = block_max_size_from_bd(bd);
+  if (info.content_size_present) {
+    if (offset + 8 > header_bytes) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "truncated LZ4 frame while reading content size");
+    }
+    info.content_size = (std::size_t)read_le64(header + offset);
+    offset += 8;
+  }
+  if (info.dict_id) {
+    if (offset + 4 > header_bytes) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "truncated LZ4 frame while reading dictionary id");
+    }
+    offset += 4;
+  }
+  if (offset + 1 > header_bytes) {
+    mps_parser_fail(error_type_t::ValidationError,
+                    "truncated LZ4 frame while reading header checksum");
+  }
+  offset += 1;
+  info.header_size = offset;
+  return info;
+}
+
+}  // namespace
+
+lz4_input_stream_t::lz4_input_stream_t(const std::string& path) : path_(path)
+{
+  MPS_NVTX_RANGE("lz4_input_constructor", nvtx::colors::io);
+
+  ensure_lz4_runtime_available();
+
+  int fd = open_lz4_fd(path);
+  cuopt::scope_guard close_fd([&] {
+    if (fd >= 0) { ::close(fd); }
+  });
+  ::posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
+
+  compressed_size_ = get_file_size(fd, path);
+
+  lz4_frame_header_t header = parse_lz4_frame_header(fd, path, compressed_size_);
+  block_max_size_           = header.block_max_size;
+  content_size_             = header.content_size;
+  header_size_              = header.header_size;
+  content_size_present_     = header.content_size_present;
+  block_checksum_           = header.block_checksum;
+  content_checksum_         = header.content_checksum;
+  dict_id_                  = header.dict_id;
+
+  std::size_t reserve_size = content_size_;
+  if (!content_size_present_) {
+    reserve_size = estimate_lz4_no_content_size(compressed_size_);
+    reserve_size = std::max(reserve_size, block_max_size_);
+  }
+  reserve_size += input_buffer_padding_bytes;
+
+  constexpr std::size_t huge_alignment = 2 * 1024 * 1024;  // 2MiB
+  output_mapped_size_                  = cuda::round_up(reserve_size, system_page_size());
+  output_region_                       = mmap_region_t::anonymous_aligned(output_mapped_size_,
+                                                    huge_alignment,
+                                                    PROT_NONE,
+                                                    MAP_PRIVATE | MAP_NORESERVE,
+                                                    "LZ4 output buffer");
+  output_data_                         = output_region_.char_data();
+
+  block_slot_count_ = std::max<std::size_t>(1, cuda::ceil_div(reserve_size, block_max_size_) + 1);
+
+  section_scanner_ =
+    std::make_unique<mps_section_block_scanner_t>(output_data_, block_slot_count_, registry_);
+
+  fd_ = fd;
+  fd  = -1;
+}
+
+lz4_input_stream_t::~lz4_input_stream_t()
+{
+  if (fd_ >= 0) { ::close(fd_); }
+}
+
+const char* lz4_input_stream_t::data() const noexcept { return output_data_; }
+char* lz4_input_stream_t::mutable_data() noexcept { return output_data_; }
+std::size_t lz4_input_stream_t::size() const noexcept { return output_view_size_; }
+std::size_t lz4_input_stream_t::compressed_size() const noexcept { return compressed_size_; }
+std::size_t lz4_input_stream_t::reserve_size_hint() const noexcept
+{
+  return content_size_present_
+           ? content_size_
+           : std::max<std::size_t>(estimate_lz4_no_content_size(compressed_size_), 1024 * 1024);
+}
+
+void lz4_input_stream_t::commit_up_to(std::size_t bytes)
+{
+  MPS_NVTX_RANGE("lz4_commit_output", nvtx::colors::alloc);
+  std::lock_guard<std::mutex> lock(commit_mutex_);
+  if (bytes <= output_committed_size_) return;
+  if (bytes > output_mapped_size_) {
+    mps_parser_fail(error_type_t::OutOfMemoryError, "LZ4 output exceeded reserved virtual mapping");
+  }
+  std::size_t new_committed = cuda::round_up(bytes, system_page_size());
+  if (new_committed > output_mapped_size_) new_committed = output_mapped_size_;
+  std::size_t add = new_committed - output_committed_size_;
+  void* target    = output_data_ + output_committed_size_;
+  mmap_region_t::map_fixed_or_throw(
+    target, add, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0, "LZ4 output commit");
+  ::madvise(target, add, MADV_HUGEPAGE);
+  output_committed_size_ = new_committed;
+}
+
+struct resident_block_desc_t {
+  const char* src                 = nullptr;
+  std::size_t compressed_size     = 0;
+  std::size_t decompressed_offset = 0;
+  std::size_t decompressed_size   = 0;
+  std::size_t index               = 0;
+  std::size_t window_index        = std::numeric_limits<std::size_t>::max();
+  bool uncompressed               = false;
+};
+
+struct window_state_t {
+  std::atomic<uint32_t> decode_refs{0};
+  std::atomic<uint8_t> released{0};
+};
+
+// Two distinct units flow through this pipeline:
+//   * window  - a fixed-size span of the compressed file read by the I/O stage.
+//   * block   - a single independent LZ4 data block (decompressed unit) that the
+//               metadata scanner discovers inside the resident windows.
+// Windows feed blocks; the decoded blocks are handed to the section scanner,
+// which owns the contiguous decoded-byte frontier and section publication.
+//
+// Locking (the grouped members below repeat each guard in context):
+//   * window_mutex          - guards window_done[]   (reader -> scanner readiness)
+//   * desc_mutex            - guards desc_queue + scanner_done (scanner -> decoders)
+//   * window_release_mutex  - serializes freeing a window buffer + RSS accounting
+//   * window_state_[].decode_refs/.released, scanned_through_, blocks_scanned,
+//     compressed_resident_bytes - lock-free atomics
+// Locks are never nested. The scanner thread is the sole writer of the frame walk,
+// so offset / decompressed_offset are mutated without locking.
+struct lz4_pipeline_t {
+  explicit lz4_pipeline_t(lz4_input_stream_t& input_)
+    : input(input_),
+      window_count(cuda::ceil_div(input.compressed_size_, window_bytes)),
+      windows(window_count),
+      window_state_(std::make_unique<window_state_t[]>(window_count)),
+      io_threads(std::min(lz4_input_max_io_threads, window_count)),
+      window_done(window_count, 0)
+  {
+    for (std::size_t i = 0; i < window_count; ++i) {
+      std::size_t offset     = i * window_bytes;
+      std::size_t size       = std::min(window_bytes, input.compressed_size_ - offset);
+      windows[i].index       = i;
+      windows[i].file_offset = offset;
+      windows[i].size        = size;
+    }
+  }
+
+  // Runs the three-stage pipeline to completion:
+  //
+  //   readers --window_done/window_cv--> scanner --desc_queue/desc_cv--> decoders
+  //
+  //   * readers  (io_threads): pread fixed compressed windows into RAM, mark ready.
+  //   * scanner  (1 thread)  : walk the LZ4 frame in order, slice it into block
+  //                            descriptors, push them to decoders in batches.
+  //   * decoders (io_threads): decompress blocks into the output buffer and hand
+  //                            each to the section scanner, which advances the
+  //                            decoded-byte frontier and publishes section ranges.
+  //
+  // Consumers are spawned first so they are parked waiting before the readers (which
+  // run on this thread) start producing. scoped_thread_group joins the background
+  // threads on scope exit; any stage's failure is captured in `latch` and rethrown here.
+  void run()
+  {
+    std::exception_ptr startup_error;
+    {
+      scoped_thread_group background;
+      try {
+        background.reserve(io_threads + 1);
+        background.emplace([this] { run_scanner_stage(); });
+        for (std::size_t t = 0; t < io_threads; ++t) {
+          background.emplace([this, t] { run_decoder_stage(t); });
+        }
+        run_readers();  // produce on the calling thread, now that consumers are parked
+      } catch (...) {
+        startup_error = std::current_exception();
+        fail_and_notify(startup_error);
+      }
+    }
+    if (startup_error) { std::rethrow_exception(startup_error); }
+    latch.rethrow_if_error();
+  }
+
+  void finalize()
+  {
+    input.output_view_size_ = input.section_scanner_->ready_bytes();
+    input.commit_up_to(input.output_view_size_ + input_buffer_padding_bytes);
+    input.section_scanner_->publish_ready(input.output_view_size_);
+  }
+
+  void fail_and_notify(std::exception_ptr eptr)
+  {
+    latch.capture(eptr);
+    window_cv.notify_all();
+    desc_cv.notify_all();
+  }
+
+  void add_compressed_resident(std::size_t bytes)
+  {
+    compressed_resident_bytes.fetch_add(bytes, std::memory_order_relaxed);
+  }
+
+  void try_release_window(std::size_t index)
+  {
+    if (index >= window_count) { return; }
+    if (index >= scanned_through_.load(std::memory_order_acquire)) { return; }
+    window_state_t& state = window_state_[index];
+    if (state.decode_refs.load(std::memory_order_acquire) != 0) { return; }
+    uint8_t expected = 0;
+    if (!state.released.compare_exchange_strong(expected, 1, std::memory_order_acq_rel)) { return; }
+    std::lock_guard<std::mutex> lock(window_release_mutex);
+    if (windows[index].data) {
+      windows[index].data.reset();
+      compressed_resident_bytes.fetch_sub(windows[index].size, std::memory_order_relaxed);
+    }
+  }
+
+  void mark_windows_scanned_before(std::size_t offset)
+  {
+    assert(offset >= last_mark_offset_);
+    last_mark_offset_               = offset;
+    std::size_t new_scanned_through = std::min(window_count, offset / window_bytes);
+    std::size_t prev                = scanned_through_.load(std::memory_order_relaxed);
+    if (new_scanned_through <= prev) { return; }
+    scanned_through_.store(new_scanned_through, std::memory_order_release);
+    for (std::size_t wi = prev; wi < new_scanned_through; ++wi) {
+      try_release_window(wi);
+    }
+  }
+
+  void run_readers()
+  {
+    parallel_for_indexed(
+      window_count, io_threads, latch, "lz4-window-read-", [this](std::size_t index) {
+        read_window(index);
+      });
+  }
+
+  void read_window(std::size_t index)
+  {
+    try {
+      auto& w = windows[index];
+      w.data.reset(new char[w.size]);
+      add_compressed_resident(w.size);
+      bool ok = false;
+      {
+        MPS_NVTX_RANGE("lz4_window_pread", nvtx::colors::io);
+        ok = pread_full(input.fd_, w.data.get(), w.size, w.file_offset);
+      }
+      if (!ok) {
+        mps_parser_fail(error_type_t::RuntimeError,
+                        "Failed to pread LZ4 resident window: %s",
+                        std::strerror(errno));
+      }
+      {
+        MPS_NVTX_RANGE("lz4_window_publish", nvtx::colors::generic);
+        std::lock_guard<std::mutex> lock(window_mutex);
+        window_done[index] = 1;
+      }
+      window_cv.notify_all();
+    } catch (...) {
+      fail_and_notify(std::current_exception());
+    }
+  }
+
+  void run_decoder_stage(std::size_t tid)
+  {
+    try {
+      std::string thread_name = "lz4-window-decode-" + std::to_string(tid);
+      nvtx::name_current_thread(thread_name.c_str());
+      while (true) {
+        std::vector<resident_block_desc_t> batch = wait_for_decode_batch();
+        if (batch.empty()) { return; }
+        decode_batch(batch);
+      }
+    } catch (...) {
+      fail_and_notify(std::current_exception());
+    }
+  }
+
+  std::vector<resident_block_desc_t> wait_for_decode_batch()
+  {
+    MPS_NVTX_RANGE("lz4_decode_wait_batch", nvtx::colors::io);
+    std::unique_lock<std::mutex> lock(desc_mutex);
+    desc_cv.wait(lock, [&] { return latch.stopped() || scanner_done || !desc_queue.empty(); });
+    if (latch.stopped() || desc_queue.empty()) { return {}; }
+    std::vector<resident_block_desc_t> batch = std::move(desc_queue.front());
+    desc_queue.pop_front();
+    return batch;
+  }
+
+  void decode_batch(const std::vector<resident_block_desc_t>& batch)
+  {
+    MPS_NVTX_RANGE("lz4_decode_batch", nvtx::colors::decode);
+    for (const auto& block : batch) {
+      decode_block(block);
+    }
+  }
+
+  void decode_block(const resident_block_desc_t& block)
+  {
+    char* dst  = input.output_data_ + block.decompressed_offset;
+    int actual = 0;
+    {
+      MPS_NVTX_RANGE("lz4_decode_block_payload", nvtx::colors::decode);
+      if (block.uncompressed) {
+        std::memcpy(dst, block.src, block.decompressed_size);
+        actual = (int)block.decompressed_size;
+      } else if (block.compressed_size > (std::size_t)std::numeric_limits<int>::max() ||
+                 block.decompressed_size > (std::size_t)std::numeric_limits<int>::max()) {
+        actual = -1;
+      } else {
+        actual = lz4_decompress_safe_runtime(
+          block.src, dst, (int)block.compressed_size, (int)block.decompressed_size);
+      }
+    }
+    if (actual < 0 || (std::size_t)actual > block.decompressed_size) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "LZ4 input block decompressed to invalid size");
+    }
+    release_block_window_ref(block);
+    publish_decoded_block(block, dst, (std::size_t)actual);
+  }
+
+  void release_block_window_ref(const resident_block_desc_t& block)
+  {
+    if (block.window_index == std::numeric_limits<std::size_t>::max()) { return; }
+    uint32_t old =
+      window_state_[block.window_index].decode_refs.fetch_sub(1, std::memory_order_acq_rel);
+    assert(old > 0);
+    if (old == 1) { try_release_window(block.window_index); }
+  }
+
+  void publish_decoded_block(const resident_block_desc_t& block, char* dst, std::size_t actual_size)
+  {
+    MPS_NVTX_RANGE("lz4_section_scan_block", nvtx::colors::generic);
+    // The scanner advances the contiguous decoded-byte frontier and publishes
+    // section ranges as blocks complete, regardless of decode order.
+    input.section_scanner_->observe_block(block.index, dst, dst + actual_size);
+  }
+
+  void wait_range_ready(std::size_t begin, std::size_t size)
+  {
+    if (size == 0) return;
+    if (begin > input.compressed_size_ || size > input.compressed_size_ - begin) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "truncated LZ4 frame while reading resident window");
+    }
+    std::size_t first = begin / window_bytes;
+    std::size_t last  = (begin + size - 1) / window_bytes;
+    if (last >= window_done.size()) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "truncated LZ4 frame while reading resident window");
+    }
+    for (std::size_t wi = first; wi <= last; ++wi) {
+      MPS_NVTX_RANGE("lz4_metadata_wait_window", nvtx::colors::io);
+      std::unique_lock<std::mutex> lock(window_mutex);
+      window_cv.wait(lock, [&] { return latch.stopped() || window_done[wi] != 0; });
+      if (latch.stopped() && window_done[wi] == 0) {
+        mps_parser_fail(error_type_t::RuntimeError,
+                        "LZ4 metadata scanner stopped before required window was ready");
+      }
+    }
+  }
+
+  void push_batch(std::vector<resident_block_desc_t>& batch)
+  {
+    if (batch.empty()) return;
+    {
+      MPS_NVTX_RANGE("lz4_metadata_commit_batch", nvtx::colors::alloc);
+      input.commit_up_to(batch.back().decompressed_offset + batch.back().decompressed_size);
+    }
+    {
+      MPS_NVTX_RANGE("lz4_metadata_enqueue_batch", nvtx::colors::generic);
+      std::lock_guard<std::mutex> lock(desc_mutex);
+      desc_queue.push_back(std::move(batch));
+    }
+    batch.clear();
+    desc_cv.notify_one();
+  }
+
+  void run_scanner_stage()
+  {
+    try {
+      nvtx::name_current_thread("lz4-metadata-scan");
+      scan_lz4_metadata();
+      {
+        std::lock_guard<std::mutex> lock(desc_mutex);
+        scanner_done = true;
+      }
+      desc_cv.notify_all();
+    } catch (...) {
+      {
+        std::lock_guard<std::mutex> lock(desc_mutex);
+        scanner_done = true;
+      }
+      fail_and_notify(std::current_exception());
+    }
+  }
+
+  void scan_lz4_metadata()
+  {
+    lz4_resident_windows_t resident(windows);
+    std::vector<resident_block_desc_t> batch;
+    batch.reserve(lz4_decode_batch_decompressed_bytes / input.block_max_size_ + 1);
+    std::size_t batch_decoded_bytes = 0;
+    std::size_t offset              = input.header_size_;
+    std::size_t decompressed_offset = 0;
+    blocks_scanned.store(0, std::memory_order_relaxed);
+
+    while (true) {
+      MPS_NVTX_RANGE("lz4_metadata_scan_block", nvtx::colors::generic);
+      wait_range_ready(offset, 4);
+      if (offset + 4 > input.compressed_size_) {
+        mps_parser_fail(error_type_t::ValidationError,
+                        "truncated LZ4 frame while reading block header");
+      }
+      uint32_t raw_block_size = resident.read_u32(offset);
+      offset += 4;
+      if (raw_block_size == 0) { break; }
+
+      resident_block_desc_t block =
+        scan_one_block(resident, raw_block_size, offset, decompressed_offset);
+      batch_decoded_bytes += block.decompressed_size;
+      batch.push_back(block);
+      blocks_scanned.fetch_add(1, std::memory_order_relaxed);
+      if (blocks_scanned.load(std::memory_order_relaxed) > input.block_slot_count_) {
+        mps_parser_fail(error_type_t::OutOfMemoryError,
+                        "LZ4 input block count exceeded reserved metadata slots");
+      }
+      if (batch_decoded_bytes >= lz4_decode_batch_decompressed_bytes) {
+        push_batch(batch);
+        batch_decoded_bytes = 0;
+      }
+    }
+
+    scan_frame_footer(offset, decompressed_offset);
+    push_batch(batch);
+    mark_windows_scanned_before(input.compressed_size_);
+  }
+
+  resident_block_desc_t scan_one_block(lz4_resident_windows_t& resident,
+                                       uint32_t raw_block_size,
+                                       std::size_t& offset,
+                                       std::size_t& decompressed_offset)
+  {
+    // --- Decode the block-size word and validate it ---------------------------
+    bool uncompressed              = (raw_block_size & lz4_uncompressed_block) != 0;
+    std::size_t block_payload_size = raw_block_size & lz4_block_size_mask;
+    if (block_payload_size == 0) {
+      mps_parser_fail(error_type_t::ValidationError, "invalid zero-sized LZ4 data block");
+    }
+    if (block_payload_size > input.block_max_size_ && uncompressed) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "LZ4 uncompressed block exceeds frame block maximum");
+    }
+    if (input.content_size_present_ && decompressed_offset >= input.content_size_) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "LZ4 frame contains more blocks than content size allows");
+    }
+
+    // --- Wait until the payload bytes are resident ----------------------------
+    wait_range_ready(offset, block_payload_size);
+    if (offset + block_payload_size > input.compressed_size_) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "truncated LZ4 frame while reading block payload");
+    }
+
+    // --- Determine the decompressed size --------------------------------------
+    // Compressed blocks expand to block_max_size_ (or the content-size remainder
+    // for the final block); uncompressed blocks keep their payload size.
+    std::size_t decompressed_size = block_payload_size;
+    if (!uncompressed) {
+      decompressed_size =
+        input.content_size_present_
+          ? std::min(input.block_max_size_, input.content_size_ - decompressed_offset)
+          : input.block_max_size_;
+    }
+    if (input.content_size_present_ &&
+        decompressed_size > input.content_size_ - decompressed_offset) {
+      mps_parser_fail(error_type_t::ValidationError, "LZ4 block exceeds declared content size");
+    }
+
+    // --- Stage the payload for the decoder ------------------------------------
+    // Fast path: the whole payload lives in one window, so point the decoder
+    // straight at it (zero copy) and pin that window with a decode_refs bump until
+    // the decode completes. Otherwise it straddles a window boundary: copy it out
+    // into crossing_payloads, which stays alive for the whole run, so no window pin
+    // is needed (and the source window can be released as soon as it is scanned).
+    const char* src          = resident.ptr_if_contiguous(offset, block_payload_size);
+    std::size_t window_index = std::numeric_limits<std::size_t>::max();
+    if (src == nullptr) {
+      crossing_payloads.emplace_back(block_payload_size);
+      resident.copy_to(offset, crossing_payloads.back().data(), block_payload_size);
+      src = crossing_payloads.back().data();
+    } else {
+      window_index = offset / window_bytes;
+      window_state_[window_index].decode_refs.fetch_add(1, std::memory_order_acq_rel);
+    }
+
+    // --- Record the descriptor and advance past the block (+ optional checksum) -
+    resident_block_desc_t block{src,
+                                block_payload_size,
+                                decompressed_offset,
+                                decompressed_size,
+                                blocks_scanned.load(std::memory_order_relaxed),
+                                window_index,
+                                uncompressed};
+    decompressed_offset += decompressed_size;
+    offset += block_payload_size;
+    mark_windows_scanned_before(offset);
+    if (input.block_checksum_) {
+      wait_range_ready(offset, 4);
+      if (offset + 4 > input.compressed_size_) {
+        mps_parser_fail(error_type_t::ValidationError,
+                        "truncated LZ4 frame while reading block checksum");
+      }
+      offset += 4;
+      mark_windows_scanned_before(offset);
+    }
+    return block;
+  }
+
+  void scan_frame_footer(std::size_t& offset, std::size_t decompressed_offset)
+  {
+    if (input.content_checksum_) {
+      wait_range_ready(offset, 4);
+      if (offset + 4 > input.compressed_size_) {
+        mps_parser_fail(error_type_t::ValidationError,
+                        "truncated LZ4 frame while reading content checksum");
+      }
+      offset += 4;
+      mark_windows_scanned_before(offset);
+    }
+    if (input.content_size_present_ && decompressed_offset != input.content_size_) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "LZ4 frame ended before declared content size was reached");
+    }
+    if (offset != input.compressed_size_) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "LZ4 input contains trailing data after the first frame");
+    }
+  }
+
+  // ---- Input + chunking (immutable after construction) ------------------------
+  // The compressed file is split into fixed-size `windows`; `io_threads` reader
+  // threads pull them by index.
+  lz4_input_stream_t& input;
+  const std::size_t window_bytes = lz4_pipeline_batch_bytes;
+  const std::size_t window_count;
+  std::vector<lz4_resident_window_t> windows;
+  const std::size_t io_threads;
+
+  // First-error-wins latch shared by all three stages: stops the pipeline and
+  // retains the first exception for run() to rethrow after the threads join.
+  parallel_error_latch_t latch;
+
+  // ---- Reader -> scanner readiness  (guarded by window_mutex) -----------------
+  // A reader sets window_done[i]=1 once window i is resident; the scanner blocks
+  // on window_cv until every window covering the bytes it needs is ready.
+  std::vector<unsigned char> window_done;
+  std::mutex window_mutex;
+  std::condition_variable window_cv;
+
+  // ---- Window lifecycle / early release ---------------------------------------
+  // windows[i].data is freed exactly once, when the metadata scan has passed window i
+  // (scanned_through_ > i) AND no decoder still pins it (window_state_[i].decode_refs == 0).
+  // scanned_through_ advances monotonically in mark_windows_scanned_before (last_mark_offset_
+  // asserts that monotonicity); decode_refs bumps in scan_one_block and drops in
+  // release_block_window_ref; the per-window `released` CAS makes the free exactly-once.
+  // window_release_mutex serializes the data.reset() + compressed_resident_bytes accounting.
+  std::unique_ptr<window_state_t[]> window_state_;
+  std::atomic_size_t scanned_through_{0};
+  std::size_t last_mark_offset_{0};
+  std::mutex window_release_mutex;
+  std::atomic_size_t compressed_resident_bytes{0};
+
+  // ---- Scanner -> decoder queue  (guarded by desc_mutex) ----------------------
+  // The scanner pushes batches of block descriptors; decoders pop them via desc_cv.
+  // scanner_done signals the scanner has emitted its final batch.
+  std::deque<std::vector<resident_block_desc_t>> desc_queue;
+  bool scanner_done = false;
+  std::mutex desc_mutex;
+  std::condition_variable desc_cv;
+
+  // ---- Scanner scratch / progress ---------------------------------------------
+  // blocks_scanned doubles as the running block index; crossing_payloads holds staged
+  // copies of blocks that straddle a window boundary (see scan_one_block).
+  std::atomic_size_t blocks_scanned{0};
+  std::vector<std::vector<char>> crossing_payloads;
+};
+
+void lz4_input_stream_t::run_decode_tasks()
+{
+  MPS_NVTX_RANGE("lz4_input_run_decode_tasks", nvtx::colors::io);
+  lz4_pipeline_t pipeline(*this);
+  pipeline.run();
+  pipeline.finalize();
+}
+
+}  // namespace cuopt::mathematical_optimization::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/mmap_region.hpp b/cpp/src/io/experimental_mps_fast/mmap_region.hpp
new file mode 100644
index 0000000000..48f2c1e1d1
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/mmap_region.hpp
@@ -0,0 +1,151 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <sys/mman.h>
+#include <sys/types.h>
+
+#include <cuda/cmath>
+
+#include <cerrno>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include <utilities/error.hpp>
+
+#include <limits>
+#include <stdexcept>
+#include <string>
+
+namespace cuopt::mathematical_optimization::io::detail {
+
+using cuopt::mathematical_optimization::io::error_type_t;
+using cuopt::mathematical_optimization::io::mps_parser_expects;
+using cuopt::mathematical_optimization::io::mps_parser_fail;
+
+// Move-only owner for a Linux mmap range. Fixed sub-maps inside a reserved range
+// are still released by unmapping the owning outer range.
+class mmap_region_t {
+ public:
+  mmap_region_t() = default;
+  mmap_region_t(void* ptr, std::size_t size) noexcept : ptr_(ptr), size_(size) {}
+
+  mmap_region_t(const mmap_region_t&)            = delete;
+  mmap_region_t& operator=(const mmap_region_t&) = delete;
+
+  mmap_region_t(mmap_region_t&& other) noexcept
+    : ptr_(other.ptr_),
+      size_(other.size_),
+      unmap_ptr_(other.unmap_ptr_),
+      unmap_size_(other.unmap_size_)
+  {
+    other.ptr_        = nullptr;
+    other.size_       = 0;
+    other.unmap_ptr_  = nullptr;
+    other.unmap_size_ = 0;
+  }
+
+  mmap_region_t& operator=(mmap_region_t&& other) noexcept
+  {
+    if (this != &other) {
+      reset();
+      ptr_              = other.ptr_;
+      size_             = other.size_;
+      unmap_ptr_        = other.unmap_ptr_;
+      unmap_size_       = other.unmap_size_;
+      other.ptr_        = nullptr;
+      other.size_       = 0;
+      other.unmap_ptr_  = nullptr;
+      other.unmap_size_ = 0;
+    }
+    return *this;
+  }
+
+  ~mmap_region_t() { reset(); }
+
+ private:
+  static mmap_region_t map(
+    void* address, std::size_t size, int prot, int flags, int fd, off_t offset, const char* context)
+  {
+    void* ptr = ::mmap(address, size, prot, flags, fd, offset);
+    if (ptr == MAP_FAILED) {
+      mps_parser_fail(
+        error_type_t::RuntimeError, "mmap failed for %s: %s", context, std::strerror(errno));
+    }
+    return mmap_region_t(ptr, size);
+  }
+
+ public:
+  static mmap_region_t anonymous(std::size_t size, int prot, int flags, const char* context)
+  {
+    return map(nullptr, size, prot, flags | MAP_ANONYMOUS, -1, 0, context);
+  }
+
+  static mmap_region_t anonymous_aligned(
+    std::size_t size, std::size_t alignment, int prot, int flags, const char* context)
+  {
+    if (!cuda::is_power_of_two(alignment)) {
+      mps_parser_fail(error_type_t::RuntimeError,
+                      "mmap aligned allocation requires power-of-two alignment");
+    }
+    if (size > std::numeric_limits<std::size_t>::max() - alignment) {
+      mps_parser_fail(error_type_t::OutOfMemoryError, "mmap aligned allocation size overflow");
+    }
+
+    std::size_t raw_size = size + alignment;
+    void* raw            = ::mmap(nullptr, raw_size, prot, flags | MAP_ANONYMOUS, -1, 0);
+    if (raw == MAP_FAILED) {
+      mps_parser_fail(
+        error_type_t::RuntimeError, "mmap failed for %s: %s", context, std::strerror(errno));
+    }
+
+    uintptr_t raw_addr     = reinterpret_cast<uintptr_t>(raw);
+    uintptr_t aligned_addr = (raw_addr + alignment - 1) & ~(uintptr_t)(alignment - 1);
+    return mmap_region_t(reinterpret_cast<void*>(aligned_addr), size, raw, raw_size);
+  }
+
+  static void map_fixed_or_throw(
+    void* address, std::size_t size, int prot, int flags, int fd, off_t offset, const char* context)
+  {
+    void* ptr = ::mmap(address, size, prot, flags | MAP_FIXED, fd, offset);
+    if (ptr == MAP_FAILED) {
+      mps_parser_fail(
+        error_type_t::RuntimeError, "mmap failed for %s: %s", context, std::strerror(errno));
+    }
+  }
+
+  void reset() noexcept
+  {
+    void* base      = unmap_ptr_ != nullptr ? unmap_ptr_ : ptr_;
+    std::size_t len = unmap_ptr_ != nullptr ? unmap_size_ : size_;
+    if (base != nullptr && len != 0) { ::munmap(base, len); }
+    ptr_        = nullptr;
+    size_       = 0;
+    unmap_ptr_  = nullptr;
+    unmap_size_ = 0;
+  }
+
+  void advise(int advice) const noexcept
+  {
+    if (ptr_ != nullptr && size_ != 0) { ::madvise(ptr_, size_, advice); }
+  }
+
+  void* data() noexcept { return ptr_; }
+  char* char_data() noexcept { return (char*)ptr_; }
+  std::size_t size() const noexcept { return size_; }
+
+ private:
+  mmap_region_t(void* ptr, std::size_t size, void* unmap_ptr, std::size_t unmap_size) noexcept
+    : ptr_(ptr), size_(size), unmap_ptr_(unmap_ptr), unmap_size_(unmap_size)
+  {
+  }
+
+  void* ptr_              = nullptr;
+  std::size_t size_       = 0;
+  void* unmap_ptr_        = nullptr;
+  std::size_t unmap_size_ = 0;
+};
+
+}  // namespace cuopt::mathematical_optimization::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp
new file mode 100644
index 0000000000..32cbf3a105
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.cpp
@@ -0,0 +1,478 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+#include "mps_section_scanner.hpp"
+
+#include <utilities/error.hpp>
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <initializer_list>
+#include <stdexcept>
+
+#include <simde/x86/avx2.h>
+#include <simde/x86/sse4.2.h>
+
+namespace cuopt::mathematical_optimization::io::detail {
+
+using cuopt::mathematical_optimization::io::error_type_t;
+using cuopt::mathematical_optimization::io::mps_parser_expects;
+using cuopt::mathematical_optimization::io::mps_parser_fail;
+
+namespace {
+
+struct section_record_t {
+  mps_section_kind kind;
+  const char* name;
+  std::size_t len;
+};
+
+constexpr section_record_t section_records[] = {
+  {mps_section_kind::rows, "ROWS", 4},
+  {mps_section_kind::columns, "COLUMNS", 7},
+  {mps_section_kind::rhs, "RHS", 3},
+  {mps_section_kind::bounds, "BOUNDS", 6},
+  {mps_section_kind::ranges, "RANGES", 6},
+  {mps_section_kind::quadobj, "QUADOBJ", 7},
+  {mps_section_kind::qmatrix, "QMATRIX", 7},
+  {mps_section_kind::qcmatrix, "QCMATRIX", 8},
+  {mps_section_kind::endata, "ENDATA", 6},
+};
+
+constexpr const char* header_records[] = {"NAME", "OBJSENSE", "OBJNAME"};
+
+constexpr std::size_t kSimdWidth = sizeof(simde__m256i);
+static_assert(kSimdWidth == 32);
+static_assert((std::size_t)mps_section_kind::rows == 0);
+static_assert((std::size_t)mps_section_kind::endata + 1 == std::size(section_records));
+static_assert((std::size_t)mps_phase_kind::header == 0);
+static_assert((std::size_t)mps_phase_kind::quadratic + 1 == 7);
+
+bool is_nonblank_column1(unsigned char c) noexcept { return c > ' '; }
+
+simde__m256i nonblank_column1_mask(simde__m256i bytes)
+{
+  return simde_mm256_cmpgt_epi8(bytes, simde_mm256_set1_epi8(' '));
+}
+
+enum class section_record_match_t { invalid, header, section };
+
+bool line_has_record_prefix(const char* line_start, const char* line_end, const char* name)
+{
+  std::size_t len = std::strlen(name);
+  if ((std::size_t)(line_end - line_start) < len || std::memcmp(line_start, name, len) != 0) {
+    return false;
+  }
+  const char* after = line_start + len;
+  return after == line_end || *after <= ' ';
+}
+
+}  // namespace
+
+std::size_t mps_phase_registry_t::phase_index(mps_phase_kind phase) { return (std::size_t)phase; }
+
+void mps_phase_registry_t::publish(mps_phase_kind phase, mps_phase_range_t range)
+{
+  std::size_t idx = phase_index(phase);
+  omp_event_handle_t event{};
+  bool fulfill = false;
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (ready_[idx].load(std::memory_order_acquire)) { return; }
+    ranges_[idx] = range;
+    ready_[idx].store(true, std::memory_order_release);
+    if (has_event_[idx] && !event_fulfilled_[idx]) {
+      event                 = events_[idx];
+      event_fulfilled_[idx] = true;
+      fulfill               = true;
+    }
+  }
+  if (fulfill) { omp_fulfill_event(event); }
+}
+
+void mps_phase_registry_t::attach_event(mps_phase_kind phase, omp_event_handle_t event)
+{
+  std::size_t idx = phase_index(phase);
+  bool fulfill    = false;
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    events_[idx]    = event;
+    has_event_[idx] = true;
+    if (ready_[idx].load(std::memory_order_acquire) && !event_fulfilled_[idx]) {
+      event_fulfilled_[idx] = true;
+      fulfill               = true;
+    }
+  }
+  if (fulfill) { omp_fulfill_event(event); }
+}
+
+bool mps_phase_registry_t::ready(mps_phase_kind phase) const
+{
+  return ready_[phase_index(phase)].load(std::memory_order_acquire);
+}
+
+mps_phase_range_t mps_phase_registry_t::range(mps_phase_kind phase) const
+{
+  std::size_t idx = phase_index(phase);
+  bool is_ready   = ready_[idx].load(std::memory_order_acquire);
+  assert(is_ready);
+  return ranges_[idx];
+}
+
+void mps_phase_registry_t::publish_endata(const char* begin, bool present)
+{
+  std::lock_guard<std::mutex> lock(mutex_);
+  if (endata_ready_.load(std::memory_order_acquire)) { return; }
+  endata_begin_   = begin;
+  endata_present_ = present;
+  endata_ready_.store(true, std::memory_order_release);
+}
+
+bool mps_phase_registry_t::endata_ready() const
+{
+  return endata_ready_.load(std::memory_order_acquire);
+}
+
+const char* mps_phase_registry_t::endata_begin() const
+{
+  assert(endata_ready());
+  return endata_begin_;
+}
+
+bool mps_phase_registry_t::endata_present() const
+{
+  assert(endata_ready());
+  return endata_present_;
+}
+
+static section_record_match_t is_section_record(const char* line_start,
+                                                const char* line_end,
+                                                mps_section_kind* kind)
+{
+  if (line_start >= line_end) { return section_record_match_t::invalid; }
+
+  for (const char* name : header_records) {
+    if (line_has_record_prefix(line_start, line_end, name)) {
+      return section_record_match_t::header;
+    }
+  }
+
+  for (const section_record_t& record : section_records) {
+    if ((std::size_t)(line_end - line_start) < record.len ||
+        std::memcmp(line_start, record.name, record.len) != 0) {
+      continue;
+    }
+    const char* after = line_start + record.len;
+    while (after < line_end && (*after == ' ' || *after == '\t' || *after == '\r')) {
+      ++after;
+    }
+    // QCMATRIX records are of the form "QCMATRIX <row>"
+    if (record.kind == mps_section_kind::qcmatrix) {
+      if (after == line_end) { return section_record_match_t::invalid; }
+      *kind = record.kind;
+      return section_record_match_t::section;
+    }
+    if (after != line_end) { return section_record_match_t::invalid; }
+    *kind = record.kind;
+    return section_record_match_t::section;
+  }
+  return section_record_match_t::invalid;
+}
+
+mps_section_block_scanner_t::mps_section_block_scanner_t(const char* data,
+                                                         std::size_t block_count,
+                                                         mps_phase_registry_t& registry)
+  : data_(data),
+    block_count_(block_count),
+    registry_(registry),
+    block_decoded_(std::make_unique<std::atomic<unsigned char>[]>(block_count)),
+    block_begin_offsets_(std::make_unique<std::atomic_size_t[]>(block_count)),
+    block_end_offsets_(std::make_unique<std::atomic_size_t[]>(block_count))
+{
+  for (std::size_t i = 0; i < block_count_; ++i) {
+    block_decoded_[i].store(0, std::memory_order_relaxed);
+    block_begin_offsets_[i].store(0, std::memory_order_relaxed);
+    block_end_offsets_[i].store(0, std::memory_order_relaxed);
+  }
+}
+
+std::size_t mps_section_block_scanner_t::section_hit_index(mps_section_kind kind)
+{
+  return (std::size_t)kind;
+}
+
+void mps_section_block_scanner_t::record_section_hit(mps_section_kind kind, const char* ptr)
+{
+  std::atomic<const char*>& slot = section_hits_[section_hit_index(kind)];
+  const char* expected           = nullptr;
+  if (slot.compare_exchange_strong(
+        expected, ptr, std::memory_order_release, std::memory_order_acquire)) {
+    notify_ready_phases();
+  }
+}
+
+void mps_section_block_scanner_t::scan_section_range(const char* begin, const char* end)
+{
+  if (begin >= end) return;
+  const char* p = begin;
+
+  // Interior scans that start inside a decoded block skip the leading partial
+  // line. A separate boundary scan covers section titles whose newline/title
+  // bytes straddle adjacent LZ4 blocks.
+  if (p != data_) {
+    const void* nl = __builtin_memchr(p, '\n', (std::size_t)(end - p));
+    if (nl == nullptr) { return; }
+    p = (const char*)nl + 1;
+  }
+
+  auto try_candidate = [&](const char* line_start) {
+    const void* nl       = __builtin_memchr(line_start, '\n', (std::size_t)(end - line_start));
+    const char* line_end = nullptr;
+    if (nl == nullptr) {
+      const char* ready_ptr = data_ + ready_bytes_.load(std::memory_order_acquire);
+      if (end != ready_ptr) { return; }
+      line_end = end;
+    } else {
+      line_end = (const char*)nl;
+    }
+    if (*line_start == '*' || *line_start == '$') { return; }
+    mps_section_kind kind;
+    section_record_match_t match = is_section_record(line_start, line_end, &kind);
+    if (match == section_record_match_t::section) {
+      record_section_hit(kind, line_start);
+      return;
+    }
+    if (match == section_record_match_t::invalid) {
+      mps_parser_fail(error_type_t::ValidationError,
+                      "unknown section record: %.*s",
+                      (int)(line_end - line_start),
+                      line_start);
+    }
+  };
+
+  // Handle the very first line of a file (NAME indicator, usually)
+  if (p == data_) {
+    if (p < end && is_nonblank_column1((unsigned char)*p)) { try_candidate(p); }
+    ++p;
+  }
+
+  // In compliant MPS, indicator records begin in column 1 while data records
+  // begin in column 2+. use "\n[nonblank]" as a needle for the SIMD scan
+  const simde__m256i newline = simde_mm256_set1_epi8('\n');
+  while ((std::size_t)(end - p) >= kSimdWidth) {
+    // The first-line path above increments p when p == data_, so p - 1 is
+    // in-bounds here. Loading the previous vector lets us test "\nX" for all
+    // 32 candidate column-1 bytes with one AVX2 mask.
+    // loadu is comparable to aligned reads on modern SSE/AVX.
+    // might warrant some checks on ARM though
+    simde__m256i current  = simde_mm256_loadu_si256(reinterpret_cast<const simde__m256i*>(p));
+    simde__m256i previous = simde_mm256_loadu_si256(reinterpret_cast<const simde__m256i*>(p - 1));
+    std::uint32_t mask    = (std::uint32_t)simde_mm256_movemask_epi8(simde_mm256_and_si256(
+      simde_mm256_cmpeq_epi8(previous, newline), nonblank_column1_mask(current)));
+    while (mask != 0) {
+      int bit = __builtin_ctz(mask);
+      try_candidate(p + bit);
+      mask &= mask - 1;
+    }
+    p += kSimdWidth;
+  }
+
+  // scalar tail
+  while (p < end) {
+    if (*(p - 1) == '\n' && is_nonblank_column1((unsigned char)*p)) { try_candidate(p); }
+    ++p;
+  }
+}
+
+void mps_section_block_scanner_t::scan_boundary(std::size_t left_index, std::size_t right_index)
+{
+  std::size_t left_begin = block_begin_offsets_[left_index].load(std::memory_order_acquire);
+  std::size_t boundary   = block_begin_offsets_[right_index].load(std::memory_order_acquire);
+  std::size_t right_end  = block_end_offsets_[right_index].load(std::memory_order_acquire);
+  std::size_t begin =
+    boundary - left_begin > boundary_overlap ? boundary - boundary_overlap : left_begin;
+  std::size_t end =
+    right_end - boundary > boundary_overlap ? boundary + boundary_overlap : right_end;
+  scan_section_range(data_ + begin, data_ + end);
+}
+
+// scans a freshly decoded block for section titles, along with the start/end boundaries if a
+// section title straddles blocks
+void mps_section_block_scanner_t::observe_block(std::size_t block_index,
+                                                const char* begin,
+                                                const char* end)
+{
+  if (block_index >= block_count_) {
+    mps_parser_fail(error_type_t::RuntimeError,
+                    "MPS section scanner observed invalid LZ4 block index");
+  }
+
+  // --- Scan this block, then record its extent and mark it decoded. The release store on
+  //     block_decoded_ publishes the two relaxed offset stores above it.
+  scan_section_range(begin, end);
+  block_begin_offsets_[block_index].store((std::size_t)(begin - data_), std::memory_order_relaxed);
+  block_end_offsets_[block_index].store((std::size_t)(end - data_), std::memory_order_relaxed);
+  block_decoded_[block_index].store(1, std::memory_order_release);
+
+  // --- Rescan the seams with already-decoded neighbors, in case a title straddles the boundary.
+  if (block_index > 0 && block_decoded_[block_index - 1].load(std::memory_order_acquire)) {
+    scan_boundary(block_index - 1, block_index);
+  }
+  if (block_index + 1 < block_count_ &&
+      block_decoded_[block_index + 1].load(std::memory_order_acquire)) {
+    scan_boundary(block_index, block_index + 1);
+  }
+
+  // --- Extend the contiguous decoded-byte frontier and publish any newly bounded phases.
+  advance_ready_frontier();
+}
+
+void mps_section_block_scanner_t::advance_ready_frontier()
+{
+  std::size_t new_ready = 0;
+  bool grew             = false;
+  {
+    std::lock_guard<std::mutex> lock(frontier_mutex_);
+    while (next_block_ < block_count_ &&
+           block_decoded_[next_block_].load(std::memory_order_acquire)) {
+      new_ready = block_end_offsets_[next_block_].load(std::memory_order_acquire);
+      ++next_block_;
+      grew = true;
+    }
+  }
+  if (grew) { publish_ready(new_ready); }
+}
+
+void mps_section_block_scanner_t::publish_ready(std::size_t ready_bytes)
+{
+  ready_bytes_.store(ready_bytes, std::memory_order_release);
+  std::size_t begin = ready_bytes > boundary_overlap ? ready_bytes - boundary_overlap : 0;
+  scan_section_range(data_ + begin, data_ + ready_bytes);
+  notify_ready_phases();
+}
+
+std::size_t mps_section_block_scanner_t::ready_bytes() const noexcept
+{
+  return ready_bytes_.load(std::memory_order_acquire);
+}
+
+void mps_section_block_scanner_t::notify_ready_phases()
+{
+  // Publication model: each present phase runs from its own section header to
+  // the first later section header that has been discovered. Optional sections
+  // publish present=false once a later boundary proves they cannot still appear.
+  // ENDATA, or final ready bytes for truncated/non-newline files, is the final
+  // boundary for the trailing optional/quadratic phases.
+  std::lock_guard<std::mutex> lock(publish_mutex_);
+  std::size_t ready     = ready_bytes_.load(std::memory_order_acquire);
+  const char* ready_ptr = data_ + ready;
+  const char* rows =
+    section_hits_[section_hit_index(mps_section_kind::rows)].load(std::memory_order_acquire);
+  const char* columns =
+    section_hits_[section_hit_index(mps_section_kind::columns)].load(std::memory_order_acquire);
+  const char* rhs =
+    section_hits_[section_hit_index(mps_section_kind::rhs)].load(std::memory_order_acquire);
+  const char* bounds =
+    section_hits_[section_hit_index(mps_section_kind::bounds)].load(std::memory_order_acquire);
+  const char* ranges =
+    section_hits_[section_hit_index(mps_section_kind::ranges)].load(std::memory_order_acquire);
+  const char* quadobj =
+    section_hits_[section_hit_index(mps_section_kind::quadobj)].load(std::memory_order_acquire);
+  const char* qmatrix =
+    section_hits_[section_hit_index(mps_section_kind::qmatrix)].load(std::memory_order_acquire);
+  const char* qcmatrix =
+    section_hits_[section_hit_index(mps_section_kind::qcmatrix)].load(std::memory_order_acquire);
+  const char* endata =
+    section_hits_[section_hit_index(mps_section_kind::endata)].load(std::memory_order_acquire);
+  auto available = [&](const char* p) { return p != nullptr && p <= ready_ptr; };
+  bool final_ready =
+    block_count_ == 0 ||
+    (block_decoded_[block_count_ - 1].load(std::memory_order_acquire) &&
+     ready == block_end_offsets_[block_count_ - 1].load(std::memory_order_acquire));
+  const char* final_boundary    = available(endata) ? endata : (final_ready ? ready_ptr : nullptr);
+  auto earliest_available_after = [&](const char* after,
+                                      std::initializer_list<const char*> candidates) {
+    const char* best = nullptr;
+    for (const char* p : candidates) {
+      if (!available(p) || (after != nullptr && p <= after)) { continue; }
+      if (best == nullptr || p < best) { best = p; }
+    }
+    return best;
+  };
+  auto publish_optional = [&](mps_phase_kind phase,
+                              const char* self,
+                              const char* predecessor,
+                              std::initializer_list<const char*> later_candidates) {
+    if (registry_.ready(phase)) { return; }
+    if (available(self)) {
+      const char* end = earliest_available_after(self, later_candidates);
+      if (end != nullptr) { registry_.publish(phase, {self, end, true}); }
+      return;
+    }
+    if (predecessor != nullptr &&
+        earliest_available_after(predecessor, later_candidates) != nullptr) {
+      registry_.publish(phase, {nullptr, nullptr, false});
+    }
+  };
+
+  // Three publication shapes follow:
+  //   (1) mandatory header/rows/columns -- each spans from its start to the next mandatory
+  //       section; published as soon as that bounding section is available.
+  //   (2) optional rhs/ranges/bounds via publish_optional -- present=true once bounded, or
+  //       present=false once a later section proves the optional one cannot still appear.
+  //   (3) quadratic -- starts at the earliest of the three quad markers (quadobj/qmatrix/qcmatrix).
+  // final_boundary (ENDATA, or the final ready frontier for truncated files) closes the tail.
+  if (available(rows) && !registry_.ready(mps_phase_kind::header)) {
+    registry_.publish(mps_phase_kind::header, {data_, rows, true});
+  }
+  if (available(rows) && available(columns) && !registry_.ready(mps_phase_kind::rows)) {
+    registry_.publish(mps_phase_kind::rows, {rows, columns, true});
+  }
+  if (available(columns) && !registry_.ready(mps_phase_kind::columns)) {
+    const char* columns_end = earliest_available_after(
+      columns, {rhs, ranges, bounds, quadobj, qmatrix, qcmatrix, final_boundary});
+    if (columns_end != nullptr) {
+      registry_.publish(mps_phase_kind::columns, {columns, columns_end, true});
+    }
+  }
+
+  publish_optional(mps_phase_kind::rhs,
+                   rhs,
+                   columns,
+                   {ranges, bounds, quadobj, qmatrix, qcmatrix, final_boundary});
+  publish_optional(mps_phase_kind::ranges,
+                   ranges,
+                   rhs ? rhs : columns,
+                   {bounds, quadobj, qmatrix, qcmatrix, final_boundary});
+  publish_optional(mps_phase_kind::bounds,
+                   bounds,
+                   ranges ? ranges : (rhs ? rhs : columns),
+                   {quadobj, qmatrix, qcmatrix, final_boundary});
+
+  if (!registry_.ready(mps_phase_kind::quadratic)) {
+    const char* quadratic_begin = nullptr;
+    if (available(quadobj)) { quadratic_begin = quadobj; }
+    if (available(qmatrix) && (quadratic_begin == nullptr || qmatrix < quadratic_begin)) {
+      quadratic_begin = qmatrix;
+    }
+    if (available(qcmatrix) && (quadratic_begin == nullptr || qcmatrix < quadratic_begin)) {
+      quadratic_begin = qcmatrix;
+    }
+    if (quadratic_begin != nullptr && final_boundary != nullptr) {
+      registry_.publish(mps_phase_kind::quadratic, {quadratic_begin, final_boundary, true});
+    } else if (quadratic_begin == nullptr && final_boundary != nullptr) {
+      registry_.publish(mps_phase_kind::quadratic, {nullptr, nullptr, false});
+    }
+  }
+
+  if (available(endata)) {
+    registry_.publish_endata(endata, true);
+  } else if (final_ready && final_boundary != nullptr) {
+    registry_.publish_endata(final_boundary, false);
+  }
+}
+
+}  // namespace cuopt::mathematical_optimization::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp
new file mode 100644
index 0000000000..1c16196d68
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/mps_section_scanner.hpp
@@ -0,0 +1,146 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex>
+
+#include <omp.h>
+
+// The section scanner handles freshly read/decoded blocks and scans them for section titles while
+// they're still warm in cache it then publishes read/decoded input ranges to the parser workers,
+// which handle their respective sections in parallel.
+
+namespace cuopt::mathematical_optimization::io::detail {
+
+enum class mps_section_kind {
+  rows,
+  columns,
+  rhs,
+  bounds,
+  ranges,
+  quadobj,
+  qmatrix,
+  qcmatrix,
+  endata,
+};
+
+enum class mps_phase_kind {
+  header,
+  rows,
+  columns,
+  rhs,
+  bounds,
+  ranges,
+  quadratic,
+};
+
+struct mps_phase_range_t {
+  const char* begin = nullptr;
+  const char* end   = nullptr;
+  bool present      = false;
+};
+
+class mps_phase_registry_t {
+ public:
+  void publish(mps_phase_kind phase, mps_phase_range_t range);
+  void attach_event(mps_phase_kind phase, omp_event_handle_t event);
+
+  bool ready(mps_phase_kind phase) const;
+  // range() acquire-loads ready_[phase] (pairs with publish()'s release store) before
+  // reading ranges_[phase]. Callers must not invoke range() until the phase is published.
+  mps_phase_range_t range(mps_phase_kind phase) const;
+
+  void publish_endata(const char* begin, bool present);
+  bool endata_ready() const;
+  const char* endata_begin() const;
+  bool endata_present() const;
+
+ private:
+  // mutex_ guards ranges_/events_/has_event_/event_fulfilled_ and the endata_* fields for writers.
+  // Readers observe ready_[phase] / endata_ready_ (release-stored under the lock on publish,
+  // acquire-loaded here) and may then read the matching range lock-free -- see range()'s contract.
+  static constexpr std::size_t phase_count = 7;
+
+  static std::size_t phase_index(mps_phase_kind phase);
+
+  mps_phase_range_t ranges_[phase_count]{};
+  std::atomic<bool> ready_[phase_count]{};
+  omp_event_handle_t events_[phase_count]{};
+  bool has_event_[phase_count]{};
+  bool event_fulfilled_[phase_count]{};
+  const char* endata_begin_ = nullptr;
+  bool endata_present_      = false;
+  std::atomic<bool> endata_ready_{false};
+  mutable std::mutex mutex_;
+};
+
+// Turns out-of-order decoded blocks into ordered section-range publications for the parser:
+//
+//   producer --observe_block(i,...)--> [SIMD-scan block i for section titles] --> section_hits_
+//                                       [advance contiguous decoded-byte frontier (ready_bytes_)]
+//                                       --> notify_ready_phases --> registry --> parser tasks
+//
+// Producers (the LZ4 decoders / raw readers) call observe_block for each block in any order.
+// Per block the scanner (1) SIMD-scans it for section titles starting in column 1 and records
+// the first byte of each section via a first-writer-wins CAS; (2) advances a contiguous
+// decoded-byte frontier across whatever leading blocks are now present; and (3) recomputes which
+// phases are fully bounded and publishes their [begin,end) ranges to the registry, unblocking the
+// matching parser task. A title can straddle two blocks, so adjacent decoded blocks are also
+// rescanned over a small overlap (boundary_overlap).
+class mps_section_block_scanner_t {
+ public:
+  mps_section_block_scanner_t(const char* data,
+                              std::size_t block_count,
+                              mps_phase_registry_t& registry);
+
+  // Records a freshly decoded block, scans it for section titles, advances the
+  // contiguous decoded-byte frontier across out-of-order completions, and
+  // publishes any newly available section ranges. Producers only need to feed
+  // blocks in any order; the frontier and publication live entirely here.
+  void observe_block(std::size_t block_index, const char* begin, const char* end);
+  void publish_ready(std::size_t ready_bytes);
+
+  // Current contiguous decoded-byte frontier; producers use this as the final
+  // view size once all blocks have been observed.
+  std::size_t ready_bytes() const noexcept;
+
+ private:
+  static constexpr std::size_t section_count = 9;
+  // Section titles are short; 128 bytes is enough to rescan around a decoded
+  // block boundary and catch a newline/title pair split across adjacent blocks.
+  static constexpr std::size_t boundary_overlap = 128;
+
+  static std::size_t section_hit_index(mps_section_kind kind);
+
+  void scan_section_range(const char* begin, const char* end);
+  void scan_boundary(std::size_t left_index, std::size_t right_index);
+  void record_section_hit(mps_section_kind kind, const char* ptr);
+  void notify_ready_phases();
+  void advance_ready_frontier();
+
+  // Concurrency: observe_block runs concurrently on many producer threads.
+  //   * frontier_mutex_ guards next_block_ and the ready_bytes_ frontier advance.
+  //   * publish_mutex_  serializes notify_ready_phases so each phase publishes once, in order.
+  //   * block_decoded_[i] is release-stored after block_begin/end_offsets_[i] (relaxed), so an
+  //     acquire-load of a set flag makes those offsets visible to the reader.
+  //   * section_hits_[k] is a first-writer-wins CAS holding the earliest byte of section k.
+  //   * registry_ carries its own internal lock.
+  const char* data_        = nullptr;
+  std::size_t block_count_ = 0;
+  mps_phase_registry_t& registry_;
+  std::mutex publish_mutex_;
+  std::unique_ptr<std::atomic<unsigned char>[]> block_decoded_;
+  std::unique_ptr<std::atomic_size_t[]> block_begin_offsets_;
+  std::unique_ptr<std::atomic_size_t[]> block_end_offsets_;
+  std::atomic_size_t ready_bytes_{0};
+  std::atomic<const char*> section_hits_[section_count]{};
+  std::mutex frontier_mutex_;
+  std::size_t next_block_ = 0;
+};
+
+}  // namespace cuopt::mathematical_optimization::io::detail
diff --git a/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp b/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp
new file mode 100644
index 0000000000..d05d6d948d
--- /dev/null
+++ b/cpp/src/io/experimental_mps_fast/nvtx_ranges.hpp
@@ -0,0 +1,132 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+#include <utility>
+
+#ifdef MPS_FAST_NVTX
+#include <nvtx3/nvToolsExt.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#endif
+
+namespace cuopt::mathematical_optimization::io::detail::nvtx {
+
+namespace colors {
+constexpr std::uint32_t generic  = 0xff8b949e;
+constexpr std::uint32_t io       = 0xff58a6ff;
+constexpr std::uint32_t decode   = 0xff3fb950;
+constexpr std::uint32_t rows     = 0xffd29922;
+constexpr std::uint32_t columns  = 0xffff7b72;
+constexpr std::uint32_t rhs      = 0xffa371f7;
+constexpr std::uint32_t bounds   = 0xfff0883e;
+constexpr std::uint32_t ranges   = 0xff79c0ff;
+constexpr std::uint32_t names    = 0xff56d364;
+constexpr std::uint32_t alloc    = 0xffdb61a2;
+constexpr std::uint32_t finalize = 0xffc9d1d9;
+}  // namespace colors
+
+inline std::uint32_t color_for_name(std::string_view name) noexcept
+{
+  if (name.find("lz4") != std::string_view::npos || name.find("read") != std::string_view::npos) {
+    return colors::io;
+  }
+  if (name.find("decode") != std::string_view::npos ||
+      name.find("decompress") != std::string_view::npos) {
+    return colors::decode;
+  }
+  if (name.find("row") != std::string_view::npos) { return colors::rows; }
+  if (name.find("column") != std::string_view::npos || name.find("csr") != std::string_view::npos) {
+    return colors::columns;
+  }
+  if (name.find("rhs") != std::string_view::npos) { return colors::rhs; }
+  if (name.find("bound") != std::string_view::npos) { return colors::bounds; }
+  if (name.find("range") != std::string_view::npos) { return colors::ranges; }
+  if (name.find("name") != std::string_view::npos ||
+      name.find("materialize") != std::string_view::npos) {
+    return colors::names;
+  }
+  if (name.find("alloc") != std::string_view::npos ||
+      name.find("resize") != std::string_view::npos ||
+      name.find("mmap") != std::string_view::npos) {
+    return colors::alloc;
+  }
+  if (name.find("finalize") != std::string_view::npos) { return colors::finalize; }
+  return colors::generic;
+}
+
+class scoped_range_t {
+ public:
+  explicit scoped_range_t(const char* name,
+                          std::uint32_t color    = colors::generic,
+                          std::uint32_t category = 0)
+  {
+    push(name, color, category);
+  }
+
+  explicit scoped_range_t(std::string name,
+                          std::uint32_t color    = colors::generic,
+                          std::uint32_t category = 0)
+    : owned_name_(std::move(name))
+  {
+    push(owned_name_.c_str(), color, category);
+  }
+
+  ~scoped_range_t() { end(); }
+
+  void end()
+  {
+#ifdef MPS_FAST_NVTX
+    if (active_) {
+      nvtxRangePop();
+      active_ = false;
+    }
+#endif
+  }
+
+  scoped_range_t(const scoped_range_t&)            = delete;
+  scoped_range_t& operator=(const scoped_range_t&) = delete;
+
+ private:
+  void push([[maybe_unused]] const char* name,
+            [[maybe_unused]] std::uint32_t color,
+            [[maybe_unused]] std::uint32_t category)
+  {
+#ifdef MPS_FAST_NVTX
+    nvtxEventAttributes_t event{};
+    event.version       = NVTX_VERSION;
+    event.size          = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+    event.colorType     = NVTX_COLOR_ARGB;
+    event.color         = color;
+    event.messageType   = NVTX_MESSAGE_TYPE_ASCII;
+    event.message.ascii = name;
+    event.category      = category;
+    nvtxRangePushEx(&event);
+    active_ = true;
+#endif
+  }
+
+  std::string owned_name_;
+#ifdef MPS_FAST_NVTX
+  bool active_ = false;
+#endif
+};
+
+inline void name_current_thread([[maybe_unused]] const char* name)
+{
+#ifdef MPS_FAST_NVTX
+  nvtxNameOsThreadA((std::uint32_t)::syscall(SYS_gettid), name);
+#endif
+}
+
+}  // namespace cuopt::mathematical_optimization::io::detail::nvtx
+
+#define MPS_FAST_NVTX_CONCAT_INNER(a, b) a##b
+#define MPS_FAST_NVTX_CONCAT(a, b)       MPS_FAST_NVTX_CONCAT_INNER(a, b)
+#define MPS_NVTX_RANGE(name, color)                                                          \
+  ::cuopt::mathematical_optimization::io::detail::nvtx::scoped_range_t MPS_FAST_NVTX_CONCAT( \
+    _mps_nvtx_range_, __LINE__)(name, color)
diff --git a/cpp/src/io/file_to_string.cpp b/cpp/src/io/file_to_string.cpp
index 9c935485e6..a7c200b585 100644
--- a/cpp/src/io/file_to_string.cpp
+++ b/cpp/src/io/file_to_string.cpp
@@ -9,6 +9,8 @@
 
 #include <utilities/error.hpp>
 
+#include <algorithm>
+#include <cctype>
 #include <cstdio>
 #include <memory>
 #include <string>
@@ -22,9 +24,9 @@
 #include <zlib.h>
 #endif  // MPS_PARSER_WITH_ZLIB
 
-#if defined(MPS_PARSER_WITH_BZIP2) || defined(MPS_PARSER_WITH_ZLIB)
+#if defined(MPS_PARSER_WITH_BZIP2) || defined(MPS_PARSER_WITH_ZLIB) || defined(MPS_PARSER_WITH_LZ4)
 #include <dlfcn.h>
-#endif  // MPS_PARSER_WITH_BZIP2 || MPS_PARSER_WITH_ZLIB
+#endif  // MPS_PARSER_WITH_BZIP2 || MPS_PARSER_WITH_ZLIB || MPS_PARSER_WITH_LZ4
 
 namespace {
 using cuopt::mathematical_optimization::io::error_type_t;
@@ -207,22 +209,184 @@ std::vector<char> zlib_file_to_string(const std::string& file)
 }  // end namespace
 #endif  // MPS_PARSER_WITH_ZLIB
 
+#ifdef MPS_PARSER_WITH_LZ4
+namespace {
+// Minimal liblz4 frame ABI declarations; keep in sync with lz4frame.h.
+struct LZ4F_dctx;
+using LZ4F_errorCode_t = size_t;
+struct LZ4F_frameInfo_t {
+  int blockSizeID;
+  int blockMode;
+  int contentChecksumFlag;
+  int frameType;
+  unsigned long long contentSize;
+  unsigned dictID;
+  int blockChecksumFlag;
+};
+using LZ4F_createDecompressionContext_t = LZ4F_errorCode_t (*)(LZ4F_dctx**, unsigned);
+using LZ4F_freeDecompressionContext_t   = LZ4F_errorCode_t (*)(LZ4F_dctx*);
+using LZ4F_getFrameInfo_t               = LZ4F_errorCode_t (*)(LZ4F_dctx*,
+                                                 LZ4F_frameInfo_t*,
+                                                 const void*,
+                                                 size_t*);
+using LZ4F_decompress_t =
+  LZ4F_errorCode_t (*)(LZ4F_dctx*, void*, size_t*, const void*, size_t*, const void*);
+using LZ4F_isError_t      = unsigned (*)(LZ4F_errorCode_t);
+using LZ4F_getErrorName_t = const char* (*)(LZ4F_errorCode_t);
+
+std::vector<char> lz4_file_to_string(const std::string& file)
+{
+  struct DlCloseDeleter {
+    void operator()(void* fp)
+    {
+      mps_parser_expects_fatal(
+        dlclose(fp) == 0, error_type_t::ValidationError, "Error closing liblz4.so!");
+    }
+  };
+  struct Lz4DctxDeleter {
+    void operator()(LZ4F_dctx* f)
+    {
+      if (f != nullptr) {
+        const LZ4F_errorCode_t err = fptr(f);
+        mps_parser_expects_fatal(
+          !is_error(err), error_type_t::ValidationError, "Error closing lz4 file!");
+      }
+    }
+    LZ4F_freeDecompressionContext_t fptr = nullptr;
+    LZ4F_isError_t is_error              = nullptr;
+  };
+
+  void* raw_lz4handle = nullptr;
+  for (const char* soname : {"liblz4.so.1", "liblz4.so"}) {
+    raw_lz4handle = dlopen(soname, RTLD_LAZY);
+    if (raw_lz4handle != nullptr) break;
+  }
+  std::unique_ptr<void, DlCloseDeleter> lz4handle{raw_lz4handle};
+  mps_parser_expects(lz4handle != nullptr,
+                     error_type_t::ValidationError,
+                     "Could not open .lz4 file since liblz4 was not found "
+                     "(tried liblz4.so.1, liblz4.so). In order to open .lz4 files directly, "
+                     "please ensure liblz4 is installed. Alternatively, decompress the .lz4 file "
+                     "manually and open the uncompressed file. Given path: %s",
+                     file.c_str());
+
+  LZ4F_createDecompressionContext_t LZ4F_createDecompressionContext =
+    reinterpret_cast<LZ4F_createDecompressionContext_t>(
+      dlsym(lz4handle.get(), "LZ4F_createDecompressionContext"));
+  LZ4F_freeDecompressionContext_t LZ4F_freeDecompressionContext =
+    reinterpret_cast<LZ4F_freeDecompressionContext_t>(
+      dlsym(lz4handle.get(), "LZ4F_freeDecompressionContext"));
+  LZ4F_getFrameInfo_t LZ4F_getFrameInfo =
+    reinterpret_cast<LZ4F_getFrameInfo_t>(dlsym(lz4handle.get(), "LZ4F_getFrameInfo"));
+  LZ4F_decompress_t LZ4F_decompress =
+    reinterpret_cast<LZ4F_decompress_t>(dlsym(lz4handle.get(), "LZ4F_decompress"));
+  LZ4F_isError_t LZ4F_isError =
+    reinterpret_cast<LZ4F_isError_t>(dlsym(lz4handle.get(), "LZ4F_isError"));
+  LZ4F_getErrorName_t LZ4F_getErrorName =
+    reinterpret_cast<LZ4F_getErrorName_t>(dlsym(lz4handle.get(), "LZ4F_getErrorName"));
+  mps_parser_expects(
+    LZ4F_createDecompressionContext != nullptr && LZ4F_freeDecompressionContext != nullptr &&
+      LZ4F_getFrameInfo != nullptr && LZ4F_decompress != nullptr && LZ4F_isError != nullptr &&
+      LZ4F_getErrorName != nullptr,
+    error_type_t::ValidationError,
+    "Error loading liblz4! Library version might be incompatible. Please decompress the .lz4 "
+    "file manually and open the uncompressed file. Given path: %s",
+    file.c_str());
+
+  std::unique_ptr<FILE, FcloseDeleter> fp{fopen(file.c_str(), "rb")};
+  mps_parser_expects(fp != nullptr,
+                     error_type_t::ValidationError,
+                     "Error opening input file! Given path: %s",
+                     file.c_str());
+  mps_parser_expects(fseek(fp.get(), 0L, SEEK_END) == 0,
+                     error_type_t::ValidationError,
+                     "Error seeking input file! Given path: %s",
+                     file.c_str());
+  const long compressed_size = ftell(fp.get());
+  mps_parser_expects(compressed_size != -1L,
+                     error_type_t::ValidationError,
+                     "Error sizing input file! Given path: %s",
+                     file.c_str());
+  std::vector<char> compressed(compressed_size);
+  rewind(fp.get());
+  mps_parser_expects(fread(compressed.data(), sizeof(char), compressed_size, fp.get()) ==
+                       static_cast<size_t>(compressed_size),
+                     error_type_t::ValidationError,
+                     "Error reading input file! Given path: %s",
+                     file.c_str());
+
+  constexpr unsigned lz4f_version = 100;
+  LZ4F_dctx* raw_dctx             = nullptr;
+  LZ4F_errorCode_t lz4_status     = LZ4F_createDecompressionContext(&raw_dctx, lz4f_version);
+  mps_parser_expects(!LZ4F_isError(lz4_status),
+                     error_type_t::ValidationError,
+                     "Could not open lz4 compressed file '%s': %s",
+                     file.c_str(),
+                     LZ4F_getErrorName(lz4_status));
+  std::unique_ptr<LZ4F_dctx, Lz4DctxDeleter> dctx{raw_dctx,
+                                                  {LZ4F_freeDecompressionContext, LZ4F_isError}};
+
+  const char* src = compressed.data();
+  size_t src_size = compressed.size();
+  LZ4F_frameInfo_t frame_info{};
+  size_t src_used = src_size;
+  lz4_status      = LZ4F_getFrameInfo(dctx.get(), &frame_info, src, &src_used);
+  mps_parser_expects(!LZ4F_isError(lz4_status),
+                     error_type_t::ValidationError,
+                     "Error reading lz4 frame info for input file '%s': %s",
+                     file.c_str(),
+                     LZ4F_getErrorName(lz4_status));
+  src += src_used;
+  src_size -= src_used;
+
+  std::vector<char> buf;
+  if (frame_info.contentSize > 0) { buf.reserve((size_t)frame_info.contentSize + 1); }
+  const size_t readbufsize = 1ull << 24;  // 16MiB
+  std::vector<char> readbuf(readbufsize);
+  while (lz4_status != 0) {
+    size_t dst_size = readbuf.size();
+    src_used        = src_size;
+    lz4_status = LZ4F_decompress(dctx.get(), readbuf.data(), &dst_size, src, &src_used, nullptr);
+    mps_parser_expects(!LZ4F_isError(lz4_status),
+                       error_type_t::ValidationError,
+                       "Error in lz4 decompression of input file '%s': %s",
+                       file.c_str(),
+                       LZ4F_getErrorName(lz4_status));
+    if (dst_size > 0) { buf.insert(buf.end(), begin(readbuf), begin(readbuf) + dst_size); }
+    src += src_used;
+    src_size -= src_used;
+    mps_parser_expects(src_used != 0 || dst_size != 0 || lz4_status == 0,
+                       error_type_t::ValidationError,
+                       "Stalled lz4 decompression of input file! Given path: %s",
+                       file.c_str());
+  }
+  buf.push_back('\0');
+  return buf;
+}
+}  // end namespace
+#endif  // MPS_PARSER_WITH_LZ4
+
 namespace cuopt::mathematical_optimization::io {
 
 std::vector<char> file_to_string(const std::string& file)
 {
+  std::string lower(file);
+  std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c) {
+    return (char)std::tolower(c);
+  });
+
 #ifdef MPS_PARSER_WITH_BZIP2
-  if (file.size() > 4 && file.substr(file.size() - 4, 4) == ".bz2") {
-    return bz2_file_to_string(file);
-  }
+  if (lower.ends_with(".bz2")) { return bz2_file_to_string(file); }
 #endif  // MPS_PARSER_WITH_BZIP2
 
 #ifdef MPS_PARSER_WITH_ZLIB
-  if (file.size() > 3 && file.substr(file.size() - 3, 3) == ".gz") {
-    return zlib_file_to_string(file);
-  }
+  if (lower.ends_with(".gz")) { return zlib_file_to_string(file); }
 #endif  // MPS_PARSER_WITH_ZLIB
 
+#ifdef MPS_PARSER_WITH_LZ4
+  if (lower.ends_with(".lz4")) { return lz4_file_to_string(file); }
+#endif  // MPS_PARSER_WITH_LZ4
+
   // Faster than using C++ I/O
   std::unique_ptr<FILE, FcloseDeleter> fp{fopen(file.c_str(), "r")};
   mps_parser_expects(fp != nullptr,
diff --git a/cpp/src/io/file_to_string.hpp b/cpp/src/io/file_to_string.hpp
index da36961d2b..6ca2687da0 100644
--- a/cpp/src/io/file_to_string.hpp
+++ b/cpp/src/io/file_to_string.hpp
@@ -17,6 +17,7 @@ namespace cuopt::mathematical_optimization::io {
 // The dispatcher looks at the extension:
 //   - ".bz2" → libbz2 (dlopen'd at runtime), if MPS_PARSER_WITH_BZIP2.
 //   - ".gz"  → libz   (dlopen'd at runtime), if MPS_PARSER_WITH_ZLIB.
+//   - ".lz4" → liblz4 (dlopen'd at runtime), if MPS_PARSER_WITH_LZ4.
 //   - otherwise → plain fopen.
 // The returned buffer's size includes the null terminator.
 std::vector<char> file_to_string(const std::string& file);
diff --git a/cpp/src/io/mps_parser.cpp b/cpp/src/io/mps_parser.cpp
index 7d3a52ee48..7c145638f1 100644
--- a/cpp/src/io/mps_parser.cpp
+++ b/cpp/src/io/mps_parser.cpp
@@ -797,9 +797,9 @@ void mps_parser_t<i_t, f_t>::parse_rows(std::string_view line)
   }
   if (type == Objective) {
     // Keep only the first name or OBJNAME since it was set before
-    if (objective_name.empty())
-      objective_name = name;
-    else
+    if (objective_name.empty()) objective_name = name;
+    // aligns with CPLEX/SCIP behavior
+    else if (name != objective_name)
       ignored_objective_names.emplace(name);
     // If we wanted to strictly follow MPS definition: a new objective row ('N') should be treated
     // as an unbounded constraints, aka an extra contraints row with lower bound -infinity and upper
diff --git a/cpp/src/io/parser.cpp b/cpp/src/io/parser.cpp
index 20e80185b3..0a626bd26c 100644
--- a/cpp/src/io/parser.cpp
+++ b/cpp/src/io/parser.cpp
@@ -7,8 +7,13 @@
 
 #include <cuopt/mathematical_optimization/io/parser.hpp>
 
+#include <experimental_mps_fast/fast_parser.hpp>
 #include <mps_parser_internal.hpp>
 
+#include <utilities/logger.hpp>
+
+#include <cstdint>
+
 namespace cuopt::mathematical_optimization::io {
 
 template <typename i_t, typename f_t>
@@ -35,4 +40,18 @@ template mps_data_model_t<int, float> read_mps_from_string(std::string_view mps_
 template mps_data_model_t<int, double> read_mps_from_string(std::string_view mps_contents,
                                                             bool fixed_mps_format);
 
+template <typename i_t, typename f_t>
+mps_data_model_t<i_t, f_t> read_mps_fast_experimental(const std::string& mps_file_path)
+{
+  CUOPT_LOG_INFO("Using experimental fast MPS parser for '%s'", mps_file_path.c_str());
+  return detail::parse_mps_fast_file<i_t, f_t>(mps_file_path);
+}
+
+template mps_data_model_t<int, float> read_mps_fast_experimental(const std::string& mps_file_path);
+template mps_data_model_t<int, double> read_mps_fast_experimental(const std::string& mps_file_path);
+template mps_data_model_t<int64_t, float> read_mps_fast_experimental(
+  const std::string& mps_file_path);
+template mps_data_model_t<int64_t, double> read_mps_fast_experimental(
+  const std::string& mps_file_path);
+
 }  // namespace cuopt::mathematical_optimization::io
diff --git a/cpp/src/io/utilities/error.hpp b/cpp/src/io/utilities/error.hpp
index a0d1813856..77ede9c415 100644
--- a/cpp/src/io/utilities/error.hpp
+++ b/cpp/src/io/utilities/error.hpp
@@ -34,6 +34,30 @@ inline std::string error_to_string(error_type_t error)
   return std::string("UnAccountedError");
 }
 
+[[noreturn]] inline void mps_parser_throw(error_type_t error_type, const char* msg)
+{
+  throw std::logic_error("{\"MPS_PARSER_ERROR_TYPE\": \"" + error_to_string(error_type) +
+                         "\", \"msg\": " + "\"" + std::string(msg) + "\"}");
+}
+
+/**
+ * @brief Report an unrecoverable parser error.
+ *
+ * @param[error_type_t] error enum error type
+ * @param[const char *] fmt String format for error message
+ * @param variable set of arguments used for fmt
+ * @throw std::logic_error always
+ */
+[[noreturn]] inline void mps_parser_fail(error_type_t error_type, const char* fmt, ...)
+{
+  va_list args;
+  va_start(args, fmt);
+  char msg[2048];
+  vsnprintf(msg, sizeof(msg), fmt, args);
+  va_end(args);
+  mps_parser_throw(error_type, msg);
+}
+
 /**
  * @brief Function for checking (pre-)conditions that throws an exception when a
  * condition is false
@@ -52,9 +76,7 @@ inline void mps_parser_expects(bool cond, error_type_t error_type, const char* f
     char msg[2048];
     vsnprintf(msg, sizeof(msg), fmt, args);
     va_end(args);
-
-    throw std::logic_error("{\"MPS_PARSER_ERROR_TYPE\": \"" + error_to_string(error_type) +
-                           "\", \"msg\": " + "\"" + std::string(msg) + "\"}");
+    mps_parser_throw(error_type, msg);
   }
 }
 
diff --git a/cpp/src/utilities/omp_helpers.hpp b/cpp/src/utilities/omp_helpers.hpp
index 8080e53fc5..3fd31d169e 100644
--- a/cpp/src/utilities/omp_helpers.hpp
+++ b/cpp/src/utilities/omp_helpers.hpp
@@ -246,6 +246,6 @@ inline double fetch_max(omp_atomic_t<double>& atomic_var, double other)
   return old;
 }
 
-#endif
-
 }  // namespace cuopt
+
+#endif
diff --git a/cpp/src/utilities/perf_counters.hpp b/cpp/src/utilities/perf_counters.hpp
new file mode 100644
index 0000000000..3fd47c2437
--- /dev/null
+++ b/cpp/src/utilities/perf_counters.hpp
@@ -0,0 +1,194 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <linux/perf_event.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include <array>
+#include <cerrno>
+#include <cstdint>
+#include <cstdio>
+#include <vector>
+
+namespace cuopt::mathematical_optimization::io::detail {
+
+// Utils to return to total resident set size (used physical pages)
+static size_t parse_status_kb_line(const char* line, const char* key)
+{
+  size_t key_len = std::strlen(key);
+  if (std::strncmp(line, key, key_len) != 0) { return 0; }
+  const char* p = line + key_len;
+  while (*p == ' ' || *p == '\t') {
+    ++p;
+  }
+  char* end_ptr = nullptr;
+  size_t value  = std::strtol(p, &end_ptr, 10);
+  return value;
+}
+
+static std::pair<size_t, size_t> current_process_rss_kb()
+{
+  FILE* file = std::fopen("/proc/self/status", "r");
+  if (file == nullptr) { return {0, 0}; }
+
+  size_t rss_kb = 0;
+  size_t hwm_kb = 0;
+  char line[256];
+  while (std::fgets(line, sizeof(line), file) != nullptr) {
+    if (rss_kb == 0) { rss_kb = parse_status_kb_line(line, "VmRSS:"); }
+    if (hwm_kb == 0) { hwm_kb = parse_status_kb_line(line, "VmHWM:"); }
+    if (rss_kb != 0 && hwm_kb != 0) { break; }
+  }
+  std::fclose(file);
+  return {rss_kb, hwm_kb};
+}
+
+struct perf_counter_spec_t {
+  const char* name;
+  uint32_t type;
+  uint64_t config;
+};
+
+static constexpr uint64_t perf_cache_config(uint64_t cache, uint64_t op, uint64_t result)
+{
+  return cache | (op << 8) | (result << 16);
+}
+
+// Small scoped Linux perf_event_open wrapper for coarse phase diagnostics.
+//
+// Important limitations:
+// - Counters are per-thread: construct one instance inside each worker whose
+//   work should be measured, then aggregate snapshots.
+// - These are generic perf events; exact mappings vary by CPU. Some events may
+//   be unavailable or unhelpful, e.g. store-side DTLB misses on this node.
+// - This deliberately does not use event groups or time_enabled/time_running
+//   scaling, so counts are approximate if the kernel multiplexes counters.
+static constexpr std::array<perf_counter_spec_t, 8> PERF_COUNTER_SPECS = {{
+  {"cycles", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES},
+  {"instructions", PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS},
+  {"cache_refs", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES},
+  {"cache_misses", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES},
+  {"branch_misses", PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES},
+  {"backend_stall_cycles", PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_BACKEND},
+  {"dtlb_load_misses",
+   PERF_TYPE_HW_CACHE,
+   perf_cache_config(
+     PERF_COUNT_HW_CACHE_DTLB, PERF_COUNT_HW_CACHE_OP_READ, PERF_COUNT_HW_CACHE_RESULT_MISS)},
+  {"dtlb_store_misses",
+   PERF_TYPE_HW_CACHE,
+   perf_cache_config(
+     PERF_COUNT_HW_CACHE_DTLB, PERF_COUNT_HW_CACHE_OP_WRITE, PERF_COUNT_HW_CACHE_RESULT_MISS)},
+}};
+
+struct perf_counter_snapshot_t {
+  bool active                                            = false;
+  int open_errno                                         = 0;
+  std::array<uint64_t, PERF_COUNTER_SPECS.size()> values = {};
+};
+
+class thread_perf_counters_t {
+ public:
+  thread_perf_counters_t()
+  {
+    fds_.fill(-1);
+    for (size_t i = 0; i < PERF_COUNTER_SPECS.size(); ++i) {
+      perf_event_attr attr = {};
+      attr.type            = PERF_COUNTER_SPECS[i].type;
+      attr.size            = sizeof(attr);
+      attr.config          = PERF_COUNTER_SPECS[i].config;
+      attr.disabled        = 1;
+      attr.exclude_kernel  = 1;
+      attr.exclude_hv      = 1;
+
+      int fd = (int)syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0);
+      if (fd < 0) {
+        if (first_errno_ == 0) { first_errno_ = errno; }
+        continue;
+      }
+      fds_[i] = fd;
+      active_ = true;
+    }
+
+    if (active_) {
+      for (int fd : fds_) {
+        if (fd >= 0) {
+          ioctl(fd, PERF_EVENT_IOC_RESET, 0);
+          ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
+        }
+      }
+    }
+  }
+
+  thread_perf_counters_t(const thread_perf_counters_t&)            = delete;
+  thread_perf_counters_t& operator=(const thread_perf_counters_t&) = delete;
+
+  ~thread_perf_counters_t() { close_all(); }
+
+  perf_counter_snapshot_t stop()
+  {
+    perf_counter_snapshot_t snapshot;
+    snapshot.active     = active_;
+    snapshot.open_errno = first_errno_;
+
+    for (size_t i = 0; i < fds_.size(); ++i) {
+      int fd = fds_[i];
+      if (fd < 0) continue;
+      ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
+      uint64_t value = 0;
+      if (read(fd, &value, sizeof(value)) == (ssize_t)sizeof(value)) { snapshot.values[i] = value; }
+    }
+    close_all();
+    active_ = false;
+    return snapshot;
+  }
+
+ private:
+  void close_all()
+  {
+    for (int& fd : fds_) {
+      if (fd >= 0) {
+        close(fd);
+        fd = -1;
+      }
+    }
+  }
+
+  bool active_     = false;
+  int first_errno_ = 0;
+  std::array<int, PERF_COUNTER_SPECS.size()> fds_;
+};
+
+static inline void print_perf_totals(const char* label,
+                                     const std::vector<perf_counter_snapshot_t>& snapshots)
+{
+  std::array<unsigned long long, PERF_COUNTER_SPECS.size()> totals = {};
+  bool any_active                                                  = false;
+  int first_errno                                                  = 0;
+  for (const auto& snapshot : snapshots) {
+    if (snapshot.open_errno != 0 && first_errno == 0) { first_errno = snapshot.open_errno; }
+    if (!snapshot.active) continue;
+    any_active = true;
+    for (size_t i = 0; i < PERF_COUNTER_SPECS.size(); ++i) {
+      totals[i] += snapshot.values[i];
+    }
+  }
+
+  if (!any_active) {
+    std::fprintf(stderr, "[PERF] %s unavailable errno=%d\n", label, first_errno);
+    return;
+  }
+
+  double ipc       = totals[0] == 0 ? 0.0 : (double)totals[1] / (double)totals[0];
+  double miss_rate = totals[2] == 0 ? 0.0 : (double)totals[3] / (double)totals[2];
+  std::fprintf(stderr, "[PERF] %s", label);
+  for (size_t i = 0; i < PERF_COUNTER_SPECS.size(); ++i) {
+    std::fprintf(stderr, " %s=%llu", PERF_COUNTER_SPECS[i].name, totals[i]);
+  }
+  std::fprintf(stderr, " ipc=%.3f cache_miss_rate=%.6f\n", ipc, miss_rate);
+}
+
+}  // namespace cuopt::mathematical_optimization::io::detail
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 19bb27d593..27f6c94983 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -23,6 +23,8 @@ if(BUILD_TESTS)
   target_link_libraries(cuopttestutils
       PUBLIC
       cuopt
+      OpenMP::OpenMP_CXX
+      OpenMP::OpenMP_CUDA
       GTest::gmock
       GTest::gtest
   )
@@ -56,6 +58,8 @@ function(ConfigureTest CMAKE_TEST_NAME)
        PRIVATE
         cuopt
         cuopttestutils
+        OpenMP::OpenMP_CXX
+        OpenMP::OpenMP_CUDA
         GTest::gmock
         GTest::gmock_main
         GTest::gtest
diff --git a/cpp/tests/linear_programming/CMakeLists.txt b/cpp/tests/linear_programming/CMakeLists.txt
index bc057db1e2..6db30755c3 100644
--- a/cpp/tests/linear_programming/CMakeLists.txt
+++ b/cpp/tests/linear_programming/CMakeLists.txt
@@ -21,6 +21,16 @@ ConfigureTest(MPS_PARSER_TEST
     ${CMAKE_CURRENT_SOURCE_DIR}/parser_test.cpp
     LABELS numopt)
 
+ConfigureTest(MPS_FAST_PARSER_TEST
+    ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/fast_fp64_parser_test.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/experimental_mps_fast/fast_parser_edge_test.cpp
+    LABELS numopt)
+target_include_directories(MPS_FAST_PARSER_TEST
+    PRIVATE
+    "${CUOPT_TEST_DIR}/../src/io/experimental_mps_fast"
+)
+target_link_libraries(MPS_FAST_PARSER_TEST PRIVATE simde::simde)
+
 # ##################################################################################################
 # - C API Tests----------------------------------------------------------------------
 # The C API tests require a separate library to be linked against. So we don't use the ConfigureTest macro.
diff --git a/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp b/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp
new file mode 100644
index 0000000000..192a707aa0
--- /dev/null
+++ b/cpp/tests/linear_programming/experimental_mps_fast/fast_fp64_parser_test.cpp
@@ -0,0 +1,188 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "fast_fp64_parser.hpp"
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <bit>
+#include <cerrno>
+#include <clocale>
+#include <cstdint>
+#include <cstdlib>
+#include <limits>
+#include <random>
+#include <string>
+#include <string_view>
+#include <vector>
+
+namespace cuopt::mathematical_optimization::io::detail {
+
+namespace {
+
+uint64_t bits(double value) { return std::bit_cast<uint64_t>(value); }
+
+double reference_strtod(std::string_view token)
+{
+  std::string normalized(token);
+  for (char& c : normalized) {
+    if (c == 'd' || c == 'D') { c = 'e'; }
+  }
+  char* end = nullptr;
+  errno     = 0;
+  return std::strtod(normalized.c_str(), &end);
+}
+
+double parse_token(std::string_view token)
+{
+  const char* p = token.data();
+  return fp64::parse_fp64_advance(p, token.data() + token.size());
+}
+
+void check_bitwise_strtod(std::string_view token)
+{
+  std::string normalized(token);
+  for (char& c : normalized) {
+    if (c == 'd' || c == 'D') { c = 'e'; }
+  }
+  char* end        = nullptr;
+  errno            = 0;
+  const double ref = std::strtod(normalized.c_str(), &end);
+  EXPECT_EQ(end, normalized.c_str() + normalized.size());
+
+  std::string padded(token);
+  padded.append(40, ' ');
+  const char* p             = padded.data();
+  const double padded_value = fp64::parse_fp64_advance(p, padded.data() + padded.size());
+  EXPECT_EQ(p, padded.data() + token.size());
+
+  const uint64_t ref_bits = bits(ref);
+  EXPECT_EQ(ref_bits, bits(parse_token(token))) << "token parse mismatch for '" << token << "'";
+  EXPECT_EQ(ref_bits, bits(padded_value)) << "padded parse mismatch for '" << token << "'";
+}
+
+std::string random_token(std::mt19937_64& rng)
+{
+  std::uniform_int_distribution<int> sign_dist(0, 4);
+  std::uniform_int_distribution<int> digit_dist(0, 9);
+  std::uniform_int_distribution<int> shape_dist(0, 5);
+  std::uniform_int_distribution<int> len_dist(1, 19);
+  std::uniform_int_distribution<int> exp_dist(-30, 30);
+
+  std::string token;
+  int sign = sign_dist(rng);
+  if (sign == 0) {
+    token.push_back('-');
+  } else if (sign == 1) {
+    token.push_back('+');
+  }
+
+  int shape = shape_dist(rng);
+  if (shape == 0) {
+    token.append("0.");
+    int frac_len = std::uniform_int_distribution<int>(1, 19)(rng);
+    for (int i = 0; i < frac_len; ++i) {
+      token.push_back(static_cast<char>('0' + digit_dist(rng)));
+    }
+  } else {
+    int int_len = len_dist(rng);
+    token.push_back(static_cast<char>('1' + std::uniform_int_distribution<int>(0, 8)(rng)));
+    for (int i = 1; i < int_len; ++i) {
+      token.push_back(static_cast<char>('0' + digit_dist(rng)));
+    }
+    if (shape >= 2) {
+      token.push_back('.');
+      int remaining = 24 - static_cast<int>(token.size());
+      int max_frac  = std::max(0, std::min(19, remaining));
+      int frac_len  = max_frac == 0 ? 0 : std::uniform_int_distribution<int>(0, max_frac)(rng);
+      for (int i = 0; i < frac_len; ++i) {
+        token.push_back(static_cast<char>('0' + digit_dist(rng)));
+      }
+    }
+  }
+
+  if (shape == 5) {
+    int exp            = exp_dist(rng);
+    std::string suffix = "e" + std::to_string(exp);
+    if (token.size() + suffix.size() <= 25) { token += suffix; }
+  }
+
+  if (token.size() > 25) { token.resize(25); }
+  return token;
+}
+
+}  // namespace
+
+TEST(FastFp64ParserTest, CommonTableMatchesStrtodBitwise)
+{
+  std::setlocale(LC_NUMERIC, "C");
+  const std::vector<std::string_view> cases = {
+    "0",
+    "-0",
+    "1",
+    "-1",
+    "+1",
+    "2",
+    "42",
+    "123456789",
+    "57.",
+    "-57.",
+    "0.1",
+    "0.01",
+    "0.12345678901234",
+    "0.1234567890123456",
+    "0.3333333333333333",
+    "0.6508282938248958",
+    "3.14159",
+    "3130000",
+    "8594600.16",
+    "2344.55",
+    "0.000000000000001",
+    "9999999999999999",
+    "1844674407370955161",
+    "1e0",
+    "1e-9",
+    "1E12",
+    "-2.5e3",
+    "3.125D-2",
+  };
+
+  for (std::string_view token : cases) {
+    check_bitwise_strtod(token);
+  }
+}
+
+TEST(FastFp64ParserTest, CursorAdvancesToTokenEnd)
+{
+  std::setlocale(LC_NUMERIC, "C");
+  std::string text = "123.45  ABC";
+  const char* p    = text.data();
+  double value     = fp64::parse_fp64_advance(p, text.data() + text.size());
+
+  EXPECT_EQ(bits(reference_strtod("123.45")), bits(value));
+  EXPECT_EQ(text.data() + 6, p);
+  EXPECT_EQ(std::string_view("  ABC"), std::string_view(p, 5));
+}
+
+TEST(FastFp64ParserTest, RejectsMalformedNumericSuffix)
+{
+  std::setlocale(LC_NUMERIC, "C");
+  for (const char* token : {"1x", "1e", "1d+", "1e+"}) {
+    SCOPED_TRACE(token);
+    EXPECT_THROW(parse_token(token), std::exception);
+  }
+}
+
+TEST(FastFp64ParserTest, FixedSeedRandomDifferential)
+{
+  std::setlocale(LC_NUMERIC, "C");
+  std::mt19937_64 rng(0x4d50535f46415354ULL);
+  for (int i = 0; i < 100000; ++i) {
+    std::string token = random_token(rng);
+    ASSERT_LE(token.size(), 25U);
+    check_bitwise_strtod(token);
+  }
+}
+
+}  // namespace cuopt::mathematical_optimization::io::detail
diff --git a/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp
new file mode 100644
index 0000000000..69cc29a0d3
--- /dev/null
+++ b/cpp/tests/linear_programming/experimental_mps_fast/fast_parser_edge_test.cpp
@@ -0,0 +1,936 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "fast_parser.hpp"
+#include "mps_section_scanner.hpp"
+
+#include <cuopt/mathematical_optimization/io/parser.hpp>
+#include <mps_parser_internal.hpp>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <bit>
+#include <cerrno>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iomanip>
+#include <limits>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <type_traits>
+#include <vector>
+
+#include <unistd.h>
+
+namespace cuopt::mathematical_optimization::io::detail {
+
+namespace {
+
+struct TempMpsFile {
+  explicit TempMpsFile(std::string contents)
+  {
+    char path_template[128];
+    std::snprintf(path_template,
+                  sizeof(path_template),
+                  "/tmp/mps_fast_parser_edge_%ld_XXXXXX.mps",
+                  static_cast<long>(getpid()));
+    int fd = mkstemps(path_template, 4);
+    if (fd < 0) {
+      throw std::runtime_error(std::string("mkstemps failed: ") + std::strerror(errno));
+    }
+    path       = path_template;
+    FILE* file = fdopen(fd, "wb");
+    if (file == nullptr) {
+      close(fd);
+      throw std::runtime_error(std::string("fdopen failed: ") + std::strerror(errno));
+    }
+    if (!contents.empty() &&
+        std::fwrite(contents.data(), 1, contents.size(), file) != contents.size()) {
+      std::fclose(file);
+      throw std::runtime_error(std::string("failed to write temporary MPS file: ") +
+                               std::strerror(errno));
+    }
+    if (std::fclose(file) != 0) {
+      throw std::runtime_error(std::string("failed to close temporary MPS file: ") +
+                               std::strerror(errno));
+    }
+  }
+
+  TempMpsFile(const TempMpsFile&)            = delete;
+  TempMpsFile& operator=(const TempMpsFile&) = delete;
+
+  ~TempMpsFile()
+  {
+    if (!path.empty()) { std::remove(path.c_str()); }
+  }
+
+  std::string path;
+};
+
+struct TempOwnedPath {
+  explicit TempOwnedPath(std::string p) : path(std::move(p)) {}
+  TempOwnedPath(const TempOwnedPath&)            = delete;
+  TempOwnedPath& operator=(const TempOwnedPath&) = delete;
+
+  ~TempOwnedPath()
+  {
+    if (!path.empty()) { std::remove(path.c_str()); }
+  }
+
+  std::string path;
+};
+
+std::string_view range_text(const mps_phase_range_t& range)
+{
+  if (!range.present) { return {}; }
+  return std::string_view(range.begin, static_cast<size_t>(range.end - range.begin));
+}
+
+uint64_t bits(double value) { return std::bit_cast<uint64_t>(value); }
+
+template <typename T>
+void expect_vectors_bitwise_equal(const std::vector<T>& reference,
+                                  const std::vector<T>& fast,
+                                  std::string_view field,
+                                  std::string_view context)
+{
+  static_assert(std::is_trivially_copyable_v<T>);
+  SCOPED_TRACE(std::string(context) + " " + std::string(field));
+  ASSERT_EQ(reference.size(), fast.size()) << "size";
+  if (reference.empty()) { return; }
+  EXPECT_EQ(0, std::memcmp(reference.data(), fast.data(), reference.size() * sizeof(T)));
+}
+
+void check_models_match_reference_bitwise(const parser_model_t<int, double>& fast,
+                                          const mps_data_model_t<int, double>& reference,
+                                          std::string_view context)
+{
+  EXPECT_EQ(reference.n_vars_, fast.n_vars_) << std::string(context) + " n_vars";
+  EXPECT_EQ(reference.n_constraints_, fast.n_constraints_)
+    << std::string(context) + " n_constraints";
+  EXPECT_EQ(reference.get_nnz(), fast.get_nnz()) << std::string(context) + " nnz";
+  EXPECT_EQ(reference.maximize_, fast.maximize_) << std::string(context) + " maximize";
+  EXPECT_EQ(reference.problem_name_, fast.problem_name_) << std::string(context) + " problem_name";
+  EXPECT_EQ(reference.objective_name_, fast.objective_name_)
+    << std::string(context) + " objective_name";
+
+  EXPECT_EQ(bits(reference.objective_scaling_factor_), bits(fast.objective_scaling_factor_))
+    << std::string(context) + " objective_scaling_factor";
+  EXPECT_EQ(bits(reference.objective_offset_), bits(fast.objective_offset_))
+    << std::string(context) + " objective_offset";
+
+  expect_vectors_bitwise_equal(reference.A_, fast.A_, "A", context);
+  EXPECT_EQ(reference.A_indices_, fast.A_indices_) << std::string(context) + " A_indices";
+  EXPECT_EQ(reference.A_offsets_, fast.A_offsets_) << std::string(context) + " A_offsets";
+  expect_vectors_bitwise_equal(reference.b_, fast.b_, "b", context);
+  expect_vectors_bitwise_equal(reference.c_, fast.c_, "c", context);
+  expect_vectors_bitwise_equal(reference.variable_lower_bounds_,
+                               fast.variable_lower_bounds_,
+                               "variable_lower_bounds",
+                               context);
+  expect_vectors_bitwise_equal(reference.variable_upper_bounds_,
+                               fast.variable_upper_bounds_,
+                               "variable_upper_bounds",
+                               context);
+  expect_vectors_bitwise_equal(reference.constraint_lower_bounds_,
+                               fast.constraint_lower_bounds_,
+                               "constraint_lower_bounds",
+                               context);
+  expect_vectors_bitwise_equal(reference.constraint_upper_bounds_,
+                               fast.constraint_upper_bounds_,
+                               "constraint_upper_bounds",
+                               context);
+  EXPECT_EQ(reference.var_types_, fast.var_types_) << std::string(context) + " var_types";
+  EXPECT_EQ(reference.row_types_, fast.row_types_) << std::string(context) + " row_types";
+  EXPECT_EQ(reference.var_names_, fast.var_names_) << std::string(context) + " var_names";
+  EXPECT_EQ(reference.row_names_, fast.row_names_) << std::string(context) + " row_names";
+
+  ASSERT_EQ(reference.quadratic_constraints_.size(), fast.quadratic_constraints_.size())
+    << std::string(context) + " quadratic_constraints size";
+  for (size_t q = 0; q < reference.quadratic_constraints_.size(); ++q) {
+    const auto& ref_qc  = reference.quadratic_constraints_[q];
+    const auto& fast_qc = fast.quadratic_constraints_[q];
+    SCOPED_TRACE(std::string(context) + " quadratic_constraint " + std::to_string(q));
+    EXPECT_EQ(ref_qc.constraint_row_index, fast_qc.constraint_row_index);
+    EXPECT_EQ(ref_qc.constraint_row_name, fast_qc.constraint_row_name);
+    EXPECT_EQ(ref_qc.constraint_row_type, fast_qc.constraint_row_type);
+    EXPECT_EQ(bits(ref_qc.rhs_value), bits(fast_qc.rhs_value));
+    expect_vectors_bitwise_equal(
+      ref_qc.linear_values, fast_qc.linear_values, "linear_values", context);
+    EXPECT_EQ(ref_qc.linear_indices, fast_qc.linear_indices);
+    expect_vectors_bitwise_equal(ref_qc.vals, fast_qc.vals, "qc_vals", context);
+    EXPECT_EQ(ref_qc.rows, fast_qc.rows);
+    EXPECT_EQ(ref_qc.cols, fast_qc.cols);
+  }
+}
+
+mps_data_model_t<int, double> parse_reference_model(const std::string& path)
+{
+  mps_data_model_t<int, double> reference;
+  mps_parser_t<int, double> parser(reference, path, false);
+  return reference;
+}
+
+void verify_fixture_bitwise(std::string_view fixture_name, std::string contents)
+{
+  TempMpsFile file(std::move(contents));
+  auto fast      = parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read);
+  auto reference = parse_reference_model(file.path);
+  check_models_match_reference_bitwise(fast, reference, fixture_name);
+}
+
+std::string row_name(size_t i)
+{
+  std::ostringstream out;
+  out << 'R' << std::setw(6) << std::setfill('0') << i;
+  return out.str();
+}
+
+int find_var_index(const parser_model_t<int, double>& model, std::string_view name)
+{
+  for (size_t i = 0; i < model.var_names_.size(); ++i) {
+    if (model.var_names_[i] == name) { return static_cast<int>(i); }
+  }
+  return -1;
+}
+
+void check_model_shapes(
+  const parser_model_t<int, double>& model, int rows, int vars, int nnz, std::string_view context)
+{
+  EXPECT_EQ(rows, model.n_constraints_) << std::string(context) + " rows";
+  EXPECT_EQ(vars, model.n_vars_) << std::string(context) + " vars";
+  EXPECT_EQ(nnz, model.nnz_) << std::string(context) + " nnz";
+  EXPECT_EQ(static_cast<size_t>(rows + 1), model.A_offsets_.size())
+    << std::string(context) + " offsets";
+  EXPECT_EQ(static_cast<size_t>(nnz), model.A_.size()) << std::string(context) + " values";
+  EXPECT_EQ(static_cast<size_t>(nnz), model.A_indices_.size()) << std::string(context) + " indices";
+}
+
+std::string section_split_fixture()
+{
+  return "NAME SPLITS\n"
+         "ROWS\n"
+         " N OBJ\n"
+         " L R1\n"
+         "COLUMNS\n"
+         " X1 OBJ 1 R1 2\n"
+         "RHS\n"
+         " RHS1 R1 3\n"
+         "BOUNDS\n"
+         " UP BND X1 4\n"
+         "ENDATA\n";
+}
+
+std::string to_crlf(std::string text)
+{
+  std::string converted;
+  converted.reserve(text.size() + text.size() / 8);
+  for (char c : text) {
+    if (c == '\n') {
+      converted += "\r\n";
+    } else {
+      converted.push_back(c);
+    }
+  }
+  return converted;
+}
+
+}  // namespace
+
+TEST(FastMpsParserEdgeTest, ScannerFindsSectionSplitAcrossBlocks)
+{
+  const std::string mps =
+    "NAME EDGE\n"
+    "ROWS\n"
+    " N OBJ\n"
+    " L rowA\n"
+    "COLUMNS\n"
+    " x1 OBJ 1\n"
+    " x1 rowA 2\n"
+    "RHS\n"
+    " rhs rowA 3\n"
+    "ENDATA\n";
+
+  const size_t columns_pos = mps.find("COLUMNS");
+  EXPECT_TRUE(columns_pos != std::string::npos) << "failed to place COLUMNS split";
+  const size_t split = columns_pos + 3;
+
+  mps_phase_registry_t registry;
+  mps_section_block_scanner_t scanner(mps.data(), 2, registry);
+
+  scanner.observe_block(1, mps.data() + split, mps.data() + mps.size());
+  scanner.publish_ready(0);
+  scanner.observe_block(0, mps.data(), mps.data() + split);
+  scanner.publish_ready(mps.size());
+
+  EXPECT_TRUE(registry.ready(mps_phase_kind::header)) << "header not ready";
+  EXPECT_TRUE(registry.ready(mps_phase_kind::rows)) << "rows not ready";
+  EXPECT_TRUE(registry.ready(mps_phase_kind::columns)) << "columns not ready";
+  EXPECT_TRUE(registry.ready(mps_phase_kind::rhs)) << "rhs not ready";
+  EXPECT_TRUE(registry.ready(mps_phase_kind::quadratic)) << "quadratic sentinel not ready";
+
+  EXPECT_TRUE(range_text(registry.range(mps_phase_kind::columns)).starts_with("COLUMNS"))
+    << "columns range begins at wrong boundary";
+  EXPECT_TRUE(range_text(registry.range(mps_phase_kind::rhs)).starts_with("RHS"))
+    << "rhs range begins at wrong boundary";
+}
+
+TEST(FastMpsParserEdgeTest, ScannerFindsHeadersSplitAtEveryByte)
+{
+  const std::string mps                       = section_split_fixture();
+  const std::vector<std::string_view> headers = {"ROWS", "COLUMNS", "RHS", "BOUNDS", "ENDATA"};
+
+  for (std::string_view header : headers) {
+    const size_t pos = mps.find(header);
+    EXPECT_TRUE(pos != std::string::npos) << "missing header in split fixture";
+    for (size_t offset = 1; offset < header.size(); ++offset) {
+      const size_t split = pos + offset;
+      mps_phase_registry_t registry;
+      mps_section_block_scanner_t scanner(mps.data(), 2, registry);
+
+      scanner.observe_block(1, mps.data() + split, mps.data() + mps.size());
+      scanner.observe_block(0, mps.data(), mps.data() + split);
+      scanner.publish_ready(mps.size());
+
+      EXPECT_TRUE(registry.ready(mps_phase_kind::rows)) << "rows not ready after split";
+      EXPECT_TRUE(registry.ready(mps_phase_kind::columns)) << "columns not ready after split";
+      EXPECT_TRUE(registry.ready(mps_phase_kind::rhs)) << "rhs not ready after split";
+      EXPECT_TRUE(registry.ready(mps_phase_kind::bounds)) << "bounds not ready after split";
+      EXPECT_TRUE(registry.ready(mps_phase_kind::quadratic))
+        << "quadratic sentinel not ready after split";
+    }
+  }
+}
+
+TEST(FastMpsParserEdgeTest, ScannerRejectsUnknownColumnOneRecordsAfterRows)
+{
+  const std::string mps =
+    "NAME BAD\n"
+    "ROWS\n"
+    " N OBJ\n"
+    "FOO\n"
+    "COLUMNS\n"
+    " x OBJ 1\n"
+    "ENDATA\n";
+
+  EXPECT_THROW(
+    {
+      mps_phase_registry_t registry;
+      mps_section_block_scanner_t scanner(mps.data(), 1, registry);
+      scanner.observe_block(0, mps.data(), mps.data() + mps.size());
+      scanner.publish_ready(mps.size());
+    },
+    std::logic_error);
+}
+
+TEST(FastMpsParserEdgeTest, ParserRejectsUnknownSectionRecords)
+{
+  TempMpsFile file(
+    "NAME BAD_UNKNOWN_SECTION\n"
+    "ROWS\n"
+    " N OBJ\n"
+    " L R1\n"
+    "COLUMNS\n"
+    " X1 OBJ 1 R1 2\n"
+    "RHS\n"
+    " RHS1 R1 3\n"
+    "BOUNDS\n"
+    " FR BND1 X1\n"
+    "QSECTION      R1\n"
+    " X1 X1 1\n"
+    "ENDATA\n");
+
+  EXPECT_THROW(((void)parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read)),
+               std::exception);
+}
+
+TEST(FastMpsParserEdgeTest, BoundsDefaultsAndTypesMatchReference)
+{
+  verify_fixture_bitwise("bounds_defaults_and_types",
+                         "NAME BOUNDS_EDGE\n"
+                         "ROWS\n"
+                         " N OBJ\n"
+                         " L rowA\n"
+                         "COLUMNS\n"
+                         " XFREE rowA 1\n"
+                         " XUP0 rowA 1\n"
+                         " XNEG rowA 1\n"
+                         " XBV rowA 1\n"
+                         " XFX rowA 1\n"
+                         " XLI rowA 1\n"
+                         "RHS\n"
+                         " RHS1 rowA 10\n"
+                         "BOUNDS\n"
+                         " FR BND XFREE\n"
+                         " UP BND XUP0 0\n"
+                         " UP BND XNEG -1\n"
+                         " BV BND XBV\n"
+                         " FX BND XFX 7\n"
+                         " LI BND XLI 2\n"
+                         " UI BND XLI 9\n"
+                         "ENDATA\n");
+}
+
+TEST(FastMpsParserEdgeTest, DuplicateBoundsLastStatementWins)
+{
+  const std::string contents =
+    "NAME BOUNDS_DUP\n"
+    "ROWS\n"
+    " N OBJ\n"
+    " L rowA\n"
+    "COLUMNS\n"
+    " X1 rowA 1\n"
+    "RHS\n"
+    " RHS1 rowA 10\n"
+    "BOUNDS\n"
+    " LO BND X1 0\n"
+    " UP BND X1 5\n"
+    " UP BND X1 3\n"
+    " LO BND X1 2\n"
+    "ENDATA\n";
+
+  verify_fixture_bitwise("duplicate_bounds_last_statement_wins", contents);
+  TempMpsFile file(contents);
+  auto model = parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read);
+  EXPECT_EQ(1, model.n_vars_) << "n_vars";
+  EXPECT_EQ(2.0, model.variable_lower_bounds_.at(0)) << "duplicate lower bound";
+  EXPECT_EQ(3.0, model.variable_upper_bounds_.at(0)) << "duplicate upper bound";
+}
+
+TEST(FastMpsParserEdgeTest, NondenseRowAndColumnNamesUseHashPath)
+{
+  verify_fixture_bitwise("nondense_row_and_column_names",
+                         "NAME HASH_NAMES\n"
+                         "ROWS\n"
+                         " N obj.row\n"
+                         " G demand-east\n"
+                         " L capacity-west\n"
+                         " E balance.17\n"
+                         "COLUMNS\n"
+                         " alpha obj.row 4.5 demand-east 1\n"
+                         " beta_two capacity-west -2 balance.17 3\n"
+                         " z-last demand-east 7 balance.17 -1\n"
+                         "RHS\n"
+                         " rhs demand-east 2 capacity-west 9\n"
+                         " rhs balance.17 0\n"
+                         "BOUNDS\n"
+                         " LO b alpha -5\n"
+                         " UP b beta_two 6\n"
+                         " FR b z-last\n"
+                         "ENDATA\n");
+}
+
+TEST(FastMpsParserEdgeTest, MissingOptionalBoundsFastPath)
+{
+  TempMpsFile file(
+    "NAME OPTIONALS\n"
+    "ROWS\n"
+    " N OBJ\n"
+    " L rowA\n"
+    "COLUMNS\n"
+    " X1 OBJ 1 rowA 2\n"
+    "RHS\n"
+    " RHS1 rowA 0\n"
+    "ENDATA\n");
+
+  auto model = parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read);
+  EXPECT_EQ(1, model.n_vars_) << "missing optional n_vars";
+  EXPECT_EQ(1, model.n_constraints_) << "missing optional n_constraints";
+  EXPECT_EQ(0.0, model.variable_lower_bounds_.at(0)) << "missing BOUNDS lower default";
+  EXPECT_EQ(std::numeric_limits<double>::infinity(), model.variable_upper_bounds_.at(0));
+}
+
+TEST(FastMpsParserEdgeTest, BoundsOnlyVariablesAreAppendedDeterministically)
+{
+  TempMpsFile file(
+    "NAME BOUNDS_ONLY\n"
+    "ROWS\n"
+    " N OBJ\n"
+    " L R1\n"
+    "COLUMNS\n"
+    " XMAIN OBJ 1 R1 2\n"
+    "RHS\n"
+    " RHS1 R1 0\n"
+    "BOUNDS\n"
+    " UP B AUX_Z 9\n"
+    " LO B AUX_Z -3\n"
+    " BV B AUX_A\n"
+    " SC B AUX_S 5\n"
+    "ENDATA\n");
+
+  auto model = parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read);
+  check_model_shapes(model, 1, 4, 1, "bounds-only");
+  EXPECT_EQ(std::string("XMAIN"), model.var_names_.at(0)) << "main var name";
+  EXPECT_EQ(std::string("AUX_A"), model.var_names_.at(1)) << "bounds-only sorted name 1";
+  EXPECT_EQ(std::string("AUX_S"), model.var_names_.at(2)) << "bounds-only sorted name 2";
+  EXPECT_EQ(std::string("AUX_Z"), model.var_names_.at(3)) << "bounds-only sorted name 3";
+
+  const int aux_a = find_var_index(model, "AUX_A");
+  const int aux_s = find_var_index(model, "AUX_S");
+  const int aux_z = find_var_index(model, "AUX_Z");
+  ASSERT_GE(aux_a, 0);
+  ASSERT_GE(aux_s, 0);
+  ASSERT_GE(aux_z, 0);
+  EXPECT_EQ('I', model.var_types_.at(aux_a)) << "bounds-only BV type";
+  EXPECT_EQ(0.0, model.variable_lower_bounds_.at(aux_a)) << "bounds-only BV lb";
+  EXPECT_EQ(1.0, model.variable_upper_bounds_.at(aux_a)) << "bounds-only BV ub";
+  EXPECT_EQ('S', model.var_types_.at(aux_s)) << "bounds-only SC type";
+  EXPECT_EQ(5.0, model.variable_upper_bounds_.at(aux_s)) << "bounds-only SC ub";
+  EXPECT_EQ(-3.0, model.variable_lower_bounds_.at(aux_z)) << "bounds-only duplicate lb";
+  EXPECT_EQ(9.0, model.variable_upper_bounds_.at(aux_z)) << "bounds-only duplicate ub";
+}
+
+TEST(FastMpsParserEdgeTest, IntegerMarkersAssignTypesAndDefaultBounds)
+{
+  TempMpsFile file(
+    "NAME MARKERS\n"
+    "ROWS\n"
+    " N OBJ\n"
+    " L R1\n"
+    "COLUMNS\n"
+    " MARK000 'MARKER' 'INTORG'\n"
+    " XINT OBJ 1 R1 1\n"
+    " MARK001 'MARKER' 'INTEND'\n"
+    " XCONT OBJ 2 R1 2\n"
+    " MARK002 'MARKER' 'INTORG'\n"
+    " XBIN OBJ 3 R1 3\n"
+    " MARK003 'MARKER' 'INTEND'\n"
+    "RHS\n"
+    " RHS1 R1 10\n"
+    "ENDATA\n");
+
+  auto model = parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read);
+  check_model_shapes(model, 1, 3, 3, "integer markers");
+  const int xint  = find_var_index(model, "XINT");
+  const int xcont = find_var_index(model, "XCONT");
+  const int xbin  = find_var_index(model, "XBIN");
+  ASSERT_GE(xint, 0);
+  ASSERT_GE(xcont, 0);
+  ASSERT_GE(xbin, 0);
+  EXPECT_EQ('I', model.var_types_.at(xint)) << "XINT type";
+  EXPECT_EQ('C', model.var_types_.at(xcont)) << "XCONT type";
+  EXPECT_EQ('I', model.var_types_.at(xbin)) << "XBIN type";
+  EXPECT_EQ(0.0, model.variable_lower_bounds_.at(xint)) << "XINT default lb";
+  EXPECT_EQ(1.0, model.variable_upper_bounds_.at(xint)) << "XINT default ub";
+  EXPECT_EQ(0.0, model.variable_lower_bounds_.at(xbin)) << "XBIN default lb";
+  EXPECT_EQ(1.0, model.variable_upper_bounds_.at(xbin)) << "XBIN default ub";
+}
+
+TEST(FastMpsParserEdgeTest, NumericParsingIntegrationMatchesReferenceBitwise)
+{
+  verify_fixture_bitwise("numeric_parsing_integration",
+                         "NAME NUMBERS\n"
+                         "ROWS\n"
+                         " N OBJ\n"
+                         " L R1\n"
+                         " G R2\n"
+                         " E R3\n"
+                         "COLUMNS\n"
+                         " X0 OBJ 0.12345678901234 R1 1e-9\n"
+                         " X1 OBJ -2.5E3 R2 0.12345678901234567890123\n"
+                         " X2 R3 9999999999999999\n"
+                         "RHS\n"
+                         " RHS1 R1 3.14159 R2 -0.000000000000001\n"
+                         " RHS1 R3 42\n"
+                         "RANGES\n"
+                         " RNG R1 0.25 R2 1E2\n"
+                         "BOUNDS\n"
+                         " LO B X0 -123456789\n"
+                         " UP B X0 123456789\n"
+                         " FX B X1 0.3333333333333333\n"
+                         " FR B X2\n"
+                         "ENDATA\n");
+}
+
+TEST(FastMpsParserEdgeTest, CrlfLineEndingsMatchReferenceBitwise)
+{
+  verify_fixture_bitwise("crlf_line_endings",
+                         to_crlf("NAME CRLF_EDGE\n"
+                                 "OBJSENSE\n"
+                                 " MAX\n"
+                                 "ROWS\n"
+                                 " N OBJ\n"
+                                 " L R1\n"
+                                 "COLUMNS\n"
+                                 " X1 OBJ 1 R1 2\n"
+                                 "RHS\n"
+                                 " RHS1 R1 3\n"
+                                 "BOUNDS\n"
+                                 " UP B X1 4\n"
+                                 "ENDATA\n"));
+}
+
+TEST(FastMpsParserEdgeTest, CommentPlacementSupportedCasesMatchReferenceBitwise)
+{
+  verify_fixture_bitwise("comment_placement_supported_cases",
+                         "* leading star comment\n"
+                         "$ leading dollar comment\n"
+                         "NAME COMMENTS\n"
+                         "$ comment between NAME and ROWS\n"
+                         "ROWS\n"
+                         "* comment after ROWS header\n"
+                         " N OBJ $ row objective comment\n"
+                         "$ comment between ROW records\n"
+                         " L R1 $ row constraint comment\n"
+                         "COLUMNS\n"
+                         "* comment after COLUMNS header\n"
+                         " X1 OBJ 1 R1 2 $ inline column comment\n"
+                         "$ comment before next column\n"
+                         " X2 OBJ -1 R1 3\n"
+                         "RHS\n"
+                         "$ comment after RHS header\n"
+                         " RHS1 R1 5 $ inline rhs comment\n"
+                         "BOUNDS\n"
+                         "* comment after BOUNDS header\n"
+                         " LO B X1 0 $ inline bound comment\n"
+                         "$ comment before ENDATA\n"
+                         "ENDATA\n");
+}
+
+TEST(FastMpsParserEdgeTest, ObjectiveMetadataSelectsNamedObjective)
+{
+  verify_fixture_bitwise("objective_metadata",
+                         "NAME OBJMETA\n"
+                         "OBJSENSE\n"
+                         " MAX\n"
+                         "OBJNAME\n"
+                         " COST\n"
+                         "ROWS\n"
+                         " N ALT\n"
+                         " N COST\n"
+                         " L R1\n"
+                         "COLUMNS\n"
+                         " X1 ALT 100 COST 5\n"
+                         " X1 R1 1\n"
+                         " X2 COST -2 R1 3\n"
+                         "RHS\n"
+                         " RHS1 COST 7 R1 11\n"
+                         "ENDATA\n");
+}
+
+TEST(FastMpsParserEdgeTest, MalformedInputsReportErrors)
+{
+  {
+    TempMpsFile file(
+      "NAME BADOBJ\n"
+      "OBJSENSE\n"
+      " SIDEWAYS\n"
+      "ROWS\n"
+      " N OBJ\n"
+      " L R1\n"
+      "COLUMNS\n"
+      " X1 OBJ 1 R1 2\n"
+      "RHS\n"
+      " RHS1 R1 0\n"
+      "ENDATA\n");
+    EXPECT_THROW(((void)parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read)),
+                 std::logic_error);
+  }
+
+  {
+    TempMpsFile file(
+      "NAME BADCOLROW\n"
+      "ROWS\n"
+      " N OBJ\n"
+      " L R1\n"
+      "COLUMNS\n"
+      " X1 MISSING 1\n"
+      "RHS\n"
+      " RHS1 R1 0\n"
+      "ENDATA\n");
+    EXPECT_THROW(((void)parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read)),
+                 std::logic_error);
+  }
+
+  {
+    TempMpsFile file(
+      "NAME BADRHSROW\n"
+      "ROWS\n"
+      " N OBJ\n"
+      " L R1\n"
+      "COLUMNS\n"
+      " X1 OBJ 1 R1 2\n"
+      "RHS\n"
+      " RHS1 MISSING 1\n"
+      "ENDATA\n");
+    EXPECT_THROW(((void)parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read)),
+                 std::logic_error);
+  }
+
+  {
+    TempMpsFile file(
+      "NAME BADBOUND\n"
+      "ROWS\n"
+      " N OBJ\n"
+      " L R1\n"
+      "COLUMNS\n"
+      " X1 OBJ 1 R1 2\n"
+      "RHS\n"
+      " RHS1 R1 0\n"
+      "BOUNDS\n"
+      " XX B X1 1\n"
+      "ENDATA\n");
+    EXPECT_THROW(((void)parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read)),
+                 std::logic_error);
+  }
+
+  {
+    TempMpsFile file(
+      "NAME BADSC\n"
+      "ROWS\n"
+      " N OBJ\n"
+      " L R1\n"
+      "COLUMNS\n"
+      " X1 OBJ 1 R1 2\n"
+      "RHS\n"
+      " RHS1 R1 0\n"
+      "BOUNDS\n"
+      " SC B X1\n"
+      "ENDATA\n");
+    EXPECT_THROW(((void)parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read)),
+                 std::logic_error);
+  }
+}
+
+TEST(FastMpsParserEdgeTest, LargeColumnsRepeatedColumnChunkBoundary)
+{
+  constexpr size_t row_count = 180000;
+  std::string mps;
+  mps.reserve(8 * 1024 * 1024);
+  mps += "NAME BIGCOLS\nROWS\n N OBJ\n";
+  for (size_t i = 1; i <= row_count; ++i) {
+    mps += " L ";
+    mps += row_name(i);
+    mps += '\n';
+  }
+  mps += "COLUMNS\n";
+  for (size_t i = 1; i <= row_count; ++i) {
+    mps += " XBIG ";
+    mps += row_name(i);
+    mps += " 1\n";
+  }
+  mps += " XTAIL ";
+  mps += row_name(1);
+  mps += " 2\nRHS\n RHS1 ";
+  mps += row_name(1);
+  mps += " 0\nENDATA\n";
+
+  TempMpsFile file(std::move(mps));
+  auto model = parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read);
+  check_model_shapes(
+    model, static_cast<int>(row_count), 2, static_cast<int>(row_count + 1), "large columns");
+  EXPECT_EQ(std::string("XBIG"), model.var_names_.at(0)) << "large repeated column name";
+  EXPECT_EQ(std::string("XTAIL"), model.var_names_.at(1)) << "large tail column name";
+}
+
+TEST(FastMpsParserEdgeTest, LargeBoundsRepeatedVarStaysOrdered)
+{
+  constexpr size_t repeat_count = 700000;
+  std::string mps;
+  mps.reserve(12 * 1024 * 1024);
+  mps +=
+    "NAME BIGBOUNDS\nROWS\n N OBJ\n L R1\nCOLUMNS\n alpha OBJ 1 R1 1\nRHS\n RHS1 R1 0\nBOUNDS\n";
+  for (size_t i = 0; i < repeat_count; ++i) {
+    mps += " UP B alpha ";
+    mps += std::to_string(i % 1000);
+    mps += '\n';
+  }
+  mps += "ENDATA\n";
+
+  TempMpsFile file(std::move(mps));
+  auto model = parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read);
+  check_model_shapes(model, 1, 1, 1, "large bounds");
+  EXPECT_EQ(static_cast<double>((repeat_count - 1) % 1000), model.variable_upper_bounds_.at(0))
+    << "large repeated bounds last value";
+}
+
+TEST(FastMpsParserEdgeTest, Lz4AndRawPathsMatchOnMultiblockInput)
+{
+  constexpr size_t row_count = 70000;
+  std::string mps;
+  mps.reserve(4 * 1024 * 1024);
+  mps += "NAME LZ4PARITY\nROWS\n N OBJ\n";
+  for (size_t i = 1; i <= row_count; ++i) {
+    mps += " L ";
+    mps += row_name(i);
+    mps += '\n';
+  }
+  mps += "COLUMNS\n";
+  for (size_t i = 1; i <= row_count; ++i) {
+    mps += " X";
+    mps += std::to_string(i);
+    mps += ' ';
+    mps += row_name(i);
+    mps += " 0.125\n";
+  }
+  mps += "RHS\n RHS1 ";
+  mps += row_name(1);
+  mps += " 1\nENDATA\n";
+
+  TempMpsFile raw_file(std::move(mps));
+  TempOwnedPath lz4_file(raw_file.path + ".lz4");
+  const std::string cmd = "lz4 -f -q " + raw_file.path + " " + lz4_file.path;
+  if (std::system(cmd.c_str()) != 0) { GTEST_SKIP() << "lz4 CLI unavailable"; }
+
+  auto raw = parse_mps_fast_file<int, double>(raw_file.path, FileReadMethod::Read);
+  auto lz4 = parse_mps_fast_file<int, double>(lz4_file.path, FileReadMethod::Read);
+
+  check_model_shapes(lz4, raw.n_constraints_, raw.n_vars_, raw.nnz_, "lz4 parity");
+  EXPECT_EQ(raw.var_names_.size(), lz4.var_names_.size()) << "lz4 var name count";
+  EXPECT_EQ(raw.row_names_.size(), lz4.row_names_.size()) << "lz4 row name count";
+  EXPECT_EQ(raw.A_, lz4.A_) << "lz4 A values";
+  EXPECT_EQ(raw.A_indices_, lz4.A_indices_) << "lz4 A indices";
+  EXPECT_EQ(raw.A_offsets_, lz4.A_offsets_) << "lz4 A offsets";
+  EXPECT_EQ(raw.c_, lz4.c_) << "lz4 objective";
+  EXPECT_EQ(raw.b_, lz4.b_) << "lz4 rhs";
+  EXPECT_EQ(raw.var_types_, lz4.var_types_) << "lz4 var types";
+  EXPECT_EQ(raw.variable_lower_bounds_, lz4.variable_lower_bounds_) << "lz4 lower bounds";
+  EXPECT_EQ(raw.variable_upper_bounds_, lz4.variable_upper_bounds_) << "lz4 upper bounds";
+}
+
+TEST(FastMpsParserEdgeTest, GzipBzip2AndRawPathsMatch)
+{
+  std::string mps;
+  mps += "NAME COMPRESSED\nROWS\n N OBJ\n L R1\n G R2\nCOLUMNS\n";
+  mps += " X1 OBJ 1 R1 2.5\n X2 R1 -3.25 R2 4\n";
+  mps += "RHS\n RHS1 R1 7 R2 8\nBOUNDS\n BV BND X1\n UP BND X2 10\nENDATA\n";
+
+  TempMpsFile raw_file(std::move(mps));
+  TempOwnedPath gzip_file(raw_file.path + ".gz");
+  TempOwnedPath bzip2_file(raw_file.path + ".bz2");
+
+  const std::string gzip_cmd  = "gzip -c " + raw_file.path + " > " + gzip_file.path;
+  const std::string bzip2_cmd = "bzip2 -c " + raw_file.path + " > " + bzip2_file.path;
+  if (std::system(gzip_cmd.c_str()) != 0) { GTEST_SKIP() << "gzip CLI unavailable"; }
+  if (std::system(bzip2_cmd.c_str()) != 0) { GTEST_SKIP() << "bzip2 CLI unavailable"; }
+
+  auto raw   = parse_mps_fast_file<int, double>(raw_file.path, FileReadMethod::Read);
+  auto gzip  = parse_mps_fast_file<int, double>(gzip_file.path, FileReadMethod::Read);
+  auto bzip2 = parse_mps_fast_file<int, double>(bzip2_file.path, FileReadMethod::Read);
+
+  check_model_shapes(gzip, raw.n_constraints_, raw.n_vars_, raw.nnz_, "gzip parity");
+  check_model_shapes(bzip2, raw.n_constraints_, raw.n_vars_, raw.nnz_, "bzip2 parity");
+  EXPECT_EQ(raw.A_, gzip.A_) << "gzip A values";
+  EXPECT_EQ(raw.A_, bzip2.A_) << "bzip2 A values";
+  EXPECT_EQ(raw.A_indices_, gzip.A_indices_) << "gzip A indices";
+  EXPECT_EQ(raw.A_indices_, bzip2.A_indices_) << "bzip2 A indices";
+  EXPECT_EQ(raw.A_offsets_, gzip.A_offsets_) << "gzip A offsets";
+  EXPECT_EQ(raw.A_offsets_, bzip2.A_offsets_) << "bzip2 A offsets";
+  EXPECT_EQ(raw.c_, gzip.c_) << "gzip objective";
+  EXPECT_EQ(raw.c_, bzip2.c_) << "bzip2 objective";
+  EXPECT_EQ(raw.b_, gzip.b_) << "gzip rhs";
+  EXPECT_EQ(raw.b_, bzip2.b_) << "bzip2 rhs";
+  EXPECT_EQ(raw.variable_lower_bounds_, gzip.variable_lower_bounds_) << "gzip lower bounds";
+  EXPECT_EQ(raw.variable_lower_bounds_, bzip2.variable_lower_bounds_) << "bzip2 lower bounds";
+  EXPECT_EQ(raw.variable_upper_bounds_, gzip.variable_upper_bounds_) << "gzip upper bounds";
+  EXPECT_EQ(raw.variable_upper_bounds_, bzip2.variable_upper_bounds_) << "bzip2 upper bounds";
+  EXPECT_EQ(raw.var_types_, gzip.var_types_) << "gzip var types";
+  EXPECT_EQ(raw.var_types_, bzip2.var_types_) << "bzip2 var types";
+}
+
+TEST(FastMpsParserEdgeTest, QcMatrixRowsMatchReferenceBitwise)
+{
+  verify_fixture_bitwise("qcmatrix rows",
+                         "NAME QCMATRIX_TEST\n"
+                         "ROWS\n"
+                         " N OBJ\n"
+                         " L LIN\n"
+                         " L QC1\n"
+                         " G QC2\n"
+                         "COLUMNS\n"
+                         " X1 OBJ 1 LIN 2\n"
+                         " X1 QC1 3 QC2 4\n"
+                         " X2 OBJ 2 LIN 5\n"
+                         " X2 QC1 6 QC2 7\n"
+                         "RHS\n"
+                         " RHS1 LIN 10 QC1 11\n"
+                         " RHS1 QC2 12\n"
+                         "QCMATRIX   QC1\n"
+                         " X1 X1 1.25\n"
+                         " X1 X2 -2.5\n"
+                         "QCMATRIX   QC2\n"
+                         " X2 X2 3.75\n"
+                         "ENDATA\n");
+}
+
+TEST(FastMpsParserEdgeTest, QcMatrixMalformedCasesMatchReference)
+{
+  const std::vector<std::string> cases = {
+    "NAME DUP_QC\n"
+    "ROWS\n"
+    " N OBJ\n"
+    " L QC1\n"
+    "COLUMNS\n"
+    " X1 OBJ 1 QC1 2\n"
+    "RHS\n"
+    " RHS1 QC1 3\n"
+    "QCMATRIX QC1\n"
+    " X1 X1 1\n"
+    "QCMATRIX QC1\n"
+    " X1 X1 2\n"
+    "ENDATA\n",
+    "NAME BAD_QC_ROW\n"
+    "ROWS\n"
+    " N OBJ\n"
+    " L QC1\n"
+    "COLUMNS\n"
+    " X1 OBJ 1 QC1 2\n"
+    "RHS\n"
+    " RHS1 QC1 3\n"
+    "QCMATRIX UNKNOWN\n"
+    " X1 X1 1\n"
+    "ENDATA\n",
+    "NAME BAD_QC_VAR\n"
+    "ROWS\n"
+    " N OBJ\n"
+    " L QC1\n"
+    "COLUMNS\n"
+    " X1 OBJ 1 QC1 2\n"
+    "RHS\n"
+    " RHS1 QC1 3\n"
+    "QCMATRIX QC1\n"
+    " X1 XBAD 1\n"
+    "ENDATA\n"};
+
+  for (const auto& mps : cases) {
+    TempMpsFile file(mps);
+    EXPECT_THROW(((void)parse_reference_model(file.path)), std::exception);
+    EXPECT_THROW(((void)parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read)),
+                 std::exception);
+  }
+}
+
+TEST(FastMpsParserEdgeTest, QuadraticParserRejectsUnknownColumnOneRecords)
+{
+  const std::vector<std::string> records = {"QSECTION      QC1",
+                                            "CSECTION      QC1        0              QUAD"};
+
+  for (const auto& record : records) {
+    TempMpsFile file(
+      "NAME BAD_QUAD_RECORD\n"
+      "ROWS\n"
+      " N OBJ\n"
+      " L QC1\n"
+      "COLUMNS\n"
+      " X1 OBJ 1 QC1 2\n"
+      " X2 OBJ 3 QC1 4\n"
+      "RHS\n"
+      " RHS1 QC1 5\n"
+      "QMATRIX\n"
+      " X1 X1 1\n" +
+      record +
+      "\n"
+      " X2 X2 2\n"
+      "ENDATA\n");
+    EXPECT_THROW(((void)parse_mps_fast_file<int, double>(file.path, FileReadMethod::Read)),
+                 std::exception)
+      << record;
+  }
+}
+
+}  // namespace cuopt::mathematical_optimization::io::detail
diff --git a/cpp/tests/linear_programming/parser_test.cpp b/cpp/tests/linear_programming/parser_test.cpp
index 9260c4d4fe..7fe70f136a 100644
--- a/cpp/tests/linear_programming/parser_test.cpp
+++ b/cpp/tests/linear_programming/parser_test.cpp
@@ -56,6 +56,21 @@ bool file_exists(const std::string& file)
 
 namespace {
 
+struct mps_reader_param_t {
+  const char* name;
+  mps_reader_type_t reader;
+};
+
+constexpr mps_reader_param_t default_mps_reader_param{"default_reader",
+                                                      mps_reader_type_t::default_reader};
+constexpr mps_reader_param_t fast_mps_reader_param{"fast_experimental",
+                                                   mps_reader_type_t::fast_experimental};
+
+std::string mps_reader_param_name(const ::testing::TestParamInfo<mps_reader_param_t>& info)
+{
+  return info.param.name;
+}
+
 // Non-template forwarding wrapper around read_lp_from_string<int, double>.
 // Exists only so EXPECT_THROW(read_lp_string(R"LP(...)LP"), exc) is parsed
 // correctly — gtest's macro splits its args on top-level commas, and the
@@ -115,20 +130,21 @@ double q_entry(const mps_data_model_t<int, double>& m, int row, int col)
 // ===========================================================================
 // Per-fixture test classes. Each class describes one named problem fixture
 // and owns the checker for that problem's expected parsed data model. The
-// MPS and LP TEST_F cases within a fixture share the same `check_model`
+// MPS TEST_P and LP TEST_F cases within a fixture share the same `check_model`
 // method, so the expected values live in exactly one place per fixture.
 //
 // All fixtures inherit a common base that supplies read_mps_file and
 // read_lp_file helpers.
 // ===========================================================================
 
-class parser_fixture_base : public ::testing::Test {
+class parser_fixture_base : public ::testing::TestWithParam<mps_reader_param_t> {
  protected:
-  static mps_data_model_t<int, double> read_mps_file(const std::string& file,
-                                                     bool fixed_format = true)
+  mps_data_model_t<int, double> read_mps_file(const std::string& file,
+                                              bool fixed_format = true) const
   {
     const std::string& root = cuopt::test::get_rapids_dataset_root_dir();
-    return read_mps<int, double>(root + "/" + file, fixed_format);
+    const auto reader       = GetParam().reader;
+    return read<int, double>(root + "/" + file, reader, fixed_format);
   }
 
   static mps_data_model_t<int, double> read_lp_file(const std::string& file)
@@ -357,9 +373,13 @@ TEST(mps_parser, bad_mps_files)
   }
 }
 
-TEST_F(good_mps_1_test, mps)
+TEST_P(good_mps_1_test, mps)
+{
+  check_model(read_mps_file("linear_programming/good-mps-1.mps", false));
+}
+
+TEST_F(good_mps_1_test, mps_parser_internals)
 {
-  check_model(read_mps_file("linear_programming/good-mps-1.mps"));
   // Parser-struct fields that are MPS-only (not exposed via the data model).
   auto mps = read_from_mps("linear_programming/good-mps-1.mps");
   EXPECT_EQ("good-1", mps.problem_name);
@@ -592,9 +612,13 @@ TEST(mps_parser_free_format, bad_mps_files_free_format)
   }
 }
 
-TEST_F(up_low_bounds_test, mps)
+TEST_P(up_low_bounds_test, mps)
 {
   check_model(read_mps_file("linear_programming/lp_model_with_var_bounds.mps", false));
+}
+
+TEST_F(up_low_bounds_test, mps_parser_internals)
+{
   auto mps = read_from_mps("linear_programming/lp_model_with_var_bounds.mps", false);
   EXPECT_EQ("lp_model_with_var_bounds", mps.problem_name);
   EXPECT_EQ("OBJ", mps.objective_name);
@@ -607,14 +631,14 @@ TEST_F(up_low_bounds_test, lp)
   check_model(read_lp_file("linear_programming/lp_model_with_var_bounds.lp"));
 }
 
-TEST_F(good_mps_1_test, mps_free_format)
+TEST_P(good_mps_1_test, mps_free_format)
 {
   // free-format-mps-1.mps encodes the same problem as good-mps-1 with default
   // [0, +inf) bounds (no BOUNDS section), so it satisfies the same checker.
   check_model(read_mps_file("linear_programming/free-format-mps-1.mps", false));
 }
 
-TEST_F(some_var_bounds_test, mps)
+TEST_P(some_var_bounds_test, mps)
 {
   check_model(read_mps_file("linear_programming/good-mps-some-var-bounds.mps"));
 }
@@ -624,7 +648,7 @@ TEST_F(some_var_bounds_test, lp)
   check_model(read_lp_file("linear_programming/good-mps-some-var-bounds.lp"));
 }
 
-TEST_F(fixed_var_bound_test, mps)
+TEST_P(fixed_var_bound_test, mps)
 {
   check_model(read_mps_file("linear_programming/good-mps-fixed-var.mps"));
 }
@@ -634,7 +658,7 @@ TEST_F(fixed_var_bound_test, lp)
   check_model(read_lp_file("linear_programming/good-mps-fixed-var.lp"));
 }
 
-TEST_F(free_var_bound_test, mps)
+TEST_P(free_var_bound_test, mps)
 {
   check_model(read_mps_file("linear_programming/good-mps-free-var.mps"));
 }
@@ -644,7 +668,7 @@ TEST_F(free_var_bound_test, lp)
   check_model(read_lp_file("linear_programming/good-mps-free-var.lp"));
 }
 
-TEST_F(lower_inf_var_bound_test, mps)
+TEST_P(lower_inf_var_bound_test, mps)
 {
   check_model(read_mps_file("linear_programming/good-mps-lower-bound-inf-var.mps"));
 }
@@ -662,7 +686,7 @@ TEST(mps_bounds, rhs_cost)
   EXPECT_EQ(int(-5), mps.objective_offset_value);
 }
 
-TEST_F(upper_inf_var_bound_test, mps)
+TEST_P(upper_inf_var_bound_test, mps)
 {
   check_model(read_mps_file("linear_programming/good-mps-upper-bound-inf-var.mps"));
 }
@@ -817,9 +841,13 @@ TEST(mps_bounds, unsupported_or_invalid_mps_types)
   };
 }
 
-TEST_F(mip_with_bounds_test, mps)
+TEST_P(mip_with_bounds_test, mps)
 {
   check_model(read_mps_file("mixed_integer_programming/good-mip-mps-1.mps", false));
+}
+
+TEST_F(mip_with_bounds_test, mps_parser_internals)
+{
   auto mps = read_from_mps("mixed_integer_programming/good-mip-mps-1.mps", false);
   EXPECT_EQ("COST", mps.objective_name);
   ASSERT_EQ(int(2), mps.row_types.size());
@@ -877,7 +905,7 @@ TEST(mps_parser, good_mps_file_mip_no_marker)
   EXPECT_EQ(10., mps.variable_upper_bounds[1]);
 }
 
-TEST_F(mip_no_bounds_test, mps)
+TEST_P(mip_no_bounds_test, mps)
 {
   check_model(read_mps_file("mixed_integer_programming/good-mip-mps-no-bounds.mps", false));
 }
@@ -887,7 +915,7 @@ TEST_F(mip_no_bounds_test, lp)
   check_model(read_lp_file("mixed_integer_programming/good-mip-mps-no-bounds.lp"));
 }
 
-TEST_F(mip_partial_bounds_test, mps)
+TEST_P(mip_partial_bounds_test, mps)
 {
   check_model(read_mps_file("mixed_integer_programming/good-mip-mps-partial-bounds.mps", false));
 }
@@ -897,6 +925,32 @@ TEST_F(mip_partial_bounds_test, lp)
   check_model(read_lp_file("mixed_integer_programming/good-mip-mps-partial-bounds.lp"));
 }
 
+#define INSTANTIATE_MPS_READER_TEST(Fixture)                                                   \
+  INSTANTIATE_TEST_SUITE_P(mps_readers,                                                        \
+                           Fixture,                                                            \
+                           ::testing::Values(default_mps_reader_param, fast_mps_reader_param), \
+                           mps_reader_param_name)
+
+#define INSTANTIATE_DEFAULT_MPS_READER_TEST(Fixture) \
+  INSTANTIATE_TEST_SUITE_P(                          \
+    mps_readers, Fixture, ::testing::Values(default_mps_reader_param), mps_reader_param_name)
+
+INSTANTIATE_MPS_READER_TEST(good_mps_1_test);
+INSTANTIATE_MPS_READER_TEST(up_low_bounds_test);
+INSTANTIATE_MPS_READER_TEST(mip_with_bounds_test);
+INSTANTIATE_MPS_READER_TEST(mip_no_bounds_test);
+INSTANTIATE_MPS_READER_TEST(mip_partial_bounds_test);
+// fast mps parser doesn't support fixed format
+INSTANTIATE_DEFAULT_MPS_READER_TEST(some_var_bounds_test);
+INSTANTIATE_DEFAULT_MPS_READER_TEST(fixed_var_bound_test);
+INSTANTIATE_DEFAULT_MPS_READER_TEST(free_var_bound_test);
+INSTANTIATE_DEFAULT_MPS_READER_TEST(lower_inf_var_bound_test);
+INSTANTIATE_DEFAULT_MPS_READER_TEST(upper_inf_var_bound_test);
+
+// NOTE: INSTANTIATE_MPS_READER_TEST / INSTANTIATE_DEFAULT_MPS_READER_TEST are intentionally
+// left defined here; the QP/QCQP file fixtures below reuse them. They are #undef-ed after the
+// last instantiation.
+
 #ifdef MPS_PARSER_WITH_BZIP2
 TEST(mps_parser, good_mps_file_bzip2_compressed)
 {
@@ -998,13 +1052,14 @@ TEST(qps_parser, quadratic_objective_basic)
   EXPECT_EQ(1.0, model.get_quadratic_objective_values()[1]);
 }
 
+class qps_file_reader_test : public parser_fixture_base {};
+
 // Test actual QPS files from the dataset
-TEST(qps_parser, test_qps_files)
+TEST_P(qps_file_reader_test, test_qps_files)
 {
   // Test QP_Test_1.qps if it exists
   if (file_exists("quadratic_programming/QP_Test_1.qps")) {
-    auto parsed_data = read_mps<int, double>(
-      cuopt::test::get_rapids_dataset_root_dir() + "/quadratic_programming/QP_Test_1.qps", false);
+    auto parsed_data = read_mps_file("quadratic_programming/QP_Test_1.qps", false);
 
     EXPECT_EQ("QP_Test_1", parsed_data.get_problem_name());
     EXPECT_EQ(2, parsed_data.get_n_variables());    // C------1 and C------2
@@ -1023,8 +1078,7 @@ TEST(qps_parser, test_qps_files)
 
   // Test QP_Test_2.qps if it exists
   if (file_exists("quadratic_programming/QP_Test_2.qps")) {
-    auto parsed_data = read_mps<int, double>(
-      cuopt::test::get_rapids_dataset_root_dir() + "/quadratic_programming/QP_Test_2.qps", false);
+    auto parsed_data = read_mps_file("quadratic_programming/QP_Test_2.qps", false);
 
     EXPECT_EQ("QP_Test_2", parsed_data.get_problem_name());
     EXPECT_EQ(3, parsed_data.get_n_variables());    // C------1, C------2, C------3
@@ -2582,6 +2636,19 @@ TEST(read, qps_extension_dispatches_to_mps_parser)
   EXPECT_EQ(m.get_variable_names()[0], "x");
 }
 
+TEST(read, qps_extension_dispatches_to_fast_experimental_reader)
+{
+  temp_file_t tmp(".qps");
+  {
+    std::ofstream out(tmp.string());
+    out << kTrivialMps;
+  }
+  auto m = read<int, double>(tmp.string(), mps_reader_type_t::fast_experimental);
+  ASSERT_EQ(m.get_variable_names().size(), 1u);
+  EXPECT_EQ(m.get_variable_names()[0], "x");
+  EXPECT_NEAR(m.get_variable_upper_bounds()[0], 10.0, tolerance);
+}
+
 TEST(read, mps_gz_extension_dispatches_to_mps_parser)
 {
   auto m = read<int, double>(cuopt::test::get_rapids_dataset_root_dir() +
@@ -2796,13 +2863,12 @@ TEST(qps_parser, qcmatrix_append_api)
 }
 
 // QCQP MPS: each quadratic constraint bundles row + linear + rhs + quadratic.
-TEST(qps_parser, qcmatrix_mps_linear_rhs_and_bounds)
+TEST_P(qps_file_reader_test, qcmatrix_mps_linear_rhs_and_bounds)
 {
   if (!file_exists("qcqp/QC_Test_1.mps")) {
     GTEST_SKIP() << "qcqp/QC_Test_1.mps not in dataset root";
   }
-  const auto model = read_mps<int, double>(
-    cuopt::test::get_rapids_dataset_root_dir() + "/qcqp/QC_Test_1.mps", false);
+  const auto model = read_mps_file("qcqp/QC_Test_1.mps", false);
 
   ASSERT_TRUE(model.has_quadratic_constraints());
   const auto& qcs = model.get_quadratic_constraints();
@@ -2848,13 +2914,12 @@ TEST(qps_parser, qcmatrix_mps_linear_rhs_and_bounds)
   EXPECT_DOUBLE_EQ(10.0, qcs[1].rhs_value);
 }
 
-TEST(qps_parser, qcqp_p0033_mps_sections)
+TEST_P(qps_file_reader_test, qcqp_p0033_mps_sections)
 {
   if (!file_exists("qcqp/p0033_qc1.mps")) {
     GTEST_SKIP() << "qcqp/p0033_qc1.mps not in dataset root";
   }
-  const auto model = read_mps<int, double>(
-    cuopt::test::get_rapids_dataset_root_dir() + "/qcqp/p0033_qc1.mps", false);
+  const auto model = read_mps_file("qcqp/p0033_qc1.mps", false);
 
   EXPECT_EQ(12, model.get_n_constraints());
   EXPECT_EQ(33, model.get_n_variables());
@@ -2897,4 +2962,9 @@ TEST(mps_roundtrip, qcqp_p0033_qc1)
   auto reloaded_2 = read_mps<int, double>(temp_file_2.string(), false);
   compare_data_models(reloaded, reloaded_2);
 }
+
+INSTANTIATE_MPS_READER_TEST(qps_file_reader_test);
+
+#undef INSTANTIATE_MPS_READER_TEST
+#undef INSTANTIATE_DEFAULT_MPS_READER_TEST
 }  // namespace cuopt::mathematical_optimization::io
diff --git a/thirdparty/THIRD_PARTY_LICENSES b/thirdparty/THIRD_PARTY_LICENSES
index 128984dd54..7424a65232 100644
--- a/thirdparty/THIRD_PARTY_LICENSES
+++ b/thirdparty/THIRD_PARTY_LICENSES
@@ -537,3 +537,63 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
+
+
+-----------------------------------------------------------------------------------------
+== LZ4
+
+Usage: cuopt uses LZ4 through dynamically loaded library symbols
+
+Copyright (c) Yann Collet. All rights reserved.
+
+BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+-----------------------------------------------------------------------------------------
+== SIMDe
+
+Usage: cuopt uses SIMDe in experimental fast MPS parser SIMD compatibility code
+
+Copyright (c) 2017 Evan Nemerson <evan@nemerson.com>
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.