From 8c808275bc83383798a31ef8896869576622d247 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huuanhhuyn@nvidia.com>
Date: Tue, 16 Jun 2026 07:06:43 +0000
Subject: [PATCH 01/14] IVF-PQ FP16 overflow detection

---
 .../neighbors/detail/cagra/cagra_build.cuh    |  16 ++
 .../neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 138 ++++++++++++++++++
 2 files changed, 154 insertions(+)
 create mode 100644 cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index a7c15b4161..4ac33868fc 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -6,6 +6,7 @@
 
 #include "../../../core/nvtx.hpp"
 #include "../../../preprocessing/quantize/vpq_build-ext.cuh"
+#include "../../ivf_pq/ivf_pq_fp16_overflow.cuh"
 #include "graph_core.cuh"
 
 #include <raft/core/copy.cuh>
@@ -1637,6 +1638,21 @@ void build_knn_graph(
                  pq.build_params.metric == cuvs::distance::DistanceType::CosineExpanded,
                "Currently only L2Expanded, InnerProduct and CosineExpanded metrics are supported");
 
+  // Guard against potential FP16 distance overflow for large-magnitude datasets -> back to FP32.
+  const bool using_fp16_distance = pq.search_params.internal_distance_dtype == CUDA_R_16F ||
+                                   pq.search_params.coarse_search_dtype == CUDA_R_16F;
+  if (using_fp16_distance &&
+      ivf_pq::helpers::estimate_fp16_overflow(res, dataset, pq.build_params.metric)) {
+    RAFT_LOG_WARN(
+      "IVF-PQ internal type of FP16 is likely insufficient for this dataset to avoid overflow in "
+      "distance computations -> "
+      "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32");
+    pq.search_params.internal_distance_dtype = CUDA_R_32F;
+    pq.search_params.coarse_search_dtype     = CUDA_R_32F;
+    // lut_dtype is left unchanged because its per-subspace terms are smaller by a factor of pq_dim
+    // and therefore, less likely to overflow.
+  }
+
   uint32_t node_degree = knn_graph.extent(1);
   raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "cagra::build_knn_graph<IVF-PQ>(%zu, %zu, %u)",
diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
new file mode 100644
index 0000000000..e67e3a5d9c
--- /dev/null
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
@@ -0,0 +1,138 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "../detail/ann_utils.cuh"  // cuvs::spatial::knn::detail::utils::mapping
+
+#include <cuvs/distance/distance.hpp>
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/mdspan.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/matrix/sample_rows.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <cstdint>
+
+namespace cuvs::neighbors::ivf_pq::detail {
+
+/** Reduce the maximum squared L2 norm over a set of row-major vectors of element type DataT. */
+template <typename DataT>
+__global__ void kern_max_squared_norm(const DataT* __restrict__ data,
+                                      int64_t n_rows,
+                                      int64_t dim,
+                                      float* __restrict__ out_max_sq)
+{
+  for (int64_t row = blockIdx.x * blockDim.x + threadIdx.x; row < n_rows;
+       row += static_cast<int64_t>(gridDim.x) * blockDim.x) {
+    const DataT* v = data + row * dim;
+    float sq       = 0.0f;
+    for (int64_t d = 0; d < dim; d++) {
+      //
+      float e = cuvs::spatial::knn::detail::utils::mapping<float>{}(v[d]);
+      printf("raw float(v[d]): %f, mapped: %f\n", static_cast<float>(v[d]), e);
+      sq += e * e;
+    }
+    // - There is no atomicMax for floats, so we embrace the bitwise representation monoticity
+    //   between float and int. This is valid when values are non-negative, which is the case
+    //   for squared norms.
+    // - Choose global atomic instead of shared memory tree reduction for simplicity, assuming
+    //   low contention.
+    atomicMax(reinterpret_cast<int*>(out_max_sq), __float_as_int(sq));
+  }
+}
+
+/**
+ * Estimate max_i ||x_i||^2 over the dataset by sampling a fraction of its rows.
+ *
+ * NOTE: sampling yields a *lower-bound* estimate of the true max norm, so a too-small fraction can
+ * miss outlier vectors. Increase `sample_fraction` (up to 1.0 for an exact, no-false-negative scan)
+ * if you observe overflow slipping through.
+ */
+template <typename DataT, typename Accessor>
+float estimate_max_squared_norm(
+  raft::resources const& res,
+  raft::mdspan<const DataT, raft::matrix_extent<int64_t>, raft::row_major, Accessor> dataset,
+  double sample_fraction)
+{
+  auto stream          = raft::resource::get_cuda_stream(res);
+  const int64_t n_rows = dataset.extent(0);
+  const int64_t dim    = dataset.extent(1);
+
+  // Determine sample size based on fraction, with bounds
+  int64_t n_sample = static_cast<int64_t>(sample_fraction * static_cast<double>(n_rows));
+  if (n_rows <= 1000) {
+    n_sample = n_rows;  // for small datasets, just use them all and skip the sampling overhead
+  } else if (n_rows > 100000) {
+    // cap the sample size to 100k for speed and keep memory use within the limited workspace
+    // resource
+    n_sample = 100000;
+    //
+  }
+
+  // Sample from the dataset
+  auto mr = raft::resource::get_workspace_resource_ref(res);
+  auto sample =
+    raft::make_device_mdarray<DataT>(res, mr, raft::make_extents<int64_t>(n_sample, dim));
+  raft::matrix::sample_rows<DataT, int64_t>(
+    res, raft::random::RngState{137}, dataset, sample.view());
+
+  auto d_max_sq            = raft::make_device_scalar<float>(res, 0.0f);
+  constexpr int block_size = 256;
+  const int grid_size      = static_cast<int>((n_sample + block_size - 1) / block_size);
+  kern_max_squared_norm<DataT><<<grid_size, block_size, 0, stream>>>(
+    sample.data_handle(), n_sample, dim, d_max_sq.data_handle());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+  float h_max_sq = 0.0f;
+  raft::update_host(&h_max_sq, d_max_sq.data_handle(), 1, stream);
+  raft::resource::sync_stream(res);
+  return h_max_sq;
+}
+
+}  // namespace cuvs::neighbors::ivf_pq::detail
+
+namespace cuvs::neighbors::ivf_pq::helpers {
+
+/**
+ * @brief Estimate whether FP16 is likely insufficient for IVF-PQ's full-magnitude distance
+ * computations on this dataset (i.e. `internal_distance_dtype` and `coarse_search_dtype`).
+ *
+ * We bound the largest achievable score from the dataset's vector norms. With R = max_i ||x_i||
+ * (estimated from a random sample of the dataset):
+ *   - L2Expanded:     ||x - y||^2 = ||x||^2 + ||y||^2 - 2<x,y> <= (||x|| + ||y||)^2 <= 4 * R^2
+ *   - InnerProduct:   |<x, y>|    <= ||x|| * ||y||                                  <=     R^2
+ *   - CosineExpanded: data is L2-normalized, so |score| <= 1 and overflow is impossible.
+ */
+template <typename DataT, typename Accessor>
+bool estimate_fp16_overflow(
+  raft::resources const& res,
+  raft::mdspan<const DataT, raft::matrix_extent<int64_t>, raft::row_major, Accessor> dataset,
+  cuvs::distance::DistanceType metric,
+  double sample_fraction = 0.01)
+{
+  // Cosine similarity scores does normalization itself, so overflow won't happen
+  if (metric == cuvs::distance::DistanceType::CosineExpanded) { return false; }
+
+  // FP16 largest finite value, with a defensive margin to also avoid precision loss near the limit.
+  constexpr float kFp16Max              = 65504.0f;
+  constexpr float kFp16DefensiveMargin  = 0.25f;
+  const float overflow_detect_threshold = kFp16DefensiveMargin * kFp16Max;
+
+  const float max_vector_sq_norm =
+    cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(res, dataset, sample_fraction);
+
+  const float max_distance_sq_norm = metric == cuvs::distance::DistanceType::L2Expanded
+                                       ? 4.0f * max_vector_sq_norm
+                                       : max_vector_sq_norm;
+
+  return max_distance_sq_norm > overflow_detect_threshold;
+}
+
+}  // namespace cuvs::neighbors::ivf_pq::helpers

From d78f6644acd690967339fdc9d950b1d1477f8b75 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huuanhhuyn@nvidia.com>
Date: Tue, 16 Jun 2026 07:28:47 +0000
Subject: [PATCH 02/14] Explain the use of mapping

---
 cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
index e67e3a5d9c..3315607a85 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
@@ -34,13 +34,14 @@ __global__ void kern_max_squared_norm(const DataT* __restrict__ data,
     const DataT* v = data + row * dim;
     float sq       = 0.0f;
     for (int64_t d = 0; d < dim; d++) {
-      //
+      // internally, IVF-PQ distance computations will map the input data type (e.g. FP16) to float before
+      // doing arithmetic, so we need to apply the same mapping here to get a correct estimate of the squared norms
+      // instead of using static_cast<float>(v[d])
       float e = cuvs::spatial::knn::detail::utils::mapping<float>{}(v[d]);
-      printf("raw float(v[d]): %f, mapped: %f\n", static_cast<float>(v[d]), e);
       sq += e * e;
     }
     // - There is no atomicMax for floats, so we embrace the bitwise representation monoticity
-    //   between float and int. This is valid when values are non-negative, which is the case
+    //   between float and int. This is valid when values are non-negative, which is the case 
     //   for squared norms.
     // - Choose global atomic instead of shared memory tree reduction for simplicity, assuming
     //   low contention.
@@ -70,10 +71,8 @@ float estimate_max_squared_norm(
   if (n_rows <= 1000) {
     n_sample = n_rows;  // for small datasets, just use them all and skip the sampling overhead
   } else if (n_rows > 100000) {
-    // cap the sample size to 100k for speed and keep memory use within the limited workspace
-    // resource
+    // cap the sample size to 100k for speed and keep memory use within the limited workspace resource
     n_sample = 100000;
-    //
   }
 
   // Sample from the dataset

From edac3b92a0ff2672193c2a08d32f45bc296c8246 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huuanhhuyn@nvidia.com>
Date: Mon, 22 Jun 2026 08:50:43 +0000
Subject: [PATCH 03/14] Move params fallback to params init phase

---
 .../neighbors/detail/cagra/cagra_build.cuh    | 31 +++++++++----------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index 4ac33868fc..6533e0b7a4 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -1637,22 +1637,7 @@ void build_knn_graph(
                  pq.build_params.metric == cuvs::distance::DistanceType::InnerProduct ||
                  pq.build_params.metric == cuvs::distance::DistanceType::CosineExpanded,
                "Currently only L2Expanded, InnerProduct and CosineExpanded metrics are supported");
-
-  // Guard against potential FP16 distance overflow for large-magnitude datasets -> back to FP32.
-  const bool using_fp16_distance = pq.search_params.internal_distance_dtype == CUDA_R_16F ||
-                                   pq.search_params.coarse_search_dtype == CUDA_R_16F;
-  if (using_fp16_distance &&
-      ivf_pq::helpers::estimate_fp16_overflow(res, dataset, pq.build_params.metric)) {
-    RAFT_LOG_WARN(
-      "IVF-PQ internal type of FP16 is likely insufficient for this dataset to avoid overflow in "
-      "distance computations -> "
-      "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32");
-    pq.search_params.internal_distance_dtype = CUDA_R_32F;
-    pq.search_params.coarse_search_dtype     = CUDA_R_32F;
-    // lut_dtype is left unchanged because its per-subspace terms are smaller by a factor of pq_dim
-    // and therefore, less likely to overflow.
-  }
-
+    
   uint32_t node_degree = knn_graph.extent(1);
   raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "cagra::build_knn_graph<IVF-PQ>(%zu, %zu, %u)",
@@ -2231,6 +2216,20 @@ index<T, IdxT> build(
     } else {
       RAFT_LOG_DEBUG("Selecting IVF-PQ solver");
       knn_build_params = cagra::graph_build_params::ivf_pq_params(dataset.extents(), params.metric);
+       // Guard against potential FP16 distance overflow for large-magnitude datasets -> back to FP32.
+      const bool using_fp16_distance = pq.search_params.internal_distance_dtype == CUDA_R_16F ||
+                                      pq.search_params.coarse_search_dtype == CUDA_R_16F;
+      if (using_fp16_distance &&
+          ivf_pq::helpers::estimate_fp16_overflow(res, dataset, pq.build_params.metric)) {
+        RAFT_LOG_WARN(
+          "IVF-PQ internal type of FP16 is likely insufficient for this dataset to avoid overflow in "
+          "distance computations -> "
+          "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32");
+        pq.search_params.internal_distance_dtype = CUDA_R_32F;
+        pq.search_params.coarse_search_dtype     = CUDA_R_32F;
+        // lut_dtype is left unchanged because its per-subspace terms are smaller by a factor of pq_dim
+        // and therefore, less likely to overflow.
+      }
     }
   }
   RAFT_EXPECTS(

From 8c501152db9481b986e72ee279a7f97c3bc8cea7 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huuanhhuyn@nvidia.com>
Date: Mon, 22 Jun 2026 08:51:12 +0000
Subject: [PATCH 04/14] Choose sampling equation

---
 .../neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 24 +++++++++----------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
index 3315607a85..4c0bc6be94 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
@@ -59,21 +59,20 @@ __global__ void kern_max_squared_norm(const DataT* __restrict__ data,
 template <typename DataT, typename Accessor>
 float estimate_max_squared_norm(
   raft::resources const& res,
-  raft::mdspan<const DataT, raft::matrix_extent<int64_t>, raft::row_major, Accessor> dataset,
-  double sample_fraction)
+  raft::mdspan<const DataT, raft::matrix_extent<int64_t>, raft::row_major, Accessor> dataset)
 {
   auto stream          = raft::resource::get_cuda_stream(res);
   const int64_t n_rows = dataset.extent(0);
   const int64_t dim    = dataset.extent(1);
 
-  // Determine sample size based on fraction, with bounds
-  int64_t n_sample = static_cast<int64_t>(sample_fraction * static_cast<double>(n_rows));
-  if (n_rows <= 1000) {
-    n_sample = n_rows;  // for small datasets, just use them all and skip the sampling overhead
-  } else if (n_rows > 100000) {
-    // cap the sample size to 100k for speed and keep memory use within the limited workspace resource
-    n_sample = 100000;
-  }
+  // Determine sample size based on a smooth saturation equation. The equation satisfies:
+  // - n_sample is always less than or equal to n_rows
+  // - n_sample saturates to kSaturation when n_rows is inf
+  // - n_sample increases fast for small n_rows and slow to saturation for large n_rows
+  // Idea: we greedily sample most of the dataset when it is small-sized, and cap it to kSaturation
+  // when dataset size grows very large.
+  constexpr int64_t kSaturation = 100000;
+  int64_t n_sample = (n_rows * kSaturation + (n_rows + kSaturation - 1)) / (n_rows + kSaturation);
 
   // Sample from the dataset
   auto mr = raft::resource::get_workspace_resource_ref(res);
@@ -113,8 +112,7 @@ template <typename DataT, typename Accessor>
 bool estimate_fp16_overflow(
   raft::resources const& res,
   raft::mdspan<const DataT, raft::matrix_extent<int64_t>, raft::row_major, Accessor> dataset,
-  cuvs::distance::DistanceType metric,
-  double sample_fraction = 0.01)
+  cuvs::distance::DistanceType metric)
 {
   // Cosine similarity scores does normalization itself, so overflow won't happen
   if (metric == cuvs::distance::DistanceType::CosineExpanded) { return false; }
@@ -125,7 +123,7 @@ bool estimate_fp16_overflow(
   const float overflow_detect_threshold = kFp16DefensiveMargin * kFp16Max;
 
   const float max_vector_sq_norm =
-    cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(res, dataset, sample_fraction);
+    cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(res, dataset);
 
   const float max_distance_sq_norm = metric == cuvs::distance::DistanceType::L2Expanded
                                        ? 4.0f * max_vector_sq_norm

From f3e3dd8e58a7936229c232b62ab0091f50f703be Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huuanhhuyn@nvidia.com>
Date: Mon, 22 Jun 2026 10:18:20 +0000
Subject: [PATCH 05/14] Use raft built-in kernels and remove manual one.

---
 .../neighbors/detail/cagra/cagra_build.cuh    | 34 ++++---
 .../neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 98 +++++++++----------
 2 files changed, 64 insertions(+), 68 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index 6533e0b7a4..99ae2f36b2 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -1637,7 +1637,7 @@ void build_knn_graph(
                  pq.build_params.metric == cuvs::distance::DistanceType::InnerProduct ||
                  pq.build_params.metric == cuvs::distance::DistanceType::CosineExpanded,
                "Currently only L2Expanded, InnerProduct and CosineExpanded metrics are supported");
-    
+
   uint32_t node_degree = knn_graph.extent(1);
   raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "cagra::build_knn_graph<IVF-PQ>(%zu, %zu, %u)",
@@ -2216,20 +2216,24 @@ index<T, IdxT> build(
     } else {
       RAFT_LOG_DEBUG("Selecting IVF-PQ solver");
       knn_build_params = cagra::graph_build_params::ivf_pq_params(dataset.extents(), params.metric);
-       // Guard against potential FP16 distance overflow for large-magnitude datasets -> back to FP32.
-      const bool using_fp16_distance = pq.search_params.internal_distance_dtype == CUDA_R_16F ||
-                                      pq.search_params.coarse_search_dtype == CUDA_R_16F;
-      if (using_fp16_distance &&
-          ivf_pq::helpers::estimate_fp16_overflow(res, dataset, pq.build_params.metric)) {
-        RAFT_LOG_WARN(
-          "IVF-PQ internal type of FP16 is likely insufficient for this dataset to avoid overflow in "
-          "distance computations -> "
-          "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32");
-        pq.search_params.internal_distance_dtype = CUDA_R_32F;
-        pq.search_params.coarse_search_dtype     = CUDA_R_32F;
-        // lut_dtype is left unchanged because its per-subspace terms are smaller by a factor of pq_dim
-        // and therefore, less likely to overflow.
-      }
+    }
+  }
+
+  // Guard against potential FP16 distance overflow for large-magnitude datasets. Applies to any
+  // IVF-PQ graph build (auto-selected above or explicitly set in params) -> fall back to FP32.
+  if (auto* pq = std::get_if<cagra::graph_build_params::ivf_pq_params>(&knn_build_params)) {
+    const bool using_fp16_distance = pq->search_params.internal_distance_dtype == CUDA_R_16F ||
+                                     pq->search_params.coarse_search_dtype == CUDA_R_16F;
+    if (using_fp16_distance &&
+        ivf_pq::helpers::estimate_fp16_overflow(res, dataset, params.metric)) {
+      RAFT_LOG_WARN(
+        "IVF-PQ internal type of FP16 is likely insufficient for this dataset to avoid overflow in "
+        "distance computations -> "
+        "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32");
+      pq->search_params.internal_distance_dtype = CUDA_R_32F;
+      pq->search_params.coarse_search_dtype     = CUDA_R_32F;
+      // lut_dtype is left unchanged because its per-subspace terms are smaller by a factor of
+      // pq_dim and therefore, less likely to overflow.
     }
   }
   RAFT_EXPECTS(
diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
index 4c0bc6be94..a184fb9701 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
@@ -11,9 +11,12 @@
 
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/mdspan.hpp>
+#include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/device_memory_resource.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/linalg/map_reduce.cuh>
+#include <raft/linalg/reduce.cuh>
 #include <raft/matrix/sample_rows.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -22,46 +25,15 @@
 
 namespace cuvs::neighbors::ivf_pq::detail {
 
-/** Reduce the maximum squared L2 norm over a set of row-major vectors of element type DataT. */
-template <typename DataT>
-__global__ void kern_max_squared_norm(const DataT* __restrict__ data,
-                                      int64_t n_rows,
-                                      int64_t dim,
-                                      float* __restrict__ out_max_sq)
-{
-  for (int64_t row = blockIdx.x * blockDim.x + threadIdx.x; row < n_rows;
-       row += static_cast<int64_t>(gridDim.x) * blockDim.x) {
-    const DataT* v = data + row * dim;
-    float sq       = 0.0f;
-    for (int64_t d = 0; d < dim; d++) {
-      // internally, IVF-PQ distance computations will map the input data type (e.g. FP16) to float before
-      // doing arithmetic, so we need to apply the same mapping here to get a correct estimate of the squared norms
-      // instead of using static_cast<float>(v[d])
-      float e = cuvs::spatial::knn::detail::utils::mapping<float>{}(v[d]);
-      sq += e * e;
-    }
-    // - There is no atomicMax for floats, so we embrace the bitwise representation monoticity
-    //   between float and int. This is valid when values are non-negative, which is the case 
-    //   for squared norms.
-    // - Choose global atomic instead of shared memory tree reduction for simplicity, assuming
-    //   low contention.
-    atomicMax(reinterpret_cast<int*>(out_max_sq), __float_as_int(sq));
-  }
-}
-
 /**
- * Estimate max_i ||x_i||^2 over the dataset by sampling a fraction of its rows.
- *
- * NOTE: sampling yields a *lower-bound* estimate of the true max norm, so a too-small fraction can
- * miss outlier vectors. Increase `sample_fraction` (up to 1.0 for an exact, no-false-negative scan)
- * if you observe overflow slipping through.
+ * Estimate max_i ||x_i||^2 over the dataset by sampling from its rows.
  */
 template <typename DataT, typename Accessor>
 float estimate_max_squared_norm(
-  raft::resources const& res,
+  raft::resources const& handle,
   raft::mdspan<const DataT, raft::matrix_extent<int64_t>, raft::row_major, Accessor> dataset)
 {
-  auto stream          = raft::resource::get_cuda_stream(res);
+  auto stream          = raft::resource::get_cuda_stream(handle);
   const int64_t n_rows = dataset.extent(0);
   const int64_t dim    = dataset.extent(1);
 
@@ -69,29 +41,47 @@ float estimate_max_squared_norm(
   // - n_sample is always less than or equal to n_rows
   // - n_sample saturates to kSaturation when n_rows is inf
   // - n_sample increases fast for small n_rows and slow to saturation for large n_rows
-  // Idea: we greedily sample most of the dataset when it is small-sized, and cap it to kSaturation
-  // when dataset size grows very large.
+  // Idea: we sample most of the dataset when it is small-sized, and only a small fraction
+  // (up to a maximum/saturation number) when the dataset size grows large.
+  // kSaturation is selected as a compromise between runtime and recall.
   constexpr int64_t kSaturation = 100000;
   int64_t n_sample = (n_rows * kSaturation + (n_rows + kSaturation - 1)) / (n_rows + kSaturation);
 
   // Sample from the dataset
-  auto mr = raft::resource::get_workspace_resource_ref(res);
+  auto mr = raft::resource::get_workspace_resource_ref(handle);
   auto sample =
-    raft::make_device_mdarray<DataT>(res, mr, raft::make_extents<int64_t>(n_sample, dim));
+    raft::make_device_mdarray<DataT>(handle, mr, raft::make_extents<int64_t>(n_sample, dim));
   raft::matrix::sample_rows<DataT, int64_t>(
-    res, raft::random::RngState{137}, dataset, sample.view());
-
-  auto d_max_sq            = raft::make_device_scalar<float>(res, 0.0f);
-  constexpr int block_size = 256;
-  const int grid_size      = static_cast<int>((n_sample + block_size - 1) / block_size);
-  kern_max_squared_norm<DataT><<<grid_size, block_size, 0, stream>>>(
-    sample.data_handle(), n_sample, dim, d_max_sq.data_handle());
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-
-  float h_max_sq = 0.0f;
-  raft::update_host(&h_max_sq, d_max_sq.data_handle(), 1, stream);
-  raft::resource::sync_stream(res);
-  return h_max_sq;
+    handle, raft::random::RngState{137}, dataset, sample.view());
+
+  // Compute mapped squared norm
+  auto d_map_sq_norm = raft::make_device_vector<float, int64_t>(handle, n_sample);
+  raft::linalg::reduce<raft::Apply::ALONG_ROWS>(
+    handle,
+    raft::make_const_mdspan(sample.view()),
+    d_map_sq_norm.view(),
+    0.0f,
+    false,
+    [] __device__(DataT v, auto) -> float {
+      float e = cuvs::spatial::knn::detail::utils::mapping<float>{}(v);
+      return e * e;
+    },
+    raft::add_op(),
+    raft::identity_op());
+  // Compute max of squared norm vector
+  auto d_max_sq = raft::make_device_scalar<float>(handle, 0.0f);
+  raft::linalg::map_reduce(handle,
+                           raft::make_const_mdspan(d_map_sq_norm.view()),
+                           d_max_sq.view(),
+                           0.0f,
+                           raft::identity_op(),
+                           raft::max_op());
+
+  float max_sq = 0.0f;
+  raft::update_host(&max_sq, d_max_sq.data_handle(), 1, stream);
+  raft::resource::sync_stream(handle);
+
+  return max_sq;
 }
 
 }  // namespace cuvs::neighbors::ivf_pq::detail
@@ -110,10 +100,12 @@ namespace cuvs::neighbors::ivf_pq::helpers {
  */
 template <typename DataT, typename Accessor>
 bool estimate_fp16_overflow(
-  raft::resources const& res,
+  raft::resources const& handle,
   raft::mdspan<const DataT, raft::matrix_extent<int64_t>, raft::row_major, Accessor> dataset,
   cuvs::distance::DistanceType metric)
 {
+  if (dataset.extent(0) == 0) { return false; }
+
   // Cosine similarity scores does normalization itself, so overflow won't happen
   if (metric == cuvs::distance::DistanceType::CosineExpanded) { return false; }
 
@@ -123,7 +115,7 @@ bool estimate_fp16_overflow(
   const float overflow_detect_threshold = kFp16DefensiveMargin * kFp16Max;
 
   const float max_vector_sq_norm =
-    cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(res, dataset);
+    cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(handle, dataset);
 
   const float max_distance_sq_norm = metric == cuvs::distance::DistanceType::L2Expanded
                                        ? 4.0f * max_vector_sq_norm

From 4d59c975734123d4462e6ef229114d1fc7b85466 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huuanhhuyn@nvidia.com>
Date: Mon, 22 Jun 2026 10:55:38 +0000
Subject: [PATCH 06/14] Add kDelay param to adjust the sampling fraction growth
 speed

---
 cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
index a184fb9701..b50e7e9a4e 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
@@ -10,6 +10,7 @@
 #include <cuvs/distance/distance.hpp>
 
 #include <raft/core/device_mdarray.hpp>
+#include <raft/core/error.hpp>
 #include <raft/core/mdspan.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
@@ -43,9 +44,13 @@ float estimate_max_squared_norm(
   // - n_sample increases fast for small n_rows and slow to saturation for large n_rows
   // Idea: we sample most of the dataset when it is small-sized, and only a small fraction
   // (up to a maximum/saturation number) when the dataset size grows large.
-  // kSaturation is selected as a compromise between runtime and recall.
+  // kSaturation and kDelay are selected as a compromise between runtime and outlier recall.
   constexpr int64_t kSaturation = 100000;
-  int64_t n_sample = (n_rows * kSaturation + (n_rows + kSaturation - 1)) / (n_rows + kSaturation);
+  constexpr int64_t kDelay      = kSaturation * 10;
+  RAFT_EXPECTS(kDelay >= kSaturation,
+               "kDelay must not be smaller than kSaturation so that n_sample is always less than "
+               "or equal to n_rows");
+  int64_t n_sample = (n_rows * kSaturation + (n_rows + kDelay - 1)) / (n_rows + kDelay);
 
   // Sample from the dataset
   auto mr = raft::resource::get_workspace_resource_ref(handle);

From c66b748b8599ef6cca43a63f54f8f8b617566fee Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huuanhhuyn@nvidia.com>
Date: Mon, 22 Jun 2026 11:24:33 +0000
Subject: [PATCH 07/14] Reduce kSaturation to 20k (just sufficient to detect
 FP16 overflow in SIFT 1M)

---
 cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
index b50e7e9a4e..99f2c9e60c 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
@@ -45,7 +45,7 @@ float estimate_max_squared_norm(
   // Idea: we sample most of the dataset when it is small-sized, and only a small fraction
   // (up to a maximum/saturation number) when the dataset size grows large.
   // kSaturation and kDelay are selected as a compromise between runtime and outlier recall.
-  constexpr int64_t kSaturation = 100000;
+  constexpr int64_t kSaturation = 20000;
   constexpr int64_t kDelay      = kSaturation * 10;
   RAFT_EXPECTS(kDelay >= kSaturation,
                "kDelay must not be smaller than kSaturation so that n_sample is always less than "

From adbe0a93379d52363215b6a81c88528dc7fdd631 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huuanhhuyn@nvidia.com>
Date: Mon, 22 Jun 2026 12:18:55 +0000
Subject: [PATCH 08/14] Explain uniform sampling decision

---
 cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
index 99f2c9e60c..2f99da9e06 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
@@ -27,7 +27,12 @@
 namespace cuvs::neighbors::ivf_pq::detail {
 
 /**
- * Estimate max_i ||x_i||^2 over the dataset by sampling from its rows.
+ * Estimate max_i ||x_i||^2 over the dataset by uniformly sampling a fraction from it.
+ *
+ * Decision: Uniform sampling is selected as it is sufficient to detect FP16 overflow in
+ * the datasets, where overflow-causing large vectors are frequent (e.g. SIFT 1M). For 
+ * dataset with rare large outliers, we might preferably sample biasedly towards large vectors,
+ * e.g. via top-k selection over the vectors with largest L_inf norm.
  */
 template <typename DataT, typename Accessor>
 float estimate_max_squared_norm(
@@ -59,7 +64,7 @@ float estimate_max_squared_norm(
   raft::matrix::sample_rows<DataT, int64_t>(
     handle, raft::random::RngState{137}, dataset, sample.view());
 
-  // Compute mapped squared norm
+  // Compute float-mapped squared norm
   auto d_map_sq_norm = raft::make_device_vector<float, int64_t>(handle, n_sample);
   raft::linalg::reduce<raft::Apply::ALONG_ROWS>(
     handle,

From 891098b71ba22bba52782f550995e974d1f142a2 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huuanhhuyn@nvidia.com>
Date: Fri, 26 Jun 2026 12:55:54 +0000
Subject: [PATCH 09/14] Avoid expensive random sampling over the full dataset.

Sampling is not necessary anyway.
---
 .../neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 24 +++++++------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
index 2f99da9e06..a01cf8e220 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
@@ -18,8 +18,7 @@
 #include <raft/core/resources.hpp>
 #include <raft/linalg/map_reduce.cuh>
 #include <raft/linalg/reduce.cuh>
-#include <raft/matrix/sample_rows.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/util/cuda_dev_essentials.cuh>
 #include <raft/util/cudart_utils.hpp>
 
 #include <cstdint>
@@ -27,18 +26,14 @@
 namespace cuvs::neighbors::ivf_pq::detail {
 
 /**
- * Estimate max_i ||x_i||^2 over the dataset by uniformly sampling a fraction from it.
- *
- * Decision: Uniform sampling is selected as it is sufficient to detect FP16 overflow in
- * the datasets, where overflow-causing large vectors are frequent (e.g. SIFT 1M). For 
- * dataset with rare large outliers, we might preferably sample biasedly towards large vectors,
- * e.g. via top-k selection over the vectors with largest L_inf norm.
+ * Estimate max_i ||x_i||^2 over the dataset.
  */
 template <typename DataT, typename Accessor>
 float estimate_max_squared_norm(
   raft::resources const& handle,
   raft::mdspan<const DataT, raft::matrix_extent<int64_t>, raft::row_major, Accessor> dataset)
 {
+  common::nvtx::range<common::nvtx::domain::cuvs> r("estimate_max_squared_norm");
   auto stream          = raft::resource::get_cuda_stream(handle);
   const int64_t n_rows = dataset.extent(0);
   const int64_t dim    = dataset.extent(1);
@@ -47,7 +42,7 @@ float estimate_max_squared_norm(
   // - n_sample is always less than or equal to n_rows
   // - n_sample saturates to kSaturation when n_rows is inf
   // - n_sample increases fast for small n_rows and slow to saturation for large n_rows
-  // Idea: we sample most of the dataset when it is small-sized, and only a small fraction
+  // Idea: we examine most of the dataset when it is small-sized, and only a small fraction
   // (up to a maximum/saturation number) when the dataset size grows large.
   // kSaturation and kDelay are selected as a compromise between runtime and outlier recall.
   constexpr int64_t kSaturation = 20000;
@@ -55,14 +50,11 @@ float estimate_max_squared_norm(
   RAFT_EXPECTS(kDelay >= kSaturation,
                "kDelay must not be smaller than kSaturation so that n_sample is always less than "
                "or equal to n_rows");
-  int64_t n_sample = (n_rows * kSaturation + (n_rows + kDelay - 1)) / (n_rows + kDelay);
+  int64_t n_sample = raft::ceildiv(n_rows * kSaturation, n_rows + kDelay);
 
-  // Sample from the dataset
   auto mr = raft::resource::get_workspace_resource_ref(handle);
-  auto sample =
-    raft::make_device_mdarray<DataT>(handle, mr, raft::make_extents<int64_t>(n_sample, dim));
-  raft::matrix::sample_rows<DataT, int64_t>(
-    handle, raft::random::RngState{137}, dataset, sample.view());
+  auto sample = raft::make_device_mdarray<DataT>(handle, mr, raft::make_extents<int64_t>(n_sample, dim));
+  raft::copy(sample.data_handle(), dataset.data_handle(), n_sample * dim, raft::resource::get_cuda_stream(handle));
 
   // Compute float-mapped squared norm
   auto d_map_sq_norm = raft::make_device_vector<float, int64_t>(handle, n_sample);
@@ -103,7 +95,7 @@ namespace cuvs::neighbors::ivf_pq::helpers {
  * computations on this dataset (i.e. `internal_distance_dtype` and `coarse_search_dtype`).
  *
  * We bound the largest achievable score from the dataset's vector norms. With R = max_i ||x_i||
- * (estimated from a random sample of the dataset):
+ * (estimated from a fraction of the dataset):
  *   - L2Expanded:     ||x - y||^2 = ||x||^2 + ||y||^2 - 2<x,y> <= (||x|| + ||y||)^2 <= 4 * R^2
  *   - InnerProduct:   |<x, y>|    <= ||x|| * ||y||                                  <=     R^2
  *   - CosineExpanded: data is L2-normalized, so |score| <= 1 and overflow is impossible.

From 6c124c844aaa8eede31c5e248494c8479305847d Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huuanhhuyn@nvidia.com>
Date: Fri, 26 Jun 2026 12:57:12 +0000
Subject: [PATCH 10/14] Edit comments

---
 cpp/src/neighbors/detail/cagra/cagra_build.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index 99ae2f36b2..cee49c8630 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -2219,8 +2219,8 @@ index<T, IdxT> build(
     }
   }
 
-  // Guard against potential FP16 distance overflow for large-magnitude datasets. Applies to any
-  // IVF-PQ graph build (auto-selected above or explicitly set in params) -> fall back to FP32.
+  // Predict potential FP16 distance overflow for large-magnitude (e.g. unnormalized) datasets 
+  // -> fall back to FP32.
   if (auto* pq = std::get_if<cagra::graph_build_params::ivf_pq_params>(&knn_build_params)) {
     const bool using_fp16_distance = pq->search_params.internal_distance_dtype == CUDA_R_16F ||
                                      pq->search_params.coarse_search_dtype == CUDA_R_16F;

From ac65897434629787899dad2046ed7bbf62e38d23 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huuanhhuyn@nvidia.com>
Date: Mon, 29 Jun 2026 12:35:03 +0000
Subject: [PATCH 11/14] Fix sampling number and remove defensive margin

---
 .../neighbors/detail/cagra/cagra_build.cuh    |  4 +--
 .../neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 32 ++++++-------------
 2 files changed, 12 insertions(+), 24 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index cee49c8630..a3ab54d376 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
@@ -2219,7 +2219,7 @@ index<T, IdxT> build(
     }
   }
 
-  // Predict potential FP16 distance overflow for large-magnitude (e.g. unnormalized) datasets 
+  // Predict potential FP16 distance overflow for large-magnitude (e.g. unnormalized) datasets
   // -> fall back to FP32.
   if (auto* pq = std::get_if<cagra::graph_build_params::ivf_pq_params>(&knn_build_params)) {
     const bool using_fp16_distance = pq->search_params.internal_distance_dtype == CUDA_R_16F ||
diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
index a01cf8e220..58f35a8c35 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -38,23 +38,15 @@ float estimate_max_squared_norm(
   const int64_t n_rows = dataset.extent(0);
   const int64_t dim    = dataset.extent(1);
 
-  // Determine sample size based on a smooth saturation equation. The equation satisfies:
-  // - n_sample is always less than or equal to n_rows
-  // - n_sample saturates to kSaturation when n_rows is inf
-  // - n_sample increases fast for small n_rows and slow to saturation for large n_rows
-  // Idea: we examine most of the dataset when it is small-sized, and only a small fraction
-  // (up to a maximum/saturation number) when the dataset size grows large.
-  // kSaturation and kDelay are selected as a compromise between runtime and outlier recall.
-  constexpr int64_t kSaturation = 20000;
-  constexpr int64_t kDelay      = kSaturation * 10;
-  RAFT_EXPECTS(kDelay >= kSaturation,
-               "kDelay must not be smaller than kSaturation so that n_sample is always less than "
-               "or equal to n_rows");
-  int64_t n_sample = raft::ceildiv(n_rows * kSaturation, n_rows + kDelay);
+  int64_t n_sample = std::min<int64_t>(n_rows, 20000);
 
   auto mr = raft::resource::get_workspace_resource_ref(handle);
-  auto sample = raft::make_device_mdarray<DataT>(handle, mr, raft::make_extents<int64_t>(n_sample, dim));
-  raft::copy(sample.data_handle(), dataset.data_handle(), n_sample * dim, raft::resource::get_cuda_stream(handle));
+  auto sample =
+    raft::make_device_mdarray<DataT>(handle, mr, raft::make_extents<int64_t>(n_sample, dim));
+  raft::copy(sample.data_handle(),
+             dataset.data_handle(),
+             n_sample * dim,
+             raft::resource::get_cuda_stream(handle));
 
   // Compute float-mapped squared norm
   auto d_map_sq_norm = raft::make_device_vector<float, int64_t>(handle, n_sample);
@@ -111,11 +103,6 @@ bool estimate_fp16_overflow(
   // Cosine similarity scores does normalization itself, so overflow won't happen
   if (metric == cuvs::distance::DistanceType::CosineExpanded) { return false; }
 
-  // FP16 largest finite value, with a defensive margin to also avoid precision loss near the limit.
-  constexpr float kFp16Max              = 65504.0f;
-  constexpr float kFp16DefensiveMargin  = 0.25f;
-  const float overflow_detect_threshold = kFp16DefensiveMargin * kFp16Max;
-
   const float max_vector_sq_norm =
     cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(handle, dataset);
 
@@ -123,7 +110,8 @@ bool estimate_fp16_overflow(
                                        ? 4.0f * max_vector_sq_norm
                                        : max_vector_sq_norm;
 
-  return max_distance_sq_norm > overflow_detect_threshold;
+  constexpr float kFp16Max = 65504.0f;
+  return max_distance_sq_norm > kFp16Max;
 }
 
 }  // namespace cuvs::neighbors::ivf_pq::helpers

From 1d44ffabe27d5a56b2ddd7a394c9960c81539338 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huuanhhuyn@nvidia.com>
Date: Mon, 29 Jun 2026 13:58:43 +0000
Subject: [PATCH 12/14] Refactor for explicitness of supported distance types

---
 .../neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh    | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
index 58f35a8c35..7c7dfe9fb1 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
@@ -100,15 +100,19 @@ bool estimate_fp16_overflow(
 {
   if (dataset.extent(0) == 0) { return false; }
 
-  // Cosine similarity scores does normalization itself, so overflow won't happen
-  if (metric == cuvs::distance::DistanceType::CosineExpanded) { return false; }
+  float dist_factor = 1.0f;
+  switch (metric) {
+    case cuvs::distance::DistanceType::L2Expanded: dist_factor = 4.0f; break;
+    case cuvs::distance::DistanceType::CosineExpanded:
+      // Cosine similarity scores does normalization itself, so overflow won't happen
+      return false;
+    case cuvs::distance::DistanceType::InnerProduct: dist_factor = 1.0f; break;
+    default: RAFT_FAIL("Unsupported distance type for IVF-PQ search %d.", int(metric));
+  }
 
   const float max_vector_sq_norm =
     cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(handle, dataset);
-
-  const float max_distance_sq_norm = metric == cuvs::distance::DistanceType::L2Expanded
-                                       ? 4.0f * max_vector_sq_norm
-                                       : max_vector_sq_norm;
+  const float max_distance_sq_norm = dist_factor * max_vector_sq_norm;
 
   constexpr float kFp16Max = 65504.0f;
   return max_distance_sq_norm > kFp16Max;

From b0b2f856856e0836347bf3d452c57c752c5e07cd Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huuanhhuyn@nvidia.com>
Date: Wed, 1 Jul 2026 02:27:57 -0700
Subject: [PATCH 13/14] Benchmark 3 approaches of overflow detection

- estimate max norm on the first 20k rows
- estimate max norm on the cluster centers
- probe search and detect invalid distances
---
 .../neighbors/detail/cagra/cagra_build.cuh    |  65 ++++++++++-
 .../neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 107 +++++++++++++++++-
 2 files changed, 165 insertions(+), 7 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index a3ab54d376..508cf46b0e 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -1,6 +1,6 @@
 /*
  * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
+ * reserved. SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
@@ -1665,6 +1665,54 @@ void build_knn_graph(
   RAFT_LOG_DEBUG("# Building IVF-PQ index %s", model_name.c_str());
   auto index = cuvs::neighbors::ivf_pq::build(res, pq.build_params, dataset);
 
+  // Empirically detect FP16 distance overflow on the just-built index: run a small FP16 probe
+  // search and downgrade the internal/coarse dtypes to FP32 if any distance comes back non-finite.
+  // This observes the actual computation, so it is agnostic of the selected distance type.
+  if (pq.search_params.internal_distance_dtype == CUDA_R_16F ||
+      pq.search_params.coarse_search_dtype == CUDA_R_16F) {
+    // --- Benchmark fp16 overflow detection: probe search with 128 queries ---
+    {
+      raft::resource::sync_stream(res);
+      const auto t0  = std::chrono::steady_clock::now();
+      const bool fp16_overflow = cuvs::neighbors::ivf_pq::helpers::detect_fp16_overflow(
+        res, index, pq.search_params, dataset);
+      const double ms =
+        std::chrono::duration<double, std::milli>(std::chrono::steady_clock::now() - t0)
+          .count();
+      RAFT_LOG_INFO("detect_fp16_overflow (IVF-PQ search) took %.3f ms (overflow=%d)", ms, int(fp16_overflow));
+    }
+    // --- Benchmark fp16 overflow detection: estimate max norm on the first 20k rows ---
+    {
+      raft::resource::sync_stream(res);
+      const auto t0 = std::chrono::steady_clock::now();
+      const bool fp16_overflow = cuvs::neighbors::ivf_pq::helpers::estimate_fp16_overflow(
+        res, dataset, pq.build_params.metric);
+      const double ms =
+        std::chrono::duration<double, std::milli>(std::chrono::steady_clock::now() - t0).count();
+      RAFT_LOG_INFO("estimate max norm on the first 20k rows took %.3f ms (overflow=%d)", ms, int(fp16_overflow));
+    }
+    // --- Benchmark fp16 overflow detection: estimate max norm on the IVF-PQ cluster centers ---
+    {
+      raft::resource::sync_stream(res);
+      const auto t0 = std::chrono::steady_clock::now();
+      const bool fp16_overflow = cuvs::neighbors::ivf_pq::helpers::estimate_fp16_overflow_centers(
+        res, index, pq.build_params.metric);
+      const double ms =
+        std::chrono::duration<double, std::milli>(std::chrono::steady_clock::now() - t0).count();
+      RAFT_LOG_INFO("estimate max norm on the IVF-PQ cluster centers took %.3f ms (overflow=%d)", ms, int(fp16_overflow));
+    }
+    // if (fp16_overflow) {
+    //   RAFT_LOG_WARN(
+    //     "IVF-PQ FP16 distance produced non-finite results on a probe search for this dataset -> "
+    //     "switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32");
+    //   pq.search_params.internal_distance_dtype = CUDA_R_32F;
+    //   pq.search_params.coarse_search_dtype     = CUDA_R_32F;
+    // }
+    // TODO delete this after benchmarking 3 approaches
+    //pq.search_params.internal_distance_dtype = CUDA_R_32F;
+    //pq.search_params.coarse_search_dtype     = CUDA_R_32F;
+  }
+
   //
   // search top (k + 1) neighbors
   //
@@ -2224,16 +2272,21 @@ index<T, IdxT> build(
   if (auto* pq = std::get_if<cagra::graph_build_params::ivf_pq_params>(&knn_build_params)) {
     const bool using_fp16_distance = pq->search_params.internal_distance_dtype == CUDA_R_16F ||
                                      pq->search_params.coarse_search_dtype == CUDA_R_16F;
+    raft::resource::sync_stream(res);
+    const auto detect_start  = std::chrono::steady_clock::now();
+    const bool first_20k_rows_check = ivf_pq::helpers::estimate_fp16_overflow(res, dataset, params.metric);
+    const double detect_ms =
+      std::chrono::duration<double, std::milli>(std::chrono::steady_clock::now() - detect_start)
+        .count();
+    RAFT_LOG_INFO("1st attempt: estimate_fp16_overflow on first 20k rows took %.3f ms (overflow=%d)", detect_ms, int(first_20k_rows_check));
     if (using_fp16_distance &&
         ivf_pq::helpers::estimate_fp16_overflow(res, dataset, params.metric)) {
       RAFT_LOG_WARN(
-        "IVF-PQ internal type of FP16 is likely insufficient for this dataset to avoid overflow in "
+        "IVF-PQ internal type of FP16 is insufficient for this dataset to avoid overflow in "
         "distance computations -> "
         "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32");
-      pq->search_params.internal_distance_dtype = CUDA_R_32F;
-      pq->search_params.coarse_search_dtype     = CUDA_R_32F;
-      // lut_dtype is left unchanged because its per-subspace terms are smaller by a factor of
-      // pq_dim and therefore, less likely to overflow.
+      //pq->search_params.internal_distance_dtype = CUDA_R_32F;
+      //pq->search_params.coarse_search_dtype     = CUDA_R_32F;
     }
   }
   RAFT_EXPECTS(
diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
index 7c7dfe9fb1..8e4198ad6f 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
@@ -8,6 +8,7 @@
 #include "../detail/ann_utils.cuh"  // cuvs::spatial::knn::detail::utils::mapping
 
 #include <cuvs/distance/distance.hpp>
+#include <cuvs/neighbors/ivf_pq.hpp>
 
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/error.hpp>
@@ -15,16 +16,24 @@
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/map_reduce.cuh>
 #include <raft/linalg/reduce.cuh>
 #include <raft/util/cuda_dev_essentials.cuh>
 #include <raft/util/cudart_utils.hpp>
 
+#include <thrust/logical.h>
+
+#include <algorithm>
 #include <cstdint>
 
 namespace cuvs::neighbors::ivf_pq::detail {
 
+struct is_non_finite_op {
+  __device__ auto operator()(float v) const -> bool { return isnan(v) || isinf(v); }
+};
+
 /**
  * Estimate max_i ||x_i||^2 over the dataset.
  */
@@ -83,7 +92,7 @@ float estimate_max_squared_norm(
 namespace cuvs::neighbors::ivf_pq::helpers {
 
 /**
- * @brief Estimate whether FP16 is likely insufficient for IVF-PQ's full-magnitude distance
+ * @brief Estimate whether FP16 is insufficient for IVF-PQ's full-magnitude distance
  * computations on this dataset (i.e. `internal_distance_dtype` and `coarse_search_dtype`).
  *
  * We bound the largest achievable score from the dataset's vector norms. With R = max_i ||x_i||
@@ -98,6 +107,7 @@ bool estimate_fp16_overflow(
   raft::mdspan<const DataT, raft::matrix_extent<int64_t>, raft::row_major, Accessor> dataset,
   cuvs::distance::DistanceType metric)
 {
+  common::nvtx::range<common::nvtx::domain::cuvs> r("estimate_fp16_overflow");
   if (dataset.extent(0) == 0) { return false; }
 
   float dist_factor = 1.0f;
@@ -118,4 +128,99 @@ bool estimate_fp16_overflow(
   return max_distance_sq_norm > kFp16Max;
 }
 
+/**
+ * @brief Estimate whether FP16 is insufficient, using the IVF-PQ cluster centers.
+ */
+inline bool estimate_fp16_overflow_centers(raft::resources const& handle,
+                                           const cuvs::neighbors::ivf_pq::index<int64_t>& index,
+                                           cuvs::distance::DistanceType metric)
+{
+  common::nvtx::range<common::nvtx::domain::cuvs> r("estimate_fp16_overflow_centers");
+  const int64_t n_lists = index.n_lists();
+  const int64_t dim     = index.dim();
+  if (n_lists == 0 || dim == 0) { return false; }
+
+  float dist_factor = 1.0f;
+  switch (metric) {
+    case cuvs::distance::DistanceType::L2Expanded: dist_factor = 4.0f; break;
+    case cuvs::distance::DistanceType::CosineExpanded:
+      // Cosine similarity scores does normalization itself, so overflow won't happen
+      return false;
+    case cuvs::distance::DistanceType::InnerProduct: dist_factor = 1.0f; break;
+    default: RAFT_FAIL("Unsupported distance type for IVF-PQ search %d.", int(metric));
+  }
+
+  const int64_t dim_ext = index.dim_ext();
+  auto stream           = raft::resource::get_cuda_stream(handle);
+  auto mr               = raft::resource::get_workspace_resource_ref(handle);
+  auto centers_contig =
+    raft::make_device_mdarray<float>(handle, mr, raft::make_extents<int64_t>(n_lists, dim));
+  RAFT_CUDA_TRY(cudaMemcpy2DAsync(centers_contig.data_handle(),
+                                  dim * sizeof(float),
+                                  index.centers().data_handle(),
+                                  dim_ext * sizeof(float),
+                                  dim * sizeof(float),
+                                  n_lists,
+                                  cudaMemcpyDeviceToDevice,
+                                  stream));
+
+  const float max_vector_sq_norm = cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(
+    handle, raft::make_const_mdspan(centers_contig.view()));
+  const float max_distance_sq_norm = dist_factor * max_vector_sq_norm;
+
+  constexpr float kFp16Max = 65504.0f;
+  return max_distance_sq_norm > kFp16Max;
+}
+
+/**
+ * @brief Detect whether FP16 internal distance dtypes overflow for this dataset during search.
+ *
+ * Runs a small probe search against an already-built IVF-PQ index with FP16 internal/coarse
+ * distance dtypes, and reports whether any returned distance is non-finite (inf/NaN).
+ */
+template <typename DataT, typename Accessor>
+bool detect_fp16_overflow(
+  raft::resources const& handle,
+  const cuvs::neighbors::ivf_pq::index<int64_t>& index,
+  cuvs::neighbors::ivf_pq::search_params search_params,
+  raft::mdspan<const DataT, raft::matrix_extent<int64_t>, raft::row_major, Accessor> dataset)
+{
+  common::nvtx::range<common::nvtx::domain::cuvs> r("detect_fp16_overflow");
+  const int64_t n_rows = dataset.extent(0);
+  if (n_rows == 0) { return false; }
+
+  auto stream       = raft::resource::get_cuda_stream(handle);
+  const int64_t dim = dataset.extent(1);
+
+  constexpr int64_t kMaxSampleQueries = 128;
+  constexpr uint32_t kProbeTopK       = 32;
+  const int64_t n_sample              = std::min<int64_t>(n_rows, kMaxSampleQueries);
+  const uint32_t top_k = std::min<uint32_t>(static_cast<uint32_t>(n_rows), kProbeTopK);
+
+  auto mr = raft::resource::get_workspace_resource_ref(handle);
+  auto queries =
+    raft::make_device_mdarray<DataT>(handle, mr, raft::make_extents<int64_t>(n_sample, dim));
+  raft::copy(queries.data_handle(), dataset.data_handle(), n_sample * dim, stream);
+
+  auto neighbors =
+    raft::make_device_mdarray<int64_t>(handle, mr, raft::make_extents<int64_t>(n_sample, top_k));
+  auto distances =
+    raft::make_device_mdarray<float>(handle, mr, raft::make_extents<int64_t>(n_sample, top_k));
+
+  cuvs::neighbors::ivf_pq::search(handle,
+                                  search_params,
+                                  index,
+                                  raft::make_const_mdspan(queries.view()),
+                                  neighbors.view(),
+                                  distances.view());
+
+  const int64_t count       = n_sample * static_cast<int64_t>(top_k);
+  const bool any_non_finite = thrust::any_of(raft::resource::get_thrust_policy(handle),
+                                             distances.data_handle(),
+                                             distances.data_handle() + count,
+                                             cuvs::neighbors::ivf_pq::detail::is_non_finite_op{});
+  raft::resource::sync_stream(handle);
+  return any_non_finite;
+}
+
 }  // namespace cuvs::neighbors::ivf_pq::helpers

From 85f7e571b1d417df463351bd168790a45774170d Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huuanhhuyn@nvidia.com>
Date: Thu, 2 Jul 2026 11:46:37 +0000
Subject: [PATCH 14/14] Select probe search approach

---
 .../neighbors/detail/cagra/cagra_build.cuh    |  66 +-------
 .../neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 150 +-----------------
 2 files changed, 11 insertions(+), 205 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index 508cf46b0e..f118146fba 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -1670,47 +1670,17 @@ void build_knn_graph(
   // This observes the actual computation, so it is agnostic of the selected distance type.
   if (pq.search_params.internal_distance_dtype == CUDA_R_16F ||
       pq.search_params.coarse_search_dtype == CUDA_R_16F) {
-    // --- Benchmark fp16 overflow detection: probe search with 128 queries ---
     {
-      raft::resource::sync_stream(res);
-      const auto t0  = std::chrono::steady_clock::now();
       const bool fp16_overflow = cuvs::neighbors::ivf_pq::helpers::detect_fp16_overflow(
         res, index, pq.search_params, dataset);
-      const double ms =
-        std::chrono::duration<double, std::milli>(std::chrono::steady_clock::now() - t0)
-          .count();
-      RAFT_LOG_INFO("detect_fp16_overflow (IVF-PQ search) took %.3f ms (overflow=%d)", ms, int(fp16_overflow));
-    }
-    // --- Benchmark fp16 overflow detection: estimate max norm on the first 20k rows ---
-    {
-      raft::resource::sync_stream(res);
-      const auto t0 = std::chrono::steady_clock::now();
-      const bool fp16_overflow = cuvs::neighbors::ivf_pq::helpers::estimate_fp16_overflow(
-        res, dataset, pq.build_params.metric);
-      const double ms =
-        std::chrono::duration<double, std::milli>(std::chrono::steady_clock::now() - t0).count();
-      RAFT_LOG_INFO("estimate max norm on the first 20k rows took %.3f ms (overflow=%d)", ms, int(fp16_overflow));
-    }
-    // --- Benchmark fp16 overflow detection: estimate max norm on the IVF-PQ cluster centers ---
-    {
-      raft::resource::sync_stream(res);
-      const auto t0 = std::chrono::steady_clock::now();
-      const bool fp16_overflow = cuvs::neighbors::ivf_pq::helpers::estimate_fp16_overflow_centers(
-        res, index, pq.build_params.metric);
-      const double ms =
-        std::chrono::duration<double, std::milli>(std::chrono::steady_clock::now() - t0).count();
-      RAFT_LOG_INFO("estimate max norm on the IVF-PQ cluster centers took %.3f ms (overflow=%d)", ms, int(fp16_overflow));
+      if (fp16_overflow) {
+        RAFT_LOG_WARN(
+          "IVF-PQ FP16 distance produced non-finite results on a probe search for this dataset -> "
+          "switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32");
+        pq.search_params.internal_distance_dtype = CUDA_R_32F;
+        pq.search_params.coarse_search_dtype     = CUDA_R_32F;
+      }
     }
-    // if (fp16_overflow) {
-    //   RAFT_LOG_WARN(
-    //     "IVF-PQ FP16 distance produced non-finite results on a probe search for this dataset -> "
-    //     "switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32");
-    //   pq.search_params.internal_distance_dtype = CUDA_R_32F;
-    //   pq.search_params.coarse_search_dtype     = CUDA_R_32F;
-    // }
-    // TODO delete this after benchmarking 3 approaches
-    //pq.search_params.internal_distance_dtype = CUDA_R_32F;
-    //pq.search_params.coarse_search_dtype     = CUDA_R_32F;
   }
 
   //
@@ -2267,28 +2237,6 @@ index<T, IdxT> build(
     }
   }
 
-  // Predict potential FP16 distance overflow for large-magnitude (e.g. unnormalized) datasets
-  // -> fall back to FP32.
-  if (auto* pq = std::get_if<cagra::graph_build_params::ivf_pq_params>(&knn_build_params)) {
-    const bool using_fp16_distance = pq->search_params.internal_distance_dtype == CUDA_R_16F ||
-                                     pq->search_params.coarse_search_dtype == CUDA_R_16F;
-    raft::resource::sync_stream(res);
-    const auto detect_start  = std::chrono::steady_clock::now();
-    const bool first_20k_rows_check = ivf_pq::helpers::estimate_fp16_overflow(res, dataset, params.metric);
-    const double detect_ms =
-      std::chrono::duration<double, std::milli>(std::chrono::steady_clock::now() - detect_start)
-        .count();
-    RAFT_LOG_INFO("1st attempt: estimate_fp16_overflow on first 20k rows took %.3f ms (overflow=%d)", detect_ms, int(first_20k_rows_check));
-    if (using_fp16_distance &&
-        ivf_pq::helpers::estimate_fp16_overflow(res, dataset, params.metric)) {
-      RAFT_LOG_WARN(
-        "IVF-PQ internal type of FP16 is insufficient for this dataset to avoid overflow in "
-        "distance computations -> "
-        "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32");
-      //pq->search_params.internal_distance_dtype = CUDA_R_32F;
-      //pq->search_params.coarse_search_dtype     = CUDA_R_32F;
-    }
-  }
   RAFT_EXPECTS(
     params.metric != cuvs::distance::DistanceType::BitwiseHamming ||
       std::holds_alternative<cagra::graph_build_params::iterative_search_params>(
diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
index 8e4198ad6f..4cd239af5d 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh
@@ -28,155 +28,13 @@
 #include <algorithm>
 #include <cstdint>
 
-namespace cuvs::neighbors::ivf_pq::detail {
-
-struct is_non_finite_op {
-  __device__ auto operator()(float v) const -> bool { return isnan(v) || isinf(v); }
-};
-
-/**
- * Estimate max_i ||x_i||^2 over the dataset.
- */
-template <typename DataT, typename Accessor>
-float estimate_max_squared_norm(
-  raft::resources const& handle,
-  raft::mdspan<const DataT, raft::matrix_extent<int64_t>, raft::row_major, Accessor> dataset)
-{
-  common::nvtx::range<common::nvtx::domain::cuvs> r("estimate_max_squared_norm");
-  auto stream          = raft::resource::get_cuda_stream(handle);
-  const int64_t n_rows = dataset.extent(0);
-  const int64_t dim    = dataset.extent(1);
-
-  int64_t n_sample = std::min<int64_t>(n_rows, 20000);
-
-  auto mr = raft::resource::get_workspace_resource_ref(handle);
-  auto sample =
-    raft::make_device_mdarray<DataT>(handle, mr, raft::make_extents<int64_t>(n_sample, dim));
-  raft::copy(sample.data_handle(),
-             dataset.data_handle(),
-             n_sample * dim,
-             raft::resource::get_cuda_stream(handle));
-
-  // Compute float-mapped squared norm
-  auto d_map_sq_norm = raft::make_device_vector<float, int64_t>(handle, n_sample);
-  raft::linalg::reduce<raft::Apply::ALONG_ROWS>(
-    handle,
-    raft::make_const_mdspan(sample.view()),
-    d_map_sq_norm.view(),
-    0.0f,
-    false,
-    [] __device__(DataT v, auto) -> float {
-      float e = cuvs::spatial::knn::detail::utils::mapping<float>{}(v);
-      return e * e;
-    },
-    raft::add_op(),
-    raft::identity_op());
-  // Compute max of squared norm vector
-  auto d_max_sq = raft::make_device_scalar<float>(handle, 0.0f);
-  raft::linalg::map_reduce(handle,
-                           raft::make_const_mdspan(d_map_sq_norm.view()),
-                           d_max_sq.view(),
-                           0.0f,
-                           raft::identity_op(),
-                           raft::max_op());
-
-  float max_sq = 0.0f;
-  raft::update_host(&max_sq, d_max_sq.data_handle(), 1, stream);
-  raft::resource::sync_stream(handle);
-
-  return max_sq;
-}
-
-}  // namespace cuvs::neighbors::ivf_pq::detail
-
 namespace cuvs::neighbors::ivf_pq::helpers {
 
-/**
- * @brief Estimate whether FP16 is insufficient for IVF-PQ's full-magnitude distance
- * computations on this dataset (i.e. `internal_distance_dtype` and `coarse_search_dtype`).
- *
- * We bound the largest achievable score from the dataset's vector norms. With R = max_i ||x_i||
- * (estimated from a fraction of the dataset):
- *   - L2Expanded:     ||x - y||^2 = ||x||^2 + ||y||^2 - 2<x,y> <= (||x|| + ||y||)^2 <= 4 * R^2
- *   - InnerProduct:   |<x, y>|    <= ||x|| * ||y||                                  <=     R^2
- *   - CosineExpanded: data is L2-normalized, so |score| <= 1 and overflow is impossible.
- */
-template <typename DataT, typename Accessor>
-bool estimate_fp16_overflow(
-  raft::resources const& handle,
-  raft::mdspan<const DataT, raft::matrix_extent<int64_t>, raft::row_major, Accessor> dataset,
-  cuvs::distance::DistanceType metric)
-{
-  common::nvtx::range<common::nvtx::domain::cuvs> r("estimate_fp16_overflow");
-  if (dataset.extent(0) == 0) { return false; }
-
-  float dist_factor = 1.0f;
-  switch (metric) {
-    case cuvs::distance::DistanceType::L2Expanded: dist_factor = 4.0f; break;
-    case cuvs::distance::DistanceType::CosineExpanded:
-      // Cosine similarity scores does normalization itself, so overflow won't happen
-      return false;
-    case cuvs::distance::DistanceType::InnerProduct: dist_factor = 1.0f; break;
-    default: RAFT_FAIL("Unsupported distance type for IVF-PQ search %d.", int(metric));
-  }
-
-  const float max_vector_sq_norm =
-    cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(handle, dataset);
-  const float max_distance_sq_norm = dist_factor * max_vector_sq_norm;
-
-  constexpr float kFp16Max = 65504.0f;
-  return max_distance_sq_norm > kFp16Max;
-}
-
-/**
- * @brief Estimate whether FP16 is insufficient, using the IVF-PQ cluster centers.
- */
-inline bool estimate_fp16_overflow_centers(raft::resources const& handle,
-                                           const cuvs::neighbors::ivf_pq::index<int64_t>& index,
-                                           cuvs::distance::DistanceType metric)
-{
-  common::nvtx::range<common::nvtx::domain::cuvs> r("estimate_fp16_overflow_centers");
-  const int64_t n_lists = index.n_lists();
-  const int64_t dim     = index.dim();
-  if (n_lists == 0 || dim == 0) { return false; }
-
-  float dist_factor = 1.0f;
-  switch (metric) {
-    case cuvs::distance::DistanceType::L2Expanded: dist_factor = 4.0f; break;
-    case cuvs::distance::DistanceType::CosineExpanded:
-      // Cosine similarity scores does normalization itself, so overflow won't happen
-      return false;
-    case cuvs::distance::DistanceType::InnerProduct: dist_factor = 1.0f; break;
-    default: RAFT_FAIL("Unsupported distance type for IVF-PQ search %d.", int(metric));
-  }
-
-  const int64_t dim_ext = index.dim_ext();
-  auto stream           = raft::resource::get_cuda_stream(handle);
-  auto mr               = raft::resource::get_workspace_resource_ref(handle);
-  auto centers_contig =
-    raft::make_device_mdarray<float>(handle, mr, raft::make_extents<int64_t>(n_lists, dim));
-  RAFT_CUDA_TRY(cudaMemcpy2DAsync(centers_contig.data_handle(),
-                                  dim * sizeof(float),
-                                  index.centers().data_handle(),
-                                  dim_ext * sizeof(float),
-                                  dim * sizeof(float),
-                                  n_lists,
-                                  cudaMemcpyDeviceToDevice,
-                                  stream));
-
-  const float max_vector_sq_norm = cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(
-    handle, raft::make_const_mdspan(centers_contig.view()));
-  const float max_distance_sq_norm = dist_factor * max_vector_sq_norm;
-
-  constexpr float kFp16Max = 65504.0f;
-  return max_distance_sq_norm > kFp16Max;
-}
-
 /**
  * @brief Detect whether FP16 internal distance dtypes overflow for this dataset during search.
  *
- * Runs a small probe search against an already-built IVF-PQ index with FP16 internal/coarse
- * distance dtypes, and reports whether any returned distance is non-finite (inf/NaN).
+ * Runs a small probe search against an already-built IVF-PQ index with current distance types,
+ * and reports whether any returned distance is non-finite (inf/NaN).
  */
 template <typename DataT, typename Accessor>
 bool detect_fp16_overflow(
@@ -185,7 +43,6 @@ bool detect_fp16_overflow(
   cuvs::neighbors::ivf_pq::search_params search_params,
   raft::mdspan<const DataT, raft::matrix_extent<int64_t>, raft::row_major, Accessor> dataset)
 {
-  common::nvtx::range<common::nvtx::domain::cuvs> r("detect_fp16_overflow");
   const int64_t n_rows = dataset.extent(0);
   if (n_rows == 0) { return false; }
 
@@ -215,10 +72,11 @@ bool detect_fp16_overflow(
                                   distances.view());
 
   const int64_t count       = n_sample * static_cast<int64_t>(top_k);
+  auto is_non_finite_op     = [] __device__(float v) { return isnan(v) || isinf(v); };
   const bool any_non_finite = thrust::any_of(raft::resource::get_thrust_policy(handle),
                                              distances.data_handle(),
                                              distances.data_handle() + count,
-                                             cuvs::neighbors::ivf_pq::detail::is_non_finite_op{});
+                                             is_non_finite_op);
   raft::resource::sync_stream(handle);
   return any_non_finite;
 }