From 8c808275bc83383798a31ef8896869576622d247 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Tue, 16 Jun 2026 07:06:43 +0000 Subject: [PATCH 01/14] IVF-PQ FP16 overflow detection --- .../neighbors/detail/cagra/cagra_build.cuh | 16 ++ .../neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 138 ++++++++++++++++++ 2 files changed, 154 insertions(+) create mode 100644 cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index a7c15b4161..4ac33868fc 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -6,6 +6,7 @@ #include "../../../core/nvtx.hpp" #include "../../../preprocessing/quantize/vpq_build-ext.cuh" +#include "../../ivf_pq/ivf_pq_fp16_overflow.cuh" #include "graph_core.cuh" #include @@ -1637,6 +1638,21 @@ void build_knn_graph( pq.build_params.metric == cuvs::distance::DistanceType::CosineExpanded, "Currently only L2Expanded, InnerProduct and CosineExpanded metrics are supported"); + // Guard against potential FP16 distance overflow for large-magnitude datasets -> back to FP32. + const bool using_fp16_distance = pq.search_params.internal_distance_dtype == CUDA_R_16F || + pq.search_params.coarse_search_dtype == CUDA_R_16F; + if (using_fp16_distance && + ivf_pq::helpers::estimate_fp16_overflow(res, dataset, pq.build_params.metric)) { + RAFT_LOG_WARN( + "IVF-PQ internal type of FP16 is likely insufficient for this dataset to avoid overflow in " + "distance computations -> " + "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32"); + pq.search_params.internal_distance_dtype = CUDA_R_32F; + pq.search_params.coarse_search_dtype = CUDA_R_32F; + // lut_dtype is left unchanged because its per-subspace terms are smaller by a factor of pq_dim + // and therefore, less likely to overflow. + } + uint32_t node_degree = knn_graph.extent(1); raft::common::nvtx::range fun_scope( "cagra::build_knn_graph(%zu, %zu, %u)", diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh new file mode 100644 index 0000000000..e67e3a5d9c --- /dev/null +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -0,0 +1,138 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include "../detail/ann_utils.cuh" // cuvs::spatial::knn::detail::utils::mapping + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cuvs::neighbors::ivf_pq::detail { + +/** Reduce the maximum squared L2 norm over a set of row-major vectors of element type DataT. */ +template +__global__ void kern_max_squared_norm(const DataT* __restrict__ data, + int64_t n_rows, + int64_t dim, + float* __restrict__ out_max_sq) +{ + for (int64_t row = blockIdx.x * blockDim.x + threadIdx.x; row < n_rows; + row += static_cast(gridDim.x) * blockDim.x) { + const DataT* v = data + row * dim; + float sq = 0.0f; + for (int64_t d = 0; d < dim; d++) { + // + float e = cuvs::spatial::knn::detail::utils::mapping{}(v[d]); + printf("raw float(v[d]): %f, mapped: %f\n", static_cast(v[d]), e); + sq += e * e; + } + // - There is no atomicMax for floats, so we embrace the bitwise representation monoticity + // between float and int. This is valid when values are non-negative, which is the case + // for squared norms. + // - Choose global atomic instead of shared memory tree reduction for simplicity, assuming + // low contention. + atomicMax(reinterpret_cast(out_max_sq), __float_as_int(sq)); + } +} + +/** + * Estimate max_i ||x_i||^2 over the dataset by sampling a fraction of its rows. + * + * NOTE: sampling yields a *lower-bound* estimate of the true max norm, so a too-small fraction can + * miss outlier vectors. Increase `sample_fraction` (up to 1.0 for an exact, no-false-negative scan) + * if you observe overflow slipping through. + */ +template +float estimate_max_squared_norm( + raft::resources const& res, + raft::mdspan, raft::row_major, Accessor> dataset, + double sample_fraction) +{ + auto stream = raft::resource::get_cuda_stream(res); + const int64_t n_rows = dataset.extent(0); + const int64_t dim = dataset.extent(1); + + // Determine sample size based on fraction, with bounds + int64_t n_sample = static_cast(sample_fraction * static_cast(n_rows)); + if (n_rows <= 1000) { + n_sample = n_rows; // for small datasets, just use them all and skip the sampling overhead + } else if (n_rows > 100000) { + // cap the sample size to 100k for speed and keep memory use within the limited workspace + // resource + n_sample = 100000; + // + } + + // Sample from the dataset + auto mr = raft::resource::get_workspace_resource_ref(res); + auto sample = + raft::make_device_mdarray(res, mr, raft::make_extents(n_sample, dim)); + raft::matrix::sample_rows( + res, raft::random::RngState{137}, dataset, sample.view()); + + auto d_max_sq = raft::make_device_scalar(res, 0.0f); + constexpr int block_size = 256; + const int grid_size = static_cast((n_sample + block_size - 1) / block_size); + kern_max_squared_norm<<>>( + sample.data_handle(), n_sample, dim, d_max_sq.data_handle()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + float h_max_sq = 0.0f; + raft::update_host(&h_max_sq, d_max_sq.data_handle(), 1, stream); + raft::resource::sync_stream(res); + return h_max_sq; +} + +} // namespace cuvs::neighbors::ivf_pq::detail + +namespace cuvs::neighbors::ivf_pq::helpers { + +/** + * @brief Estimate whether FP16 is likely insufficient for IVF-PQ's full-magnitude distance + * computations on this dataset (i.e. `internal_distance_dtype` and `coarse_search_dtype`). + * + * We bound the largest achievable score from the dataset's vector norms. With R = max_i ||x_i|| + * (estimated from a random sample of the dataset): + * - L2Expanded: ||x - y||^2 = ||x||^2 + ||y||^2 - 2 <= (||x|| + ||y||)^2 <= 4 * R^2 + * - InnerProduct: || <= ||x|| * ||y|| <= R^2 + * - CosineExpanded: data is L2-normalized, so |score| <= 1 and overflow is impossible. + */ +template +bool estimate_fp16_overflow( + raft::resources const& res, + raft::mdspan, raft::row_major, Accessor> dataset, + cuvs::distance::DistanceType metric, + double sample_fraction = 0.01) +{ + // Cosine similarity scores does normalization itself, so overflow won't happen + if (metric == cuvs::distance::DistanceType::CosineExpanded) { return false; } + + // FP16 largest finite value, with a defensive margin to also avoid precision loss near the limit. + constexpr float kFp16Max = 65504.0f; + constexpr float kFp16DefensiveMargin = 0.25f; + const float overflow_detect_threshold = kFp16DefensiveMargin * kFp16Max; + + const float max_vector_sq_norm = + cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(res, dataset, sample_fraction); + + const float max_distance_sq_norm = metric == cuvs::distance::DistanceType::L2Expanded + ? 4.0f * max_vector_sq_norm + : max_vector_sq_norm; + + return max_distance_sq_norm > overflow_detect_threshold; +} + +} // namespace cuvs::neighbors::ivf_pq::helpers From d78f6644acd690967339fdc9d950b1d1477f8b75 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Tue, 16 Jun 2026 07:28:47 +0000 Subject: [PATCH 02/14] Explain the use of mapping --- cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh index e67e3a5d9c..3315607a85 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -34,13 +34,14 @@ __global__ void kern_max_squared_norm(const DataT* __restrict__ data, const DataT* v = data + row * dim; float sq = 0.0f; for (int64_t d = 0; d < dim; d++) { - // + // internally, IVF-PQ distance computations will map the input data type (e.g. FP16) to float before + // doing arithmetic, so we need to apply the same mapping here to get a correct estimate of the squared norms + // instead of using static_cast(v[d]) float e = cuvs::spatial::knn::detail::utils::mapping{}(v[d]); - printf("raw float(v[d]): %f, mapped: %f\n", static_cast(v[d]), e); sq += e * e; } // - There is no atomicMax for floats, so we embrace the bitwise representation monoticity - // between float and int. This is valid when values are non-negative, which is the case + // between float and int. This is valid when values are non-negative, which is the case // for squared norms. // - Choose global atomic instead of shared memory tree reduction for simplicity, assuming // low contention. @@ -70,10 +71,8 @@ float estimate_max_squared_norm( if (n_rows <= 1000) { n_sample = n_rows; // for small datasets, just use them all and skip the sampling overhead } else if (n_rows > 100000) { - // cap the sample size to 100k for speed and keep memory use within the limited workspace - // resource + // cap the sample size to 100k for speed and keep memory use within the limited workspace resource n_sample = 100000; - // } // Sample from the dataset From edac3b92a0ff2672193c2a08d32f45bc296c8246 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 22 Jun 2026 08:50:43 +0000 Subject: [PATCH 03/14] Move params fallback to params init phase --- .../neighbors/detail/cagra/cagra_build.cuh | 31 +++++++++---------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 4ac33868fc..6533e0b7a4 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -1637,22 +1637,7 @@ void build_knn_graph( pq.build_params.metric == cuvs::distance::DistanceType::InnerProduct || pq.build_params.metric == cuvs::distance::DistanceType::CosineExpanded, "Currently only L2Expanded, InnerProduct and CosineExpanded metrics are supported"); - - // Guard against potential FP16 distance overflow for large-magnitude datasets -> back to FP32. - const bool using_fp16_distance = pq.search_params.internal_distance_dtype == CUDA_R_16F || - pq.search_params.coarse_search_dtype == CUDA_R_16F; - if (using_fp16_distance && - ivf_pq::helpers::estimate_fp16_overflow(res, dataset, pq.build_params.metric)) { - RAFT_LOG_WARN( - "IVF-PQ internal type of FP16 is likely insufficient for this dataset to avoid overflow in " - "distance computations -> " - "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32"); - pq.search_params.internal_distance_dtype = CUDA_R_32F; - pq.search_params.coarse_search_dtype = CUDA_R_32F; - // lut_dtype is left unchanged because its per-subspace terms are smaller by a factor of pq_dim - // and therefore, less likely to overflow. - } - + uint32_t node_degree = knn_graph.extent(1); raft::common::nvtx::range fun_scope( "cagra::build_knn_graph(%zu, %zu, %u)", @@ -2231,6 +2216,20 @@ index build( } else { RAFT_LOG_DEBUG("Selecting IVF-PQ solver"); knn_build_params = cagra::graph_build_params::ivf_pq_params(dataset.extents(), params.metric); + // Guard against potential FP16 distance overflow for large-magnitude datasets -> back to FP32. + const bool using_fp16_distance = pq.search_params.internal_distance_dtype == CUDA_R_16F || + pq.search_params.coarse_search_dtype == CUDA_R_16F; + if (using_fp16_distance && + ivf_pq::helpers::estimate_fp16_overflow(res, dataset, pq.build_params.metric)) { + RAFT_LOG_WARN( + "IVF-PQ internal type of FP16 is likely insufficient for this dataset to avoid overflow in " + "distance computations -> " + "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32"); + pq.search_params.internal_distance_dtype = CUDA_R_32F; + pq.search_params.coarse_search_dtype = CUDA_R_32F; + // lut_dtype is left unchanged because its per-subspace terms are smaller by a factor of pq_dim + // and therefore, less likely to overflow. + } } } RAFT_EXPECTS( From 8c501152db9481b986e72ee279a7f97c3bc8cea7 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 22 Jun 2026 08:51:12 +0000 Subject: [PATCH 04/14] Choose sampling equation --- .../neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 24 +++++++++---------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh index 3315607a85..4c0bc6be94 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -59,21 +59,20 @@ __global__ void kern_max_squared_norm(const DataT* __restrict__ data, template float estimate_max_squared_norm( raft::resources const& res, - raft::mdspan, raft::row_major, Accessor> dataset, - double sample_fraction) + raft::mdspan, raft::row_major, Accessor> dataset) { auto stream = raft::resource::get_cuda_stream(res); const int64_t n_rows = dataset.extent(0); const int64_t dim = dataset.extent(1); - // Determine sample size based on fraction, with bounds - int64_t n_sample = static_cast(sample_fraction * static_cast(n_rows)); - if (n_rows <= 1000) { - n_sample = n_rows; // for small datasets, just use them all and skip the sampling overhead - } else if (n_rows > 100000) { - // cap the sample size to 100k for speed and keep memory use within the limited workspace resource - n_sample = 100000; - } + // Determine sample size based on a smooth saturation equation. The equation satisfies: + // - n_sample is always less than or equal to n_rows + // - n_sample saturates to kSaturation when n_rows is inf + // - n_sample increases fast for small n_rows and slow to saturation for large n_rows + // Idea: we greedily sample most of the dataset when it is small-sized, and cap it to kSaturation + // when dataset size grows very large. + constexpr int64_t kSaturation = 100000; + int64_t n_sample = (n_rows * kSaturation + (n_rows + kSaturation - 1)) / (n_rows + kSaturation); // Sample from the dataset auto mr = raft::resource::get_workspace_resource_ref(res); @@ -113,8 +112,7 @@ template bool estimate_fp16_overflow( raft::resources const& res, raft::mdspan, raft::row_major, Accessor> dataset, - cuvs::distance::DistanceType metric, - double sample_fraction = 0.01) + cuvs::distance::DistanceType metric) { // Cosine similarity scores does normalization itself, so overflow won't happen if (metric == cuvs::distance::DistanceType::CosineExpanded) { return false; } @@ -125,7 +123,7 @@ bool estimate_fp16_overflow( const float overflow_detect_threshold = kFp16DefensiveMargin * kFp16Max; const float max_vector_sq_norm = - cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(res, dataset, sample_fraction); + cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(res, dataset); const float max_distance_sq_norm = metric == cuvs::distance::DistanceType::L2Expanded ? 4.0f * max_vector_sq_norm From f3e3dd8e58a7936229c232b62ab0091f50f703be Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 22 Jun 2026 10:18:20 +0000 Subject: [PATCH 05/14] Use raft built-in kernels and remove manual one. --- .../neighbors/detail/cagra/cagra_build.cuh | 34 ++++--- .../neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 98 +++++++++---------- 2 files changed, 64 insertions(+), 68 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 6533e0b7a4..99ae2f36b2 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -1637,7 +1637,7 @@ void build_knn_graph( pq.build_params.metric == cuvs::distance::DistanceType::InnerProduct || pq.build_params.metric == cuvs::distance::DistanceType::CosineExpanded, "Currently only L2Expanded, InnerProduct and CosineExpanded metrics are supported"); - + uint32_t node_degree = knn_graph.extent(1); raft::common::nvtx::range fun_scope( "cagra::build_knn_graph(%zu, %zu, %u)", @@ -2216,20 +2216,24 @@ index build( } else { RAFT_LOG_DEBUG("Selecting IVF-PQ solver"); knn_build_params = cagra::graph_build_params::ivf_pq_params(dataset.extents(), params.metric); - // Guard against potential FP16 distance overflow for large-magnitude datasets -> back to FP32. - const bool using_fp16_distance = pq.search_params.internal_distance_dtype == CUDA_R_16F || - pq.search_params.coarse_search_dtype == CUDA_R_16F; - if (using_fp16_distance && - ivf_pq::helpers::estimate_fp16_overflow(res, dataset, pq.build_params.metric)) { - RAFT_LOG_WARN( - "IVF-PQ internal type of FP16 is likely insufficient for this dataset to avoid overflow in " - "distance computations -> " - "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32"); - pq.search_params.internal_distance_dtype = CUDA_R_32F; - pq.search_params.coarse_search_dtype = CUDA_R_32F; - // lut_dtype is left unchanged because its per-subspace terms are smaller by a factor of pq_dim - // and therefore, less likely to overflow. - } + } + } + + // Guard against potential FP16 distance overflow for large-magnitude datasets. Applies to any + // IVF-PQ graph build (auto-selected above or explicitly set in params) -> fall back to FP32. + if (auto* pq = std::get_if(&knn_build_params)) { + const bool using_fp16_distance = pq->search_params.internal_distance_dtype == CUDA_R_16F || + pq->search_params.coarse_search_dtype == CUDA_R_16F; + if (using_fp16_distance && + ivf_pq::helpers::estimate_fp16_overflow(res, dataset, params.metric)) { + RAFT_LOG_WARN( + "IVF-PQ internal type of FP16 is likely insufficient for this dataset to avoid overflow in " + "distance computations -> " + "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32"); + pq->search_params.internal_distance_dtype = CUDA_R_32F; + pq->search_params.coarse_search_dtype = CUDA_R_32F; + // lut_dtype is left unchanged because its per-subspace terms are smaller by a factor of + // pq_dim and therefore, less likely to overflow. } } RAFT_EXPECTS( diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh index 4c0bc6be94..a184fb9701 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -11,9 +11,12 @@ #include #include +#include #include #include #include +#include +#include #include #include #include @@ -22,46 +25,15 @@ namespace cuvs::neighbors::ivf_pq::detail { -/** Reduce the maximum squared L2 norm over a set of row-major vectors of element type DataT. */ -template -__global__ void kern_max_squared_norm(const DataT* __restrict__ data, - int64_t n_rows, - int64_t dim, - float* __restrict__ out_max_sq) -{ - for (int64_t row = blockIdx.x * blockDim.x + threadIdx.x; row < n_rows; - row += static_cast(gridDim.x) * blockDim.x) { - const DataT* v = data + row * dim; - float sq = 0.0f; - for (int64_t d = 0; d < dim; d++) { - // internally, IVF-PQ distance computations will map the input data type (e.g. FP16) to float before - // doing arithmetic, so we need to apply the same mapping here to get a correct estimate of the squared norms - // instead of using static_cast(v[d]) - float e = cuvs::spatial::knn::detail::utils::mapping{}(v[d]); - sq += e * e; - } - // - There is no atomicMax for floats, so we embrace the bitwise representation monoticity - // between float and int. This is valid when values are non-negative, which is the case - // for squared norms. - // - Choose global atomic instead of shared memory tree reduction for simplicity, assuming - // low contention. - atomicMax(reinterpret_cast(out_max_sq), __float_as_int(sq)); - } -} - /** - * Estimate max_i ||x_i||^2 over the dataset by sampling a fraction of its rows. - * - * NOTE: sampling yields a *lower-bound* estimate of the true max norm, so a too-small fraction can - * miss outlier vectors. Increase `sample_fraction` (up to 1.0 for an exact, no-false-negative scan) - * if you observe overflow slipping through. + * Estimate max_i ||x_i||^2 over the dataset by sampling from its rows. */ template float estimate_max_squared_norm( - raft::resources const& res, + raft::resources const& handle, raft::mdspan, raft::row_major, Accessor> dataset) { - auto stream = raft::resource::get_cuda_stream(res); + auto stream = raft::resource::get_cuda_stream(handle); const int64_t n_rows = dataset.extent(0); const int64_t dim = dataset.extent(1); @@ -69,29 +41,47 @@ float estimate_max_squared_norm( // - n_sample is always less than or equal to n_rows // - n_sample saturates to kSaturation when n_rows is inf // - n_sample increases fast for small n_rows and slow to saturation for large n_rows - // Idea: we greedily sample most of the dataset when it is small-sized, and cap it to kSaturation - // when dataset size grows very large. + // Idea: we sample most of the dataset when it is small-sized, and only a small fraction + // (up to a maximum/saturation number) when the dataset size grows large. + // kSaturation is selected as a compromise between runtime and recall. constexpr int64_t kSaturation = 100000; int64_t n_sample = (n_rows * kSaturation + (n_rows + kSaturation - 1)) / (n_rows + kSaturation); // Sample from the dataset - auto mr = raft::resource::get_workspace_resource_ref(res); + auto mr = raft::resource::get_workspace_resource_ref(handle); auto sample = - raft::make_device_mdarray(res, mr, raft::make_extents(n_sample, dim)); + raft::make_device_mdarray(handle, mr, raft::make_extents(n_sample, dim)); raft::matrix::sample_rows( - res, raft::random::RngState{137}, dataset, sample.view()); - - auto d_max_sq = raft::make_device_scalar(res, 0.0f); - constexpr int block_size = 256; - const int grid_size = static_cast((n_sample + block_size - 1) / block_size); - kern_max_squared_norm<<>>( - sample.data_handle(), n_sample, dim, d_max_sq.data_handle()); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - - float h_max_sq = 0.0f; - raft::update_host(&h_max_sq, d_max_sq.data_handle(), 1, stream); - raft::resource::sync_stream(res); - return h_max_sq; + handle, raft::random::RngState{137}, dataset, sample.view()); + + // Compute mapped squared norm + auto d_map_sq_norm = raft::make_device_vector(handle, n_sample); + raft::linalg::reduce( + handle, + raft::make_const_mdspan(sample.view()), + d_map_sq_norm.view(), + 0.0f, + false, + [] __device__(DataT v, auto) -> float { + float e = cuvs::spatial::knn::detail::utils::mapping{}(v); + return e * e; + }, + raft::add_op(), + raft::identity_op()); + // Compute max of squared norm vector + auto d_max_sq = raft::make_device_scalar(handle, 0.0f); + raft::linalg::map_reduce(handle, + raft::make_const_mdspan(d_map_sq_norm.view()), + d_max_sq.view(), + 0.0f, + raft::identity_op(), + raft::max_op()); + + float max_sq = 0.0f; + raft::update_host(&max_sq, d_max_sq.data_handle(), 1, stream); + raft::resource::sync_stream(handle); + + return max_sq; } } // namespace cuvs::neighbors::ivf_pq::detail @@ -110,10 +100,12 @@ namespace cuvs::neighbors::ivf_pq::helpers { */ template bool estimate_fp16_overflow( - raft::resources const& res, + raft::resources const& handle, raft::mdspan, raft::row_major, Accessor> dataset, cuvs::distance::DistanceType metric) { + if (dataset.extent(0) == 0) { return false; } + // Cosine similarity scores does normalization itself, so overflow won't happen if (metric == cuvs::distance::DistanceType::CosineExpanded) { return false; } @@ -123,7 +115,7 @@ bool estimate_fp16_overflow( const float overflow_detect_threshold = kFp16DefensiveMargin * kFp16Max; const float max_vector_sq_norm = - cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(res, dataset); + cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(handle, dataset); const float max_distance_sq_norm = metric == cuvs::distance::DistanceType::L2Expanded ? 4.0f * max_vector_sq_norm From 4d59c975734123d4462e6ef229114d1fc7b85466 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 22 Jun 2026 10:55:38 +0000 Subject: [PATCH 06/14] Add kDelay param to adjust the sampling fraction growth speed --- cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh index a184fb9701..b50e7e9a4e 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -43,9 +44,13 @@ float estimate_max_squared_norm( // - n_sample increases fast for small n_rows and slow to saturation for large n_rows // Idea: we sample most of the dataset when it is small-sized, and only a small fraction // (up to a maximum/saturation number) when the dataset size grows large. - // kSaturation is selected as a compromise between runtime and recall. + // kSaturation and kDelay are selected as a compromise between runtime and outlier recall. constexpr int64_t kSaturation = 100000; - int64_t n_sample = (n_rows * kSaturation + (n_rows + kSaturation - 1)) / (n_rows + kSaturation); + constexpr int64_t kDelay = kSaturation * 10; + RAFT_EXPECTS(kDelay >= kSaturation, + "kDelay must not be smaller than kSaturation so that n_sample is always less than " + "or equal to n_rows"); + int64_t n_sample = (n_rows * kSaturation + (n_rows + kDelay - 1)) / (n_rows + kDelay); // Sample from the dataset auto mr = raft::resource::get_workspace_resource_ref(handle); From c66b748b8599ef6cca43a63f54f8f8b617566fee Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 22 Jun 2026 11:24:33 +0000 Subject: [PATCH 07/14] Reduce kSaturation to 20k (just sufficient to detect FP16 overflow in SIFT 1M) --- cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh index b50e7e9a4e..99f2c9e60c 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -45,7 +45,7 @@ float estimate_max_squared_norm( // Idea: we sample most of the dataset when it is small-sized, and only a small fraction // (up to a maximum/saturation number) when the dataset size grows large. // kSaturation and kDelay are selected as a compromise between runtime and outlier recall. - constexpr int64_t kSaturation = 100000; + constexpr int64_t kSaturation = 20000; constexpr int64_t kDelay = kSaturation * 10; RAFT_EXPECTS(kDelay >= kSaturation, "kDelay must not be smaller than kSaturation so that n_sample is always less than " From adbe0a93379d52363215b6a81c88528dc7fdd631 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 22 Jun 2026 12:18:55 +0000 Subject: [PATCH 08/14] Explain uniform sampling decision --- cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh index 99f2c9e60c..2f99da9e06 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -27,7 +27,12 @@ namespace cuvs::neighbors::ivf_pq::detail { /** - * Estimate max_i ||x_i||^2 over the dataset by sampling from its rows. + * Estimate max_i ||x_i||^2 over the dataset by uniformly sampling a fraction from it. + * + * Decision: Uniform sampling is selected as it is sufficient to detect FP16 overflow in + * the datasets, where overflow-causing large vectors are frequent (e.g. SIFT 1M). For + * dataset with rare large outliers, we might preferably sample biasedly towards large vectors, + * e.g. via top-k selection over the vectors with largest L_inf norm. */ template float estimate_max_squared_norm( @@ -59,7 +64,7 @@ float estimate_max_squared_norm( raft::matrix::sample_rows( handle, raft::random::RngState{137}, dataset, sample.view()); - // Compute mapped squared norm + // Compute float-mapped squared norm auto d_map_sq_norm = raft::make_device_vector(handle, n_sample); raft::linalg::reduce( handle, From 891098b71ba22bba52782f550995e974d1f142a2 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Fri, 26 Jun 2026 12:55:54 +0000 Subject: [PATCH 09/14] Avoid expensive random sampling over the full dataset. Sampling is not necessary anyway. --- .../neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 24 +++++++------------ 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh index 2f99da9e06..a01cf8e220 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -18,8 +18,7 @@ #include #include #include -#include -#include +#include #include #include @@ -27,18 +26,14 @@ namespace cuvs::neighbors::ivf_pq::detail { /** - * Estimate max_i ||x_i||^2 over the dataset by uniformly sampling a fraction from it. - * - * Decision: Uniform sampling is selected as it is sufficient to detect FP16 overflow in - * the datasets, where overflow-causing large vectors are frequent (e.g. SIFT 1M). For - * dataset with rare large outliers, we might preferably sample biasedly towards large vectors, - * e.g. via top-k selection over the vectors with largest L_inf norm. + * Estimate max_i ||x_i||^2 over the dataset. */ template float estimate_max_squared_norm( raft::resources const& handle, raft::mdspan, raft::row_major, Accessor> dataset) { + common::nvtx::range r("estimate_max_squared_norm"); auto stream = raft::resource::get_cuda_stream(handle); const int64_t n_rows = dataset.extent(0); const int64_t dim = dataset.extent(1); @@ -47,7 +42,7 @@ float estimate_max_squared_norm( // - n_sample is always less than or equal to n_rows // - n_sample saturates to kSaturation when n_rows is inf // - n_sample increases fast for small n_rows and slow to saturation for large n_rows - // Idea: we sample most of the dataset when it is small-sized, and only a small fraction + // Idea: we examine most of the dataset when it is small-sized, and only a small fraction // (up to a maximum/saturation number) when the dataset size grows large. // kSaturation and kDelay are selected as a compromise between runtime and outlier recall. constexpr int64_t kSaturation = 20000; @@ -55,14 +50,11 @@ float estimate_max_squared_norm( RAFT_EXPECTS(kDelay >= kSaturation, "kDelay must not be smaller than kSaturation so that n_sample is always less than " "or equal to n_rows"); - int64_t n_sample = (n_rows * kSaturation + (n_rows + kDelay - 1)) / (n_rows + kDelay); + int64_t n_sample = raft::ceildiv(n_rows * kSaturation, n_rows + kDelay); - // Sample from the dataset auto mr = raft::resource::get_workspace_resource_ref(handle); - auto sample = - raft::make_device_mdarray(handle, mr, raft::make_extents(n_sample, dim)); - raft::matrix::sample_rows( - handle, raft::random::RngState{137}, dataset, sample.view()); + auto sample = raft::make_device_mdarray(handle, mr, raft::make_extents(n_sample, dim)); + raft::copy(sample.data_handle(), dataset.data_handle(), n_sample * dim, raft::resource::get_cuda_stream(handle)); // Compute float-mapped squared norm auto d_map_sq_norm = raft::make_device_vector(handle, n_sample); @@ -103,7 +95,7 @@ namespace cuvs::neighbors::ivf_pq::helpers { * computations on this dataset (i.e. `internal_distance_dtype` and `coarse_search_dtype`). * * We bound the largest achievable score from the dataset's vector norms. With R = max_i ||x_i|| - * (estimated from a random sample of the dataset): + * (estimated from a fraction of the dataset): * - L2Expanded: ||x - y||^2 = ||x||^2 + ||y||^2 - 2 <= (||x|| + ||y||)^2 <= 4 * R^2 * - InnerProduct: || <= ||x|| * ||y|| <= R^2 * - CosineExpanded: data is L2-normalized, so |score| <= 1 and overflow is impossible. From 6c124c844aaa8eede31c5e248494c8479305847d Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Fri, 26 Jun 2026 12:57:12 +0000 Subject: [PATCH 10/14] Edit comments --- cpp/src/neighbors/detail/cagra/cagra_build.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 99ae2f36b2..cee49c8630 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -2219,8 +2219,8 @@ index build( } } - // Guard against potential FP16 distance overflow for large-magnitude datasets. Applies to any - // IVF-PQ graph build (auto-selected above or explicitly set in params) -> fall back to FP32. + // Predict potential FP16 distance overflow for large-magnitude (e.g. unnormalized) datasets + // -> fall back to FP32. if (auto* pq = std::get_if(&knn_build_params)) { const bool using_fp16_distance = pq->search_params.internal_distance_dtype == CUDA_R_16F || pq->search_params.coarse_search_dtype == CUDA_R_16F; From ac65897434629787899dad2046ed7bbf62e38d23 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 29 Jun 2026 12:35:03 +0000 Subject: [PATCH 11/14] Fix sampling number and remove defensive margin --- .../neighbors/detail/cagra/cagra_build.cuh | 4 +-- .../neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 32 ++++++------------- 2 files changed, 12 insertions(+), 24 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index cee49c8630..a3ab54d376 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ #pragma once @@ -2219,7 +2219,7 @@ index build( } } - // Predict potential FP16 distance overflow for large-magnitude (e.g. unnormalized) datasets + // Predict potential FP16 distance overflow for large-magnitude (e.g. unnormalized) datasets // -> fall back to FP32. if (auto* pq = std::get_if(&knn_build_params)) { const bool using_fp16_distance = pq->search_params.internal_distance_dtype == CUDA_R_16F || diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh index a01cf8e220..58f35a8c35 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ @@ -38,23 +38,15 @@ float estimate_max_squared_norm( const int64_t n_rows = dataset.extent(0); const int64_t dim = dataset.extent(1); - // Determine sample size based on a smooth saturation equation. The equation satisfies: - // - n_sample is always less than or equal to n_rows - // - n_sample saturates to kSaturation when n_rows is inf - // - n_sample increases fast for small n_rows and slow to saturation for large n_rows - // Idea: we examine most of the dataset when it is small-sized, and only a small fraction - // (up to a maximum/saturation number) when the dataset size grows large. - // kSaturation and kDelay are selected as a compromise between runtime and outlier recall. - constexpr int64_t kSaturation = 20000; - constexpr int64_t kDelay = kSaturation * 10; - RAFT_EXPECTS(kDelay >= kSaturation, - "kDelay must not be smaller than kSaturation so that n_sample is always less than " - "or equal to n_rows"); - int64_t n_sample = raft::ceildiv(n_rows * kSaturation, n_rows + kDelay); + int64_t n_sample = std::min(n_rows, 20000); auto mr = raft::resource::get_workspace_resource_ref(handle); - auto sample = raft::make_device_mdarray(handle, mr, raft::make_extents(n_sample, dim)); - raft::copy(sample.data_handle(), dataset.data_handle(), n_sample * dim, raft::resource::get_cuda_stream(handle)); + auto sample = + raft::make_device_mdarray(handle, mr, raft::make_extents(n_sample, dim)); + raft::copy(sample.data_handle(), + dataset.data_handle(), + n_sample * dim, + raft::resource::get_cuda_stream(handle)); // Compute float-mapped squared norm auto d_map_sq_norm = raft::make_device_vector(handle, n_sample); @@ -111,11 +103,6 @@ bool estimate_fp16_overflow( // Cosine similarity scores does normalization itself, so overflow won't happen if (metric == cuvs::distance::DistanceType::CosineExpanded) { return false; } - // FP16 largest finite value, with a defensive margin to also avoid precision loss near the limit. - constexpr float kFp16Max = 65504.0f; - constexpr float kFp16DefensiveMargin = 0.25f; - const float overflow_detect_threshold = kFp16DefensiveMargin * kFp16Max; - const float max_vector_sq_norm = cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(handle, dataset); @@ -123,7 +110,8 @@ bool estimate_fp16_overflow( ? 4.0f * max_vector_sq_norm : max_vector_sq_norm; - return max_distance_sq_norm > overflow_detect_threshold; + constexpr float kFp16Max = 65504.0f; + return max_distance_sq_norm > kFp16Max; } } // namespace cuvs::neighbors::ivf_pq::helpers From 1d44ffabe27d5a56b2ddd7a394c9960c81539338 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 29 Jun 2026 13:58:43 +0000 Subject: [PATCH 12/14] Refactor for explicitness of supported distance types --- .../neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh index 58f35a8c35..7c7dfe9fb1 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -100,15 +100,19 @@ bool estimate_fp16_overflow( { if (dataset.extent(0) == 0) { return false; } - // Cosine similarity scores does normalization itself, so overflow won't happen - if (metric == cuvs::distance::DistanceType::CosineExpanded) { return false; } + float dist_factor = 1.0f; + switch (metric) { + case cuvs::distance::DistanceType::L2Expanded: dist_factor = 4.0f; break; + case cuvs::distance::DistanceType::CosineExpanded: + // Cosine similarity scores does normalization itself, so overflow won't happen + return false; + case cuvs::distance::DistanceType::InnerProduct: dist_factor = 1.0f; break; + default: RAFT_FAIL("Unsupported distance type for IVF-PQ search %d.", int(metric)); + } const float max_vector_sq_norm = cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(handle, dataset); - - const float max_distance_sq_norm = metric == cuvs::distance::DistanceType::L2Expanded - ? 4.0f * max_vector_sq_norm - : max_vector_sq_norm; + const float max_distance_sq_norm = dist_factor * max_vector_sq_norm; constexpr float kFp16Max = 65504.0f; return max_distance_sq_norm > kFp16Max; From b0b2f856856e0836347bf3d452c57c752c5e07cd Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Wed, 1 Jul 2026 02:27:57 -0700 Subject: [PATCH 13/14] Benchmark 3 approaches of overflow detection - estimate max norm on the first 20k rows - estimate max norm on the cluster centers - probe search and detect invalid distances --- .../neighbors/detail/cagra/cagra_build.cuh | 65 ++++++++++- .../neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 107 +++++++++++++++++- 2 files changed, 165 insertions(+), 7 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index a3ab54d376..508cf46b0e 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -1,6 +1,6 @@ /* * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 + * reserved. SPDX-License-Identifier: Apache-2.0 */ #pragma once @@ -1665,6 +1665,54 @@ void build_knn_graph( RAFT_LOG_DEBUG("# Building IVF-PQ index %s", model_name.c_str()); auto index = cuvs::neighbors::ivf_pq::build(res, pq.build_params, dataset); + // Empirically detect FP16 distance overflow on the just-built index: run a small FP16 probe + // search and downgrade the internal/coarse dtypes to FP32 if any distance comes back non-finite. + // This observes the actual computation, so it is agnostic of the selected distance type. + if (pq.search_params.internal_distance_dtype == CUDA_R_16F || + pq.search_params.coarse_search_dtype == CUDA_R_16F) { + // --- Benchmark fp16 overflow detection: probe search with 128 queries --- + { + raft::resource::sync_stream(res); + const auto t0 = std::chrono::steady_clock::now(); + const bool fp16_overflow = cuvs::neighbors::ivf_pq::helpers::detect_fp16_overflow( + res, index, pq.search_params, dataset); + const double ms = + std::chrono::duration(std::chrono::steady_clock::now() - t0) + .count(); + RAFT_LOG_INFO("detect_fp16_overflow (IVF-PQ search) took %.3f ms (overflow=%d)", ms, int(fp16_overflow)); + } + // --- Benchmark fp16 overflow detection: estimate max norm on the first 20k rows --- + { + raft::resource::sync_stream(res); + const auto t0 = std::chrono::steady_clock::now(); + const bool fp16_overflow = cuvs::neighbors::ivf_pq::helpers::estimate_fp16_overflow( + res, dataset, pq.build_params.metric); + const double ms = + std::chrono::duration(std::chrono::steady_clock::now() - t0).count(); + RAFT_LOG_INFO("estimate max norm on the first 20k rows took %.3f ms (overflow=%d)", ms, int(fp16_overflow)); + } + // --- Benchmark fp16 overflow detection: estimate max norm on the IVF-PQ cluster centers --- + { + raft::resource::sync_stream(res); + const auto t0 = std::chrono::steady_clock::now(); + const bool fp16_overflow = cuvs::neighbors::ivf_pq::helpers::estimate_fp16_overflow_centers( + res, index, pq.build_params.metric); + const double ms = + std::chrono::duration(std::chrono::steady_clock::now() - t0).count(); + RAFT_LOG_INFO("estimate max norm on the IVF-PQ cluster centers took %.3f ms (overflow=%d)", ms, int(fp16_overflow)); + } + // if (fp16_overflow) { + // RAFT_LOG_WARN( + // "IVF-PQ FP16 distance produced non-finite results on a probe search for this dataset -> " + // "switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32"); + // pq.search_params.internal_distance_dtype = CUDA_R_32F; + // pq.search_params.coarse_search_dtype = CUDA_R_32F; + // } + // TODO delete this after benchmarking 3 approaches + //pq.search_params.internal_distance_dtype = CUDA_R_32F; + //pq.search_params.coarse_search_dtype = CUDA_R_32F; + } + // // search top (k + 1) neighbors // @@ -2224,16 +2272,21 @@ index build( if (auto* pq = std::get_if(&knn_build_params)) { const bool using_fp16_distance = pq->search_params.internal_distance_dtype == CUDA_R_16F || pq->search_params.coarse_search_dtype == CUDA_R_16F; + raft::resource::sync_stream(res); + const auto detect_start = std::chrono::steady_clock::now(); + const bool first_20k_rows_check = ivf_pq::helpers::estimate_fp16_overflow(res, dataset, params.metric); + const double detect_ms = + std::chrono::duration(std::chrono::steady_clock::now() - detect_start) + .count(); + RAFT_LOG_INFO("1st attempt: estimate_fp16_overflow on first 20k rows took %.3f ms (overflow=%d)", detect_ms, int(first_20k_rows_check)); if (using_fp16_distance && ivf_pq::helpers::estimate_fp16_overflow(res, dataset, params.metric)) { RAFT_LOG_WARN( - "IVF-PQ internal type of FP16 is likely insufficient for this dataset to avoid overflow in " + "IVF-PQ internal type of FP16 is insufficient for this dataset to avoid overflow in " "distance computations -> " "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32"); - pq->search_params.internal_distance_dtype = CUDA_R_32F; - pq->search_params.coarse_search_dtype = CUDA_R_32F; - // lut_dtype is left unchanged because its per-subspace terms are smaller by a factor of - // pq_dim and therefore, less likely to overflow. + //pq->search_params.internal_distance_dtype = CUDA_R_32F; + //pq->search_params.coarse_search_dtype = CUDA_R_32F; } } RAFT_EXPECTS( diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh index 7c7dfe9fb1..8e4198ad6f 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -8,6 +8,7 @@ #include "../detail/ann_utils.cuh" // cuvs::spatial::knn::detail::utils::mapping #include +#include #include #include @@ -15,16 +16,24 @@ #include #include #include +#include #include #include #include #include #include +#include + +#include #include namespace cuvs::neighbors::ivf_pq::detail { +struct is_non_finite_op { + __device__ auto operator()(float v) const -> bool { return isnan(v) || isinf(v); } +}; + /** * Estimate max_i ||x_i||^2 over the dataset. */ @@ -83,7 +92,7 @@ float estimate_max_squared_norm( namespace cuvs::neighbors::ivf_pq::helpers { /** - * @brief Estimate whether FP16 is likely insufficient for IVF-PQ's full-magnitude distance + * @brief Estimate whether FP16 is insufficient for IVF-PQ's full-magnitude distance * computations on this dataset (i.e. `internal_distance_dtype` and `coarse_search_dtype`). * * We bound the largest achievable score from the dataset's vector norms. With R = max_i ||x_i|| @@ -98,6 +107,7 @@ bool estimate_fp16_overflow( raft::mdspan, raft::row_major, Accessor> dataset, cuvs::distance::DistanceType metric) { + common::nvtx::range r("estimate_fp16_overflow"); if (dataset.extent(0) == 0) { return false; } float dist_factor = 1.0f; @@ -118,4 +128,99 @@ bool estimate_fp16_overflow( return max_distance_sq_norm > kFp16Max; } +/** + * @brief Estimate whether FP16 is insufficient, using the IVF-PQ cluster centers. + */ +inline bool estimate_fp16_overflow_centers(raft::resources const& handle, + const cuvs::neighbors::ivf_pq::index& index, + cuvs::distance::DistanceType metric) +{ + common::nvtx::range r("estimate_fp16_overflow_centers"); + const int64_t n_lists = index.n_lists(); + const int64_t dim = index.dim(); + if (n_lists == 0 || dim == 0) { return false; } + + float dist_factor = 1.0f; + switch (metric) { + case cuvs::distance::DistanceType::L2Expanded: dist_factor = 4.0f; break; + case cuvs::distance::DistanceType::CosineExpanded: + // Cosine similarity scores does normalization itself, so overflow won't happen + return false; + case cuvs::distance::DistanceType::InnerProduct: dist_factor = 1.0f; break; + default: RAFT_FAIL("Unsupported distance type for IVF-PQ search %d.", int(metric)); + } + + const int64_t dim_ext = index.dim_ext(); + auto stream = raft::resource::get_cuda_stream(handle); + auto mr = raft::resource::get_workspace_resource_ref(handle); + auto centers_contig = + raft::make_device_mdarray(handle, mr, raft::make_extents(n_lists, dim)); + RAFT_CUDA_TRY(cudaMemcpy2DAsync(centers_contig.data_handle(), + dim * sizeof(float), + index.centers().data_handle(), + dim_ext * sizeof(float), + dim * sizeof(float), + n_lists, + cudaMemcpyDeviceToDevice, + stream)); + + const float max_vector_sq_norm = cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm( + handle, raft::make_const_mdspan(centers_contig.view())); + const float max_distance_sq_norm = dist_factor * max_vector_sq_norm; + + constexpr float kFp16Max = 65504.0f; + return max_distance_sq_norm > kFp16Max; +} + +/** + * @brief Detect whether FP16 internal distance dtypes overflow for this dataset during search. + * + * Runs a small probe search against an already-built IVF-PQ index with FP16 internal/coarse + * distance dtypes, and reports whether any returned distance is non-finite (inf/NaN). + */ +template +bool detect_fp16_overflow( + raft::resources const& handle, + const cuvs::neighbors::ivf_pq::index& index, + cuvs::neighbors::ivf_pq::search_params search_params, + raft::mdspan, raft::row_major, Accessor> dataset) +{ + common::nvtx::range r("detect_fp16_overflow"); + const int64_t n_rows = dataset.extent(0); + if (n_rows == 0) { return false; } + + auto stream = raft::resource::get_cuda_stream(handle); + const int64_t dim = dataset.extent(1); + + constexpr int64_t kMaxSampleQueries = 128; + constexpr uint32_t kProbeTopK = 32; + const int64_t n_sample = std::min(n_rows, kMaxSampleQueries); + const uint32_t top_k = std::min(static_cast(n_rows), kProbeTopK); + + auto mr = raft::resource::get_workspace_resource_ref(handle); + auto queries = + raft::make_device_mdarray(handle, mr, raft::make_extents(n_sample, dim)); + raft::copy(queries.data_handle(), dataset.data_handle(), n_sample * dim, stream); + + auto neighbors = + raft::make_device_mdarray(handle, mr, raft::make_extents(n_sample, top_k)); + auto distances = + raft::make_device_mdarray(handle, mr, raft::make_extents(n_sample, top_k)); + + cuvs::neighbors::ivf_pq::search(handle, + search_params, + index, + raft::make_const_mdspan(queries.view()), + neighbors.view(), + distances.view()); + + const int64_t count = n_sample * static_cast(top_k); + const bool any_non_finite = thrust::any_of(raft::resource::get_thrust_policy(handle), + distances.data_handle(), + distances.data_handle() + count, + cuvs::neighbors::ivf_pq::detail::is_non_finite_op{}); + raft::resource::sync_stream(handle); + return any_non_finite; +} + } // namespace cuvs::neighbors::ivf_pq::helpers From 85f7e571b1d417df463351bd168790a45774170d Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Thu, 2 Jul 2026 11:46:37 +0000 Subject: [PATCH 14/14] Select probe search approach --- .../neighbors/detail/cagra/cagra_build.cuh | 66 +------- .../neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 150 +----------------- 2 files changed, 11 insertions(+), 205 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 508cf46b0e..f118146fba 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -1670,47 +1670,17 @@ void build_knn_graph( // This observes the actual computation, so it is agnostic of the selected distance type. if (pq.search_params.internal_distance_dtype == CUDA_R_16F || pq.search_params.coarse_search_dtype == CUDA_R_16F) { - // --- Benchmark fp16 overflow detection: probe search with 128 queries --- { - raft::resource::sync_stream(res); - const auto t0 = std::chrono::steady_clock::now(); const bool fp16_overflow = cuvs::neighbors::ivf_pq::helpers::detect_fp16_overflow( res, index, pq.search_params, dataset); - const double ms = - std::chrono::duration(std::chrono::steady_clock::now() - t0) - .count(); - RAFT_LOG_INFO("detect_fp16_overflow (IVF-PQ search) took %.3f ms (overflow=%d)", ms, int(fp16_overflow)); - } - // --- Benchmark fp16 overflow detection: estimate max norm on the first 20k rows --- - { - raft::resource::sync_stream(res); - const auto t0 = std::chrono::steady_clock::now(); - const bool fp16_overflow = cuvs::neighbors::ivf_pq::helpers::estimate_fp16_overflow( - res, dataset, pq.build_params.metric); - const double ms = - std::chrono::duration(std::chrono::steady_clock::now() - t0).count(); - RAFT_LOG_INFO("estimate max norm on the first 20k rows took %.3f ms (overflow=%d)", ms, int(fp16_overflow)); - } - // --- Benchmark fp16 overflow detection: estimate max norm on the IVF-PQ cluster centers --- - { - raft::resource::sync_stream(res); - const auto t0 = std::chrono::steady_clock::now(); - const bool fp16_overflow = cuvs::neighbors::ivf_pq::helpers::estimate_fp16_overflow_centers( - res, index, pq.build_params.metric); - const double ms = - std::chrono::duration(std::chrono::steady_clock::now() - t0).count(); - RAFT_LOG_INFO("estimate max norm on the IVF-PQ cluster centers took %.3f ms (overflow=%d)", ms, int(fp16_overflow)); + if (fp16_overflow) { + RAFT_LOG_WARN( + "IVF-PQ FP16 distance produced non-finite results on a probe search for this dataset -> " + "switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32"); + pq.search_params.internal_distance_dtype = CUDA_R_32F; + pq.search_params.coarse_search_dtype = CUDA_R_32F; + } } - // if (fp16_overflow) { - // RAFT_LOG_WARN( - // "IVF-PQ FP16 distance produced non-finite results on a probe search for this dataset -> " - // "switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32"); - // pq.search_params.internal_distance_dtype = CUDA_R_32F; - // pq.search_params.coarse_search_dtype = CUDA_R_32F; - // } - // TODO delete this after benchmarking 3 approaches - //pq.search_params.internal_distance_dtype = CUDA_R_32F; - //pq.search_params.coarse_search_dtype = CUDA_R_32F; } // @@ -2267,28 +2237,6 @@ index build( } } - // Predict potential FP16 distance overflow for large-magnitude (e.g. unnormalized) datasets - // -> fall back to FP32. - if (auto* pq = std::get_if(&knn_build_params)) { - const bool using_fp16_distance = pq->search_params.internal_distance_dtype == CUDA_R_16F || - pq->search_params.coarse_search_dtype == CUDA_R_16F; - raft::resource::sync_stream(res); - const auto detect_start = std::chrono::steady_clock::now(); - const bool first_20k_rows_check = ivf_pq::helpers::estimate_fp16_overflow(res, dataset, params.metric); - const double detect_ms = - std::chrono::duration(std::chrono::steady_clock::now() - detect_start) - .count(); - RAFT_LOG_INFO("1st attempt: estimate_fp16_overflow on first 20k rows took %.3f ms (overflow=%d)", detect_ms, int(first_20k_rows_check)); - if (using_fp16_distance && - ivf_pq::helpers::estimate_fp16_overflow(res, dataset, params.metric)) { - RAFT_LOG_WARN( - "IVF-PQ internal type of FP16 is insufficient for this dataset to avoid overflow in " - "distance computations -> " - "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32"); - //pq->search_params.internal_distance_dtype = CUDA_R_32F; - //pq->search_params.coarse_search_dtype = CUDA_R_32F; - } - } RAFT_EXPECTS( params.metric != cuvs::distance::DistanceType::BitwiseHamming || std::holds_alternative( diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh index 8e4198ad6f..4cd239af5d 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -28,155 +28,13 @@ #include #include -namespace cuvs::neighbors::ivf_pq::detail { - -struct is_non_finite_op { - __device__ auto operator()(float v) const -> bool { return isnan(v) || isinf(v); } -}; - -/** - * Estimate max_i ||x_i||^2 over the dataset. - */ -template -float estimate_max_squared_norm( - raft::resources const& handle, - raft::mdspan, raft::row_major, Accessor> dataset) -{ - common::nvtx::range r("estimate_max_squared_norm"); - auto stream = raft::resource::get_cuda_stream(handle); - const int64_t n_rows = dataset.extent(0); - const int64_t dim = dataset.extent(1); - - int64_t n_sample = std::min(n_rows, 20000); - - auto mr = raft::resource::get_workspace_resource_ref(handle); - auto sample = - raft::make_device_mdarray(handle, mr, raft::make_extents(n_sample, dim)); - raft::copy(sample.data_handle(), - dataset.data_handle(), - n_sample * dim, - raft::resource::get_cuda_stream(handle)); - - // Compute float-mapped squared norm - auto d_map_sq_norm = raft::make_device_vector(handle, n_sample); - raft::linalg::reduce( - handle, - raft::make_const_mdspan(sample.view()), - d_map_sq_norm.view(), - 0.0f, - false, - [] __device__(DataT v, auto) -> float { - float e = cuvs::spatial::knn::detail::utils::mapping{}(v); - return e * e; - }, - raft::add_op(), - raft::identity_op()); - // Compute max of squared norm vector - auto d_max_sq = raft::make_device_scalar(handle, 0.0f); - raft::linalg::map_reduce(handle, - raft::make_const_mdspan(d_map_sq_norm.view()), - d_max_sq.view(), - 0.0f, - raft::identity_op(), - raft::max_op()); - - float max_sq = 0.0f; - raft::update_host(&max_sq, d_max_sq.data_handle(), 1, stream); - raft::resource::sync_stream(handle); - - return max_sq; -} - -} // namespace cuvs::neighbors::ivf_pq::detail - namespace cuvs::neighbors::ivf_pq::helpers { -/** - * @brief Estimate whether FP16 is insufficient for IVF-PQ's full-magnitude distance - * computations on this dataset (i.e. `internal_distance_dtype` and `coarse_search_dtype`). - * - * We bound the largest achievable score from the dataset's vector norms. With R = max_i ||x_i|| - * (estimated from a fraction of the dataset): - * - L2Expanded: ||x - y||^2 = ||x||^2 + ||y||^2 - 2 <= (||x|| + ||y||)^2 <= 4 * R^2 - * - InnerProduct: || <= ||x|| * ||y|| <= R^2 - * - CosineExpanded: data is L2-normalized, so |score| <= 1 and overflow is impossible. - */ -template -bool estimate_fp16_overflow( - raft::resources const& handle, - raft::mdspan, raft::row_major, Accessor> dataset, - cuvs::distance::DistanceType metric) -{ - common::nvtx::range r("estimate_fp16_overflow"); - if (dataset.extent(0) == 0) { return false; } - - float dist_factor = 1.0f; - switch (metric) { - case cuvs::distance::DistanceType::L2Expanded: dist_factor = 4.0f; break; - case cuvs::distance::DistanceType::CosineExpanded: - // Cosine similarity scores does normalization itself, so overflow won't happen - return false; - case cuvs::distance::DistanceType::InnerProduct: dist_factor = 1.0f; break; - default: RAFT_FAIL("Unsupported distance type for IVF-PQ search %d.", int(metric)); - } - - const float max_vector_sq_norm = - cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(handle, dataset); - const float max_distance_sq_norm = dist_factor * max_vector_sq_norm; - - constexpr float kFp16Max = 65504.0f; - return max_distance_sq_norm > kFp16Max; -} - -/** - * @brief Estimate whether FP16 is insufficient, using the IVF-PQ cluster centers. - */ -inline bool estimate_fp16_overflow_centers(raft::resources const& handle, - const cuvs::neighbors::ivf_pq::index& index, - cuvs::distance::DistanceType metric) -{ - common::nvtx::range r("estimate_fp16_overflow_centers"); - const int64_t n_lists = index.n_lists(); - const int64_t dim = index.dim(); - if (n_lists == 0 || dim == 0) { return false; } - - float dist_factor = 1.0f; - switch (metric) { - case cuvs::distance::DistanceType::L2Expanded: dist_factor = 4.0f; break; - case cuvs::distance::DistanceType::CosineExpanded: - // Cosine similarity scores does normalization itself, so overflow won't happen - return false; - case cuvs::distance::DistanceType::InnerProduct: dist_factor = 1.0f; break; - default: RAFT_FAIL("Unsupported distance type for IVF-PQ search %d.", int(metric)); - } - - const int64_t dim_ext = index.dim_ext(); - auto stream = raft::resource::get_cuda_stream(handle); - auto mr = raft::resource::get_workspace_resource_ref(handle); - auto centers_contig = - raft::make_device_mdarray(handle, mr, raft::make_extents(n_lists, dim)); - RAFT_CUDA_TRY(cudaMemcpy2DAsync(centers_contig.data_handle(), - dim * sizeof(float), - index.centers().data_handle(), - dim_ext * sizeof(float), - dim * sizeof(float), - n_lists, - cudaMemcpyDeviceToDevice, - stream)); - - const float max_vector_sq_norm = cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm( - handle, raft::make_const_mdspan(centers_contig.view())); - const float max_distance_sq_norm = dist_factor * max_vector_sq_norm; - - constexpr float kFp16Max = 65504.0f; - return max_distance_sq_norm > kFp16Max; -} - /** * @brief Detect whether FP16 internal distance dtypes overflow for this dataset during search. * - * Runs a small probe search against an already-built IVF-PQ index with FP16 internal/coarse - * distance dtypes, and reports whether any returned distance is non-finite (inf/NaN). + * Runs a small probe search against an already-built IVF-PQ index with current distance types, + * and reports whether any returned distance is non-finite (inf/NaN). */ template bool detect_fp16_overflow( @@ -185,7 +43,6 @@ bool detect_fp16_overflow( cuvs::neighbors::ivf_pq::search_params search_params, raft::mdspan, raft::row_major, Accessor> dataset) { - common::nvtx::range r("detect_fp16_overflow"); const int64_t n_rows = dataset.extent(0); if (n_rows == 0) { return false; } @@ -215,10 +72,11 @@ bool detect_fp16_overflow( distances.view()); const int64_t count = n_sample * static_cast(top_k); + auto is_non_finite_op = [] __device__(float v) { return isnan(v) || isinf(v); }; const bool any_non_finite = thrust::any_of(raft::resource::get_thrust_policy(handle), distances.data_handle(), distances.data_handle() + count, - cuvs::neighbors::ivf_pq::detail::is_non_finite_op{}); + is_non_finite_op); raft::resource::sync_stream(handle); return any_non_finite; }