From 19bc18fb1e70e60f2f2b681b41595338d431be45 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Tue, 16 Jun 2026 07:06:43 +0000 Subject: [PATCH 01/12] IVF-PQ FP16 overflow detection --- .../neighbors/detail/cagra/cagra_build.cuh | 16 ++ .../neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 138 ++++++++++++++++++ 2 files changed, 154 insertions(+) create mode 100644 cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index a7c15b4161..4ac33868fc 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -6,6 +6,7 @@ #include "../../../core/nvtx.hpp" #include "../../../preprocessing/quantize/vpq_build-ext.cuh" +#include "../../ivf_pq/ivf_pq_fp16_overflow.cuh" #include "graph_core.cuh" #include @@ -1637,6 +1638,21 @@ void build_knn_graph( pq.build_params.metric == cuvs::distance::DistanceType::CosineExpanded, "Currently only L2Expanded, InnerProduct and CosineExpanded metrics are supported"); + // Guard against potential FP16 distance overflow for large-magnitude datasets -> back to FP32. + const bool using_fp16_distance = pq.search_params.internal_distance_dtype == CUDA_R_16F || + pq.search_params.coarse_search_dtype == CUDA_R_16F; + if (using_fp16_distance && + ivf_pq::helpers::estimate_fp16_overflow(res, dataset, pq.build_params.metric)) { + RAFT_LOG_WARN( + "IVF-PQ internal type of FP16 is likely insufficient for this dataset to avoid overflow in " + "distance computations -> " + "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32"); + pq.search_params.internal_distance_dtype = CUDA_R_32F; + pq.search_params.coarse_search_dtype = CUDA_R_32F; + // lut_dtype is left unchanged because its per-subspace terms are smaller by a factor of pq_dim + // and therefore, less likely to overflow. + } + uint32_t node_degree = knn_graph.extent(1); raft::common::nvtx::range fun_scope( "cagra::build_knn_graph(%zu, %zu, %u)", diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh new file mode 100644 index 0000000000..e67e3a5d9c --- /dev/null +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -0,0 +1,138 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include "../detail/ann_utils.cuh" // cuvs::spatial::knn::detail::utils::mapping + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cuvs::neighbors::ivf_pq::detail { + +/** Reduce the maximum squared L2 norm over a set of row-major vectors of element type DataT. */ +template +__global__ void kern_max_squared_norm(const DataT* __restrict__ data, + int64_t n_rows, + int64_t dim, + float* __restrict__ out_max_sq) +{ + for (int64_t row = blockIdx.x * blockDim.x + threadIdx.x; row < n_rows; + row += static_cast(gridDim.x) * blockDim.x) { + const DataT* v = data + row * dim; + float sq = 0.0f; + for (int64_t d = 0; d < dim; d++) { + // + float e = cuvs::spatial::knn::detail::utils::mapping{}(v[d]); + printf("raw float(v[d]): %f, mapped: %f\n", static_cast(v[d]), e); + sq += e * e; + } + // - There is no atomicMax for floats, so we embrace the bitwise representation monoticity + // between float and int. This is valid when values are non-negative, which is the case + // for squared norms. + // - Choose global atomic instead of shared memory tree reduction for simplicity, assuming + // low contention. + atomicMax(reinterpret_cast(out_max_sq), __float_as_int(sq)); + } +} + +/** + * Estimate max_i ||x_i||^2 over the dataset by sampling a fraction of its rows. + * + * NOTE: sampling yields a *lower-bound* estimate of the true max norm, so a too-small fraction can + * miss outlier vectors. Increase `sample_fraction` (up to 1.0 for an exact, no-false-negative scan) + * if you observe overflow slipping through. + */ +template +float estimate_max_squared_norm( + raft::resources const& res, + raft::mdspan, raft::row_major, Accessor> dataset, + double sample_fraction) +{ + auto stream = raft::resource::get_cuda_stream(res); + const int64_t n_rows = dataset.extent(0); + const int64_t dim = dataset.extent(1); + + // Determine sample size based on fraction, with bounds + int64_t n_sample = static_cast(sample_fraction * static_cast(n_rows)); + if (n_rows <= 1000) { + n_sample = n_rows; // for small datasets, just use them all and skip the sampling overhead + } else if (n_rows > 100000) { + // cap the sample size to 100k for speed and keep memory use within the limited workspace + // resource + n_sample = 100000; + // + } + + // Sample from the dataset + auto mr = raft::resource::get_workspace_resource_ref(res); + auto sample = + raft::make_device_mdarray(res, mr, raft::make_extents(n_sample, dim)); + raft::matrix::sample_rows( + res, raft::random::RngState{137}, dataset, sample.view()); + + auto d_max_sq = raft::make_device_scalar(res, 0.0f); + constexpr int block_size = 256; + const int grid_size = static_cast((n_sample + block_size - 1) / block_size); + kern_max_squared_norm<<>>( + sample.data_handle(), n_sample, dim, d_max_sq.data_handle()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + float h_max_sq = 0.0f; + raft::update_host(&h_max_sq, d_max_sq.data_handle(), 1, stream); + raft::resource::sync_stream(res); + return h_max_sq; +} + +} // namespace cuvs::neighbors::ivf_pq::detail + +namespace cuvs::neighbors::ivf_pq::helpers { + +/** + * @brief Estimate whether FP16 is likely insufficient for IVF-PQ's full-magnitude distance + * computations on this dataset (i.e. `internal_distance_dtype` and `coarse_search_dtype`). + * + * We bound the largest achievable score from the dataset's vector norms. With R = max_i ||x_i|| + * (estimated from a random sample of the dataset): + * - L2Expanded: ||x - y||^2 = ||x||^2 + ||y||^2 - 2 <= (||x|| + ||y||)^2 <= 4 * R^2 + * - InnerProduct: || <= ||x|| * ||y|| <= R^2 + * - CosineExpanded: data is L2-normalized, so |score| <= 1 and overflow is impossible. + */ +template +bool estimate_fp16_overflow( + raft::resources const& res, + raft::mdspan, raft::row_major, Accessor> dataset, + cuvs::distance::DistanceType metric, + double sample_fraction = 0.01) +{ + // Cosine similarity scores does normalization itself, so overflow won't happen + if (metric == cuvs::distance::DistanceType::CosineExpanded) { return false; } + + // FP16 largest finite value, with a defensive margin to also avoid precision loss near the limit. + constexpr float kFp16Max = 65504.0f; + constexpr float kFp16DefensiveMargin = 0.25f; + const float overflow_detect_threshold = kFp16DefensiveMargin * kFp16Max; + + const float max_vector_sq_norm = + cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(res, dataset, sample_fraction); + + const float max_distance_sq_norm = metric == cuvs::distance::DistanceType::L2Expanded + ? 4.0f * max_vector_sq_norm + : max_vector_sq_norm; + + return max_distance_sq_norm > overflow_detect_threshold; +} + +} // namespace cuvs::neighbors::ivf_pq::helpers From 01f234a38aafbab686a7e2009a4b41b28436dd41 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Tue, 16 Jun 2026 07:28:47 +0000 Subject: [PATCH 02/12] Explain the use of mapping --- cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh index e67e3a5d9c..3315607a85 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -34,13 +34,14 @@ __global__ void kern_max_squared_norm(const DataT* __restrict__ data, const DataT* v = data + row * dim; float sq = 0.0f; for (int64_t d = 0; d < dim; d++) { - // + // internally, IVF-PQ distance computations will map the input data type (e.g. FP16) to float before + // doing arithmetic, so we need to apply the same mapping here to get a correct estimate of the squared norms + // instead of using static_cast(v[d]) float e = cuvs::spatial::knn::detail::utils::mapping{}(v[d]); - printf("raw float(v[d]): %f, mapped: %f\n", static_cast(v[d]), e); sq += e * e; } // - There is no atomicMax for floats, so we embrace the bitwise representation monoticity - // between float and int. This is valid when values are non-negative, which is the case + // between float and int. This is valid when values are non-negative, which is the case // for squared norms. // - Choose global atomic instead of shared memory tree reduction for simplicity, assuming // low contention. @@ -70,10 +71,8 @@ float estimate_max_squared_norm( if (n_rows <= 1000) { n_sample = n_rows; // for small datasets, just use them all and skip the sampling overhead } else if (n_rows > 100000) { - // cap the sample size to 100k for speed and keep memory use within the limited workspace - // resource + // cap the sample size to 100k for speed and keep memory use within the limited workspace resource n_sample = 100000; - // } // Sample from the dataset From 08302cf8fed856b0f82bb05bcc83b0e52cc60317 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 22 Jun 2026 08:50:43 +0000 Subject: [PATCH 03/12] Move params fallback to params init phase --- .../neighbors/detail/cagra/cagra_build.cuh | 31 +++++++++---------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 4ac33868fc..6533e0b7a4 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -1637,22 +1637,7 @@ void build_knn_graph( pq.build_params.metric == cuvs::distance::DistanceType::InnerProduct || pq.build_params.metric == cuvs::distance::DistanceType::CosineExpanded, "Currently only L2Expanded, InnerProduct and CosineExpanded metrics are supported"); - - // Guard against potential FP16 distance overflow for large-magnitude datasets -> back to FP32. - const bool using_fp16_distance = pq.search_params.internal_distance_dtype == CUDA_R_16F || - pq.search_params.coarse_search_dtype == CUDA_R_16F; - if (using_fp16_distance && - ivf_pq::helpers::estimate_fp16_overflow(res, dataset, pq.build_params.metric)) { - RAFT_LOG_WARN( - "IVF-PQ internal type of FP16 is likely insufficient for this dataset to avoid overflow in " - "distance computations -> " - "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32"); - pq.search_params.internal_distance_dtype = CUDA_R_32F; - pq.search_params.coarse_search_dtype = CUDA_R_32F; - // lut_dtype is left unchanged because its per-subspace terms are smaller by a factor of pq_dim - // and therefore, less likely to overflow. - } - + uint32_t node_degree = knn_graph.extent(1); raft::common::nvtx::range fun_scope( "cagra::build_knn_graph(%zu, %zu, %u)", @@ -2231,6 +2216,20 @@ index build( } else { RAFT_LOG_DEBUG("Selecting IVF-PQ solver"); knn_build_params = cagra::graph_build_params::ivf_pq_params(dataset.extents(), params.metric); + // Guard against potential FP16 distance overflow for large-magnitude datasets -> back to FP32. + const bool using_fp16_distance = pq.search_params.internal_distance_dtype == CUDA_R_16F || + pq.search_params.coarse_search_dtype == CUDA_R_16F; + if (using_fp16_distance && + ivf_pq::helpers::estimate_fp16_overflow(res, dataset, pq.build_params.metric)) { + RAFT_LOG_WARN( + "IVF-PQ internal type of FP16 is likely insufficient for this dataset to avoid overflow in " + "distance computations -> " + "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32"); + pq.search_params.internal_distance_dtype = CUDA_R_32F; + pq.search_params.coarse_search_dtype = CUDA_R_32F; + // lut_dtype is left unchanged because its per-subspace terms are smaller by a factor of pq_dim + // and therefore, less likely to overflow. + } } } RAFT_EXPECTS( From 1586a42b9a6a5dc98436dc1205fdf2e3c71fc0c2 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 22 Jun 2026 08:51:12 +0000 Subject: [PATCH 04/12] Choose sampling equation --- .../neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 24 +++++++++---------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh index 3315607a85..4c0bc6be94 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -59,21 +59,20 @@ __global__ void kern_max_squared_norm(const DataT* __restrict__ data, template float estimate_max_squared_norm( raft::resources const& res, - raft::mdspan, raft::row_major, Accessor> dataset, - double sample_fraction) + raft::mdspan, raft::row_major, Accessor> dataset) { auto stream = raft::resource::get_cuda_stream(res); const int64_t n_rows = dataset.extent(0); const int64_t dim = dataset.extent(1); - // Determine sample size based on fraction, with bounds - int64_t n_sample = static_cast(sample_fraction * static_cast(n_rows)); - if (n_rows <= 1000) { - n_sample = n_rows; // for small datasets, just use them all and skip the sampling overhead - } else if (n_rows > 100000) { - // cap the sample size to 100k for speed and keep memory use within the limited workspace resource - n_sample = 100000; - } + // Determine sample size based on a smooth saturation equation. The equation satisfies: + // - n_sample is always less than or equal to n_rows + // - n_sample saturates to kSaturation when n_rows is inf + // - n_sample increases fast for small n_rows and slow to saturation for large n_rows + // Idea: we greedily sample most of the dataset when it is small-sized, and cap it to kSaturation + // when dataset size grows very large. + constexpr int64_t kSaturation = 100000; + int64_t n_sample = (n_rows * kSaturation + (n_rows + kSaturation - 1)) / (n_rows + kSaturation); // Sample from the dataset auto mr = raft::resource::get_workspace_resource_ref(res); @@ -113,8 +112,7 @@ template bool estimate_fp16_overflow( raft::resources const& res, raft::mdspan, raft::row_major, Accessor> dataset, - cuvs::distance::DistanceType metric, - double sample_fraction = 0.01) + cuvs::distance::DistanceType metric) { // Cosine similarity scores does normalization itself, so overflow won't happen if (metric == cuvs::distance::DistanceType::CosineExpanded) { return false; } @@ -125,7 +123,7 @@ bool estimate_fp16_overflow( const float overflow_detect_threshold = kFp16DefensiveMargin * kFp16Max; const float max_vector_sq_norm = - cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(res, dataset, sample_fraction); + cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(res, dataset); const float max_distance_sq_norm = metric == cuvs::distance::DistanceType::L2Expanded ? 4.0f * max_vector_sq_norm From 431d3ac2c99c04f21e14543a372053278c19399b Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 22 Jun 2026 10:18:20 +0000 Subject: [PATCH 05/12] Use raft built-in kernels and remove manual one. --- .../neighbors/detail/cagra/cagra_build.cuh | 34 ++++--- .../neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 98 +++++++++---------- 2 files changed, 64 insertions(+), 68 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 6533e0b7a4..99ae2f36b2 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -1637,7 +1637,7 @@ void build_knn_graph( pq.build_params.metric == cuvs::distance::DistanceType::InnerProduct || pq.build_params.metric == cuvs::distance::DistanceType::CosineExpanded, "Currently only L2Expanded, InnerProduct and CosineExpanded metrics are supported"); - + uint32_t node_degree = knn_graph.extent(1); raft::common::nvtx::range fun_scope( "cagra::build_knn_graph(%zu, %zu, %u)", @@ -2216,20 +2216,24 @@ index build( } else { RAFT_LOG_DEBUG("Selecting IVF-PQ solver"); knn_build_params = cagra::graph_build_params::ivf_pq_params(dataset.extents(), params.metric); - // Guard against potential FP16 distance overflow for large-magnitude datasets -> back to FP32. - const bool using_fp16_distance = pq.search_params.internal_distance_dtype == CUDA_R_16F || - pq.search_params.coarse_search_dtype == CUDA_R_16F; - if (using_fp16_distance && - ivf_pq::helpers::estimate_fp16_overflow(res, dataset, pq.build_params.metric)) { - RAFT_LOG_WARN( - "IVF-PQ internal type of FP16 is likely insufficient for this dataset to avoid overflow in " - "distance computations -> " - "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32"); - pq.search_params.internal_distance_dtype = CUDA_R_32F; - pq.search_params.coarse_search_dtype = CUDA_R_32F; - // lut_dtype is left unchanged because its per-subspace terms are smaller by a factor of pq_dim - // and therefore, less likely to overflow. - } + } + } + + // Guard against potential FP16 distance overflow for large-magnitude datasets. Applies to any + // IVF-PQ graph build (auto-selected above or explicitly set in params) -> fall back to FP32. + if (auto* pq = std::get_if(&knn_build_params)) { + const bool using_fp16_distance = pq->search_params.internal_distance_dtype == CUDA_R_16F || + pq->search_params.coarse_search_dtype == CUDA_R_16F; + if (using_fp16_distance && + ivf_pq::helpers::estimate_fp16_overflow(res, dataset, params.metric)) { + RAFT_LOG_WARN( + "IVF-PQ internal type of FP16 is likely insufficient for this dataset to avoid overflow in " + "distance computations -> " + "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32"); + pq->search_params.internal_distance_dtype = CUDA_R_32F; + pq->search_params.coarse_search_dtype = CUDA_R_32F; + // lut_dtype is left unchanged because its per-subspace terms are smaller by a factor of + // pq_dim and therefore, less likely to overflow. } } RAFT_EXPECTS( diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh index 4c0bc6be94..a184fb9701 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -11,9 +11,12 @@ #include #include +#include #include #include #include +#include +#include #include #include #include @@ -22,46 +25,15 @@ namespace cuvs::neighbors::ivf_pq::detail { -/** Reduce the maximum squared L2 norm over a set of row-major vectors of element type DataT. */ -template -__global__ void kern_max_squared_norm(const DataT* __restrict__ data, - int64_t n_rows, - int64_t dim, - float* __restrict__ out_max_sq) -{ - for (int64_t row = blockIdx.x * blockDim.x + threadIdx.x; row < n_rows; - row += static_cast(gridDim.x) * blockDim.x) { - const DataT* v = data + row * dim; - float sq = 0.0f; - for (int64_t d = 0; d < dim; d++) { - // internally, IVF-PQ distance computations will map the input data type (e.g. FP16) to float before - // doing arithmetic, so we need to apply the same mapping here to get a correct estimate of the squared norms - // instead of using static_cast(v[d]) - float e = cuvs::spatial::knn::detail::utils::mapping{}(v[d]); - sq += e * e; - } - // - There is no atomicMax for floats, so we embrace the bitwise representation monoticity - // between float and int. This is valid when values are non-negative, which is the case - // for squared norms. - // - Choose global atomic instead of shared memory tree reduction for simplicity, assuming - // low contention. - atomicMax(reinterpret_cast(out_max_sq), __float_as_int(sq)); - } -} - /** - * Estimate max_i ||x_i||^2 over the dataset by sampling a fraction of its rows. - * - * NOTE: sampling yields a *lower-bound* estimate of the true max norm, so a too-small fraction can - * miss outlier vectors. Increase `sample_fraction` (up to 1.0 for an exact, no-false-negative scan) - * if you observe overflow slipping through. + * Estimate max_i ||x_i||^2 over the dataset by sampling from its rows. */ template float estimate_max_squared_norm( - raft::resources const& res, + raft::resources const& handle, raft::mdspan, raft::row_major, Accessor> dataset) { - auto stream = raft::resource::get_cuda_stream(res); + auto stream = raft::resource::get_cuda_stream(handle); const int64_t n_rows = dataset.extent(0); const int64_t dim = dataset.extent(1); @@ -69,29 +41,47 @@ float estimate_max_squared_norm( // - n_sample is always less than or equal to n_rows // - n_sample saturates to kSaturation when n_rows is inf // - n_sample increases fast for small n_rows and slow to saturation for large n_rows - // Idea: we greedily sample most of the dataset when it is small-sized, and cap it to kSaturation - // when dataset size grows very large. + // Idea: we sample most of the dataset when it is small-sized, and only a small fraction + // (up to a maximum/saturation number) when the dataset size grows large. + // kSaturation is selected as a compromise between runtime and recall. constexpr int64_t kSaturation = 100000; int64_t n_sample = (n_rows * kSaturation + (n_rows + kSaturation - 1)) / (n_rows + kSaturation); // Sample from the dataset - auto mr = raft::resource::get_workspace_resource_ref(res); + auto mr = raft::resource::get_workspace_resource_ref(handle); auto sample = - raft::make_device_mdarray(res, mr, raft::make_extents(n_sample, dim)); + raft::make_device_mdarray(handle, mr, raft::make_extents(n_sample, dim)); raft::matrix::sample_rows( - res, raft::random::RngState{137}, dataset, sample.view()); - - auto d_max_sq = raft::make_device_scalar(res, 0.0f); - constexpr int block_size = 256; - const int grid_size = static_cast((n_sample + block_size - 1) / block_size); - kern_max_squared_norm<<>>( - sample.data_handle(), n_sample, dim, d_max_sq.data_handle()); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - - float h_max_sq = 0.0f; - raft::update_host(&h_max_sq, d_max_sq.data_handle(), 1, stream); - raft::resource::sync_stream(res); - return h_max_sq; + handle, raft::random::RngState{137}, dataset, sample.view()); + + // Compute mapped squared norm + auto d_map_sq_norm = raft::make_device_vector(handle, n_sample); + raft::linalg::reduce( + handle, + raft::make_const_mdspan(sample.view()), + d_map_sq_norm.view(), + 0.0f, + false, + [] __device__(DataT v, auto) -> float { + float e = cuvs::spatial::knn::detail::utils::mapping{}(v); + return e * e; + }, + raft::add_op(), + raft::identity_op()); + // Compute max of squared norm vector + auto d_max_sq = raft::make_device_scalar(handle, 0.0f); + raft::linalg::map_reduce(handle, + raft::make_const_mdspan(d_map_sq_norm.view()), + d_max_sq.view(), + 0.0f, + raft::identity_op(), + raft::max_op()); + + float max_sq = 0.0f; + raft::update_host(&max_sq, d_max_sq.data_handle(), 1, stream); + raft::resource::sync_stream(handle); + + return max_sq; } } // namespace cuvs::neighbors::ivf_pq::detail @@ -110,10 +100,12 @@ namespace cuvs::neighbors::ivf_pq::helpers { */ template bool estimate_fp16_overflow( - raft::resources const& res, + raft::resources const& handle, raft::mdspan, raft::row_major, Accessor> dataset, cuvs::distance::DistanceType metric) { + if (dataset.extent(0) == 0) { return false; } + // Cosine similarity scores does normalization itself, so overflow won't happen if (metric == cuvs::distance::DistanceType::CosineExpanded) { return false; } @@ -123,7 +115,7 @@ bool estimate_fp16_overflow( const float overflow_detect_threshold = kFp16DefensiveMargin * kFp16Max; const float max_vector_sq_norm = - cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(res, dataset); + cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(handle, dataset); const float max_distance_sq_norm = metric == cuvs::distance::DistanceType::L2Expanded ? 4.0f * max_vector_sq_norm From 7fcdb1d945044b294ebf84700d9909da66c0017a Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 22 Jun 2026 10:55:38 +0000 Subject: [PATCH 06/12] Add kDelay param to adjust the sampling fraction growth speed --- cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh index a184fb9701..b50e7e9a4e 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -43,9 +44,13 @@ float estimate_max_squared_norm( // - n_sample increases fast for small n_rows and slow to saturation for large n_rows // Idea: we sample most of the dataset when it is small-sized, and only a small fraction // (up to a maximum/saturation number) when the dataset size grows large. - // kSaturation is selected as a compromise between runtime and recall. + // kSaturation and kDelay are selected as a compromise between runtime and outlier recall. constexpr int64_t kSaturation = 100000; - int64_t n_sample = (n_rows * kSaturation + (n_rows + kSaturation - 1)) / (n_rows + kSaturation); + constexpr int64_t kDelay = kSaturation * 10; + RAFT_EXPECTS(kDelay >= kSaturation, + "kDelay must not be smaller than kSaturation so that n_sample is always less than " + "or equal to n_rows"); + int64_t n_sample = (n_rows * kSaturation + (n_rows + kDelay - 1)) / (n_rows + kDelay); // Sample from the dataset auto mr = raft::resource::get_workspace_resource_ref(handle); From e09cb73eac440933672369a145fc39d04021f2f9 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 22 Jun 2026 11:24:33 +0000 Subject: [PATCH 07/12] Reduce kSaturation to 20k (just sufficient to detect FP16 overflow in SIFT 1M) --- cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh index b50e7e9a4e..99f2c9e60c 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -45,7 +45,7 @@ float estimate_max_squared_norm( // Idea: we sample most of the dataset when it is small-sized, and only a small fraction // (up to a maximum/saturation number) when the dataset size grows large. // kSaturation and kDelay are selected as a compromise between runtime and outlier recall. - constexpr int64_t kSaturation = 100000; + constexpr int64_t kSaturation = 20000; constexpr int64_t kDelay = kSaturation * 10; RAFT_EXPECTS(kDelay >= kSaturation, "kDelay must not be smaller than kSaturation so that n_sample is always less than " From c4b344c7aabbfb9246bcc8dc5c1bc0756ced13b4 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 22 Jun 2026 12:18:55 +0000 Subject: [PATCH 08/12] Explain uniform sampling decision --- cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh index 99f2c9e60c..2f99da9e06 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -27,7 +27,12 @@ namespace cuvs::neighbors::ivf_pq::detail { /** - * Estimate max_i ||x_i||^2 over the dataset by sampling from its rows. + * Estimate max_i ||x_i||^2 over the dataset by uniformly sampling a fraction from it. + * + * Decision: Uniform sampling is selected as it is sufficient to detect FP16 overflow in + * the datasets, where overflow-causing large vectors are frequent (e.g. SIFT 1M). For + * dataset with rare large outliers, we might preferably sample biasedly towards large vectors, + * e.g. via top-k selection over the vectors with largest L_inf norm. */ template float estimate_max_squared_norm( @@ -59,7 +64,7 @@ float estimate_max_squared_norm( raft::matrix::sample_rows( handle, raft::random::RngState{137}, dataset, sample.view()); - // Compute mapped squared norm + // Compute float-mapped squared norm auto d_map_sq_norm = raft::make_device_vector(handle, n_sample); raft::linalg::reduce( handle, From 49ee283c72e8ed7f8f96580b3d2f6e9c0cb2d48e Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Fri, 26 Jun 2026 12:55:54 +0000 Subject: [PATCH 09/12] Avoid expensive random sampling over the full dataset. Sampling is not necessary anyway. --- .../neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 24 +++++++------------ 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh index 2f99da9e06..a01cf8e220 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -18,8 +18,7 @@ #include #include #include -#include -#include +#include #include #include @@ -27,18 +26,14 @@ namespace cuvs::neighbors::ivf_pq::detail { /** - * Estimate max_i ||x_i||^2 over the dataset by uniformly sampling a fraction from it. - * - * Decision: Uniform sampling is selected as it is sufficient to detect FP16 overflow in - * the datasets, where overflow-causing large vectors are frequent (e.g. SIFT 1M). For - * dataset with rare large outliers, we might preferably sample biasedly towards large vectors, - * e.g. via top-k selection over the vectors with largest L_inf norm. + * Estimate max_i ||x_i||^2 over the dataset. */ template float estimate_max_squared_norm( raft::resources const& handle, raft::mdspan, raft::row_major, Accessor> dataset) { + common::nvtx::range r("estimate_max_squared_norm"); auto stream = raft::resource::get_cuda_stream(handle); const int64_t n_rows = dataset.extent(0); const int64_t dim = dataset.extent(1); @@ -47,7 +42,7 @@ float estimate_max_squared_norm( // - n_sample is always less than or equal to n_rows // - n_sample saturates to kSaturation when n_rows is inf // - n_sample increases fast for small n_rows and slow to saturation for large n_rows - // Idea: we sample most of the dataset when it is small-sized, and only a small fraction + // Idea: we examine most of the dataset when it is small-sized, and only a small fraction // (up to a maximum/saturation number) when the dataset size grows large. // kSaturation and kDelay are selected as a compromise between runtime and outlier recall. constexpr int64_t kSaturation = 20000; @@ -55,14 +50,11 @@ float estimate_max_squared_norm( RAFT_EXPECTS(kDelay >= kSaturation, "kDelay must not be smaller than kSaturation so that n_sample is always less than " "or equal to n_rows"); - int64_t n_sample = (n_rows * kSaturation + (n_rows + kDelay - 1)) / (n_rows + kDelay); + int64_t n_sample = raft::ceildiv(n_rows * kSaturation, n_rows + kDelay); - // Sample from the dataset auto mr = raft::resource::get_workspace_resource_ref(handle); - auto sample = - raft::make_device_mdarray(handle, mr, raft::make_extents(n_sample, dim)); - raft::matrix::sample_rows( - handle, raft::random::RngState{137}, dataset, sample.view()); + auto sample = raft::make_device_mdarray(handle, mr, raft::make_extents(n_sample, dim)); + raft::copy(sample.data_handle(), dataset.data_handle(), n_sample * dim, raft::resource::get_cuda_stream(handle)); // Compute float-mapped squared norm auto d_map_sq_norm = raft::make_device_vector(handle, n_sample); @@ -103,7 +95,7 @@ namespace cuvs::neighbors::ivf_pq::helpers { * computations on this dataset (i.e. `internal_distance_dtype` and `coarse_search_dtype`). * * We bound the largest achievable score from the dataset's vector norms. With R = max_i ||x_i|| - * (estimated from a random sample of the dataset): + * (estimated from a fraction of the dataset): * - L2Expanded: ||x - y||^2 = ||x||^2 + ||y||^2 - 2 <= (||x|| + ||y||)^2 <= 4 * R^2 * - InnerProduct: || <= ||x|| * ||y|| <= R^2 * - CosineExpanded: data is L2-normalized, so |score| <= 1 and overflow is impossible. From e4ee6181ff0495330ad1f988f184ae0e354d5449 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Fri, 26 Jun 2026 12:57:12 +0000 Subject: [PATCH 10/12] Edit comments --- cpp/src/neighbors/detail/cagra/cagra_build.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 99ae2f36b2..cee49c8630 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -2219,8 +2219,8 @@ index build( } } - // Guard against potential FP16 distance overflow for large-magnitude datasets. Applies to any - // IVF-PQ graph build (auto-selected above or explicitly set in params) -> fall back to FP32. + // Predict potential FP16 distance overflow for large-magnitude (e.g. unnormalized) datasets + // -> fall back to FP32. if (auto* pq = std::get_if(&knn_build_params)) { const bool using_fp16_distance = pq->search_params.internal_distance_dtype == CUDA_R_16F || pq->search_params.coarse_search_dtype == CUDA_R_16F; From 3ddd98ab151fce5b4a10f7122be57f37c98ea35b Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 29 Jun 2026 12:35:03 +0000 Subject: [PATCH 11/12] Fix sampling number and remove defensive margin --- .../neighbors/detail/cagra/cagra_build.cuh | 4 +-- .../neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 32 ++++++------------- 2 files changed, 12 insertions(+), 24 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index cee49c8630..a3ab54d376 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ #pragma once @@ -2219,7 +2219,7 @@ index build( } } - // Predict potential FP16 distance overflow for large-magnitude (e.g. unnormalized) datasets + // Predict potential FP16 distance overflow for large-magnitude (e.g. unnormalized) datasets // -> fall back to FP32. if (auto* pq = std::get_if(&knn_build_params)) { const bool using_fp16_distance = pq->search_params.internal_distance_dtype == CUDA_R_16F || diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh index a01cf8e220..58f35a8c35 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ @@ -38,23 +38,15 @@ float estimate_max_squared_norm( const int64_t n_rows = dataset.extent(0); const int64_t dim = dataset.extent(1); - // Determine sample size based on a smooth saturation equation. The equation satisfies: - // - n_sample is always less than or equal to n_rows - // - n_sample saturates to kSaturation when n_rows is inf - // - n_sample increases fast for small n_rows and slow to saturation for large n_rows - // Idea: we examine most of the dataset when it is small-sized, and only a small fraction - // (up to a maximum/saturation number) when the dataset size grows large. - // kSaturation and kDelay are selected as a compromise between runtime and outlier recall. - constexpr int64_t kSaturation = 20000; - constexpr int64_t kDelay = kSaturation * 10; - RAFT_EXPECTS(kDelay >= kSaturation, - "kDelay must not be smaller than kSaturation so that n_sample is always less than " - "or equal to n_rows"); - int64_t n_sample = raft::ceildiv(n_rows * kSaturation, n_rows + kDelay); + int64_t n_sample = std::min(n_rows, 20000); auto mr = raft::resource::get_workspace_resource_ref(handle); - auto sample = raft::make_device_mdarray(handle, mr, raft::make_extents(n_sample, dim)); - raft::copy(sample.data_handle(), dataset.data_handle(), n_sample * dim, raft::resource::get_cuda_stream(handle)); + auto sample = + raft::make_device_mdarray(handle, mr, raft::make_extents(n_sample, dim)); + raft::copy(sample.data_handle(), + dataset.data_handle(), + n_sample * dim, + raft::resource::get_cuda_stream(handle)); // Compute float-mapped squared norm auto d_map_sq_norm = raft::make_device_vector(handle, n_sample); @@ -111,11 +103,6 @@ bool estimate_fp16_overflow( // Cosine similarity scores does normalization itself, so overflow won't happen if (metric == cuvs::distance::DistanceType::CosineExpanded) { return false; } - // FP16 largest finite value, with a defensive margin to also avoid precision loss near the limit. - constexpr float kFp16Max = 65504.0f; - constexpr float kFp16DefensiveMargin = 0.25f; - const float overflow_detect_threshold = kFp16DefensiveMargin * kFp16Max; - const float max_vector_sq_norm = cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(handle, dataset); @@ -123,7 +110,8 @@ bool estimate_fp16_overflow( ? 4.0f * max_vector_sq_norm : max_vector_sq_norm; - return max_distance_sq_norm > overflow_detect_threshold; + constexpr float kFp16Max = 65504.0f; + return max_distance_sq_norm > kFp16Max; } } // namespace cuvs::neighbors::ivf_pq::helpers From d94fe86c25be35ad87633f41d70b53abeeee691a Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 29 Jun 2026 13:58:43 +0000 Subject: [PATCH 12/12] Refactor for explicitness of supported distance types --- .../neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh index 58f35a8c35..7c7dfe9fb1 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -100,15 +100,19 @@ bool estimate_fp16_overflow( { if (dataset.extent(0) == 0) { return false; } - // Cosine similarity scores does normalization itself, so overflow won't happen - if (metric == cuvs::distance::DistanceType::CosineExpanded) { return false; } + float dist_factor = 1.0f; + switch (metric) { + case cuvs::distance::DistanceType::L2Expanded: dist_factor = 4.0f; break; + case cuvs::distance::DistanceType::CosineExpanded: + // Cosine similarity scores does normalization itself, so overflow won't happen + return false; + case cuvs::distance::DistanceType::InnerProduct: dist_factor = 1.0f; break; + default: RAFT_FAIL("Unsupported distance type for IVF-PQ search %d.", int(metric)); + } const float max_vector_sq_norm = cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(handle, dataset); - - const float max_distance_sq_norm = metric == cuvs::distance::DistanceType::L2Expanded - ? 4.0f * max_vector_sq_norm - : max_vector_sq_norm; + const float max_distance_sq_norm = dist_factor * max_vector_sq_norm; constexpr float kFp16Max = 65504.0f; return max_distance_sq_norm > kFp16Max;