diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index a7c15b4161..a3ab54d376 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -1,11 +1,12 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ #pragma once #include "../../../core/nvtx.hpp" #include "../../../preprocessing/quantize/vpq_build-ext.cuh" +#include "../../ivf_pq/ivf_pq_fp16_overflow.cuh" #include "graph_core.cuh" #include @@ -2217,6 +2218,24 @@ index build( knn_build_params = cagra::graph_build_params::ivf_pq_params(dataset.extents(), params.metric); } } + + // Predict potential FP16 distance overflow for large-magnitude (e.g. unnormalized) datasets + // -> fall back to FP32. + if (auto* pq = std::get_if(&knn_build_params)) { + const bool using_fp16_distance = pq->search_params.internal_distance_dtype == CUDA_R_16F || + pq->search_params.coarse_search_dtype == CUDA_R_16F; + if (using_fp16_distance && + ivf_pq::helpers::estimate_fp16_overflow(res, dataset, params.metric)) { + RAFT_LOG_WARN( + "IVF-PQ internal type of FP16 is likely insufficient for this dataset to avoid overflow in " + "distance computations -> " + "Switching 'internal_distance_dtype' and 'coarse_search_dtype' to FP32"); + pq->search_params.internal_distance_dtype = CUDA_R_32F; + pq->search_params.coarse_search_dtype = CUDA_R_32F; + // lut_dtype is left unchanged because its per-subspace terms are smaller by a factor of + // pq_dim and therefore, less likely to overflow. + } + } RAFT_EXPECTS( params.metric != cuvs::distance::DistanceType::BitwiseHamming || std::holds_alternative( diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh new file mode 100644 index 0000000000..7c7dfe9fb1 --- /dev/null +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp16_overflow.cuh @@ -0,0 +1,121 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include "../detail/ann_utils.cuh" // cuvs::spatial::knn::detail::utils::mapping + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cuvs::neighbors::ivf_pq::detail { + +/** + * Estimate max_i ||x_i||^2 over the dataset. + */ +template +float estimate_max_squared_norm( + raft::resources const& handle, + raft::mdspan, raft::row_major, Accessor> dataset) +{ + common::nvtx::range r("estimate_max_squared_norm"); + auto stream = raft::resource::get_cuda_stream(handle); + const int64_t n_rows = dataset.extent(0); + const int64_t dim = dataset.extent(1); + + int64_t n_sample = std::min(n_rows, 20000); + + auto mr = raft::resource::get_workspace_resource_ref(handle); + auto sample = + raft::make_device_mdarray(handle, mr, raft::make_extents(n_sample, dim)); + raft::copy(sample.data_handle(), + dataset.data_handle(), + n_sample * dim, + raft::resource::get_cuda_stream(handle)); + + // Compute float-mapped squared norm + auto d_map_sq_norm = raft::make_device_vector(handle, n_sample); + raft::linalg::reduce( + handle, + raft::make_const_mdspan(sample.view()), + d_map_sq_norm.view(), + 0.0f, + false, + [] __device__(DataT v, auto) -> float { + float e = cuvs::spatial::knn::detail::utils::mapping{}(v); + return e * e; + }, + raft::add_op(), + raft::identity_op()); + // Compute max of squared norm vector + auto d_max_sq = raft::make_device_scalar(handle, 0.0f); + raft::linalg::map_reduce(handle, + raft::make_const_mdspan(d_map_sq_norm.view()), + d_max_sq.view(), + 0.0f, + raft::identity_op(), + raft::max_op()); + + float max_sq = 0.0f; + raft::update_host(&max_sq, d_max_sq.data_handle(), 1, stream); + raft::resource::sync_stream(handle); + + return max_sq; +} + +} // namespace cuvs::neighbors::ivf_pq::detail + +namespace cuvs::neighbors::ivf_pq::helpers { + +/** + * @brief Estimate whether FP16 is likely insufficient for IVF-PQ's full-magnitude distance + * computations on this dataset (i.e. `internal_distance_dtype` and `coarse_search_dtype`). + * + * We bound the largest achievable score from the dataset's vector norms. With R = max_i ||x_i|| + * (estimated from a fraction of the dataset): + * - L2Expanded: ||x - y||^2 = ||x||^2 + ||y||^2 - 2 <= (||x|| + ||y||)^2 <= 4 * R^2 + * - InnerProduct: || <= ||x|| * ||y|| <= R^2 + * - CosineExpanded: data is L2-normalized, so |score| <= 1 and overflow is impossible. + */ +template +bool estimate_fp16_overflow( + raft::resources const& handle, + raft::mdspan, raft::row_major, Accessor> dataset, + cuvs::distance::DistanceType metric) +{ + if (dataset.extent(0) == 0) { return false; } + + float dist_factor = 1.0f; + switch (metric) { + case cuvs::distance::DistanceType::L2Expanded: dist_factor = 4.0f; break; + case cuvs::distance::DistanceType::CosineExpanded: + // Cosine similarity scores does normalization itself, so overflow won't happen + return false; + case cuvs::distance::DistanceType::InnerProduct: dist_factor = 1.0f; break; + default: RAFT_FAIL("Unsupported distance type for IVF-PQ search %d.", int(metric)); + } + + const float max_vector_sq_norm = + cuvs::neighbors::ivf_pq::detail::estimate_max_squared_norm(handle, dataset); + const float max_distance_sq_norm = dist_factor * max_vector_sq_norm; + + constexpr float kFp16Max = 65504.0f; + return max_distance_sq_norm > kFp16Max; +} + +} // namespace cuvs::neighbors::ivf_pq::helpers