diff --git a/c/include/cuvs/neighbors/all_neighbors.h b/c/include/cuvs/neighbors/all_neighbors.h index f121b5cc33..dcbf4900a9 100644 --- a/c/include/cuvs/neighbors/all_neighbors.h +++ b/c/include/cuvs/neighbors/all_neighbors.h @@ -26,7 +26,8 @@ extern "C" { * provide the dataset on host. * * Notes: - * - Outputs (indices, distances, core_distances) are expected to be on device memory. + * - Outputs (indices, distances) can be on host memory (numpy arrays) + * or device memory (CUDA arrays). core_distances can only be on device memory. * - Host variant accepts host-resident dataset; device variant accepts device-resident dataset. * - For batching, `overlap_factor < n_clusters` must hold. * - When `core_distances` is provided, mutual-reachability distances are produced (see alpha). @@ -92,8 +93,8 @@ cuvsError_t cuvsAllNeighborsIndexParamsDestroy(cuvsAllNeighborsIndexParams_t ind * resources * @param[in] params Build parameters (see cuvsAllNeighborsIndexParams) * @param[in] dataset 2D tensor [num_rows x dim] on host or device (auto-detected) - * @param[out] indices 2D tensor [num_rows x k] on device (int64) - * @param[out] distances Optional 2D tensor [num_rows x k] on device (float32); can be NULL + * @param[out] indices 2D tensor [num_rows x k] on host or device (int64) + * @param[out] distances Optional 2D tensor [num_rows x k] on host or device (float32); can be NULL * @param[out] core_distances Optional 1D tensor [num_rows] on device (float32); can be NULL * @param[in] alpha Mutual-reachability scaling; used only when core_distances is provided * @@ -101,7 +102,7 @@ cuvsError_t cuvsAllNeighborsIndexParamsDestroy(cuvsAllNeighborsIndexParams_t ind * and calls the appropriate implementation. For host datasets, it partitions data into * `n_clusters` clusters and assigns each row to `overlap_factor` nearest clusters. For device * datasets, `n_clusters` must be 1 (no batching); `overlap_factor` is ignored. - * Outputs always reside in device memory. + * Outputs can be on host memory (numpy arrays) or device memory (CUDA arrays). */ cuvsError_t cuvsAllNeighborsBuild(cuvsResources_t res, cuvsAllNeighborsIndexParams_t params, diff --git a/c/src/neighbors/all_neighbors.cpp b/c/src/neighbors/all_neighbors.cpp index fa110c4662..4a6df079a5 100644 --- a/c/src/neighbors/all_neighbors.cpp +++ b/c/src/neighbors/all_neighbors.cpp @@ -80,33 +80,36 @@ static cuvs::neighbors::all_neighbors::all_neighbors_params convert_params( return out; } -static void ensure_indices_dtype_and_device_compatibility(DLManagedTensor* indices) +static void ensure_indices_dtype_compatibility(DLManagedTensor* indices) { auto dtype = indices->dl_tensor.dtype; RAFT_EXPECTS(dtype.code == kDLInt && dtype.bits == 64, "indices must be int64 output tensor"); - RAFT_EXPECTS(cuvs::core::is_dlpack_device_compatible(indices->dl_tensor), - "indices tensor must be device-compatible"); + RAFT_EXPECTS(cuvs::core::is_dlpack_device_compatible(indices->dl_tensor) || + cuvs::core::is_dlpack_host_compatible(indices->dl_tensor), + "indices tensor must be either device-compatible or host-compatible"); } -static void ensure_optional_distance_dtype_and_device_compatibility(DLManagedTensor* distances) +static void ensure_optional_distance_dtype_compatibility(DLManagedTensor* distances) { if (distances == nullptr) { return; } auto dtype = distances->dl_tensor.dtype; RAFT_EXPECTS(dtype.code == kDLFloat && dtype.bits == 32, "distances must be float32 output tensor"); - RAFT_EXPECTS(cuvs::core::is_dlpack_device_compatible(distances->dl_tensor), - "distances tensor must be device-compatible"); + RAFT_EXPECTS(cuvs::core::is_dlpack_device_compatible(distances->dl_tensor) || + cuvs::core::is_dlpack_host_compatible(distances->dl_tensor), + "distances tensor must be either device-compatible or host-compatible"); } -static void ensure_optional_core_distance_dtype_and_device_compatibility( - DLManagedTensor* core_distances) +static void ensure_optional_core_distance_dtype_compatibility(DLManagedTensor* core_distances) { if (core_distances == nullptr) { return; } auto dtype = core_distances->dl_tensor.dtype; RAFT_EXPECTS(dtype.code == kDLFloat && dtype.bits == 32, "core_distances must be float32 output tensor"); - RAFT_EXPECTS(cuvs::core::is_dlpack_device_compatible(core_distances->dl_tensor), - "core_distances tensor must be device-compatible"); + RAFT_EXPECTS( + cuvs::core::is_dlpack_device_compatible(core_distances->dl_tensor) || + cuvs::core::is_dlpack_host_compatible(core_distances->dl_tensor), + "core_distances tensor must be either device-compatible or host-compatible"); } template @@ -124,9 +127,9 @@ void _build_host(cuvsResources_t res, RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(dlt), "Host build expects host-compatible dataset tensor"); - ensure_indices_dtype_and_device_compatibility(indices_tensor); - ensure_optional_distance_dtype_and_device_compatibility(distances_tensor); - ensure_optional_core_distance_dtype_and_device_compatibility(core_distances_tensor); + ensure_indices_dtype_compatibility(indices_tensor); + ensure_optional_distance_dtype_compatibility(distances_tensor); + ensure_optional_core_distance_dtype_compatibility(core_distances_tensor); // Check dependencies between parameters if (core_distances_tensor != nullptr && distances_tensor == nullptr) { @@ -138,26 +141,63 @@ void _build_host(cuvsResources_t res, auto cpp_params = convert_params(params, n_rows, n_cols); - using dataset_mdspan_t = raft::host_matrix_view; - using indices_mdspan_t = raft::device_matrix_view; - using distances_mdspan_t = raft::device_matrix_view; - using core_mdspan_t = raft::device_vector_view; + using dataset_mdspan_t = raft::host_matrix_view; - auto dataset = cuvs::core::from_dlpack(dataset_tensor); - auto indices = cuvs::core::from_dlpack(indices_tensor); + bool indices_is_host = cuvs::core::is_dlpack_host_compatible(indices_tensor->dl_tensor); + bool distances_is_host = distances_tensor ? cuvs::core::is_dlpack_host_compatible(distances_tensor->dl_tensor) : indices_is_host; - std::optional distances = std::nullopt; - if (distances_tensor) { - distances = cuvs::core::from_dlpack(distances_tensor); + if (distances_tensor && distances_is_host != indices_is_host) { + RAFT_FAIL("distances and indices must be on the same memory location (both host or both device)"); } - std::optional core_distances = std::nullopt; if (core_distances_tensor) { - core_distances = cuvs::core::from_dlpack(core_distances_tensor); + bool core_distances_is_host = + cuvs::core::is_dlpack_host_compatible(core_distances_tensor->dl_tensor); + RAFT_EXPECTS(core_distances_is_host == indices_is_host, + "core_distances must be on the same memory location as indices and distances"); } - cuvs::neighbors::all_neighbors::build( - cpp_res, cpp_params, dataset, indices, distances, core_distances, alpha); + auto dataset = cuvs::core::from_dlpack(dataset_tensor); + + if (indices_is_host) { + using indices_mdspan_t = raft::host_matrix_view; + using distances_mdspan_t = raft::host_matrix_view; + using core_mdspan_t = raft::host_vector_view; + + auto indices = cuvs::core::from_dlpack(indices_tensor); + + std::optional distances = std::nullopt; + if (distances_tensor) { + distances = cuvs::core::from_dlpack(distances_tensor); + } + + std::optional core_distances = std::nullopt; + if (core_distances_tensor) { + core_distances = cuvs::core::from_dlpack(core_distances_tensor); + } + + cuvs::neighbors::all_neighbors::build( + cpp_res, cpp_params, dataset, indices, distances, core_distances, alpha); + } else { + using indices_mdspan_t = raft::device_matrix_view; + using distances_mdspan_t = raft::device_matrix_view; + using core_mdspan_t = raft::device_vector_view; + + auto indices = cuvs::core::from_dlpack(indices_tensor); + + std::optional distances = std::nullopt; + if (distances_tensor) { + distances = cuvs::core::from_dlpack(distances_tensor); + } + + std::optional core_distances = std::nullopt; + if (core_distances_tensor) { + core_distances = cuvs::core::from_dlpack(core_distances_tensor); + } + + cuvs::neighbors::all_neighbors::build( + cpp_res, cpp_params, dataset, indices, distances, core_distances, alpha); + } } template @@ -175,9 +215,9 @@ void _build_device(cuvsResources_t device_res, RAFT_EXPECTS(cuvs::core::is_dlpack_device_compatible(dlt), "Device build expects device-compatible dataset tensor"); - ensure_indices_dtype_and_device_compatibility(indices_tensor); - ensure_optional_distance_dtype_and_device_compatibility(distances_tensor); - ensure_optional_core_distance_dtype_and_device_compatibility(core_distances_tensor); + ensure_indices_dtype_compatibility(indices_tensor); + ensure_optional_distance_dtype_compatibility(distances_tensor); + ensure_optional_core_distance_dtype_compatibility(core_distances_tensor); // Check dependencies between parameters if (core_distances_tensor != nullptr && distances_tensor == nullptr) { @@ -189,26 +229,62 @@ void _build_device(cuvsResources_t device_res, auto cpp_params = convert_params(params, n_rows, n_cols); - using dataset_mdspan_t = raft::device_matrix_view; - using indices_mdspan_t = raft::device_matrix_view; - using distances_mdspan_t = raft::device_matrix_view; - using core_mdspan_t = raft::device_vector_view; + using dataset_mdspan_t = raft::device_matrix_view; + auto dataset = cuvs::core::from_dlpack(dataset_tensor); - auto dataset = cuvs::core::from_dlpack(dataset_tensor); - auto indices = cuvs::core::from_dlpack(indices_tensor); + bool indices_is_host = cuvs::core::is_dlpack_host_compatible(indices_tensor->dl_tensor); + bool distances_is_host = distances_tensor ? cuvs::core::is_dlpack_host_compatible(distances_tensor->dl_tensor) : indices_is_host; - std::optional distances = std::nullopt; - if (distances_tensor) { - distances = cuvs::core::from_dlpack(distances_tensor); + if (distances_tensor && distances_is_host != indices_is_host) { + RAFT_FAIL("distances and indices must be on the same memory location (both host or both device)"); } - std::optional core_distances = std::nullopt; if (core_distances_tensor) { - core_distances = cuvs::core::from_dlpack(core_distances_tensor); + bool core_distances_is_host = + cuvs::core::is_dlpack_host_compatible(core_distances_tensor->dl_tensor); + RAFT_EXPECTS(core_distances_is_host == indices_is_host, + "core_distances must be on the same memory location as indices and distances"); } - cuvs::neighbors::all_neighbors::build( - cpp_res, cpp_params, dataset, indices, distances, core_distances, alpha); + if (indices_is_host) { + using indices_mdspan_t = raft::host_matrix_view; + using distances_mdspan_t = raft::host_matrix_view; + using core_mdspan_t = raft::host_vector_view; + + auto indices = cuvs::core::from_dlpack(indices_tensor); + + std::optional distances = std::nullopt; + if (distances_tensor) { + distances = cuvs::core::from_dlpack(distances_tensor); + } + + std::optional core_distances = std::nullopt; + if (core_distances_tensor) { + core_distances = cuvs::core::from_dlpack(core_distances_tensor); + } + + cuvs::neighbors::all_neighbors::build( + cpp_res, cpp_params, dataset, indices, distances, core_distances, alpha); + } else { + using indices_mdspan_t = raft::device_matrix_view; + using distances_mdspan_t = raft::device_matrix_view; + using core_mdspan_t = raft::device_vector_view; + + auto indices = cuvs::core::from_dlpack(indices_tensor); + + std::optional distances = std::nullopt; + if (distances_tensor) { + distances = cuvs::core::from_dlpack(distances_tensor); + } + + std::optional core_distances = std::nullopt; + if (core_distances_tensor) { + core_distances = cuvs::core::from_dlpack(core_distances_tensor); + } + + cuvs::neighbors::all_neighbors::build( + cpp_res, cpp_params, dataset, indices, distances, core_distances, alpha); + } } } // namespace diff --git a/cpp/include/cuvs/neighbors/all_neighbors.hpp b/cpp/include/cuvs/neighbors/all_neighbors.hpp index 70e066ef2f..a9f134a390 100644 --- a/cpp/include/cuvs/neighbors/all_neighbors.hpp +++ b/cpp/include/cuvs/neighbors/all_neighbors.hpp @@ -116,11 +116,11 @@ struct all_neighbors_params { * to build all-neighbors knn graph * @param[in] dataset raft::host_matrix_view input dataset expected to be located * in host memory - * @param[out] indices nearest neighbor indices of shape [n_row x k] - * @param[out] distances nearest neighbor distances [n_row x k] - * @param[out] core_distances array for core distances of size [n_row]. Requires distances matrix to - * compute core_distances. If core_distances is given, the resulting indices and distances will be - * mutual reachability space. + * @param[out] indices nearest neighbor indices of shape [n_row x k] in device memory + * @param[out] distances nearest neighbor distances [n_row x k] in device memory + * @param[out] core_distances core distances of size [n_row] in device memory. Requires distances + * matrix to compute core_distances. If core_distances is given, the resulting indices and distances + * will be in mutual reachability space. * @param[in] alpha distance scaling parameter as used in robust single linkage. */ void build( @@ -152,11 +152,11 @@ void build( * to build all-neighbors knn graph * @param[in] dataset raft::device_matrix_view input dataset expected to be located * in device memory - * @param[out] indices nearest neighbor indices of shape [n_row x k] - * @param[out] distances nearest neighbor distances [n_row x k] - * @param[out] core_distances array for core distances of size [n_row]. Requires distances matrix to - * compute core_distances. If core_distances is given, the resulting indices and distances will be - * mutual reachability space. + * @param[out] indices nearest neighbor indices of shape [n_row x k] in device memory + * @param[out] distances nearest neighbor distances [n_row x k] in device memory + * @param[out] core_distances core distances of size [n_row] in device memory. Requires distances + * matrix to compute core_distances. If core_distances is given, the resulting indices and distances + * will be in mutual reachability space. * @param[in] alpha distance scaling parameter as used in robust single linkage. */ void build( @@ -168,5 +168,76 @@ void build( std::optional> core_distances = std::nullopt, float alpha = 1.0); +/** + * @brief Builds an approximate all-neighbors knn graph (find nearest neighbors for all the training + * vectors) with host memory output buffers. + * + * Usage example: + * @code{.cpp} + * using namespace cuvs::neighbors; + * // use default index parameters + * all_neighbors::all_neighbors_params params; + * auto indices = raft::make_host_matrix(handle, n_row, k); + * auto distances = raft::make_host_matrix(handle, n_row, k); + * all_neighbors::build(res, params, dataset, indices.view(), distances.view()); + * @endcode + * + * @param[in] handle raft::resources is an object managing resources + * @param[in] params an instance of all_neighbors::all_neighbors_params that are parameters + * to build all-neighbors knn graph + * @param[in] dataset raft::host_matrix_view input dataset expected to be located + * in host memory + * @param[out] indices nearest neighbor indices of shape [n_row x k] in host memory + * @param[out] distances nearest neighbor distances [n_row x k] in host memory + * @param[out] core_distances core distances of size [n_row] in host memory. Requires distances + * matrix to compute core_distances. If core_distances is given, the resulting indices and distances + * will be in mutual reachability space. + * @param[in] alpha distance scaling parameter as used in robust single linkage. + */ +void build( + const raft::resources& handle, + const all_neighbors_params& params, + raft::host_matrix_view dataset, + raft::host_matrix_view indices, + std::optional> distances = std::nullopt, + std::optional> core_distances = std::nullopt, + float alpha = 1.0); + +/** + * @brief Builds an approximate all-neighbors knn graph (find nearest neighbors for all the training + * vectors) with host memory output buffers. params.n_clusters should be 1 for data on device. To + * use a larger params.n_clusters for efficient device memory usage, put data on host RAM. + * + * Usage example: + * @code{.cpp} + * using namespace cuvs::neighbors; + * // use default index parameters + * all_neighbors::all_neighbors_params params; + * auto indices = raft::make_host_matrix(handle, n_row, k); + * auto distances = raft::make_host_matrix(handle, n_row, k); + * all_neighbors::build(res, params, dataset, indices.view(), distances.view()); + * @endcode + * + * @param[in] handle raft::resources is an object managing resources + * @param[in] params an instance of all_neighbors::all_neighbors_params that are parameters + * to build all-neighbors knn graph + * @param[in] dataset raft::device_matrix_view input dataset expected to be located + * in device memory + * @param[out] indices nearest neighbor indices of shape [n_row x k] in host memory + * @param[out] distances nearest neighbor distances [n_row x k] in host memory + * @param[out] core_distances core distances of size [n_row] in host memory. Requires distances + * matrix to compute core_distances. If core_distances is given, the resulting indices and distances + * will be in mutual reachability space. + * @param[in] alpha distance scaling parameter as used in robust single linkage. + */ +void build( + const raft::resources& handle, + const all_neighbors_params& params, + raft::device_matrix_view dataset, + raft::host_matrix_view indices, + std::optional> distances = std::nullopt, + std::optional> core_distances = std::nullopt, + float alpha = 1.0); + /** @} */ } // namespace cuvs::neighbors::all_neighbors diff --git a/cpp/src/neighbors/all_neighbors/all_neighbors.cu b/cpp/src/neighbors/all_neighbors/all_neighbors.cu index d44c8cb7a0..25f445bab2 100644 --- a/cpp/src/neighbors/all_neighbors/all_neighbors.cu +++ b/cpp/src/neighbors/all_neighbors/all_neighbors.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -30,6 +30,30 @@ namespace cuvs::neighbors::all_neighbors { { \ return all_neighbors::detail::build( \ handle, params, dataset, indices, distances, core_distances, alpha); \ + } \ + \ + void build(const raft::resources& handle, \ + const all_neighbors_params& params, \ + raft::host_matrix_view dataset, \ + raft::host_matrix_view indices, \ + std::optional> distances, \ + std::optional> core_distances, \ + T alpha) \ + { \ + return all_neighbors::detail::build( \ + handle, params, dataset, indices, distances, core_distances, alpha); \ + } \ + \ + void build(const raft::resources& handle, \ + const all_neighbors_params& params, \ + raft::device_matrix_view dataset, \ + raft::host_matrix_view indices, \ + std::optional> distances, \ + std::optional> core_distances, \ + T alpha) \ + { \ + return all_neighbors::detail::build( \ + handle, params, dataset, indices, distances, core_distances, alpha); \ } CUVS_INST_ALL_NEIGHBORS(float, int64_t); diff --git a/cpp/src/neighbors/all_neighbors/all_neighbors.cuh b/cpp/src/neighbors/all_neighbors/all_neighbors.cuh index b6831d1f3c..7b0d6787ff 100644 --- a/cpp/src/neighbors/all_neighbors/all_neighbors.cuh +++ b/cpp/src/neighbors/all_neighbors/all_neighbors.cuh @@ -7,12 +7,43 @@ #include "../detail/reachability.cuh" #include "all_neighbors_batched.cuh" #include +#include #include #include namespace cuvs::neighbors::all_neighbors::detail { using namespace cuvs::neighbors; +// Host-side shift: shifts columns to the right by k, fills first column with row IDs (for indices) +// or 0 (for distances) +template +void host_shift_columns(Mdspan in_out, + size_t k, + std::optional fill_value = std::nullopt) +{ + size_t n_rows = in_out.extent(0); + size_t n_cols = in_out.extent(1); + RAFT_EXPECTS(n_cols > k, "Shift size k should be smaller than the number of columns in matrix."); + + // Shift columns to the right by k +#pragma omp parallel for + for (size_t i = 0; i < n_rows; i++) { + // Copy columns from right to left + for (size_t j = n_cols - 1; j >= k; j--) { + in_out(i, j) = in_out(i, j - k); + } + // Fill first k columns + for (size_t j = 0; j < k; j++) { + if (fill_value.has_value()) { + in_out(i, j) = fill_value.value(); + } else { + // Fill with row ID (for indices) or 0 (for distances) + in_out(i, j) = static_cast(i); + } + } + } +} + GRAPH_BUILD_ALGO check_params_validity(const all_neighbors_params& params, bool do_mutual_reachability_dist) { @@ -71,34 +102,89 @@ GRAPH_BUILD_ALGO check_params_validity(const all_neighbors_params& params, } // Single build (i.e. no batching) supports both host and device datasets -template -void single_build( - const raft::resources& handle, - const all_neighbors_params& params, - mdspan, row_major, Accessor> dataset, - raft::device_matrix_view indices, - std::optional> distances = std::nullopt, - DistEpilogueT dist_epilogue = DistEpilogueT{}) +template , + typename DistEpilogueT = raft::identity_op> +void single_build(const raft::resources& handle, + const all_neighbors_params& params, + DatasetMdspan dataset, + IndicesMdspan indices, + std::optional distances = std::nullopt, + DistEpilogueT dist_epilogue = DistEpilogueT{}) { size_t num_rows = static_cast(dataset.extent(0)); size_t num_cols = static_cast(dataset.extent(1)); + size_t k = indices.extent(1); + + constexpr bool indices_are_host = raft::is_output_host_mdspan_t::value; - auto knn_builder = get_knn_builder( - handle, params, num_rows, num_rows, indices.extent(1), indices, distances, dist_epilogue); + // Builder requires device-side arrays. Allocate device buffers when user provides host arrays. + std::optional> indices_d; + std::optional> distances_d; + + auto indices_d_view = [&]() -> raft::device_matrix_view { + if constexpr (indices_are_host) { + indices_d.emplace(raft::make_device_matrix(handle, num_rows, k)); + return indices_d.value().view(); + } else { + return raft::make_device_matrix_view(indices.data_handle(), num_rows, k); + } + }(); + + auto distances_d_opt_view = [&]() -> std::optional> { + if constexpr (indices_are_host) { + if (distances.has_value()) { + distances_d.emplace(raft::make_device_matrix(handle, num_rows, k)); + return distances_d.value().view(); + } + } else { + return distances; + } + return std::nullopt; + }(); + + auto knn_builder = get_knn_builder(handle, + params, + num_rows, + num_rows, + indices.extent(1), + indices_d_view, + distances_d_opt_view, + dist_epilogue); knn_builder->prepare_build(dataset); knn_builder->build_knn(dataset); + + if constexpr (indices_are_host) { + raft::copy(indices.data_handle(), + indices_d_view.data_handle(), + num_rows * k, + raft::resource::get_cuda_stream(handle)); + if (distances.has_value()) { + raft::copy(distances.value().data_handle(), + distances_d.value().data_handle(), + num_rows * k, + raft::resource::get_cuda_stream(handle)); + } + raft::resource::sync_stream(handle); + } } -template -void build( - const raft::resources& handle, - const all_neighbors_params& params, - raft::host_matrix_view dataset, - raft::device_matrix_view indices, - std::optional> distances = std::nullopt, - std::optional> core_distances = std::nullopt, - T alpha = 1.0) +template , + typename CoreDistMdspan = raft::device_vector_view> +void build(const raft::resources& handle, + const all_neighbors_params& params, + raft::host_matrix_view dataset, + IndicesMdspan indices, + std::optional distances = std::nullopt, + std::optional core_distances = std::nullopt, + T alpha = 1.0) { auto build_algo = check_params_validity(params, core_distances.has_value()); @@ -116,9 +202,11 @@ void build( "distances matrix should be allocated to get mutual reachability distance."); } + constexpr bool outputs_are_host = raft::is_output_host_mdspan_t::value; + std::unique_ptr> aux_vectors; if (params.n_clusters == 1) { - single_build(handle, params, dataset, indices, distances); + single_build(handle, params, dataset, indices, distances); } else { if (core_distances.has_value()) { aux_vectors = std::make_unique>( @@ -132,52 +220,98 @@ void build( // NN Descent doesn't include self loops. Shifted to keep it consistent with brute force and ivfpq bool need_shift = (build_algo == GRAPH_BUILD_ALGO::NN_DESCENT) && (params.metric != cuvs::distance::DistanceType::InnerProduct); - if (need_shift) { - raft::matrix::shift(handle, indices, 1); - if (distances.has_value()) { - raft::matrix::shift(handle, distances.value(), 1, std::make_optional(0.0)); + if constexpr (outputs_are_host) { + host_shift_columns(indices, 1, std::nullopt); // fill with row ID + if (distances.has_value()) { + host_shift_columns(distances.value(), 1, std::make_optional(0.0)); + } + } else { + auto indices_d = raft::make_device_matrix_view( + const_cast(indices.data_handle()), indices.extent(0), indices.extent(1)); + raft::matrix::shift(handle, indices_d, 1); + if (distances.has_value()) { + auto distances_d = + raft::make_device_matrix_view(const_cast(distances.value().data_handle()), + distances.value().extent(0), + distances.value().extent(1)); + raft::matrix::shift(handle, distances_d, 1, std::make_optional(0.0)); + } } } if (core_distances.has_value()) { // calculate mutual reachability distances size_t k = indices.extent(1); size_t num_rows = core_distances.value().size(); - cuvs::neighbors::detail::reachability::core_distances( - handle, - distances.value().data_handle(), - k, - k, - num_rows, - core_distances.value().data_handle()); + + std::optional> core_distances_tmp; + T* core_distances_d_ptr; + if constexpr (outputs_are_host) { + // Core distance is the last column (index k-1) of distances +#pragma omp parallel for + for (size_t i = 0; i < num_rows; i++) { + core_distances.value()(i) = distances.value()(i, k - 1); + } + core_distances_tmp.emplace(raft::make_device_vector(handle, num_rows)); + core_distances_d_ptr = core_distances_tmp.value().data_handle(); + raft::copy(core_distances_d_ptr, + core_distances.value().data_handle(), + num_rows, + raft::resource::get_cuda_stream(handle)); + } else { + core_distances_d_ptr = core_distances.value().data_handle(); + cuvs::neighbors::detail::reachability::core_distances( + handle, distances.value().data_handle(), k, k, num_rows, core_distances_d_ptr); + } using ReachabilityPP = cuvs::neighbors::detail::reachability::ReachabilityPostProcess; - auto dist_epilogue = ReachabilityPP{core_distances.value().data_handle(), alpha, num_rows}; + auto dist_epilogue = ReachabilityPP{core_distances_d_ptr, alpha, num_rows}; if (params.n_clusters == 1) { - single_build(handle, params, dataset, indices, distances, dist_epilogue); + single_build(handle, params, dataset, indices, distances, dist_epilogue); } else { batch_build(handle, params, dataset, indices, distances, aux_vectors.get(), dist_epilogue); } if (need_shift) { - raft::matrix::shift(handle, indices, 1); - raft::matrix::shift(handle, - distances.value(), - raft::make_device_matrix_view( - core_distances.value().data_handle(), num_rows, 1)); + if constexpr (outputs_are_host) { + host_shift_columns(indices, 1, std::nullopt); + size_t n_cols = distances.value().extent(1); +#pragma omp parallel for + for (size_t i = 0; i < num_rows; i++) { + for (size_t j = n_cols - 1; j >= 1; j--) { + distances.value()(i, j) = distances.value()(i, j - 1); + } + distances.value()(i, 0) = core_distances.value()(i); + } + } else { + auto indices_d = raft::make_device_matrix_view( + const_cast(indices.data_handle()), indices.extent(0), indices.extent(1)); + raft::matrix::shift(handle, indices_d, 1); + auto distances_d = + raft::make_device_matrix_view(const_cast(distances.value().data_handle()), + distances.value().extent(0), + distances.value().extent(1)); + raft::matrix::shift(handle, + distances_d, + raft::make_device_matrix_view( + core_distances.value().data_handle(), num_rows, 1)); + } } } } -template -void build( - const raft::resources& handle, - const all_neighbors_params& params, - raft::device_matrix_view dataset, - raft::device_matrix_view indices, - std::optional> distances = std::nullopt, - std::optional> core_distances = std::nullopt, - T alpha = 1.0) +template , + typename CoreDistMdspan = raft::device_vector_view> +void build(const raft::resources& handle, + const all_neighbors_params& params, + raft::device_matrix_view dataset, + IndicesMdspan indices, + std::optional distances = std::nullopt, + std::optional core_distances = std::nullopt, + T alpha = 1.0) { auto build_algo = check_params_validity(params, core_distances.has_value()); @@ -200,41 +334,85 @@ void build( "Batched all-neighbors build is not supported with data on device. Put data on host for " "batch build."); } else { - single_build(handle, params, dataset, indices, distances); + single_build(handle, params, dataset, indices, distances); } // NN Descent doesn't include self loops. Shifted to keep it consistent with brute force and ivfpq bool need_shift = (build_algo == GRAPH_BUILD_ALGO::NN_DESCENT) && (params.metric != cuvs::distance::DistanceType::InnerProduct); + constexpr bool outputs_are_host = raft::is_output_host_mdspan_t::value; if (need_shift) { - raft::matrix::shift(handle, indices, 1); - if (distances.has_value()) { - raft::matrix::shift(handle, distances.value(), 1, std::make_optional(0.0)); + if constexpr (outputs_are_host) { + host_shift_columns(indices, 1, std::nullopt); // fill first column with row ID + if (distances.has_value()) { // fill first column with 0 + host_shift_columns(distances.value(), 1, std::make_optional(0.0)); + } + } else { + auto indices_d = raft::make_device_matrix_view( + const_cast(indices.data_handle()), indices.extent(0), indices.extent(1)); + raft::matrix::shift(handle, indices_d, 1); + if (distances.has_value()) { + auto distances_d = + raft::make_device_matrix_view(const_cast(distances.value().data_handle()), + distances.value().extent(0), + distances.value().extent(1)); + raft::matrix::shift(handle, distances_d, 1, std::make_optional(0.0)); + } } } if (core_distances.has_value()) { size_t k = indices.extent(1); size_t num_rows = core_distances.value().size(); - cuvs::neighbors::detail::reachability::core_distances( - handle, - distances.value().data_handle(), - k, - k, - num_rows, - core_distances.value().data_handle()); + + std::optional> core_distances_tmp; + T* core_distances_d_ptr; + if constexpr (outputs_are_host) { +#pragma omp parallel for + for (size_t i = 0; i < num_rows; i++) { + core_distances.value()(i) = distances.value()(i, k - 1); + } + core_distances_tmp.emplace(raft::make_device_vector(handle, num_rows)); + core_distances_d_ptr = core_distances_tmp.value().data_handle(); + raft::copy(core_distances_d_ptr, + core_distances.value().data_handle(), + num_rows, + raft::resource::get_cuda_stream(handle)); + } else { + core_distances_d_ptr = core_distances.value().data_handle(); + cuvs::neighbors::detail::reachability::core_distances( + handle, distances.value().data_handle(), k, k, num_rows, core_distances_d_ptr); + } using ReachabilityPP = cuvs::neighbors::detail::reachability::ReachabilityPostProcess; - auto dist_epilogue = ReachabilityPP{core_distances.value().data_handle(), alpha, num_rows}; - single_build(handle, params, dataset, indices, distances, dist_epilogue); + auto dist_epilogue = ReachabilityPP{core_distances_d_ptr, alpha, num_rows}; + single_build(handle, params, dataset, indices, distances, dist_epilogue); if (need_shift) { - raft::matrix::shift(handle, indices, 1); - raft::matrix::shift(handle, - distances.value(), - raft::make_device_matrix_view( - core_distances.value().data_handle(), num_rows, 1)); + if constexpr (outputs_are_host) { + host_shift_columns(indices, 1, std::nullopt); + size_t n_cols = distances.value().extent(1); +#pragma omp parallel for + for (size_t i = 0; i < num_rows; i++) { + for (size_t j = n_cols - 1; j >= 1; j--) { + distances.value()(i, j) = distances.value()(i, j - 1); + } + distances.value()(i, 0) = core_distances.value()(i); + } + } else { + auto indices_d = raft::make_device_matrix_view( + const_cast(indices.data_handle()), indices.extent(0), indices.extent(1)); + raft::matrix::shift(handle, indices_d, 1); + auto distances_d = + raft::make_device_matrix_view(const_cast(distances.value().data_handle()), + distances.value().extent(0), + distances.value().extent(1)); + raft::matrix::shift(handle, + distances_d, + raft::make_device_matrix_view( + core_distances.value().data_handle(), num_rows, 1)); + } } } } diff --git a/cpp/src/neighbors/all_neighbors/all_neighbors_batched.cuh b/cpp/src/neighbors/all_neighbors/all_neighbors_batched.cuh index 7e4ff748a4..f4b3638f54 100644 --- a/cpp/src/neighbors/all_neighbors/all_neighbors_batched.cuh +++ b/cpp/src/neighbors/all_neighbors/all_neighbors_batched.cuh @@ -466,15 +466,18 @@ struct BatchBuildAux { } }; -template -void batch_build( - const raft::resources& handle, - const all_neighbors_params& params, - raft::host_matrix_view dataset, - raft::device_matrix_view indices, - std::optional> distances = std::nullopt, - BatchBuildAux* aux_vectors = nullptr, - DistEpilogueT dist_epilogue = DistEpilogueT{}) +template , + typename DistEpilogueT = raft::identity_op> +void batch_build(const raft::resources& handle, + const all_neighbors_params& params, + raft::host_matrix_view dataset, + IndicesMdspan indices, + std::optional distances = std::nullopt, + BatchBuildAux* aux_vectors = nullptr, + DistEpilogueT dist_epilogue = DistEpilogueT{}) { if (raft::resource::is_multi_gpu(handle)) { // For efficient CPU-computation of omp parallel for regions per GPU @@ -606,15 +609,15 @@ void batch_build( inverted_indices_view); } - raft::copy( - handle, - raft::make_device_vector_view(indices.data_handle(), num_rows * k), - raft::make_device_vector_view(global_neighbors.data_handle(), num_rows * k)); + raft::copy(indices.data_handle(), + global_neighbors.data_handle(), + num_rows * k, + raft::resource::get_cuda_stream(handle)); if (distances.has_value()) { - raft::copy( - handle, - raft::make_device_vector_view(distances.value().data_handle(), num_rows * k), - raft::make_device_vector_view(global_distances.data_handle(), num_rows * k)); + raft::copy(distances.value().data_handle(), + global_distances.data_handle(), + num_rows * k, + raft::resource::get_cuda_stream(handle)); } } diff --git a/cpp/tests/neighbors/all_neighbors.cuh b/cpp/tests/neighbors/all_neighbors.cuh index 3b6b245a4e..7ccaf7003b 100644 --- a/cpp/tests/neighbors/all_neighbors.cuh +++ b/cpp/tests/neighbors/all_neighbors.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #pragma once @@ -10,6 +10,7 @@ #include "ann_utils.cuh" #include "naive_knn.cuh" #include +#include #include #include #include @@ -44,6 +45,7 @@ struct AllNeighborsInputs { int k; bool data_on_host; bool mutual_reach; + bool output_on_host; }; inline ::std::ostream& operator<<(::std::ostream& os, const AllNeighborsInputs& p) @@ -51,7 +53,8 @@ inline ::std::ostream& operator<<(::std::ostream& os, const AllNeighborsInputs& os << "dataset shape=" << p.n_rows << "x" << p.dim << ", k=" << p.k << ", metric=" << static_cast(std::get<1>(p.build_algo_metric_recall)) << ", clusters=" << std::get<0>(p.cluster_nearestcluster) - << ", overlap_factor=" << std::get<1>(p.cluster_nearestcluster) << std::endl; + << ", overlap_factor=" << std::get<1>(p.cluster_nearestcluster) + << ", output_on_host=" << p.output_on_host << std::endl; return os; } @@ -150,38 +153,71 @@ void get_graphs(raft::resources& handle, } { - rmm::device_uvector distances_allNN_dev(queries_size, cuda_stream); - rmm::device_uvector indices_allNN_dev(queries_size, cuda_stream); - - if (ps.data_on_host) { - auto database_h = raft::make_host_matrix(ps.n_rows, ps.dim); - raft::copy(database_h.data_handle(), database.data(), ps.n_rows * ps.dim, cuda_stream); - - all_neighbors::build( - handle, - params, - raft::make_const_mdspan(database_h.view()), - raft::make_device_matrix_view(indices_allNN_dev.data(), ps.n_rows, ps.k), - raft::make_device_matrix_view(distances_allNN_dev.data(), ps.n_rows, ps.k), - ps.mutual_reach - ? std::make_optional(raft::make_device_vector(handle, ps.n_rows).view()) - : std::nullopt); - + if (ps.output_on_host) { + auto indices_allNN_h = raft::make_host_matrix(ps.n_rows, ps.k); + auto distances_allNN_h = raft::make_host_matrix(ps.n_rows, ps.k); + + if (ps.data_on_host) { + auto database_h = raft::make_host_matrix(ps.n_rows, ps.dim); + raft::copy(database_h.data_handle(), database.data(), ps.n_rows * ps.dim, cuda_stream); + + all_neighbors::build( + handle, + params, + raft::make_const_mdspan(database_h.view()), + indices_allNN_h.view(), + std::make_optional(distances_allNN_h.view()), + ps.mutual_reach + ? std::make_optional(raft::make_host_vector(ps.n_rows).view()) + : std::nullopt); + } else { + all_neighbors::build( + handle, + params, + raft::make_device_matrix_view(database.data(), ps.n_rows, ps.dim), + indices_allNN_h.view(), + std::make_optional(distances_allNN_h.view()), + ps.mutual_reach + ? std::make_optional(raft::make_host_vector(ps.n_rows).view()) + : std::nullopt); + } + + memcpy(indices_allNN.data(), indices_allNN_h.data_handle(), queries_size * sizeof(IdxT)); + memcpy( + distances_allNN.data(), distances_allNN_h.data_handle(), queries_size * sizeof(DistanceT)); } else { - all_neighbors::build( - handle, - params, - raft::make_device_matrix_view(database.data(), ps.n_rows, ps.dim), - raft::make_device_matrix_view(indices_allNN_dev.data(), ps.n_rows, ps.k), - raft::make_device_matrix_view(distances_allNN_dev.data(), ps.n_rows, ps.k), - ps.mutual_reach - ? std::make_optional(raft::make_device_vector(handle, ps.n_rows).view()) - : std::nullopt); + rmm::device_uvector distances_allNN_dev(queries_size, cuda_stream); + rmm::device_uvector indices_allNN_dev(queries_size, cuda_stream); + + if (ps.data_on_host) { + auto database_h = raft::make_host_matrix(ps.n_rows, ps.dim); + raft::copy(database_h.data_handle(), database.data(), ps.n_rows * ps.dim, cuda_stream); + + all_neighbors::build( + handle, + params, + raft::make_const_mdspan(database_h.view()), + raft::make_device_matrix_view(indices_allNN_dev.data(), ps.n_rows, ps.k), + raft::make_device_matrix_view(distances_allNN_dev.data(), ps.n_rows, ps.k), + ps.mutual_reach + ? std::make_optional(raft::make_device_vector(handle, ps.n_rows).view()) + : std::nullopt); + } else { + all_neighbors::build( + handle, + params, + raft::make_device_matrix_view(database.data(), ps.n_rows, ps.dim), + raft::make_device_matrix_view(indices_allNN_dev.data(), ps.n_rows, ps.k), + raft::make_device_matrix_view(distances_allNN_dev.data(), ps.n_rows, ps.k), + ps.mutual_reach + ? std::make_optional(raft::make_device_vector(handle, ps.n_rows).view()) + : std::nullopt); + } + + raft::copy(indices_allNN.data(), indices_allNN_dev.data(), queries_size, cuda_stream); + raft::copy(distances_allNN.data(), distances_allNN_dev.data(), queries_size, cuda_stream); + raft::resource::sync_stream(handle); } - - raft::copy(indices_allNN.data(), indices_allNN_dev.data(), queries_size, cuda_stream); - raft::copy(distances_allNN.data(), distances_allNN_dev.data(), queries_size, cuda_stream); - raft::resource::sync_stream(handle); } } @@ -258,7 +294,8 @@ const std::vector inputsSingle = {64, 137}, // dim {16, 23}, // graph_degree {false, true}, // data on host - {false} // mutual_reach + {false}, // mutual_reach + {false, true} // output on host ); const std::vector inputsBatch = @@ -281,7 +318,8 @@ const std::vector inputsBatch = {64, 137}, // dim {16, 23}, // graph_degree {true}, // data on host - {false} // mutual_reach + {false}, // mutual_reach + {false, true} // output on host ); const std::vector mutualReachSingle = @@ -297,7 +335,8 @@ const std::vector mutualReachSingle = {64, 137}, // dim {16, 23}, // graph_degree {false, true}, // data on host - {true} // mutual_reach + {true}, // mutual_reach + {false, true} // output on host ); const std::vector mutualReachBatch = @@ -317,7 +356,8 @@ const std::vector mutualReachBatch = {64, 137}, // dim {16, 23}, // graph_degree {true}, // data on host - {true} // mutual_reach + {true}, // mutual_reach + {false, true} // output on host ); } // namespace cuvs::neighbors::all_neighbors diff --git a/python/cuvs/cuvs/neighbors/all_neighbors/all_neighbors.pyx b/python/cuvs/cuvs/neighbors/all_neighbors/all_neighbors.pyx index fb4750d90d..812e1ebe24 100644 --- a/python/cuvs/cuvs/neighbors/all_neighbors/all_neighbors.pyx +++ b/python/cuvs/cuvs/neighbors/all_neighbors/all_neighbors.pyx @@ -1,9 +1,11 @@ # -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 # # cython: language_level=3 +import warnings + import numpy as np cimport cuvs.common.cydlpack @@ -198,7 +200,8 @@ def build(dataset, k, params, *, distances=None, core_distances=None, alpha=1.0, - resources=None): + resources=None, + return_on_host=False): """ All-neighbors allows building an approximate all-neighbors knn graph. Given a full dataset, it finds nearest neighbors for all the training @@ -217,14 +220,18 @@ def build(dataset, k, params, *, Parameters object containing all build settings including algorithm choice and algorithm-specific parameters. indices : array_like, optional - Optional output buffer for indices [num_rows x k] on device - (int64). If not provided, will be allocated automatically. + Optional output buffer for indices [num_rows x k] (int64). + Accepts a numpy array (host) or a device array (CUDA array + interface). If not provided, allocated according to return_on_host. distances : array_like, optional - Optional output buffer for distances [num_rows x k] on device - (float32) + Optional output buffer for distances [num_rows x k] (float32). + Accepts a numpy array (host) or a device array (CUDA array + interface). Has to be the same memory location as indices. core_distances : array_like, optional - Optional output buffer for core distances [num_rows] on device - (float32). Requires distances parameter to be provided. + Optional output buffer for core distances [num_rows] (float32). + Accepts a numpy array (host) or a device array (CUDA array + interface). Has to be the same memory location as indices. + Requires distances parameter to be provided. alpha : float, default=1.0 Mutual-reachability scaling; used only when core_distances is provided @@ -232,15 +239,20 @@ def build(dataset, k, params, *, CUDA resources to use for the operation. If not provided, a default Resources object will be created. Use MultiGpuResources to enable multi-GPU execution across multiple devices. + return_on_host : bool, default=False + Controls whether auto-allocated output arrays live on host (numpy) + or device (CUDA) memory. Ignored when indices or distances buffers + are provided, since host/device placement is inferred from those. Returns ------- indices : array_like - k-NN indices for each point [num_rows x k], always on device. + k-NN indices for each point [num_rows x k]. If indices buffer was provided, returns the same array filled with results. distances : array_like or None - k-NN distances if distances buffer was provided, None otherwise + k-NN distances if distances buffer was provided, None otherwise. + Returns the same array if provided (same memory location as input). core_distances : array_like or None Core distances if core_distances buffer was provided, None otherwise """ @@ -271,29 +283,70 @@ def build(dataset, k, params, *, "distances must be provided when core_distances is provided" ) - # Validate user-provided outputs (must be device arrays if provided) - if indices is not None and not hasattr( - indices, "__cuda_array_interface__" - ): - raise ValueError( - "indices must be a device array (CUDA array interface)" - ) - if distances is not None and not hasattr( - distances, "__cuda_array_interface__" - ): - raise ValueError( - "distances must be a device array (CUDA array interface)" - ) - if core_distances is not None and not hasattr( - core_distances, "__cuda_array_interface__" - ): - raise ValueError( - "core_distances must be a device array (CUDA array interface)" + # Validate user-provided outputs are either numpy (host) or device arrays + if indices is not None: + is_numpy = isinstance(indices, np.ndarray) + is_device = hasattr(indices, "__cuda_array_interface__") + if not (is_numpy or is_device): + raise ValueError( + "indices must be either a numpy array (host) or a device array " + "(CUDA array interface)" + ) + if distances is not None: + is_numpy = isinstance(distances, np.ndarray) + is_device = hasattr(distances, "__cuda_array_interface__") + if not (is_numpy or is_device): + raise ValueError( + "distances must be either a numpy array (host) or a device array " + "(CUDA array interface)" + ) + if indices is not None and distances is not None: + # checking if indices and distances are on the same memory location + indices_on_host = isinstance(indices, np.ndarray) + distances_on_host = isinstance(distances, np.ndarray) + if indices_on_host != distances_on_host: + raise ValueError( + "indices and distances must both be on host (numpy) or " + "both on device (CUDA array interface)" + ) + if core_distances is not None: + cd_is_numpy = isinstance(core_distances, np.ndarray) + cd_is_device = hasattr(core_distances, "__cuda_array_interface__") + if not (cd_is_numpy or cd_is_device): + raise ValueError( + "core_distances must be either a numpy array (host) or a " + "device array (CUDA array interface)" + ) + if indices is not None: + idx_on_host = isinstance(indices, np.ndarray) + cd_on_host = cd_is_numpy + if idx_on_host != cd_on_host: + raise ValueError( + "core_distances must be on the same memory location " + "as indices and distances (both host or both device)" + ) + + # Infer host vs device from user-provided output arrays; fall back to + # return_on_host only when neither indices nor distances is given. + if return_on_host and (indices is not None or distances is not None): + warnings.warn( + "return_on_host is ignored when indices or distances buffers " + "are provided. Output placement is inferred from the provided " + "arrays instead.", ) + if indices is not None: + on_host_output = isinstance(indices, np.ndarray) + elif distances is not None: + on_host_output = isinstance(distances, np.ndarray) + else: + # if nothing is provided, default to return_on_host + on_host_output = return_on_host - # Handle indices array (create if not provided) if indices is None: - indices = device_ndarray.empty((n_rows, k), dtype="int64") + if on_host_output: + indices = np.empty((n_rows, k), dtype="int64") + else: + indices = device_ndarray.empty((n_rows, k), dtype="int64") indices_out = wrap_array(indices) _check_input_array( diff --git a/python/cuvs/cuvs/tests/test_all_neighbors.py b/python/cuvs/cuvs/tests/test_all_neighbors.py index a232a58af8..2cd3c2e17e 100644 --- a/python/cuvs/cuvs/tests/test_all_neighbors.py +++ b/python/cuvs/cuvs/tests/test_all_neighbors.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 # @@ -37,9 +37,19 @@ def make_cosine( @pytest.mark.parametrize("algo", ["nn_descent", "brute_force", "ivf_pq"]) @pytest.mark.parametrize("cluster", ["single_cluster", "multi_cluster"]) @pytest.mark.parametrize("metric", ["sqeuclidean", "cosine"]) -def test_all_neighbors_device_build_quality(algo, cluster, metric): +@pytest.mark.parametrize( + "output_location", + ["host_arrays", "device_arrays", "return_on_host", "return_on_device"], +) +def test_all_neighbors_device_build_quality( + algo, cluster, metric, output_location +): """Test device build with quality validation against brute force ground - truth. + truth. Exercises all output placement paths: + - host_arrays: pre-allocated numpy indices + distances + - device_arrays: pre-allocated cupy indices + distances + - return_on_host: auto-allocated via return_on_host=True + - return_on_device: auto-allocated via return_on_host=False """ n_rows, n_cols, k = 7151, 64, 16 @@ -98,24 +108,72 @@ def test_all_neighbors_device_build_quality(algo, cluster, metric): ) res = Resources() - indices, distances = all_neighbors.build( - X_device, - k, - params, - distances=cupy.empty((n_rows, k), dtype=cupy.float32), - resources=res, - ) + + distances_result = None + if output_location == "host_arrays": + indices_arg = np.empty((n_rows, k), dtype="int64") + distances_arg = np.empty((n_rows, k), dtype="float32") + indices_result, distances_result = all_neighbors.build( + X_device, + k, + params, + indices=indices_arg, + distances=distances_arg, + resources=res, + ) + assert isinstance(indices_result, np.ndarray) + assert isinstance(distances_result, np.ndarray) + elif output_location == "device_arrays": + indices_arg = cupy.empty((n_rows, k), dtype=cupy.int64) + distances_arg = cupy.empty((n_rows, k), dtype=cupy.float32) + indices_result, distances_result = all_neighbors.build( + X_device, + k, + params, + indices=indices_arg, + distances=distances_arg, + resources=res, + ) + assert hasattr(indices_result, "__cuda_array_interface__") + assert hasattr(distances_result, "__cuda_array_interface__") + elif output_location == "return_on_host": + indices_result = all_neighbors.build( + X_device, + k, + params, + return_on_host=True, + resources=res, + ) + assert isinstance(indices_result, np.ndarray) + elif output_location == "return_on_device": + indices_result = all_neighbors.build( + X_device, + k, + params, + return_on_host=False, + resources=res, + ) + assert hasattr(indices_result, "__cuda_array_interface__") bf_index = brute_force.build(X_device, metric=metric) bf_distances, bf_indices = brute_force.search(bf_index, X_device, k=k) - - indices_host = cupy.asnumpy(indices) bf_indices_host = cupy.asnumpy(bf_indices) - assert indices.shape == (n_rows, k) - assert indices.dtype == cupy.int64 - assert distances.shape == (n_rows, k) - assert distances.dtype == cupy.float32 + if isinstance(indices_result, np.ndarray): + indices_host = indices_result + else: + indices_host = cupy.asnumpy(indices_result) + + assert indices_host.shape == (n_rows, k) + assert indices_host.dtype == np.int64 + + if distances_result is not None: + if isinstance(distances_result, np.ndarray): + distances_host = distances_result + else: + distances_host = cupy.asnumpy(distances_result) + assert distances_host.shape == (n_rows, k) + assert distances_host.dtype == np.float32 recall = calc_recall(indices_host, bf_indices_host) assert recall > 0.85 @@ -124,9 +182,19 @@ def test_all_neighbors_device_build_quality(algo, cluster, metric): @pytest.mark.parametrize("algo", ["nn_descent", "brute_force", "ivf_pq"]) @pytest.mark.parametrize("cluster", ["single_cluster", "multi_cluster"]) @pytest.mark.parametrize("snmg", [False, True]) -def test_all_neighbors_host_build_quality(algo, cluster, snmg): +@pytest.mark.parametrize( + "output_location", + ["host_arrays", "device_arrays", "return_on_host", "return_on_device"], +) +def test_all_neighbors_host_build_quality( + algo, cluster, snmg, output_location +): """Test host build with quality validation against brute force ground - truth. + truth. Exercises all output placement paths: + - host_arrays: pre-allocated numpy indices + distances + - device_arrays: pre-allocated cupy indices + distances + - return_on_host: auto-allocated via return_on_host=True + - return_on_device: auto-allocated via return_on_host=False """ n_rows, n_cols, k = 7151, 64, 16 @@ -184,25 +252,71 @@ def test_all_neighbors_host_build_quality(algo, cluster, snmg): else: res = Resources() - indices, distances = all_neighbors.build( - X_host, - k, - params, - distances=cupy.empty((n_rows, k), dtype=cupy.float32), - resources=res, - ) + distances_result = None + if output_location == "host_arrays": + indices_arg = np.empty((n_rows, k), dtype="int64") + distances_arg = np.empty((n_rows, k), dtype="float32") + indices_result, distances_result = all_neighbors.build( + X_host, + k, + params, + indices=indices_arg, + distances=distances_arg, + resources=res, + ) + assert isinstance(indices_result, np.ndarray) + assert isinstance(distances_result, np.ndarray) + elif output_location == "device_arrays": + indices_arg = cupy.empty((n_rows, k), dtype=cupy.int64) + distances_arg = cupy.empty((n_rows, k), dtype=cupy.float32) + indices_result, distances_result = all_neighbors.build( + X_host, + k, + params, + indices=indices_arg, + distances=distances_arg, + resources=res, + ) + assert hasattr(indices_result, "__cuda_array_interface__") + assert hasattr(distances_result, "__cuda_array_interface__") + elif output_location == "return_on_host": + indices_result = all_neighbors.build( + X_host, + k, + params, + return_on_host=True, + resources=res, + ) + assert isinstance(indices_result, np.ndarray) + elif output_location == "return_on_device": + indices_result = all_neighbors.build( + X_host, + k, + params, + return_on_host=False, + resources=res, + ) + assert hasattr(indices_result, "__cuda_array_interface__") bf_index = brute_force.build(X_device, metric="sqeuclidean") bf_distances, bf_indices = brute_force.search(bf_index, X_device, k=k) - - indices_host = cupy.asnumpy(indices) bf_indices_host = cupy.asnumpy(bf_indices) - assert indices.shape == (n_rows, k) - assert indices.dtype == cupy.int64 - assert distances.shape == (n_rows, k) - assert distances.dtype == cupy.float32 + if isinstance(indices_result, np.ndarray): + indices_host = indices_result + else: + indices_host = cupy.asnumpy(indices_result) - recall = calc_recall(indices_host, bf_indices_host) + assert indices_host.shape == (n_rows, k) + assert indices_host.dtype == np.int64 + if distances_result is not None: + if isinstance(distances_result, np.ndarray): + distances_host = distances_result + else: + distances_host = cupy.asnumpy(distances_result) + assert distances_host.shape == (n_rows, k) + assert distances_host.dtype == np.float32 + + recall = calc_recall(indices_host, bf_indices_host) assert recall > 0.85