diff --git a/docs/source/python_api/cluster_kmeans.rst b/docs/source/python_api/cluster_kmeans.rst index 8fda17f80d..60f2c44d36 100644 --- a/docs/source/python_api/cluster_kmeans.rst +++ b/docs/source/python_api/cluster_kmeans.rst @@ -16,6 +16,11 @@ K-Means Fit .. autofunction:: cuvs.cluster.kmeans.fit +K-Means Fit-Predict +################### + +.. autofunction:: cuvs.cluster.kmeans.fit_predict + K-Means Predict ############### diff --git a/python/cuvs/cuvs/cluster/kmeans/__init__.py b/python/cuvs/cuvs/cluster/kmeans/__init__.py index f4765bcb64..3b2fa060be 100644 --- a/python/cuvs/cuvs/cluster/kmeans/__init__.py +++ b/python/cuvs/cuvs/cluster/kmeans/__init__.py @@ -1,7 +1,7 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 -from .kmeans import KMeansParams, cluster_cost, fit, predict +from .kmeans import KMeansParams, cluster_cost, fit, fit_predict, predict -__all__ = ["KMeansParams", "cluster_cost", "fit", "predict"] +__all__ = ["KMeansParams", "cluster_cost", "fit", "fit_predict", "predict"] diff --git a/python/cuvs/cuvs/cluster/kmeans/kmeans.pyx b/python/cuvs/cuvs/cluster/kmeans/kmeans.pyx index 489d983ac7..9169bf16b6 100644 --- a/python/cuvs/cuvs/cluster/kmeans/kmeans.pyx +++ b/python/cuvs/cuvs/cluster/kmeans/kmeans.pyx @@ -1,5 +1,5 @@ # -# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 # # cython: language_level=3 @@ -156,6 +156,10 @@ cdef class KMeansParams: FitOutput = namedtuple("FitOutput", "centroids inertia n_iter") +FitPredictOutput = namedtuple( + "FitPredictOutput", "labels centroids inertia n_iter" +) + @auto_sync_resources @auto_convert_output @@ -239,6 +243,86 @@ def fit( return FitOutput(centroids, inertia, n_iter) +@auto_sync_resources +@auto_convert_output +def fit_predict( + KMeansParams params, + X, + centroids=None, + sample_weights=None, + labels=None, + normalize_weight=True, + resources=None, +): + """ + Fit k-means on ``X`` and return cluster labels for the same data. + + This is equivalent to calling :func:`fit` followed by :func:`predict` on + ``X`` with the fitted centroids. + + Parameters + ---------- + params : KMeansParams + Parameters to use to fit the KMeans model + X : Input CUDA array interface compliant matrix shape (m, k) + centroids : Optional writable CUDA array interface compliant matrix + shape (n_clusters, k) + sample_weights : Optional input CUDA array interface compliant matrix shape + (n_clusters, 1) default: None + labels : Optional preallocated CUDA array interface matrix shape (m, 1) + to hold the output labels + normalize_weight: bool + Passed to :func:`predict`; True if the weights should be normalized + {resources_docstring} + + Returns + ------- + labels : raft.device_ndarray + Cluster index for each row of ``X`` + centroids : raft.device_ndarray + The fitted cluster centroids + inertia : float + Sum of squared distances of samples to their closest cluster center + (from the prediction step) + n_iter : int + Number of iterations used in :func:`fit` + + Examples + -------- + + >>> import cupy as cp + >>> + >>> from cuvs.cluster.kmeans import fit_predict, KMeansParams + >>> + >>> n_samples = 5000 + >>> n_features = 50 + >>> n_clusters = 3 + >>> + >>> X = cp.random.random_sample((n_samples, n_features), + ... dtype=cp.float32) + >>> + >>> params = KMeansParams(n_clusters=n_clusters) + >>> labels, centroids, inertia, n_iter = fit_predict(params, X) + """ + centroids_out, _, n_iter = fit( + params, + X, + centroids=centroids, + sample_weights=sample_weights, + resources=resources, + ) + labels_out, inertia_pred = predict( + params, + X, + centroids_out, + sample_weights=sample_weights, + labels=labels, + normalize_weight=normalize_weight, + resources=resources, + ) + return FitPredictOutput(labels_out, centroids_out, inertia_pred, n_iter) + + PredictOutput = namedtuple("PredictOutput", "labels inertia") diff --git a/python/cuvs/cuvs/tests/test_kmeans.py b/python/cuvs/cuvs/tests/test_kmeans.py index 6f18137b13..5fa956daf0 100644 --- a/python/cuvs/cuvs/tests/test_kmeans.py +++ b/python/cuvs/cuvs/tests/test_kmeans.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 # @@ -6,7 +6,13 @@ import pytest from pylibraft.common import device_ndarray -from cuvs.cluster.kmeans import KMeansParams, cluster_cost, fit, predict +from cuvs.cluster.kmeans import ( + KMeansParams, + cluster_cost, + fit, + fit_predict, + predict, +) from cuvs.distance import pairwise_distance @@ -43,6 +49,52 @@ def test_kmeans_fit(n_rows, n_cols, n_clusters, dtype, hierarchical): assert np.all(labels.copy_to_host() == np.arange(labels.shape[0])) +@pytest.mark.parametrize("n_rows", [100]) +@pytest.mark.parametrize("n_cols", [5, 25]) +@pytest.mark.parametrize("n_clusters", [5, 15]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("hierarchical", [True, False]) +def test_kmeans_fit_predict(n_rows, n_cols, n_clusters, dtype, hierarchical): + if hierarchical and dtype == np.float64: + pytest.skip("hierarchical kmeans doesn't support float64") + + # generate some random input points / centroids + X_host = np.random.random_sample((n_rows, n_cols)).astype(dtype) + init_host = X_host[:n_clusters].copy() + + # initialize the centroids for fit_predict and sequential fit + init_for_fit_predict = device_ndarray(init_host) + init_for_sequential = device_ndarray(init_host.copy()) + + X = device_ndarray(X_host) + + params = KMeansParams( + n_clusters=n_clusters, + hierarchical=hierarchical, + init_method="Array", + ) + + labels_fp, centroids_fp, inertia_fp, n_iter_fp = fit_predict( + params, X, centroids=init_for_fit_predict + ) + centroids_seq, _, n_iter_seq = fit( + params, X, centroids=init_for_sequential + ) + + if hierarchical: + labels, pred_inertia = predict(params, X, centroids_fp) + assert inertia_fp == pred_inertia == 0.0 + else: + labels, _ = predict(params, X, centroids_seq) + assert n_iter_fp == n_iter_seq + + # make sure the labels are the same between fit_predict and sequential fit + np.testing.assert_array_equal( + labels_fp.copy_to_host(), + labels.copy_to_host(), + ) + + @pytest.mark.parametrize("n_rows", [100]) @pytest.mark.parametrize("n_cols", [5, 25]) @pytest.mark.parametrize("n_clusters", [4, 15])