diff --git a/.Rbuildignore b/.Rbuildignore index a4fee43..0a6f21b 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -10,6 +10,7 @@ ^src/CMakeLists\.txt$ ^src/CMakeCache\.txt$ ^src/CMakeFiles/* +^src/\.cmake-build/* ^src/_deps/* ^src/eval_gpu_archs* ^src/*\.o$ diff --git a/DESCRIPTION b/DESCRIPTION index d88861e..a633018 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -46,5 +46,7 @@ LinkingTo: Rcpp Encoding: UTF-8 RoxygenNote: 7.3.3 OS_type: unix -SystemRequirements: RAPIDS cuML (see https://rapids.ai/start.html) +SystemRequirements: NVIDIA GPU and driver, CUDA Toolkit with nvcc, and uv or + Python/pip for automatic RAPIDS cuML bootstrap. Alternatively, an existing + RAPIDS cuML installation can be provided with CUML_PREFIX. NeedsCompilation: yes diff --git a/R/cuml_utils.R b/R/cuml_utils.R index abd8822..322f073 100644 --- a/R/cuml_utils.R +++ b/R/cuml_utils.R @@ -4,14 +4,23 @@ #' @return A logical value indicating whether the current installation \{cuda.ml\} #' was linked to a valid version of the RAPIDS cuML shared library. #' +#' @details +#' If this returns \code{FALSE}, \pkg{cuda.ml} was installed in stub-only mode. +#' On a GPU machine, verify that \code{nvidia-smi} and \code{nvcc --version} +#' both work, then reinstall \pkg{cuda.ml}. During installation, \pkg{cuda.ml} +#' can bootstrap RAPIDS cuML from pip wheels with \code{uv} or Python/pip. If +#' RAPIDS cuML is already installed, set \code{CUML_PREFIX} to a prefix +#' containing \code{include/cuml} and \code{lib/libcuml++.so} before +#' reinstalling. +#' #' @examples #' #' library(cuda.ml) #' #' if (!has_cuML()) { #' warning( -#' "Please install the RAPIDS cuML shared library first, and then re-", -#' "install {cuda.ml}." +#' "This installation was built without RAPIDS cuML. Verify `nvidia-smi` ", +#' "and `nvcc --version`, then reinstall {cuda.ml}." #' ) #' } #' @export diff --git a/R/knn.R b/R/knn.R index 4fc8ff6..d1ff50e 100644 --- a/R/knn.R +++ b/R/knn.R @@ -72,6 +72,7 @@ cuda_ml_knn_algo_ivfpq <- function(nlist, nprobe, m, n_bits, nlist = as.integer(nlist), nprobe = as.integer(nprobe), M = as.integer(m), + n_bits = as.integer(n_bits), usePrecomputedTables = as.logical(use_precomputed_tables) ) ) diff --git a/R/package.R b/R/package.R index e0ffb36..5de704c 100644 --- a/R/package.R +++ b/R/package.R @@ -2,6 +2,23 @@ #' #' This package provides a R interface for the RAPIDS cuML library. #' +#' @section Installation: +#' A functional GPU installation requires an NVIDIA GPU with a working driver, +#' a CUDA Toolkit installation that provides \code{nvcc}, and normal R package +#' build tools. During installation, \pkg{cuda.ml} first looks for an existing +#' RAPIDS installation through \code{CUML_PREFIX} or \code{CUDA_PATH}. If none +#' is found, it can bootstrap RAPIDS cuML from pip wheels with \code{uv} or +#' Python/pip and link against the resulting local prefix. +#' +#' On machines without a usable NVIDIA driver/GPU and \code{nvcc}, including +#' CRAN check machines, \pkg{cuda.ml} may install in stub-only mode. In that +#' mode \code{has_cuML()} returns \code{FALSE}, and cuML-backed algorithms are +#' unavailable until the system prerequisites are installed and \pkg{cuda.ml} +#' is reinstalled. +#' +#' Useful environment variables include \code{CUDA_HOME}, \code{CUML_PREFIX}, +#' \code{CUML_BOOTSTRAP}, and \code{CUML_BOOTSTRAP_CACHE}. +#' #' @author Yitao Li #' @import Rcpp #' @useDynLib cuda.ml, .registration = TRUE @@ -17,20 +34,15 @@ if (!has_cuML()) { packageStartupMessage( " - The current installation of {", pkgname, "} will not function as expected - because it was not linked with a valid version of the RAPIDS cuML shared - library. + The current installation of {", pkgname, "} was built without a usable + RAPIDS cuML shared library. + + To fix this, ensure `nvidia-smi` and `nvcc --version` both work, then + reinstall {", pkgname, "}. During installation, {", pkgname, "} can + bootstrap RAPIDS cuML from pip wheels with `uv` or Python/pip. - To fix this issue, please follow https://rapids.ai/start.html#get-rapids - to install the RAPIDS cuML shared library from Conda and ensure the - 'CUML_PREFIX' env variable is set to a valid RAPIDS conda env directory - (e.g., '/home/user/anaconda3/envs/rapids-21.06', '/usr', or similar) - during the installation of {", pkgname, "} or alternatively, follow - https://github.com/yitao-li/cuml-installation-notes#build-from-source-without-conda-and-without-multi-gpu-support - or - https://github.com/yitao-li/cuml-installation-notes#build-from-source-without-conda-and-with-multi-gpu-support - or similar to build and install RAPIDS cuML library from source, and - then re-install {", pkgname, "}.\n\n + If RAPIDS is already installed, set `CUML_PREFIX` to a prefix containing + include/cuml and lib/libcuml++.so before reinstalling.\n\n " ) } diff --git a/R/rand_forest.R b/R/rand_forest.R index 6d7380b..6e05882 100644 --- a/R/rand_forest.R +++ b/R/rand_forest.R @@ -331,6 +331,14 @@ cuda_ml_rand_forest_impl_regression <- function(processed, mtry, trees, min_n, #' @export cuda_ml_get_state.cuda_ml_rand_forest <- function(model) { + if (!cuda_ml_fil_enabled()) { + stop( + "Random forest serialization requires Treelite/FIL support, but FIL is ", + "disabled in this cuda.ml build.", + call. = FALSE + ) + } + get_state_impl <- switch(model$mode, classification = .rf_classifier_get_state, regression = .rf_regressor_get_state diff --git a/R/rand_proj.R b/R/rand_proj.R index 475ebec..92740a9 100644 --- a/R/rand_proj.R +++ b/R/rand_proj.R @@ -5,6 +5,22 @@ new_rproj_model <- function(rproj_ctx) { model } +cuda_ml_rand_proj_available <- function() { + tryCatch( + { + .rproj_johnson_lindenstrauss_min_dim(2L, 0.5) + TRUE + }, + error = function(e) { + if (grepl("random projection support is not available", e$message)) { + FALSE + } else { + stop(e) + } + } + ) +} + #' Random projection for dimensionality reduction. #' #' Generate a random projection matrix for dimensionality reduction, and diff --git a/R/tsvd.R b/R/tsvd.R index 92aec91..d2abde3 100644 --- a/R/tsvd.R +++ b/R/tsvd.R @@ -48,11 +48,25 @@ cuda_ml_tsvd <- function(x, transform_input = transform_input, verbosity = cuML_log_level ) + model <- tsvd_flip_signs(model) class(model) <- c("cuda_ml_tsvd", class(model)) model } +tsvd_flip_signs <- function(model) { + signs <- apply(model$components, 1L, function(x) { + if (x[[which.max(abs(x))]] < 0) -1 else 1 + }) + + model$components <- sweep(model$components, 1L, signs, `*`) + if (!is.null(model$transformed_data)) { + model$transformed_data <- sweep(model$transformed_data, 2L, signs, `*`) + } + + model +} + #' @export cuda_ml_transform.cuda_ml_tsvd <- function(model, x, ...) { .tsvd_transform(model = model, x = as.matrix(x)) diff --git a/README.Rmd b/README.Rmd index f8efc88..8089726 100644 --- a/README.Rmd +++ b/README.Rmd @@ -176,115 +176,117 @@ about the MNIST dataset: ## Installation -In order for {cuda.ml} to work as expected, the C++/CUDA source code of -{cuda.ml} must be linked with CUDA runtime and a valid copy of the RAPIDS cuML -library. +For a fully functional installation, {cuda.ml} needs: -Before installing {cuda.ml} itself, it may be worthwhile to take a quick look -through the sub-sections below on how to properly setup all of {cuda.ml}'s -required runtime dependencies. +- an NVIDIA GPU with a working NVIDIA driver; +- a CUDA Toolkit installation that provides `nvcc`; +- normal R package build tools; and +- either `uv` or Python with `pip`. -### Quick note on installing the RAPIDS cuML library: +When those prerequisites are present, {cuda.ml} can bootstrap RAPIDS cuML from +pip wheels during installation. You do not need conda, and you usually do not +need to set `CUML_PREFIX` manually. -Although Conda is the only officially supported distribution channel at the -moment for RAPIDS cuML (i.e., see https://rapids.ai/start.html#get-rapids), -you can still build and install this library from source without relying on -Conda. -See https://github.com/yitao-li/cuml-installation-notes for build-from-source -instructions. +On a new Ubuntu installation, install R/build/Python prerequisites: -### Quick install instructions for Ubuntu 20-04: - -#### Install deps: -``` -sudo apt install -y cmake ccache libblas3 liblapack3 +```bash +sudo apt update +sudo apt install -y r-base-dev build-essential git cmake \ + python3 python3-pip python3-venv ubuntu-drivers-common ``` +Install the NVIDIA driver, reboot, and verify that the driver can see your GPU: -### Install CUDA -(consult https://developer.nvidia.com/cuda-downloads for other platforms) ```bash -wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin -sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 -wget https://developer.download.nvidia.com/compute/cuda/11.4.2/local_installers/cuda-repo-ubuntu2004-11-4-local_11.4.2-470.57.02-1_amd64.deb -sudo dpkg -i cuda-repo-ubuntu2004-11-4-local_11.4.2-470.57.02-1_amd64.deb -sudo apt-key add /var/cuda-repo-ubuntu2004-11-4-local/7fa2af80.pub -sudo apt-get update -sudo apt-get -y install cuda +sudo ubuntu-drivers install +sudo reboot + +nvidia-smi ``` -### Add CUDA executables to path -(nvcc is needed for building the C++/CUDA source code of {cuda.ml}) + +Install a CUDA Toolkit that includes `nvcc`. Use NVIDIA's CUDA Linux +installation guide for your Ubuntu release to add the CUDA apt repository, then: + ```bash -echo "export PATH=$PATH:/usr/local/cuda/bin" >> ~/.bashrc -source ~/.bashrc +sudo apt update +sudo apt install -y cuda-toolkit + +nvcc --version ``` -### Install Miniconda: +If the toolkit is installed but `nvcc` is not on `PATH`, set `CUDA_HOME` to the +toolkit prefix before installing {cuda.ml}, for example: + ```bash -wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -chmod +x Miniconda3-latest-Linux-x86_64.sh -./Miniconda3-latest-Linux-x86_64.sh -b -# consult https://rapids.ai/start.html for alternatives +export CUDA_HOME=/usr/local/cuda ``` -### Create and configure the conda env -``` -# This is a relatively big download, may take a while -~/miniconda3/bin/conda create -n rapids-21.08 -c rapidsai -c nvidia -c conda-forge \ - rapids-blazing=21.08 python=3.8 cudatoolkit=11.2 -``` +Then install {cuda.ml}: -### Install cmake -CUDA dependencies require a relatively recent version of CMake, so you need to install it manually -```bash -wget https://github.com/Kitware/CMake/releases/download/v3.22.0/cmake-3.22.0.tar.gz -cd cmake-3.22.0 -./bootstrap && make -j8 && sudo make install -cd .. +``` r +install.packages("cuda.ml") ``` -### Activate the conda env: -```bash -. ~/miniconda3/bin/activate -conda activate rapids-21.08 +And verify that the installed package was linked with real cuML: + +``` r +library(cuda.ml) +has_cuML() ``` -### Consider adjusting `LD_LIBRARY_PATH` +If this returns `TRUE`, {cuda.ml} is using RAPIDS cuML. If it returns `FALSE`, +the package installed in stub-only mode; check the install output for the first +missing prerequisite. -The subsequent steps may (or may not) fail without the following: +### What happens during installation -```bash -export LD_LIBRARY_PATH=~/miniconda3/envs/rapids-21.08/lib -``` +The configure script first looks for an existing RAPIDS installation through +`CUML_PREFIX` or `CUDA_PATH`. If no existing installation is found, and a +working NVIDIA driver/GPU plus `nvcc` are available, it bootstraps RAPIDS cuML +from pip wheels into a cache directory and links {cuda.ml} against that prefix. -If you get some error indicating a GLIBC version mismatch in the subsequent -steps, then please try adjusting `LD_LIBRARY_PATH` as a workaround. +The bootstrap prefers `uv` when available, then reticulate's managed `uv`, then +`python -m pip`, `python3 -m pip`, `pip`, and `pip3`. +Useful environment variables: -### Consider enabling ccache +- `CUDA_HOME`: CUDA Toolkit prefix containing `bin/nvcc`. +- `CUML_PREFIX`: existing RAPIDS prefix containing `include/cuml` and + `lib/libcuml++.so`. +- `CUML_BOOTSTRAP=0`: disable automatic RAPIDS pip bootstrap. +- `CUML_BOOTSTRAP_CACHE`: cache directory for bootstrapped RAPIDS headers and + libraries. +- `CUML_PIP_VERSION`: RAPIDS pip wheel version to install. -To speed up recompilation times during development, set this env var: -```bash -echo "export CUML4R_ENABLE_CCACHE=1" >> ~/.bashrc -. ~/.bashrc -``` +### CRAN and machines without GPUs -### Install {cuda.ml} the R package: +On CRAN, or on machines without a usable NVIDIA GPU/driver and `nvcc`, {cuda.ml} +can still install in stub-only mode. In that mode `has_cuML()` returns `FALSE` +and cuML-backed algorithms are not usable until the system prerequisites are +installed and {cuda.ml} is reinstalled. -You can install the released version of {cuda.ml} from -[CRAN](https://CRAN.R-project.org) with: +### Manual RAPIDS installations -``` r -install.packages("cuda.ml") -``` +If you already have RAPIDS cuML from pip, conda, or a source build, set +`CUML_PREFIX` to a prefix containing `include/cuml` and `lib/libcuml++.so` +before installing {cuda.ml}. In this case the automatic bootstrap is skipped. -And the development version from [GitHub](https://github.com/) with: +### Development version + +Install the development version from [GitHub](https://github.com/) with: ``` r # install.packages("devtools") devtools::install_github("mlverse/cuda.ml") ``` +To speed up recompilation times during development, set this env var: + +```bash +echo "export CUML4R_ENABLE_CCACHE=1" >> ~/.bashrc +. ~/.bashrc +``` + ## Appendix diff --git a/README.md b/README.md index 52460e0..277195c 100644 --- a/README.md +++ b/README.md @@ -263,110 +263,100 @@ From this type of visualization, we can qualitatively understand the following a ## Installation -In order for {cuda.ml} to work as expected, the C++/CUDA source code of {cuda.ml} must be linked with CUDA runtime and a valid copy of the RAPIDS cuML library. +For a fully functional installation, {cuda.ml} needs: -Before installing {cuda.ml} itself, it may be worthwhile to take a quick look through the sub-sections below on how to properly setup all of {cuda.ml}'s required runtime dependencies. +- an NVIDIA GPU with a working NVIDIA driver; +- a CUDA Toolkit installation that provides `nvcc`; +- normal R package build tools; and +- either `uv` or Python with `pip`. -### Quick note on installing the RAPIDS cuML library: +When those prerequisites are present, {cuda.ml} can bootstrap RAPIDS cuML from pip wheels during installation. You do not need conda, and you usually do not need to set `CUML_PREFIX` manually. -Although Conda is the only officially supported distribution channel at the moment for RAPIDS cuML (i.e., see ), you can still build and install this library from source without relying on Conda. See for build-from-source instructions. - -### Quick install instructions for Ubuntu 20-04: - -#### Install deps: - - sudo apt install -y cmake ccache libblas3 liblapack3 - -### Install CUDA - -(consult for other platforms) +On a new Ubuntu installation, install R/build/Python prerequisites: ``` bash -wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin -sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 -wget https://developer.download.nvidia.com/compute/cuda/11.4.2/local_installers/cuda-repo-ubuntu2004-11-4-local_11.4.2-470.57.02-1_amd64.deb -sudo dpkg -i cuda-repo-ubuntu2004-11-4-local_11.4.2-470.57.02-1_amd64.deb -sudo apt-key add /var/cuda-repo-ubuntu2004-11-4-local/7fa2af80.pub -sudo apt-get update -sudo apt-get -y install cuda +sudo apt update +sudo apt install -y r-base-dev build-essential git cmake \ + python3 python3-pip python3-venv ubuntu-drivers-common ``` -### Add CUDA executables to path - -(nvcc is needed for building the C++/CUDA source code of {cuda.ml}) +Install the NVIDIA driver, reboot, and verify that the driver can see your GPU: ``` bash -echo "export PATH=$PATH:/usr/local/cuda/bin" >> ~/.bashrc -source ~/.bashrc +sudo ubuntu-drivers install +sudo reboot + +nvidia-smi ``` -### Install Miniconda: +Install a CUDA Toolkit that includes `nvcc`. Use NVIDIA's CUDA Linux installation guide for your Ubuntu release to add the CUDA apt repository, then: ``` bash -wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -chmod +x Miniconda3-latest-Linux-x86_64.sh -./Miniconda3-latest-Linux-x86_64.sh -b -# consult https://rapids.ai/start.html for alternatives -``` +sudo apt update +sudo apt install -y cuda-toolkit -### Create and configure the conda env +nvcc --version +``` - # This is a relatively big download, may take a while - ~/miniconda3/bin/conda create -n rapids-21.08 -c rapidsai -c nvidia -c conda-forge \ - rapids-blazing=21.08 python=3.8 cudatoolkit=11.2 +If the toolkit is installed but `nvcc` is not on `PATH`, set `CUDA_HOME` to the toolkit prefix before installing {cuda.ml}, for example: -### Install cmake +``` bash +export CUDA_HOME=/usr/local/cuda +``` -CUDA dependencies require a relatively recent version of CMake, so you need to install it manually +Then install {cuda.ml}: -``` bash -wget https://github.com/Kitware/CMake/releases/download/v3.22.0/cmake-3.22.0.tar.gz -cd cmake-3.22.0 -./bootstrap && make -j8 && sudo make install -cd .. +``` r +install.packages("cuda.ml") ``` -### Activate the conda env: +And verify that the installed package was linked with real cuML: -``` bash -. ~/miniconda3/bin/activate -conda activate rapids-21.08 +``` r +library(cuda.ml) +has_cuML() ``` -### Consider adjusting `LD_LIBRARY_PATH` +If this returns `TRUE`, {cuda.ml} is using RAPIDS cuML. If it returns `FALSE`, the package installed in stub-only mode; check the install output for the first missing prerequisite. -The subsequent steps may (or may not) fail without the following: +### What happens during installation -``` bash -export LD_LIBRARY_PATH=~/miniconda3/envs/rapids-21.08/lib -``` +The configure script first looks for an existing RAPIDS installation through `CUML_PREFIX` or `CUDA_PATH`. If no existing installation is found, and a working NVIDIA driver/GPU plus `nvcc` are available, it bootstraps RAPIDS cuML from pip wheels into a cache directory and links {cuda.ml} against that prefix. -If you get some error indicating a GLIBC version mismatch in the subsequent steps, then please try adjusting `LD_LIBRARY_PATH` as a workaround. +The bootstrap prefers `uv` when available, then reticulate's managed `uv`, then `python -m pip`, `python3 -m pip`, `pip`, and `pip3`. -### Consider enabling ccache +Useful environment variables: -To speed up recompilation times during development, set this env var: +- `CUDA_HOME`: CUDA Toolkit prefix containing `bin/nvcc`. +- `CUML_PREFIX`: existing RAPIDS prefix containing `include/cuml` and `lib/libcuml++.so`. +- `CUML_BOOTSTRAP=0`: disable automatic RAPIDS pip bootstrap. +- `CUML_BOOTSTRAP_CACHE`: cache directory for bootstrapped RAPIDS headers and libraries. +- `CUML_PIP_VERSION`: RAPIDS pip wheel version to install. -``` bash -echo "export CUML4R_ENABLE_CCACHE=1" >> ~/.bashrc -. ~/.bashrc -``` +### CRAN and machines without GPUs -### Install {cuda.ml} the R package: +On CRAN, or on machines without a usable NVIDIA GPU/driver and `nvcc`, {cuda.ml} can still install in stub-only mode. In that mode `has_cuML()` returns `FALSE` and cuML-backed algorithms are not usable until the system prerequisites are installed and {cuda.ml} is reinstalled. -You can install the released version of {cuda.ml} from [CRAN](https://CRAN.R-project.org) with: +### Manual RAPIDS installations -``` r -install.packages("cuda.ml") -``` +If you already have RAPIDS cuML from pip, conda, or a source build, set `CUML_PREFIX` to a prefix containing `include/cuml` and `lib/libcuml++.so` before installing {cuda.ml}. In this case the automatic bootstrap is skipped. -And the development version from [GitHub](https://github.com/) with: +### Development version + +Install the development version from [GitHub](https://github.com/) with: ``` r # install.packages("devtools") devtools::install_github("mlverse/cuda.ml") ``` +To speed up recompilation times during development, set this env var: + +``` bash +echo "export CUML4R_ENABLE_CCACHE=1" >> ~/.bashrc +. ~/.bashrc +``` + ## Appendix
Inspect MNIST images diff --git a/man/cuda.ml-package.Rd b/man/cuda.ml-package.Rd index b43d49e..8502f4d 100644 --- a/man/cuda.ml-package.Rd +++ b/man/cuda.ml-package.Rd @@ -8,6 +8,25 @@ \description{ This package provides a R interface for the RAPIDS cuML library. } +\section{Installation}{ + +A functional GPU installation requires an NVIDIA GPU with a working driver, +a CUDA Toolkit installation that provides \code{nvcc}, and normal R package +build tools. During installation, \pkg{cuda.ml} first looks for an existing +RAPIDS installation through \code{CUML_PREFIX} or \code{CUDA_PATH}. If none +is found, it can bootstrap RAPIDS cuML from pip wheels with \code{uv} or +Python/pip and link against the resulting local prefix. + +On machines without a usable NVIDIA driver/GPU and \code{nvcc}, including +CRAN check machines, \pkg{cuda.ml} may install in stub-only mode. In that +mode \code{has_cuML()} returns \code{FALSE}, and cuML-backed algorithms are +unavailable until the system prerequisites are installed and \pkg{cuda.ml} +is reinstalled. + +Useful environment variables include \code{CUDA_HOME}, \code{CUML_PREFIX}, +\code{CUML_BOOTSTRAP}, and \code{CUML_BOOTSTRAP_CACHE}. +} + \seealso{ Useful links: \itemize{ diff --git a/man/has_cuML.Rd b/man/has_cuML.Rd index 8305755..d22a51c 100644 --- a/man/has_cuML.Rd +++ b/man/has_cuML.Rd @@ -15,14 +15,23 @@ A logical value indicating whether the current installation \{cuda.ml\} Determine whether \{cuda.ml\} was linked to a valid version of the RAPIDS cuML shared library. } +\details{ +If this returns \code{FALSE}, \pkg{cuda.ml} was installed in stub-only mode. +On a GPU machine, verify that \code{nvidia-smi} and \code{nvcc --version} +both work, then reinstall \pkg{cuda.ml}. During installation, \pkg{cuda.ml} +can bootstrap RAPIDS cuML from pip wheels with \code{uv} or Python/pip. If +RAPIDS cuML is already installed, set \code{CUML_PREFIX} to a prefix +containing \code{include/cuml} and \code{lib/libcuml++.so} before +reinstalling. +} \examples{ library(cuda.ml) if (!has_cuML()) { warning( - "Please install the RAPIDS cuML shared library first, and then re-", - "install {cuda.ml}." + "This installation was built without RAPIDS cuML. Verify `nvidia-smi` ", + "and `nvcc --version`, then reinstall {cuda.ml}." ) } } diff --git a/src/CMakeLists.txt.in b/src/CMakeLists.txt.in index 030d323..22a022d 100644 --- a/src/CMakeLists.txt.in +++ b/src/CMakeLists.txt.in @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.8 FATAL_ERROR) -set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") @@ -17,6 +17,7 @@ FetchContent_Declare( rapids-cmake GIT_REPOSITORY https://github.com/rapidsai/rapids-cmake.git GIT_TAG origin/branch-21.10 + UPDATE_DISCONNECTED TRUE ) FetchContent_MakeAvailable(rapids-cmake) include(rapids-cuda) @@ -34,7 +35,13 @@ endif(DEFINED ENV{CUML4R_ENABLE_CCACHE}) if(DEFINED CUML_INCLUDE_DIR) # CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES is needed so that cuda_runtime.h is found # CUML_INCLUDE_DIR is needed so that kmeans/kmeans_c.h is found - set(CUML4R_INCLUDE_DIRS ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${CUML_INCLUDE_DIR}) + set(CUML4R_INCLUDE_DIRS ${CUML_INCLUDE_DIR} ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + if(EXISTS "${CUML_INCLUDE_DIR}/rapids") + # RAPIDS pip wheels may ship a CCCL copy that matches the RAPIDS shared + # libraries under include/rapids, plus a newer top-level CCCL copy. + # Prefer the ABI-compatible copy for cuda/cub/thrust/nv headers. + include_directories(BEFORE ${CUML_INCLUDE_DIR}/rapids) + endif() else() message(FATAL_ERROR "CUML_INCLUDE_DIR not specified.") endif(DEFINED CUML_INCLUDE_DIR) @@ -50,6 +57,20 @@ include_directories(@RCPP_INCLUDE_DIR@) include_directories(${TREELITE_C_API_INCLUDE_DIR}) +set(CUML4R_RPROJ_C_API_FOUND FALSE) +foreach(CUML4R_INC_DIR IN LISTS CUML4R_INCLUDE_DIRS) + if(EXISTS "${CUML4R_INC_DIR}/cuml/random_projection/rproj_c.h") + set(CUML4R_RPROJ_C_API_FOUND TRUE) + endif() +endforeach() +if(NOT CUML4R_RPROJ_C_API_FOUND) + message( + STATUS + "cuML random projection C API headers were not found; random projection support will be disabled." + ) + add_definitions(-DCUML4R_RPROJ_C_API_MISSING) +endif(NOT CUML4R_RPROJ_C_API_FOUND) + if(DEFINED ENV{CUML4R_ENABLE_ASAN}) if($ENV{CUML4R_ENABLE_ASAN} MATCHES "true") add_compile_options(-fno-omit-frame-pointer -fsanitize-recover=address) @@ -121,25 +142,36 @@ add_library( # Need to set linker language to CUDA to link the CUDA Runtime set_target_properties(cuda.ml PROPERTIES LINKER_LANGUAGE "CUDA") set_target_properties(cuda.ml PROPERTIES PREFIX "") +target_compile_options( + cuda.ml + PRIVATE + $<$:-Wno-deprecated-declarations> + $<$:-Xcompiler=-Wno-deprecated-declarations> +) set(CUML4R_LIBS cuda.ml PRIVATE cuml++ cublas cusolver cudart cusparse) -find_package(Treelite) +find_package(Treelite QUIET) if(Treelite_FOUND) - set(CUML4R_LIBS ${CUML4R_LIBS} treelite::treelite treelite::treelite_runtime) + target_compile_definitions(cuda.ml PRIVATE CUML_ENABLE_GPU) + if(TARGET treelite::treelite_static) + set(CUML4R_LIBS ${CUML4R_LIBS} treelite::treelite_static) + elseif(TARGET treelite::treelite) + set(CUML4R_LIBS ${CUML4R_LIBS} treelite::treelite) + else() + message(FATAL_ERROR "Treelite was found but no supported Treelite CMake target was exported.") + endif() + if(TARGET treelite::treelite_runtime) + set(CUML4R_LIBS ${CUML4R_LIBS} treelite::treelite_runtime) + endif() set(CUML4R_INCLUDE_DIRS ${CUML4R_INCLUDE_DIRS} ${Treelite_INCLUDE_DIRS}) message(STATUS "Treelite found, ignoring stub headers: ${CUML_STUB_HEADERS_DIR}") else() message( - WARNING - " - Unable to locate 'TreeLite' using CMake. Forest Inference Library (FIL) - functionalities from {cuda.ml} will be disabled! - - Please install the treelite C API and re-install {cuda.ml} if you want to - enable FIL functionalities. - " + STATUS + "Treelite was not found; Forest Inference Library (FIL) support will be disabled." ) + add_definitions(-DCUML4R_TREELITE_C_API_MISSING) set( CUML4R_INCLUDE_DIRS ${CUML4R_INCLUDE_DIRS} ${CUML_STUB_HEADERS_DIR} ) diff --git a/src/agglomerative_clustering.cu b/src/agglomerative_clustering.cu index d5bb06f..cf24067 100644 --- a/src/agglomerative_clustering.cu +++ b/src/agglomerative_clustering.cu @@ -6,9 +6,9 @@ #include "preprocessor.h" #include "stream_allocator.h" -#include -#include #include +#include +#include #include @@ -38,9 +38,18 @@ __host__ Rcpp::List agglomerative_clustering(Rcpp::NumericMatrix const& x, async_copy(stream_view.value(), h_x.cbegin(), h_x.cend(), d_x.begin()); // single-linkage hierarchical clustering output - auto out = std::make_unique>(); thrust::device_vector d_labels(n_samples); thrust::device_vector d_children((n_samples - 1) * 2); + +#if CUML_VERSION_MAJOR >= 24 + ML::linkage::single_linkage( + handle, /*X=*/d_x.data().get(), /*n_rows=*/n_samples, + /*n_cols=*/n_features, /*n_clusters=*/n_clusters, + /*metric=*/static_cast(metric), + /*children=*/d_children.data().get(), /*labels=*/d_labels.data().get(), + /*use_knn=*/!pairwise_conn, /*c=*/n_neighbors); +#else + auto out = std::make_unique>(); out->labels = d_labels.data().get(); out->children = d_children.data().get(); @@ -56,6 +65,7 @@ __host__ Rcpp::List agglomerative_clustering(Rcpp::NumericMatrix const& x, /*metric=*/static_cast(metric), /*c=*/n_neighbors, n_clusters); } +#endif CUDA_RT_CALL(cudaStreamSynchronize(stream_view.value())); @@ -69,7 +79,11 @@ __host__ Rcpp::List agglomerative_clustering(Rcpp::NumericMatrix const& x, CUDA_RT_CALL(cudaStreamSynchronize(stream_view.value())); +#if CUML_VERSION_MAJOR >= 24 + result["n_clusters"] = n_clusters; +#else result["n_clusters"] = out->n_clusters; +#endif result["children"] = Rcpp::transpose(Rcpp::IntegerMatrix(2, n_samples - 1, h_children.begin())); result["labels"] = Rcpp::IntegerVector(h_labels.cbegin(), h_labels.cend()); diff --git a/src/async_utils.cuh b/src/async_utils.cuh index ed9050a..2a6d0a4 100644 --- a/src/async_utils.cuh +++ b/src/async_utils.cuh @@ -4,33 +4,21 @@ #include "cuda_utils.h" #include "preprocessor.h" -#include "unique_marker.cuh" -#include -#include - -#include +#include +#include namespace cuml4r { -// To ensure the correct async behavior, an `AsyncCopyCtx` object must be -// destroyed after the stream associated with the copy operation is -// synchronized, not before. -struct AsyncCopyCtx { - thrust::system::cuda::unique_eager_event event; - unique_marker marker; -}; +struct AsyncCopyCtx {}; // perform a copy operation that is asynchronous with respect to the host // and synchronous with respect to the stream specified -template -__host__ CUML4R_NODISCARD auto async_copy(cudaStream_t stream, Args&&... args) { - auto e = thrust::async::copy(std::forward(args)...); - auto& s = e.stream(); - unique_marker m; - CUDA_RT_CALL(cudaEventRecord(m.get(), s.get())); - CUDA_RT_CALL(cudaStreamWaitEvent(stream, m.get(), cudaEventWaitDefault)); - return AsyncCopyCtx{std::move(e), std::move(m)}; +template +__host__ CUML4R_NODISCARD auto async_copy( + cudaStream_t stream, InputIt first, InputIt last, OutputIt result) { + thrust::copy(thrust::cuda::par.on(stream), first, last, result); + return AsyncCopyCtx{}; } } // namespace cuml4r diff --git a/src/cd_fit_impl.cu b/src/cd_fit_impl.cu index c6e052e..6c8e67d 100644 --- a/src/cd_fit_impl.cu +++ b/src/cd_fit_impl.cu @@ -1,6 +1,8 @@ #include "lm_params.h" +#include "preprocessor.h" #include +#include namespace cuml4r { namespace detail { @@ -14,8 +16,13 @@ __host__ void cd_fit_impl(raft::handle_t& handle, lm::Params const& params, /*labels=*/params.d_labels, /*coef=*/params.d_coef, /*intercept=*/params.intercept, /*fit_intercept=*/params.fit_intercept, +#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) < \ + CUML4R_LIBCUML_VERSION(24, 0)) /*normalize=*/params.normalize_input, epochs, loss, alpha, - l1_ratio, shuffle, tol); +#else + epochs, +#endif + loss, alpha, l1_ratio, shuffle, tol); } } // namespace detail diff --git a/src/cuml_utils.cpp b/src/cuml_utils.cpp index 4f07355..86a06db 100644 --- a/src/cuml_utils.cpp +++ b/src/cuml_utils.cpp @@ -4,9 +4,6 @@ #include -static_assert(CUML_VERSION_MAJOR == 21, - "{cuda.ml} currently only supports linking to RAPIDS cuML 21.x!"); - #endif #include diff --git a/src/dbscan.cu b/src/dbscan.cu index e385995..7c3a534 100644 --- a/src/dbscan.cu +++ b/src/dbscan.cu @@ -5,9 +5,9 @@ #include "preprocessor.h" #include "stream_allocator.h" -#include -#include #include +#include +#include #include @@ -41,10 +41,21 @@ __host__ Rcpp::List dbscan(Rcpp::NumericMatrix const& x, int const min_pts, ML::Dbscan::fit(handle, /*input=*/d_src_data.data().get(), /*n_rows=*/n_samples, /*n_cols=*/n_features, eps, min_pts, +#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \ + CUML4R_LIBCUML_VERSION(24, 0)) + /*metric=*/ML::distance::DistanceType::L2SqrtUnexpanded, + /*labels=*/d_labels.data().get(), + /*core_sample_indices=*/nullptr, /*sample_weight=*/nullptr, + max_bytes_per_batch, /*eps_nn_method=*/ML::Dbscan::BRUTE_FORCE, + /*verbosity=*/static_cast( + verbosity), + /*opg=*/false); +#else /*metric=*/raft::distance::L2SqrtUnexpanded, /*labels=*/d_labels.data().get(), /*core_sample_indices=*/nullptr, max_bytes_per_batch, /*verbosity=*/verbosity, /*opg=*/false); +#endif CUDA_RT_CALL(cudaStreamSynchronize(stream_view.value())); diff --git a/src/device_allocator.cu b/src/device_allocator.cu index fe13909..139bc2d 100644 --- a/src/device_allocator.cu +++ b/src/device_allocator.cu @@ -2,6 +2,10 @@ #include "device_allocator.h" +#include + +#if CUML_VERSION_MAJOR < 24 + #include namespace { @@ -19,6 +23,8 @@ __host__ std::shared_ptr getDeviceAllocator() { } // namespace cuml4r +#endif + #else #include "warn_cuml_missing.h" diff --git a/src/device_allocator.h b/src/device_allocator.h index 124c3b1..098639d 100644 --- a/src/device_allocator.h +++ b/src/device_allocator.h @@ -2,6 +2,10 @@ #ifdef HAS_CUML +#include + +#if CUML_VERSION_MAJOR < 24 + #include namespace raft { @@ -20,6 +24,8 @@ std::shared_ptr getDeviceAllocator(); } // namespace cuml4r +#endif + #else #include "warn_cuml_missing.h" diff --git a/src/fil.cu b/src/fil.cu index 1545177..1ed555d 100644 --- a/src/fil.cu +++ b/src/fil.cu @@ -8,15 +8,18 @@ #include "stream_allocator.h" #include "treelite_utils.cuh" -#include -#include +#ifndef CUML4R_TREELITE_C_API_MISSING + #include #include +#include #include +#include #include #include +#include namespace cuml4r { namespace { @@ -25,43 +28,65 @@ enum class ModelType { XGBoost, XGBoostJSON, LightGBM }; struct FILModel { __host__ FILModel(std::unique_ptr handle, - fil::forest_uptr forest, size_t const num_classes) + fil::forest_uptr forest, bool const classification, + float const threshold, size_t const num_classes) : handle_(std::move(handle)), forest_(std::move(forest)), + classification_(classification), + threshold_(threshold), numClasses_(num_classes) {} std::unique_ptr const handle_; // NOTE: the destruction of `forest_` must precede the destruction of // `handle_`. fil::forest_uptr forest_; + bool const classification_; + float const threshold_; size_t const numClasses_; }; __host__ int treeliteLoadModel(ModelType const model_type, char const* filename, TreeliteHandle& tl_handle) { + auto constexpr config = "{}"; switch (model_type) { case ModelType::XGBoost: - return TreeliteLoadXGBoostModel(filename, tl_handle.get()); + return TreeliteLoadXGBoostModelLegacyBinary(filename, config, + tl_handle.get()); case ModelType::XGBoostJSON: - return TreeliteLoadXGBoostJSON(filename, tl_handle.get()); + return TreeliteLoadXGBoostModelJSON(filename, config, tl_handle.get()); case ModelType::LightGBM: - return TreeliteLoadLightGBMModel(filename, tl_handle.get()); + return TreeliteLoadLightGBMModel(filename, config, tl_handle.get()); } // unreachable return -1; } -/* - * The 'ML::fil::treelite_params_t::threads_per_tree' and - * 'ML::fil::treelite_params_t::n_items' parameters are only supported in - * RAPIDS cuML 21.08 or above. - */ -CUML4R_ASSIGN_IF_PRESENT(threads_per_tree) -CUML4R_NOOP_IF_ABSENT(threads_per_tree) +__host__ size_t treelite_num_classes(TreeliteHandle const& tl_handle, + bool const classification) { + if (!classification) { + return 0; + } + + auto const* model = static_cast(tl_handle.handle()); + auto num_classes = + model->num_class.Size() > 0 ? static_cast(model->num_class[0]) : 0; -CUML4R_ASSIGN_IF_PRESENT(n_items) -CUML4R_NOOP_IF_ABSENT(n_items) + // Treelite uses one output for binary classification in some import paths. + return std::max(num_classes, size_t(2)); +} + +template +__host__ Rcpp::NumericMatrix make_matrix(size_t const n_rows, + size_t const n_cols, F&& getter) { + Rcpp::NumericMatrix out(n_rows, n_cols); + for (size_t i = 0; i < n_rows; ++i) { + for (size_t j = 0; j < n_cols; ++j) { + out(i, j) = getter(i, j); + } + } + return out; +} } // namespace @@ -84,45 +109,18 @@ __host__ SEXP fil_load_model(int const model_type, std::string const& filename, } } - ML::fil::treelite_params_t params; - params.algo = static_cast(algo); - params.output_class = classification; - params.threshold = threshold; - params.storage_type = static_cast(storage_type); - params.blocks_per_sm = blocks_per_sm; - params.output_class = classification; - set_threads_per_tree(params, threads_per_tree); - set_n_items(params, n_items); - params.pforest_shape_str = nullptr; - auto stream_view = stream_allocator::getOrCreateStream(); auto handle = std::make_unique(); handle_utils::initializeHandle(*handle, stream_view.value()); - auto forest = fil::make_forest(*handle, /*src=*/[&] { - ML::fil::forest* f; - ML::fil::from_treelite(/*handle=*/*handle, /*pforest=*/&f, - /*model=*/*tl_handle.get(), - /*tl_params=*/¶ms); - return f; - }); - - size_t num_classes = 0; - if (classification) { - auto const rc = TreeliteQueryNumClass(/*handle=*/*tl_handle.get(), - /*out=*/&num_classes); - if (rc < 0) { - char const* err = TreeliteGetLastError(); - Rcpp::stop("TreeliteQueryNumClass failed: %s.", err); - } - - // Treelite returns 1 as number of classes for binary classification. - num_classes = std::max(num_classes, size_t(2)); - } + auto forest = fil::import_from_treelite( + *handle, tl_handle, fil::tree_layout_from_storage_type(storage_type)); + auto const num_classes = treelite_num_classes(tl_handle, classification); return Rcpp::XPtr( std::make_unique( - /*handle=*/std::move(handle), std::move(forest), num_classes) + /*handle=*/std::move(handle), std::move(forest), classification, + threshold, num_classes) .release()); } @@ -137,7 +135,7 @@ __host__ Rcpp::NumericMatrix fil_predict( auto const model_xptr = Rcpp::XPtr(model); auto const m = Matrix(x, /*transpose=*/false); - if (output_class_probabilities && model_xptr->numClasses_ == 0) { + if (output_class_probabilities && !model_xptr->classification_) { Rcpp::stop( "'output_class_probabilities' is not applicable for regressions!"); } @@ -150,15 +148,12 @@ __host__ Rcpp::NumericMatrix fil_predict( auto CUML4R_ANONYMOUS_VARIABLE(x_h2d) = async_copy(handle.get_stream(), h_x.cbegin(), h_x.cend(), d_x.begin()); - // ensemble output - thrust::device_vector d_preds(output_class_probabilities - ? model_xptr->numClasses_ * m.numRows - : m.numRows); + auto const n_outputs = + static_cast(model_xptr->forest_->num_outputs()); + thrust::device_vector d_preds(n_outputs * m.numRows); - ML::fil::predict(/*h=*/handle, /*f=*/model_xptr->forest_.get(), - /*preds=*/d_preds.data().get(), - /*data=*/d_x.data().get(), /*num_rows=*/m.numRows, - /*predict_proba=*/output_class_probabilities); + fil::predict(handle, *model_xptr->forest_, d_preds.data().get(), + d_x.data().get(), m.numRows); pinned_host_vector h_preds(d_preds.size()); auto CUML4R_ANONYMOUS_VARIABLE(preds_d2h) = async_copy( @@ -166,9 +161,47 @@ __host__ Rcpp::NumericMatrix fil_predict( CUDA_RT_CALL(cudaStreamSynchronize(handle.get_stream())); - return Rcpp::transpose(Rcpp::NumericMatrix( - output_class_probabilities ? model_xptr->numClasses_ : 1, m.numRows, - h_preds.begin())); + if (!model_xptr->classification_) { + return make_matrix(m.numRows, n_outputs, [&](size_t const i, + size_t const j) { + return h_preds[i * n_outputs + j]; + }); + } + + if (output_class_probabilities) { + if (n_outputs == model_xptr->numClasses_) { + return make_matrix(m.numRows, n_outputs, [&](size_t const i, + size_t const j) { + return h_preds[i * n_outputs + j]; + }); + } + if (n_outputs == 1 && model_xptr->numClasses_ == 2) { + return make_matrix(m.numRows, 2, [&](size_t const i, size_t const j) { + auto const p1 = static_cast(h_preds[i]); + return j == 0 ? 1.0 - p1 : p1; + }); + } + Rcpp::stop("FIL model returned %d outputs, but %d classes were expected.", + static_cast(n_outputs), + static_cast(model_xptr->numClasses_)); + } + + return make_matrix(m.numRows, 1, [&](size_t const i, size_t) { + if (n_outputs == 1) { + return model_xptr->numClasses_ == 2 + ? static_cast(h_preds[i] >= model_xptr->threshold_) + : static_cast(h_preds[i]); + } + if (model_xptr->forest_->row_postprocessing() == ML::fil::row_op::max_index) { + return static_cast(h_preds[i * n_outputs]); + } + + auto const row_begin = h_preds.begin() + i * n_outputs; + return static_cast( + std::distance(row_begin, std::max_element(row_begin, row_begin + n_outputs))); + }); } } // namespace cuml4r + +#endif diff --git a/src/fil_utils.cu b/src/fil_utils.cu index e36d501..63c95bb 100644 --- a/src/fil_utils.cu +++ b/src/fil_utils.cu @@ -1,21 +1,106 @@ #include "fil_utils.h" +#ifndef CUML4R_TREELITE_C_API_MISSING + +#include "cuda_utils.h" + +#include + +#include +#include +#include +#include + +namespace ML { +namespace fil { +namespace detail { +namespace device_initialization { + +CUML_FIL_INITIALIZE_DEVICE(template, 0) +CUML_FIL_INITIALIZE_DEVICE(template, 1) +CUML_FIL_INITIALIZE_DEVICE(template, 2) +CUML_FIL_INITIALIZE_DEVICE(template, 3) +CUML_FIL_INITIALIZE_DEVICE(template, 4) +CUML_FIL_INITIALIZE_DEVICE(template, 5) +CUML_FIL_INITIALIZE_DEVICE(template, 6) +CUML_FIL_INITIALIZE_DEVICE(template, 7) +CUML_FIL_INITIALIZE_DEVICE(template, 8) +CUML_FIL_INITIALIZE_DEVICE(template, 9) +CUML_FIL_INITIALIZE_DEVICE(template, 10) +CUML_FIL_INITIALIZE_DEVICE(template, 11) + +} // namespace device_initialization + +namespace inference { + +CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 0) +CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 1) +CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 2) +CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 3) +CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 4) +CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 5) +CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 6) +CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 7) +CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 8) +CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 9) +CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 10) +CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 11) + +} // namespace inference +} // namespace detail +} // namespace fil +} // namespace ML + namespace cuml4r { namespace fil { +namespace { -__host__ forest_uptr make_forest(raft::handle_t const& handle, - ML::fil::forest* const forest) { - return forest_uptr(forest, [&handle](auto* const f) { - if (f != nullptr) { - ML::fil::free(handle, f); - } - }); +__host__ int current_device() { + int device = 0; + CUDA_RT_CALL(cudaGetDevice(&device)); + return device; } -__host__ forest_uptr make_forest(raft::handle_t const& handle, - std::function src) { - return make_forest(handle, src()); +} // namespace + +__host__ ML::fil::tree_layout tree_layout_from_storage_type( + int const storage_type) { + switch (storage_type) { + case 1: + return ML::fil::tree_layout::breadth_first; + case 2: + return ML::fil::tree_layout::depth_first; + default: + return ML::fil::tree_layout::depth_first; + } +} + +__host__ forest_uptr import_from_treelite( + raft::handle_t const& handle, TreeliteHandle const& tl_handle, + ML::fil::tree_layout const layout) { + return std::make_unique( + ML::fil::import_from_treelite_handle( + /*tl_handle=*/tl_handle.handle(), /*layout=*/layout, + /*align_bytes=*/128, + /*use_double_precision=*/false, + /*dev_type=*/raft_proto::device_type::gpu, + /*device=*/current_device(), + /*stream=*/handle.get_stream())); +} + +__host__ void predict(raft::handle_t const& handle, + ML::fil::forest_model& forest, float* const output, + float* const input, std::size_t const num_rows, + ML::fil::infer_kind const infer_kind, + std::optional const chunk_size) { + raft_proto::handle_t fil_handle(handle); + forest.predict(fil_handle, output, input, num_rows, + raft_proto::device_type::gpu, raft_proto::device_type::gpu, + infer_kind, chunk_size); + fil_handle.synchronize(); } } // namespace fil } // namespace cuml4r + +#endif diff --git a/src/fil_utils.h b/src/fil_utils.h index a5702d0..828cdea 100644 --- a/src/fil_utils.h +++ b/src/fil_utils.h @@ -1,28 +1,36 @@ #pragma once -#include +#ifndef CUML4R_TREELITE_C_API_MISSING -#include +#include "treelite_utils.cuh" + +#include +#include +#include +#include + +#include #include +#include namespace cuml4r { namespace fil { -using forest_uptr = - std::unique_ptr>; +using forest_uptr = std::unique_ptr; -/* - * RAII wrapper for a `ML::fil::forest` pointer (a.k.a `ML::fil::forest_t`) - * - * NOTE: the resulting RAII wrapper does *not* take ownship of `handle`, and - * assumes `handle` will be destroyed *after* the FIL forest object itself is - * destroyed. - */ -forest_uptr make_forest(raft::handle_t const& handle, - ML::fil::forest* const forest); +ML::fil::tree_layout tree_layout_from_storage_type(int storage_type); -forest_uptr make_forest(raft::handle_t const& handle, - std::function src); +forest_uptr import_from_treelite( + raft::handle_t const& handle, TreeliteHandle const& tl_handle, + ML::fil::tree_layout layout = ML::fil::tree_layout::depth_first); + +void predict(raft::handle_t const& handle, ML::fil::forest_model& forest, + float* output, float* input, std::size_t num_rows, + ML::fil::infer_kind infer_kind = ML::fil::infer_kind::default_kind, + std::optional chunk_size = + std::optional{4}); } // namespace fil } // namespace cuml4r + +#endif diff --git a/src/handle_utils.cu b/src/handle_utils.cu index 9c61b7b..ada5ae0 100644 --- a/src/handle_utils.cu +++ b/src/handle_utils.cu @@ -3,6 +3,13 @@ #ifdef HAS_CUML +#include +#include +#include +#include + +#include + namespace cuml4r { namespace handle_utils { @@ -11,7 +18,13 @@ __host__ void initializeHandle(raft::handle_t& handle, if (stream_view.value() == 0) { stream_view = stream_allocator::getOrCreateStream(); } +#if CUML_VERSION_MAJOR >= 24 + raft::resource::set_cuda_stream(handle, stream_view); + raft::resource::set_cuda_stream_pool( + handle, std::make_shared(8)); +#else handle.set_stream(stream_view.value()); +#endif } } // namespace handle_utils diff --git a/src/handle_utils.h b/src/handle_utils.h index f00d622..bba115a 100644 --- a/src/handle_utils.h +++ b/src/handle_utils.h @@ -2,7 +2,7 @@ #ifdef HAS_CUML -#include +#include #include namespace cuml4r { diff --git a/src/kmeans.cu b/src/kmeans.cu index a3357b7..ad0c0b7 100644 --- a/src/kmeans.cu +++ b/src/kmeans.cu @@ -6,9 +6,9 @@ #include "preprocessor.h" #include "stream_allocator.h" -#include -#include #include +#include +#include #include @@ -35,8 +35,15 @@ __host__ Rcpp::List kmeans(Rcpp::NumericMatrix const& x, int const k, params.inertia_check = true; } params.init = static_cast(init_method); +#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \ + CUML4R_LIBCUML_VERSION(24, 0)) + params.rng_state = raft::random::RngState( + seed, raft::random::GeneratorType::GenPhilox); + params.verbosity = static_cast(verbosity); +#else params.seed = seed; params.verbosity = verbosity; +#endif auto stream_view = stream_allocator::getOrCreateStream(); raft::handle_t handle; @@ -53,7 +60,7 @@ __host__ Rcpp::List kmeans(Rcpp::NumericMatrix const& x, int const k, // kmeans outputs thrust::device_vector d_pred_centroids(n_centroid_values); - AsyncCopyCtx centroids_h2d; + CUML4R_MAYBE_UNUSED AsyncCopyCtx centroids_h2d; if (params.init == ML::kmeans::KMeansParams::InitMethod::Array) { auto const m_centroids = Matrix<>(centroids, /*transpose=*/false); auto const& h_centroids = m_centroids.values; @@ -64,9 +71,20 @@ __host__ Rcpp::List kmeans(Rcpp::NumericMatrix const& x, int const k, double inertia = 0; int n_iter = 0; +#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \ + CUML4R_LIBCUML_VERSION(24, 0)) + ML::kmeans::fit(handle, params, d_src_data.data().get(), n_samples, + n_features, /*sample_weight=*/nullptr, + d_pred_centroids.data().get(), inertia, n_iter); + ML::kmeans::predict(handle, params, d_pred_centroids.data().get(), + d_src_data.data().get(), n_samples, n_features, + /*sample_weight=*/nullptr, /*normalize_weights=*/false, + d_pred_labels.data().get(), inertia); +#else ML::kmeans::fit_predict(handle, params, d_src_data.data().get(), n_samples, n_features, 0, d_pred_centroids.data().get(), d_pred_labels.data().get(), inertia, n_iter); +#endif CUDA_RT_CALL(cudaStreamSynchronize(stream_view.value())); diff --git a/src/knn.cu b/src/knn.cu index 13894d8..9af6c0f 100644 --- a/src/knn.cu +++ b/src/knn.cu @@ -8,7 +8,6 @@ #include "random_forest.cuh" #include "stream_allocator.h" -#include #include #include #include @@ -21,7 +20,16 @@ #include #include -#if CUML_VERSION_MAJOR == 21 +#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \ + CUML4R_LIBCUML_VERSION(24, 0)) + +using knnIndex = ML::knnIndex; +using knnIndexParam = ML::knnIndexParam; +using IVFFlatParam = ML::IVFFlatParam; +using IVFPQParam = ML::IVFPQParam; +using knnDistanceType = ML::distance::DistanceType; + +#elif CUML_VERSION_MAJOR == 21 #if CUML4R_CONCAT(0x, CUML_VERSION_MINOR) >= 0x08 #include @@ -32,6 +40,7 @@ using QuantizerType = raft::spatial::knn::QuantizerType; using IVFFlatParam = raft::spatial::knn::IVFFlatParam; using IVFPQParam = raft::spatial::knn::IVFPQParam; using IVFSQParam = raft::spatial::knn::IVFSQParam; +using knnDistanceType = raft::distance::DistanceType; #else @@ -41,6 +50,7 @@ using QuantizerType = ML::QuantizerType; using IVFFlatParam = ML::IVFFlatParam; using IVFPQParam = ML::IVFPQParam; using IVFSQParam = ML::IVFSQParam; +using knnDistanceType = raft::distance::DistanceType; #endif #endif @@ -66,6 +76,8 @@ constexpr auto kMetric = "metric"; constexpr auto kNumSamples = "n_samples"; constexpr auto kNumDims = "n_dims"; +#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) < \ + CUML4R_LIBCUML_VERSION(24, 0)) std::unordered_map const kQuantizerTypes{ {"QT_8bit", QuantizerType::QT_8bit}, {"QT_4bit", QuantizerType::QT_4bit}, @@ -74,6 +86,7 @@ std::unordered_map const kQuantizerTypes{ {"QT_fp16", QuantizerType::QT_fp16}, {"QT_8bit_direct", QuantizerType::QT_8bit_direct}, {"QT_6bit", QuantizerType::QT_6bit}}; +#endif // Additional info for setting KNN params struct ParamsDetails { @@ -105,8 +118,7 @@ class PredictionCtx { nFeatures_(x.ncol()), modelKnnIndex_(Rcpp::XPtr(static_cast(model[kIndex]))), modelAlgoType_(static_cast(Rcpp::as(model[kAlgo]))), - modelDistType_(static_cast( - Rcpp::as(model[kMetric]))), + modelDistType_(static_cast(Rcpp::as(model[kMetric]))), modelP_(Rcpp::as(model[kP])), modelNSamples_(Rcpp::as(model[kNumSamples])), modelNDims_(Rcpp::as(model[kNumDims])), @@ -167,7 +179,7 @@ class PredictionCtx { // attributes from the KNN model object Rcpp::XPtr const modelKnnIndex_; Algo const modelAlgoType_; - raft::distance::DistanceType const modelDistType_; + knnDistanceType const modelDistType_; float const modelP_; int const modelNSamples_; int const modelNDims_; @@ -235,6 +247,18 @@ __host__ std::unique_ptr build_ivfpq_algo_params( params[kNumLists] = 8; params[kNumProbes] = 3; +#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \ + CUML4R_LIBCUML_VERSION(24, 0)) + for (auto iter = kAllowedSubDimSize.crbegin(); + iter != kAllowedSubDimSize.crend(); ++iter) { + auto const pq_dim = *iter; + if (pq_dim <= d && d % pq_dim == 0) { + params[kUseComputedTables] = false; + params[kM] = pq_dim; + break; + } + } +#else for (auto const n_subq : kAllowedSubquantizers) { if (d % n_subq == 0 && std::find(kAllowedSubDimSize.cbegin(), kAllowedSubDimSize.cend(), @@ -244,6 +268,7 @@ __host__ std::unique_ptr build_ivfpq_algo_params( break; } } +#endif if (!params.containsElementNamed(kM)) { for (auto const n_subq : kAllowedSubquantizers) { @@ -256,9 +281,10 @@ __host__ std::unique_ptr build_ivfpq_algo_params( } params[kNumBits] = 4; - for (auto const n_bits : {8, 6, 5}) { + for (auto const n_bits : {8, 6, 5, 4}) { auto const min_train_points = (1 << n_bits) * 39; - if (n >= min_train_points) { + if (n >= min_train_points && + ((n_bits * Rcpp::as(params[kM])) % 8) == 0) { params[kNumBits] = n_bits; break; } @@ -278,6 +304,11 @@ __host__ std::unique_ptr build_ivfpq_algo_params( __host__ std::unique_ptr build_ivfsq_algo_params( Rcpp::List params, bool const automated) { +#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \ + CUML4R_LIBCUML_VERSION(24, 0)) + Rcpp::stop("IVFSQ KNN is unsupported by this cuML version"); + return nullptr; +#else if (automated) { params[kNumLists] = 8; params[kNumProbes] = 2; @@ -299,6 +330,7 @@ __host__ std::unique_ptr build_ivfsq_algo_params( algo_params->encodeResidual = Rcpp::as(params[kEncodeResidual]); return algo_params; +#endif } __host__ std::unique_ptr build_algo_params( @@ -324,7 +356,7 @@ __host__ std::unique_ptr build_algo_params( __host__ std::unique_ptr build_knn_index( raft::handle_t& handle, float* const d_input, int const n_samples, int const n_features, Algo const algo_type, - raft::distance::DistanceType const dist_type, float const p, + knnDistanceType const dist_type, float const p, Rcpp::List const& algo_params) { std::unique_ptr knn_index(nullptr); @@ -360,7 +392,7 @@ __host__ Rcpp::List knn_fit(Rcpp::NumericMatrix const& x, int const algo, int const metric, float const p, Rcpp::List const& algo_params) { auto const algo_type = static_cast(algo); - auto const dist_type = static_cast(metric); + auto const dist_type = static_cast(metric); auto const input_m = Matrix(x, /*transpose=*/false); int const n_samples = input_m.numRows; int const n_features = input_m.numCols; diff --git a/src/lm.cu b/src/lm.cu index de8b0b8..f6c3e93 100644 --- a/src/lm.cu +++ b/src/lm.cu @@ -8,7 +8,6 @@ #include "preprocessor.h" #include "stream_allocator.h" -#include #include #include diff --git a/src/lm_predict.cu b/src/lm_predict.cu index d8f5531..1a592c4 100644 --- a/src/lm_predict.cu +++ b/src/lm_predict.cu @@ -6,7 +6,6 @@ #include "preprocessor.h" #include "stream_allocator.h" -#include #include #include diff --git a/src/ols_fit_impl.cu b/src/ols_fit_impl.cu index d507b99..96b2b0c 100644 --- a/src/ols_fit_impl.cu +++ b/src/ols_fit_impl.cu @@ -1,6 +1,8 @@ #include "lm_params.h" +#include "preprocessor.h" #include +#include namespace cuml4r { namespace detail { @@ -14,7 +16,12 @@ __host__ void ols_fit_impl(raft::handle_t& handle, lm::Params const& params, /*coef=*/params.d_coef, /*intercept=*/params.intercept, /*fit_intercept=*/params.fit_intercept, +#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) < \ + CUML4R_LIBCUML_VERSION(24, 0)) /*normalize=*/params.normalize_input, algo); +#else + algo); +#endif } } // namespace detail diff --git a/src/pca.cu b/src/pca.cu index 3901667..592a674 100644 --- a/src/pca.cu +++ b/src/pca.cu @@ -6,9 +6,9 @@ #include "preprocessor.h" #include "stream_allocator.h" -#include #include #include +#include #include @@ -123,7 +123,13 @@ __host__ Rcpp::List pca_fit_transform(Rcpp::NumericMatrix const& x, /*singular_vals=*/d_singular_vals.data().get(), /*mu=*/d_mu.data().get(), /*noise_vars=*/d_noise_vars.data().get(), - /*prms=*/*params); + /*prms=*/*params +#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \ + CUML4R_LIBCUML_VERSION(24, 0)) + , + /*flip_signs_based_on_U=*/true +#endif + ); } else { ML::pcaFit(handle, /*input=*/d_input.data().get(), @@ -133,7 +139,13 @@ __host__ Rcpp::List pca_fit_transform(Rcpp::NumericMatrix const& x, /*singular_vals=*/d_singular_vals.data().get(), /*mu=*/d_mu.data().get(), /*noise_vars=*/d_noise_vars.data().get(), - /*prms=*/*params); + /*prms=*/*params +#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \ + CUML4R_LIBCUML_VERSION(24, 0)) + , + /*flip_signs_based_on_U=*/true +#endif + ); } CUDA_RT_CALL(cudaStreamSynchronize(stream_view.value())); @@ -149,7 +161,7 @@ __host__ Rcpp::List pca_fit_transform(Rcpp::NumericMatrix const& x, pinned_host_vector h_mu(n_cols); pinned_host_vector h_noise_vars(1); - AsyncCopyCtx transformed_data_d2h; + CUML4R_MAYBE_UNUSED AsyncCopyCtx transformed_data_d2h; if (transform_input) { transformed_data_d2h = async_copy(stream_view.value(), d_transformed_data.cbegin(), diff --git a/src/pinned_host_vector.h b/src/pinned_host_vector.h index a0d6359..772787c 100644 --- a/src/pinned_host_vector.h +++ b/src/pinned_host_vector.h @@ -2,39 +2,15 @@ #ifdef HAS_CUML -#include -#include - -#include +#include namespace cuml4r { template -using pinned_host_vector = - thrust::host_vector>; +using pinned_host_vector = std::vector; } // namespace cuml4r -namespace Rcpp { -namespace traits { - -template