diff --git a/.Rbuildignore b/.Rbuildignore
index a4fee43..0a6f21b 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -10,6 +10,7 @@
 ^src/CMakeLists\.txt$
 ^src/CMakeCache\.txt$
 ^src/CMakeFiles/*
+^src/\.cmake-build/*
 ^src/_deps/*
 ^src/eval_gpu_archs*
 ^src/*\.o$
diff --git a/DESCRIPTION b/DESCRIPTION
index d88861e..a633018 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -46,5 +46,7 @@ LinkingTo: Rcpp
 Encoding: UTF-8
 RoxygenNote: 7.3.3
 OS_type: unix
-SystemRequirements: RAPIDS cuML (see https://rapids.ai/start.html)
+SystemRequirements: NVIDIA GPU and driver, CUDA Toolkit with nvcc, and uv or
+    Python/pip for automatic RAPIDS cuML bootstrap. Alternatively, an existing
+    RAPIDS cuML installation can be provided with CUML_PREFIX.
 NeedsCompilation: yes
diff --git a/R/cuml_utils.R b/R/cuml_utils.R
index abd8822..322f073 100644
--- a/R/cuml_utils.R
+++ b/R/cuml_utils.R
@@ -4,14 +4,23 @@
 #' @return A logical value indicating whether the current installation \{cuda.ml\}
 #'   was linked to a valid version of the RAPIDS cuML shared library.
 #'
+#' @details
+#' If this returns \code{FALSE}, \pkg{cuda.ml} was installed in stub-only mode.
+#' On a GPU machine, verify that \code{nvidia-smi} and \code{nvcc --version}
+#' both work, then reinstall \pkg{cuda.ml}. During installation, \pkg{cuda.ml}
+#' can bootstrap RAPIDS cuML from pip wheels with \code{uv} or Python/pip. If
+#' RAPIDS cuML is already installed, set \code{CUML_PREFIX} to a prefix
+#' containing \code{include/cuml} and \code{lib/libcuml++.so} before
+#' reinstalling.
+#'
 #' @examples
 #'
 #' library(cuda.ml)
 #'
 #' if (!has_cuML()) {
 #'   warning(
-#'     "Please install the RAPIDS cuML shared library first, and then re-",
-#'     "install {cuda.ml}."
+#'     "This installation was built without RAPIDS cuML. Verify `nvidia-smi` ",
+#'     "and `nvcc --version`, then reinstall {cuda.ml}."
 #'   )
 #' }
 #' @export
diff --git a/R/knn.R b/R/knn.R
index 4fc8ff6..d1ff50e 100644
--- a/R/knn.R
+++ b/R/knn.R
@@ -72,6 +72,7 @@ cuda_ml_knn_algo_ivfpq <- function(nlist, nprobe, m, n_bits,
       nlist = as.integer(nlist),
       nprobe = as.integer(nprobe),
       M = as.integer(m),
+      n_bits = as.integer(n_bits),
       usePrecomputedTables = as.logical(use_precomputed_tables)
     )
   )
diff --git a/R/package.R b/R/package.R
index e0ffb36..5de704c 100644
--- a/R/package.R
+++ b/R/package.R
@@ -2,6 +2,23 @@
 #'
 #' This package provides a R interface for the RAPIDS cuML library.
 #'
+#' @section Installation:
+#' A functional GPU installation requires an NVIDIA GPU with a working driver,
+#' a CUDA Toolkit installation that provides \code{nvcc}, and normal R package
+#' build tools. During installation, \pkg{cuda.ml} first looks for an existing
+#' RAPIDS installation through \code{CUML_PREFIX} or \code{CUDA_PATH}. If none
+#' is found, it can bootstrap RAPIDS cuML from pip wheels with \code{uv} or
+#' Python/pip and link against the resulting local prefix.
+#'
+#' On machines without a usable NVIDIA driver/GPU and \code{nvcc}, including
+#' CRAN check machines, \pkg{cuda.ml} may install in stub-only mode. In that
+#' mode \code{has_cuML()} returns \code{FALSE}, and cuML-backed algorithms are
+#' unavailable until the system prerequisites are installed and \pkg{cuda.ml}
+#' is reinstalled.
+#'
+#' Useful environment variables include \code{CUDA_HOME}, \code{CUML_PREFIX},
+#' \code{CUML_BOOTSTRAP}, and \code{CUML_BOOTSTRAP_CACHE}.
+#'
 #' @author Yitao Li <yitao@rstudio.com>
 #' @import Rcpp
 #' @useDynLib cuda.ml, .registration = TRUE
@@ -17,20 +34,15 @@
   if (!has_cuML()) {
     packageStartupMessage(
       "
-      The current installation of {", pkgname, "} will not function as expected
-      because it was not linked with a valid version of the RAPIDS cuML shared
-      library.
+      The current installation of {", pkgname, "} was built without a usable
+      RAPIDS cuML shared library.
+
+      To fix this, ensure `nvidia-smi` and `nvcc --version` both work, then
+      reinstall {", pkgname, "}. During installation, {", pkgname, "} can
+      bootstrap RAPIDS cuML from pip wheels with `uv` or Python/pip.
 
-      To fix this issue, please follow https://rapids.ai/start.html#get-rapids
-      to install the RAPIDS cuML shared library from Conda and ensure the
-      'CUML_PREFIX' env variable is set to a valid RAPIDS conda env directory
-      (e.g., '/home/user/anaconda3/envs/rapids-21.06', '/usr', or similar)
-      during the installation of {", pkgname, "} or alternatively, follow
-      https://github.com/yitao-li/cuml-installation-notes#build-from-source-without-conda-and-without-multi-gpu-support
-      or
-      https://github.com/yitao-li/cuml-installation-notes#build-from-source-without-conda-and-with-multi-gpu-support
-      or similar to build and install RAPIDS cuML library from source, and
-      then re-install {", pkgname, "}.\n\n
+      If RAPIDS is already installed, set `CUML_PREFIX` to a prefix containing
+      include/cuml and lib/libcuml++.so before reinstalling.\n\n
       "
     )
   }
diff --git a/R/rand_forest.R b/R/rand_forest.R
index 6d7380b..6e05882 100644
--- a/R/rand_forest.R
+++ b/R/rand_forest.R
@@ -331,6 +331,14 @@ cuda_ml_rand_forest_impl_regression <- function(processed, mtry, trees, min_n,
 
 #' @export
 cuda_ml_get_state.cuda_ml_rand_forest <- function(model) {
+  if (!cuda_ml_fil_enabled()) {
+    stop(
+      "Random forest serialization requires Treelite/FIL support, but FIL is ",
+      "disabled in this cuda.ml build.",
+      call. = FALSE
+    )
+  }
+
   get_state_impl <- switch(model$mode,
     classification = .rf_classifier_get_state,
     regression = .rf_regressor_get_state
diff --git a/R/rand_proj.R b/R/rand_proj.R
index 475ebec..92740a9 100644
--- a/R/rand_proj.R
+++ b/R/rand_proj.R
@@ -5,6 +5,22 @@ new_rproj_model <- function(rproj_ctx) {
   model
 }
 
+cuda_ml_rand_proj_available <- function() {
+  tryCatch(
+    {
+      .rproj_johnson_lindenstrauss_min_dim(2L, 0.5)
+      TRUE
+    },
+    error = function(e) {
+      if (grepl("random projection support is not available", e$message)) {
+        FALSE
+      } else {
+        stop(e)
+      }
+    }
+  )
+}
+
 #' Random projection for dimensionality reduction.
 #'
 #' Generate a random projection matrix for dimensionality reduction, and
diff --git a/R/tsvd.R b/R/tsvd.R
index 92aec91..d2abde3 100644
--- a/R/tsvd.R
+++ b/R/tsvd.R
@@ -48,11 +48,25 @@ cuda_ml_tsvd <- function(x,
     transform_input = transform_input,
     verbosity = cuML_log_level
   )
+  model <- tsvd_flip_signs(model)
   class(model) <- c("cuda_ml_tsvd", class(model))
 
   model
 }
 
+tsvd_flip_signs <- function(model) {
+  signs <- apply(model$components, 1L, function(x) {
+    if (x[[which.max(abs(x))]] < 0) -1 else 1
+  })
+
+  model$components <- sweep(model$components, 1L, signs, `*`)
+  if (!is.null(model$transformed_data)) {
+    model$transformed_data <- sweep(model$transformed_data, 2L, signs, `*`)
+  }
+
+  model
+}
+
 #' @export
 cuda_ml_transform.cuda_ml_tsvd <- function(model, x, ...) {
   .tsvd_transform(model = model, x = as.matrix(x))
diff --git a/README.Rmd b/README.Rmd
index f8efc88..8089726 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -176,115 +176,117 @@ about the MNIST dataset:
 
 ## Installation
 
-In order for {cuda.ml} to work as expected, the C++/CUDA source code of
-{cuda.ml} must be linked with CUDA runtime and a valid copy of the RAPIDS cuML
-library.
+For a fully functional installation, {cuda.ml} needs:
 
-Before installing {cuda.ml} itself, it may be worthwhile to take a quick look
-through the sub-sections below on how to properly setup all of {cuda.ml}'s
-required runtime dependencies.
+- an NVIDIA GPU with a working NVIDIA driver;
+- a CUDA Toolkit installation that provides `nvcc`;
+- normal R package build tools; and
+- either `uv` or Python with `pip`.
 
-### Quick note on installing the RAPIDS cuML library:
+When those prerequisites are present, {cuda.ml} can bootstrap RAPIDS cuML from
+pip wheels during installation. You do not need conda, and you usually do not
+need to set `CUML_PREFIX` manually.
 
-Although Conda is the only officially supported distribution channel at the
-moment for RAPIDS cuML (i.e., see https://rapids.ai/start.html#get-rapids),
-you can still build and install this library from source without relying on
-Conda.
-See https://github.com/yitao-li/cuml-installation-notes for build-from-source
-instructions.
+On a new Ubuntu installation, install R/build/Python prerequisites:
 
-### Quick install instructions for Ubuntu 20-04:
-
-#### Install deps:
-```
-sudo apt install -y cmake ccache libblas3 liblapack3
+```bash
+sudo apt update
+sudo apt install -y r-base-dev build-essential git cmake \
+  python3 python3-pip python3-venv ubuntu-drivers-common
 ```
 
+Install the NVIDIA driver, reboot, and verify that the driver can see your GPU:
 
-### Install CUDA
-(consult https://developer.nvidia.com/cuda-downloads for other platforms)
 ```bash
-wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin
-sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600
-wget https://developer.download.nvidia.com/compute/cuda/11.4.2/local_installers/cuda-repo-ubuntu2004-11-4-local_11.4.2-470.57.02-1_amd64.deb
-sudo dpkg -i cuda-repo-ubuntu2004-11-4-local_11.4.2-470.57.02-1_amd64.deb
-sudo apt-key add /var/cuda-repo-ubuntu2004-11-4-local/7fa2af80.pub
-sudo apt-get update
-sudo apt-get -y install cuda
+sudo ubuntu-drivers install
+sudo reboot
+
+nvidia-smi
 ```
-### Add CUDA executables to path
-(nvcc is needed for building the C++/CUDA source code of {cuda.ml})
+
+Install a CUDA Toolkit that includes `nvcc`. Use NVIDIA's CUDA Linux
+installation guide for your Ubuntu release to add the CUDA apt repository, then:
+
 ```bash
-echo "export PATH=$PATH:/usr/local/cuda/bin" >> ~/.bashrc
-source ~/.bashrc
+sudo apt update
+sudo apt install -y cuda-toolkit
+
+nvcc --version
 ```
 
-### Install Miniconda:
+If the toolkit is installed but `nvcc` is not on `PATH`, set `CUDA_HOME` to the
+toolkit prefix before installing {cuda.ml}, for example:
+
 ```bash
-wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
-chmod +x Miniconda3-latest-Linux-x86_64.sh
-./Miniconda3-latest-Linux-x86_64.sh -b
-# consult https://rapids.ai/start.html for alternatives
+export CUDA_HOME=/usr/local/cuda
 ```
 
-### Create and configure the conda env
-```
-# This is a relatively big download, may take a while
-~/miniconda3/bin/conda create -n rapids-21.08 -c rapidsai -c nvidia -c conda-forge \
-    rapids-blazing=21.08 python=3.8 cudatoolkit=11.2
-```
+Then install {cuda.ml}:
 
-### Install cmake
-CUDA dependencies require a relatively recent version of CMake, so you need to install it manually
-```bash
-wget https://github.com/Kitware/CMake/releases/download/v3.22.0/cmake-3.22.0.tar.gz
-cd cmake-3.22.0
-./bootstrap && make -j8 && sudo make install
-cd ..
+``` r
+install.packages("cuda.ml")
 ```
 
-### Activate the conda env:
-```bash
-. ~/miniconda3/bin/activate
-conda activate rapids-21.08
+And verify that the installed package was linked with real cuML:
+
+``` r
+library(cuda.ml)
+has_cuML()
 ```
 
-### Consider adjusting `LD_LIBRARY_PATH`
+If this returns `TRUE`, {cuda.ml} is using RAPIDS cuML. If it returns `FALSE`,
+the package installed in stub-only mode; check the install output for the first
+missing prerequisite.
 
-The subsequent steps may (or may not) fail without the following:
+### What happens during installation
 
-```bash
-export LD_LIBRARY_PATH=~/miniconda3/envs/rapids-21.08/lib
-```
+The configure script first looks for an existing RAPIDS installation through
+`CUML_PREFIX` or `CUDA_PATH`. If no existing installation is found, and a
+working NVIDIA driver/GPU plus `nvcc` are available, it bootstraps RAPIDS cuML
+from pip wheels into a cache directory and links {cuda.ml} against that prefix.
 
-If you get some error indicating a GLIBC version mismatch in the subsequent
-steps, then please try adjusting `LD_LIBRARY_PATH` as a workaround.
+The bootstrap prefers `uv` when available, then reticulate's managed `uv`, then
+`python -m pip`, `python3 -m pip`, `pip`, and `pip3`.
 
+Useful environment variables:
 
-### Consider enabling ccache
+- `CUDA_HOME`: CUDA Toolkit prefix containing `bin/nvcc`.
+- `CUML_PREFIX`: existing RAPIDS prefix containing `include/cuml` and
+  `lib/libcuml++.so`.
+- `CUML_BOOTSTRAP=0`: disable automatic RAPIDS pip bootstrap.
+- `CUML_BOOTSTRAP_CACHE`: cache directory for bootstrapped RAPIDS headers and
+  libraries.
+- `CUML_PIP_VERSION`: RAPIDS pip wheel version to install.
 
-To speed up recompilation times during development, set this env var:
-```bash
-echo "export CUML4R_ENABLE_CCACHE=1" >> ~/.bashrc
-. ~/.bashrc
-```
+### CRAN and machines without GPUs
 
-### Install {cuda.ml} the R package:
+On CRAN, or on machines without a usable NVIDIA GPU/driver and `nvcc`, {cuda.ml}
+can still install in stub-only mode. In that mode `has_cuML()` returns `FALSE`
+and cuML-backed algorithms are not usable until the system prerequisites are
+installed and {cuda.ml} is reinstalled.
 
-You can install the released version of {cuda.ml} from
-[CRAN](https://CRAN.R-project.org) with:
+### Manual RAPIDS installations
 
-``` r
-install.packages("cuda.ml")
-```
+If you already have RAPIDS cuML from pip, conda, or a source build, set
+`CUML_PREFIX` to a prefix containing `include/cuml` and `lib/libcuml++.so`
+before installing {cuda.ml}. In this case the automatic bootstrap is skipped.
 
-And the development version from [GitHub](https://github.com/) with:
+### Development version
+
+Install the development version from [GitHub](https://github.com/) with:
 
 ``` r
 # install.packages("devtools")
 devtools::install_github("mlverse/cuda.ml")
 ```
 
+To speed up recompilation times during development, set this env var:
+
+```bash
+echo "export CUML4R_ENABLE_CCACHE=1" >> ~/.bashrc
+. ~/.bashrc
+```
+
 
 ## Appendix
 
diff --git a/README.md b/README.md
index 52460e0..277195c 100644
--- a/README.md
+++ b/README.md
@@ -263,110 +263,100 @@ From this type of visualization, we can qualitatively understand the following a
 
 ## Installation
 
-In order for {cuda.ml} to work as expected, the C++/CUDA source code of {cuda.ml} must be linked with CUDA runtime and a valid copy of the RAPIDS cuML library.
+For a fully functional installation, {cuda.ml} needs:
 
-Before installing {cuda.ml} itself, it may be worthwhile to take a quick look through the sub-sections below on how to properly setup all of {cuda.ml}'s required runtime dependencies.
+-   an NVIDIA GPU with a working NVIDIA driver;
+-   a CUDA Toolkit installation that provides `nvcc`;
+-   normal R package build tools; and
+-   either `uv` or Python with `pip`.
 
-### Quick note on installing the RAPIDS cuML library:
+When those prerequisites are present, {cuda.ml} can bootstrap RAPIDS cuML from pip wheels during installation. You do not need conda, and you usually do not need to set `CUML_PREFIX` manually.
 
-Although Conda is the only officially supported distribution channel at the moment for RAPIDS cuML (i.e., see <https://rapids.ai/start.html#get-rapids>), you can still build and install this library from source without relying on Conda. See <https://github.com/yitao-li/cuml-installation-notes> for build-from-source instructions.
-
-### Quick install instructions for Ubuntu 20-04:
-
-#### Install deps:
-
-    sudo apt install -y cmake ccache libblas3 liblapack3
-
-### Install CUDA
-
-(consult <https://developer.nvidia.com/cuda-downloads> for other platforms)
+On a new Ubuntu installation, install R/build/Python prerequisites:
 
 ``` bash
-wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin
-sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600
-wget https://developer.download.nvidia.com/compute/cuda/11.4.2/local_installers/cuda-repo-ubuntu2004-11-4-local_11.4.2-470.57.02-1_amd64.deb
-sudo dpkg -i cuda-repo-ubuntu2004-11-4-local_11.4.2-470.57.02-1_amd64.deb
-sudo apt-key add /var/cuda-repo-ubuntu2004-11-4-local/7fa2af80.pub
-sudo apt-get update
-sudo apt-get -y install cuda
+sudo apt update
+sudo apt install -y r-base-dev build-essential git cmake \
+  python3 python3-pip python3-venv ubuntu-drivers-common
 ```
 
-### Add CUDA executables to path
-
-(nvcc is needed for building the C++/CUDA source code of {cuda.ml})
+Install the NVIDIA driver, reboot, and verify that the driver can see your GPU:
 
 ``` bash
-echo "export PATH=$PATH:/usr/local/cuda/bin" >> ~/.bashrc
-source ~/.bashrc
+sudo ubuntu-drivers install
+sudo reboot
+
+nvidia-smi
 ```
 
-### Install Miniconda:
+Install a CUDA Toolkit that includes `nvcc`. Use NVIDIA's CUDA Linux installation guide for your Ubuntu release to add the CUDA apt repository, then:
 
 ``` bash
-wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
-chmod +x Miniconda3-latest-Linux-x86_64.sh
-./Miniconda3-latest-Linux-x86_64.sh -b
-# consult https://rapids.ai/start.html for alternatives
-```
+sudo apt update
+sudo apt install -y cuda-toolkit
 
-### Create and configure the conda env
+nvcc --version
+```
 
-    # This is a relatively big download, may take a while
-    ~/miniconda3/bin/conda create -n rapids-21.08 -c rapidsai -c nvidia -c conda-forge \
-        rapids-blazing=21.08 python=3.8 cudatoolkit=11.2
+If the toolkit is installed but `nvcc` is not on `PATH`, set `CUDA_HOME` to the toolkit prefix before installing {cuda.ml}, for example:
 
-### Install cmake
+``` bash
+export CUDA_HOME=/usr/local/cuda
+```
 
-CUDA dependencies require a relatively recent version of CMake, so you need to install it manually
+Then install {cuda.ml}:
 
-``` bash
-wget https://github.com/Kitware/CMake/releases/download/v3.22.0/cmake-3.22.0.tar.gz
-cd cmake-3.22.0
-./bootstrap && make -j8 && sudo make install
-cd ..
+``` r
+install.packages("cuda.ml")
 ```
 
-### Activate the conda env:
+And verify that the installed package was linked with real cuML:
 
-``` bash
-. ~/miniconda3/bin/activate
-conda activate rapids-21.08
+``` r
+library(cuda.ml)
+has_cuML()
 ```
 
-### Consider adjusting `LD_LIBRARY_PATH`
+If this returns `TRUE`, {cuda.ml} is using RAPIDS cuML. If it returns `FALSE`, the package installed in stub-only mode; check the install output for the first missing prerequisite.
 
-The subsequent steps may (or may not) fail without the following:
+### What happens during installation
 
-``` bash
-export LD_LIBRARY_PATH=~/miniconda3/envs/rapids-21.08/lib
-```
+The configure script first looks for an existing RAPIDS installation through `CUML_PREFIX` or `CUDA_PATH`. If no existing installation is found, and a working NVIDIA driver/GPU plus `nvcc` are available, it bootstraps RAPIDS cuML from pip wheels into a cache directory and links {cuda.ml} against that prefix.
 
-If you get some error indicating a GLIBC version mismatch in the subsequent steps, then please try adjusting `LD_LIBRARY_PATH` as a workaround.
+The bootstrap prefers `uv` when available, then reticulate's managed `uv`, then `python -m pip`, `python3 -m pip`, `pip`, and `pip3`.
 
-### Consider enabling ccache
+Useful environment variables:
 
-To speed up recompilation times during development, set this env var:
+-   `CUDA_HOME`: CUDA Toolkit prefix containing `bin/nvcc`.
+-   `CUML_PREFIX`: existing RAPIDS prefix containing `include/cuml` and `lib/libcuml++.so`.
+-   `CUML_BOOTSTRAP=0`: disable automatic RAPIDS pip bootstrap.
+-   `CUML_BOOTSTRAP_CACHE`: cache directory for bootstrapped RAPIDS headers and libraries.
+-   `CUML_PIP_VERSION`: RAPIDS pip wheel version to install.
 
-``` bash
-echo "export CUML4R_ENABLE_CCACHE=1" >> ~/.bashrc
-. ~/.bashrc
-```
+### CRAN and machines without GPUs
 
-### Install {cuda.ml} the R package:
+On CRAN, or on machines without a usable NVIDIA GPU/driver and `nvcc`, {cuda.ml} can still install in stub-only mode. In that mode `has_cuML()` returns `FALSE` and cuML-backed algorithms are not usable until the system prerequisites are installed and {cuda.ml} is reinstalled.
 
-You can install the released version of {cuda.ml} from [CRAN](https://CRAN.R-project.org) with:
+### Manual RAPIDS installations
 
-``` r
-install.packages("cuda.ml")
-```
+If you already have RAPIDS cuML from pip, conda, or a source build, set `CUML_PREFIX` to a prefix containing `include/cuml` and `lib/libcuml++.so` before installing {cuda.ml}. In this case the automatic bootstrap is skipped.
 
-And the development version from [GitHub](https://github.com/) with:
+### Development version
+
+Install the development version from [GitHub](https://github.com/) with:
 
 ``` r
 # install.packages("devtools")
 devtools::install_github("mlverse/cuda.ml")
 ```
 
+To speed up recompilation times during development, set this env var:
+
+``` bash
+echo "export CUML4R_ENABLE_CCACHE=1" >> ~/.bashrc
+. ~/.bashrc
+```
+
 ## Appendix
 
 <details> <summary>Inspect MNIST images</summary>
diff --git a/man/cuda.ml-package.Rd b/man/cuda.ml-package.Rd
index b43d49e..8502f4d 100644
--- a/man/cuda.ml-package.Rd
+++ b/man/cuda.ml-package.Rd
@@ -8,6 +8,25 @@
 \description{
 This package provides a R interface for the RAPIDS cuML library.
 }
+\section{Installation}{
+
+A functional GPU installation requires an NVIDIA GPU with a working driver,
+a CUDA Toolkit installation that provides \code{nvcc}, and normal R package
+build tools. During installation, \pkg{cuda.ml} first looks for an existing
+RAPIDS installation through \code{CUML_PREFIX} or \code{CUDA_PATH}. If none
+is found, it can bootstrap RAPIDS cuML from pip wheels with \code{uv} or
+Python/pip and link against the resulting local prefix.
+
+On machines without a usable NVIDIA driver/GPU and \code{nvcc}, including
+CRAN check machines, \pkg{cuda.ml} may install in stub-only mode. In that
+mode \code{has_cuML()} returns \code{FALSE}, and cuML-backed algorithms are
+unavailable until the system prerequisites are installed and \pkg{cuda.ml}
+is reinstalled.
+
+Useful environment variables include \code{CUDA_HOME}, \code{CUML_PREFIX},
+\code{CUML_BOOTSTRAP}, and \code{CUML_BOOTSTRAP_CACHE}.
+}
+
 \seealso{
 Useful links:
 \itemize{
diff --git a/man/has_cuML.Rd b/man/has_cuML.Rd
index 8305755..d22a51c 100644
--- a/man/has_cuML.Rd
+++ b/man/has_cuML.Rd
@@ -15,14 +15,23 @@ A logical value indicating whether the current installation \{cuda.ml\}
 Determine whether \{cuda.ml\} was linked to a valid version of the RAPIDS cuML
 shared library.
 }
+\details{
+If this returns \code{FALSE}, \pkg{cuda.ml} was installed in stub-only mode.
+On a GPU machine, verify that \code{nvidia-smi} and \code{nvcc --version}
+both work, then reinstall \pkg{cuda.ml}. During installation, \pkg{cuda.ml}
+can bootstrap RAPIDS cuML from pip wheels with \code{uv} or Python/pip. If
+RAPIDS cuML is already installed, set \code{CUML_PREFIX} to a prefix
+containing \code{include/cuml} and \code{lib/libcuml++.so} before
+reinstalling.
+}
 \examples{
 
 library(cuda.ml)
 
 if (!has_cuML()) {
   warning(
-    "Please install the RAPIDS cuML shared library first, and then re-",
-    "install {cuda.ml}."
+    "This installation was built without RAPIDS cuML. Verify `nvidia-smi` ",
+    "and `nvcc --version`, then reinstall {cuda.ml}."
   )
 }
 }
diff --git a/src/CMakeLists.txt.in b/src/CMakeLists.txt.in
index 030d323..22a022d 100644
--- a/src/CMakeLists.txt.in
+++ b/src/CMakeLists.txt.in
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
 
-set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
 
@@ -17,6 +17,7 @@ FetchContent_Declare(
   rapids-cmake
   GIT_REPOSITORY https://github.com/rapidsai/rapids-cmake.git
   GIT_TAG        origin/branch-21.10
+  UPDATE_DISCONNECTED TRUE
   )
 FetchContent_MakeAvailable(rapids-cmake)
 include(rapids-cuda)
@@ -34,7 +35,13 @@ endif(DEFINED ENV{CUML4R_ENABLE_CCACHE})
 if(DEFINED CUML_INCLUDE_DIR)
     # CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES is needed so that cuda_runtime.h is found
     # CUML_INCLUDE_DIR is needed so that kmeans/kmeans_c.h is found
-    set(CUML4R_INCLUDE_DIRS ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${CUML_INCLUDE_DIR})
+    set(CUML4R_INCLUDE_DIRS ${CUML_INCLUDE_DIR} ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+    if(EXISTS "${CUML_INCLUDE_DIR}/rapids")
+      # RAPIDS pip wheels may ship a CCCL copy that matches the RAPIDS shared
+      # libraries under include/rapids, plus a newer top-level CCCL copy.
+      # Prefer the ABI-compatible copy for cuda/cub/thrust/nv headers.
+      include_directories(BEFORE ${CUML_INCLUDE_DIR}/rapids)
+    endif()
 else()
     message(FATAL_ERROR "CUML_INCLUDE_DIR not specified.")
 endif(DEFINED CUML_INCLUDE_DIR)
@@ -50,6 +57,20 @@ include_directories(@RCPP_INCLUDE_DIR@)
 
 include_directories(${TREELITE_C_API_INCLUDE_DIR})
 
+set(CUML4R_RPROJ_C_API_FOUND FALSE)
+foreach(CUML4R_INC_DIR IN LISTS CUML4R_INCLUDE_DIRS)
+  if(EXISTS "${CUML4R_INC_DIR}/cuml/random_projection/rproj_c.h")
+    set(CUML4R_RPROJ_C_API_FOUND TRUE)
+  endif()
+endforeach()
+if(NOT CUML4R_RPROJ_C_API_FOUND)
+  message(
+    STATUS
+    "cuML random projection C API headers were not found; random projection support will be disabled."
+  )
+  add_definitions(-DCUML4R_RPROJ_C_API_MISSING)
+endif(NOT CUML4R_RPROJ_C_API_FOUND)
+
 if(DEFINED ENV{CUML4R_ENABLE_ASAN})
     if($ENV{CUML4R_ENABLE_ASAN} MATCHES "true")
         add_compile_options(-fno-omit-frame-pointer -fsanitize-recover=address)
@@ -121,25 +142,36 @@ add_library(
 # Need to set linker language to CUDA to link the CUDA Runtime
 set_target_properties(cuda.ml PROPERTIES LINKER_LANGUAGE "CUDA")
 set_target_properties(cuda.ml PROPERTIES PREFIX "")
+target_compile_options(
+  cuda.ml
+  PRIVATE
+  $<$<COMPILE_LANGUAGE:CXX>:-Wno-deprecated-declarations>
+  $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Wno-deprecated-declarations>
+)
 
 set(CUML4R_LIBS cuda.ml PRIVATE cuml++ cublas cusolver cudart cusparse)
 
-find_package(Treelite)
+find_package(Treelite QUIET)
 if(Treelite_FOUND)
-  set(CUML4R_LIBS ${CUML4R_LIBS} treelite::treelite treelite::treelite_runtime)
+  target_compile_definitions(cuda.ml PRIVATE CUML_ENABLE_GPU)
+  if(TARGET treelite::treelite_static)
+    set(CUML4R_LIBS ${CUML4R_LIBS} treelite::treelite_static)
+  elseif(TARGET treelite::treelite)
+    set(CUML4R_LIBS ${CUML4R_LIBS} treelite::treelite)
+  else()
+    message(FATAL_ERROR "Treelite was found but no supported Treelite CMake target was exported.")
+  endif()
+  if(TARGET treelite::treelite_runtime)
+    set(CUML4R_LIBS ${CUML4R_LIBS} treelite::treelite_runtime)
+  endif()
   set(CUML4R_INCLUDE_DIRS ${CUML4R_INCLUDE_DIRS} ${Treelite_INCLUDE_DIRS})
   message(STATUS "Treelite found, ignoring stub headers: ${CUML_STUB_HEADERS_DIR}")
 else()
   message(
-    WARNING
-    "
-    Unable to locate 'TreeLite' using CMake. Forest Inference Library (FIL)
-    functionalities from {cuda.ml} will be disabled!
-
-    Please install the treelite C API and re-install {cuda.ml} if you want to
-    enable FIL functionalities.
-    "
+    STATUS
+    "Treelite was not found; Forest Inference Library (FIL) support will be disabled."
   )
+  add_definitions(-DCUML4R_TREELITE_C_API_MISSING)
   set(
     CUML4R_INCLUDE_DIRS ${CUML4R_INCLUDE_DIRS} ${CUML_STUB_HEADERS_DIR}
   )
diff --git a/src/agglomerative_clustering.cu b/src/agglomerative_clustering.cu
index d5bb06f..cf24067 100644
--- a/src/agglomerative_clustering.cu
+++ b/src/agglomerative_clustering.cu
@@ -6,9 +6,9 @@
 #include "preprocessor.h"
 #include "stream_allocator.h"
 
-#include <thrust/async/copy.h>
-#include <thrust/device_vector.h>
 #include <cuml/cluster/linkage.hpp>
+#include <cuml/version_config.hpp>
+#include <thrust/device_vector.h>
 
 #include <Rcpp.h>
 
@@ -38,9 +38,18 @@ __host__ Rcpp::List agglomerative_clustering(Rcpp::NumericMatrix const& x,
     async_copy(stream_view.value(), h_x.cbegin(), h_x.cend(), d_x.begin());
 
   // single-linkage hierarchical clustering output
-  auto out = std::make_unique<raft::hierarchy::linkage_output<int, float>>();
   thrust::device_vector<int> d_labels(n_samples);
   thrust::device_vector<int> d_children((n_samples - 1) * 2);
+
+#if CUML_VERSION_MAJOR >= 24
+  ML::linkage::single_linkage(
+    handle, /*X=*/d_x.data().get(), /*n_rows=*/n_samples,
+    /*n_cols=*/n_features, /*n_clusters=*/n_clusters,
+    /*metric=*/static_cast<ML::distance::DistanceType>(metric),
+    /*children=*/d_children.data().get(), /*labels=*/d_labels.data().get(),
+    /*use_knn=*/!pairwise_conn, /*c=*/n_neighbors);
+#else
+  auto out = std::make_unique<raft::hierarchy::linkage_output<int, float>>();
   out->labels = d_labels.data().get();
   out->children = d_children.data().get();
 
@@ -56,6 +65,7 @@ __host__ Rcpp::List agglomerative_clustering(Rcpp::NumericMatrix const& x,
       /*metric=*/static_cast<raft::distance::DistanceType>(metric),
       /*c=*/n_neighbors, n_clusters);
   }
+#endif
 
   CUDA_RT_CALL(cudaStreamSynchronize(stream_view.value()));
 
@@ -69,7 +79,11 @@ __host__ Rcpp::List agglomerative_clustering(Rcpp::NumericMatrix const& x,
 
   CUDA_RT_CALL(cudaStreamSynchronize(stream_view.value()));
 
+#if CUML_VERSION_MAJOR >= 24
+  result["n_clusters"] = n_clusters;
+#else
   result["n_clusters"] = out->n_clusters;
+#endif
   result["children"] =
     Rcpp::transpose(Rcpp::IntegerMatrix(2, n_samples - 1, h_children.begin()));
   result["labels"] = Rcpp::IntegerVector(h_labels.cbegin(), h_labels.cend());
diff --git a/src/async_utils.cuh b/src/async_utils.cuh
index ed9050a..2a6d0a4 100644
--- a/src/async_utils.cuh
+++ b/src/async_utils.cuh
@@ -4,33 +4,21 @@
 
 #include "cuda_utils.h"
 #include "preprocessor.h"
-#include "unique_marker.cuh"
 
-#include <thrust/async/copy.h>
-#include <thrust/system/cuda/future.h>
-
-#include <utility>
+#include <thrust/copy.h>
+#include <thrust/system/cuda/execution_policy.h>
 
 namespace cuml4r {
 
-// To ensure the correct async behavior, an `AsyncCopyCtx` object must be
-// destroyed after the stream associated with the copy operation is
-// synchronized, not before.
-struct AsyncCopyCtx {
-  thrust::system::cuda::unique_eager_event event;
-  unique_marker marker;
-};
+struct AsyncCopyCtx {};
 
 // perform a copy operation that is asynchronous with respect to the host
 // and synchronous with respect to the stream specified
-template <typename... Args>
-__host__ CUML4R_NODISCARD auto async_copy(cudaStream_t stream, Args&&... args) {
-  auto e = thrust::async::copy(std::forward<Args>(args)...);
-  auto& s = e.stream();
-  unique_marker m;
-  CUDA_RT_CALL(cudaEventRecord(m.get(), s.get()));
-  CUDA_RT_CALL(cudaStreamWaitEvent(stream, m.get(), cudaEventWaitDefault));
-  return AsyncCopyCtx{std::move(e), std::move(m)};
+template <typename InputIt, typename OutputIt>
+__host__ CUML4R_NODISCARD auto async_copy(
+  cudaStream_t stream, InputIt first, InputIt last, OutputIt result) {
+  thrust::copy(thrust::cuda::par.on(stream), first, last, result);
+  return AsyncCopyCtx{};
 }
 
 }  // namespace cuml4r
diff --git a/src/cd_fit_impl.cu b/src/cd_fit_impl.cu
index c6e052e..6c8e67d 100644
--- a/src/cd_fit_impl.cu
+++ b/src/cd_fit_impl.cu
@@ -1,6 +1,8 @@
 #include "lm_params.h"
+#include "preprocessor.h"
 
 #include <cuml/solvers/solver.hpp>
+#include <cuml/version_config.hpp>
 
 namespace cuml4r {
 namespace detail {
@@ -14,8 +16,13 @@ __host__ void cd_fit_impl(raft::handle_t& handle, lm::Params const& params,
                     /*labels=*/params.d_labels, /*coef=*/params.d_coef,
                     /*intercept=*/params.intercept,
                     /*fit_intercept=*/params.fit_intercept,
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) < \
+     CUML4R_LIBCUML_VERSION(24, 0))
                     /*normalize=*/params.normalize_input, epochs, loss, alpha,
-                    l1_ratio, shuffle, tol);
+#else
+                    epochs,
+#endif
+                    loss, alpha, l1_ratio, shuffle, tol);
 }
 
 }  // namespace detail
diff --git a/src/cuml_utils.cpp b/src/cuml_utils.cpp
index 4f07355..86a06db 100644
--- a/src/cuml_utils.cpp
+++ b/src/cuml_utils.cpp
@@ -4,9 +4,6 @@
 
 #include <cuml/version_config.hpp>
 
-static_assert(CUML_VERSION_MAJOR == 21,
-              "{cuda.ml} currently only supports linking to RAPIDS cuML 21.x!");
-
 #endif
 
 #include <Rcpp.h>
diff --git a/src/dbscan.cu b/src/dbscan.cu
index e385995..7c3a534 100644
--- a/src/dbscan.cu
+++ b/src/dbscan.cu
@@ -5,9 +5,9 @@
 #include "preprocessor.h"
 #include "stream_allocator.h"
 
-#include <thrust/async/copy.h>
-#include <thrust/device_vector.h>
 #include <cuml/cluster/dbscan.hpp>
+#include <cuml/version_config.hpp>
+#include <thrust/device_vector.h>
 
 #include <Rcpp.h>
 
@@ -41,10 +41,21 @@ __host__ Rcpp::List dbscan(Rcpp::NumericMatrix const& x, int const min_pts,
 
   ML::Dbscan::fit(handle, /*input=*/d_src_data.data().get(),
                   /*n_rows=*/n_samples, /*n_cols=*/n_features, eps, min_pts,
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
+     CUML4R_LIBCUML_VERSION(24, 0))
+                  /*metric=*/ML::distance::DistanceType::L2SqrtUnexpanded,
+                  /*labels=*/d_labels.data().get(),
+                  /*core_sample_indices=*/nullptr, /*sample_weight=*/nullptr,
+                  max_bytes_per_batch, /*eps_nn_method=*/ML::Dbscan::BRUTE_FORCE,
+                  /*verbosity=*/static_cast<rapids_logger::level_enum>(
+                    verbosity),
+                  /*opg=*/false);
+#else
                   /*metric=*/raft::distance::L2SqrtUnexpanded,
                   /*labels=*/d_labels.data().get(),
                   /*core_sample_indices=*/nullptr, max_bytes_per_batch,
                   /*verbosity=*/verbosity, /*opg=*/false);
+#endif
 
   CUDA_RT_CALL(cudaStreamSynchronize(stream_view.value()));
 
diff --git a/src/device_allocator.cu b/src/device_allocator.cu
index fe13909..139bc2d 100644
--- a/src/device_allocator.cu
+++ b/src/device_allocator.cu
@@ -2,6 +2,10 @@
 
 #include "device_allocator.h"
 
+#include <cuml/version_config.hpp>
+
+#if CUML_VERSION_MAJOR < 24
+
 #include <raft/mr/device/allocator.hpp>
 
 namespace {
@@ -19,6 +23,8 @@ __host__ std::shared_ptr<raft::mr::device::allocator> getDeviceAllocator() {
 
 }  // namespace cuml4r
 
+#endif
+
 #else
 
 #include "warn_cuml_missing.h"
diff --git a/src/device_allocator.h b/src/device_allocator.h
index 124c3b1..098639d 100644
--- a/src/device_allocator.h
+++ b/src/device_allocator.h
@@ -2,6 +2,10 @@
 
 #ifdef HAS_CUML
 
+#include <cuml/version_config.hpp>
+
+#if CUML_VERSION_MAJOR < 24
+
 #include <memory>
 
 namespace raft {
@@ -20,6 +24,8 @@ std::shared_ptr<raft::mr::device::allocator> getDeviceAllocator();
 
 }  // namespace cuml4r
 
+#endif
+
 #else
 
 #include "warn_cuml_missing.h"
diff --git a/src/fil.cu b/src/fil.cu
index 1545177..1ed555d 100644
--- a/src/fil.cu
+++ b/src/fil.cu
@@ -8,15 +8,18 @@
 #include "stream_allocator.h"
 #include "treelite_utils.cuh"
 
-#include <cuml/fil/fil.h>
-#include <thrust/async/copy.h>
+#ifndef CUML4R_TREELITE_C_API_MISSING
+
 #include <thrust/device_vector.h>
 #include <treelite/c_api.h>
+#include <treelite/tree.h>
 
 #include <Rcpp.h>
 
+#include <algorithm>
 #include <memory>
 #include <string>
+#include <vector>
 
 namespace cuml4r {
 namespace {
@@ -25,43 +28,65 @@ enum class ModelType { XGBoost, XGBoostJSON, LightGBM };
 
 struct FILModel {
   __host__ FILModel(std::unique_ptr<raft::handle_t> handle,
-                    fil::forest_uptr forest, size_t const num_classes)
+                    fil::forest_uptr forest, bool const classification,
+                    float const threshold, size_t const num_classes)
     : handle_(std::move(handle)),
       forest_(std::move(forest)),
+      classification_(classification),
+      threshold_(threshold),
       numClasses_(num_classes) {}
 
   std::unique_ptr<raft::handle_t> const handle_;
   // NOTE: the destruction of `forest_` must precede the destruction of
   // `handle_`.
   fil::forest_uptr forest_;
+  bool const classification_;
+  float const threshold_;
   size_t const numClasses_;
 };
 
 __host__ int treeliteLoadModel(ModelType const model_type, char const* filename,
                                TreeliteHandle& tl_handle) {
+  auto constexpr config = "{}";
   switch (model_type) {
     case ModelType::XGBoost:
-      return TreeliteLoadXGBoostModel(filename, tl_handle.get());
+      return TreeliteLoadXGBoostModelLegacyBinary(filename, config,
+                                                  tl_handle.get());
     case ModelType::XGBoostJSON:
-      return TreeliteLoadXGBoostJSON(filename, tl_handle.get());
+      return TreeliteLoadXGBoostModelJSON(filename, config, tl_handle.get());
     case ModelType::LightGBM:
-      return TreeliteLoadLightGBMModel(filename, tl_handle.get());
+      return TreeliteLoadLightGBMModel(filename, config, tl_handle.get());
   }
 
   // unreachable
   return -1;
 }
 
-/*
- * The 'ML::fil::treelite_params_t::threads_per_tree' and
- * 'ML::fil::treelite_params_t::n_items' parameters are only supported in
- * RAPIDS cuML 21.08 or above.
- */
-CUML4R_ASSIGN_IF_PRESENT(threads_per_tree)
-CUML4R_NOOP_IF_ABSENT(threads_per_tree)
+__host__ size_t treelite_num_classes(TreeliteHandle const& tl_handle,
+                                     bool const classification) {
+  if (!classification) {
+    return 0;
+  }
+
+  auto const* model = static_cast<treelite::Model const*>(tl_handle.handle());
+  auto num_classes =
+    model->num_class.Size() > 0 ? static_cast<size_t>(model->num_class[0]) : 0;
 
-CUML4R_ASSIGN_IF_PRESENT(n_items)
-CUML4R_NOOP_IF_ABSENT(n_items)
+  // Treelite uses one output for binary classification in some import paths.
+  return std::max(num_classes, size_t(2));
+}
+
+template <typename F>
+__host__ Rcpp::NumericMatrix make_matrix(size_t const n_rows,
+                                         size_t const n_cols, F&& getter) {
+  Rcpp::NumericMatrix out(n_rows, n_cols);
+  for (size_t i = 0; i < n_rows; ++i) {
+    for (size_t j = 0; j < n_cols; ++j) {
+      out(i, j) = getter(i, j);
+    }
+  }
+  return out;
+}
 
 }  // namespace
 
@@ -84,45 +109,18 @@ __host__ SEXP fil_load_model(int const model_type, std::string const& filename,
     }
   }
 
-  ML::fil::treelite_params_t params;
-  params.algo = static_cast<ML::fil::algo_t>(algo);
-  params.output_class = classification;
-  params.threshold = threshold;
-  params.storage_type = static_cast<ML::fil::storage_type_t>(storage_type);
-  params.blocks_per_sm = blocks_per_sm;
-  params.output_class = classification;
-  set_threads_per_tree(params, threads_per_tree);
-  set_n_items(params, n_items);
-  params.pforest_shape_str = nullptr;
-
   auto stream_view = stream_allocator::getOrCreateStream();
   auto handle = std::make_unique<raft::handle_t>();
   handle_utils::initializeHandle(*handle, stream_view.value());
 
-  auto forest = fil::make_forest(*handle, /*src=*/[&] {
-    ML::fil::forest* f;
-    ML::fil::from_treelite(/*handle=*/*handle, /*pforest=*/&f,
-                           /*model=*/*tl_handle.get(),
-                           /*tl_params=*/&params);
-    return f;
-  });
-
-  size_t num_classes = 0;
-  if (classification) {
-    auto const rc = TreeliteQueryNumClass(/*handle=*/*tl_handle.get(),
-                                          /*out=*/&num_classes);
-    if (rc < 0) {
-      char const* err = TreeliteGetLastError();
-      Rcpp::stop("TreeliteQueryNumClass failed: %s.", err);
-    }
-
-    // Treelite returns 1 as number of classes for binary classification.
-    num_classes = std::max(num_classes, size_t(2));
-  }
+  auto forest = fil::import_from_treelite(
+    *handle, tl_handle, fil::tree_layout_from_storage_type(storage_type));
+  auto const num_classes = treelite_num_classes(tl_handle, classification);
 
   return Rcpp::XPtr<FILModel>(
     std::make_unique<FILModel>(
-      /*handle=*/std::move(handle), std::move(forest), num_classes)
+      /*handle=*/std::move(handle), std::move(forest), classification,
+      threshold, num_classes)
       .release());
 }
 
@@ -137,7 +135,7 @@ __host__ Rcpp::NumericMatrix fil_predict(
   auto const model_xptr = Rcpp::XPtr<FILModel>(model);
   auto const m = Matrix<float>(x, /*transpose=*/false);
 
-  if (output_class_probabilities && model_xptr->numClasses_ == 0) {
+  if (output_class_probabilities && !model_xptr->classification_) {
     Rcpp::stop(
       "'output_class_probabilities' is not applicable for regressions!");
   }
@@ -150,15 +148,12 @@ __host__ Rcpp::NumericMatrix fil_predict(
   auto CUML4R_ANONYMOUS_VARIABLE(x_h2d) =
     async_copy(handle.get_stream(), h_x.cbegin(), h_x.cend(), d_x.begin());
 
-  // ensemble output
-  thrust::device_vector<float> d_preds(output_class_probabilities
-                                         ? model_xptr->numClasses_ * m.numRows
-                                         : m.numRows);
+  auto const n_outputs =
+    static_cast<size_t>(model_xptr->forest_->num_outputs());
+  thrust::device_vector<float> d_preds(n_outputs * m.numRows);
 
-  ML::fil::predict(/*h=*/handle, /*f=*/model_xptr->forest_.get(),
-                   /*preds=*/d_preds.data().get(),
-                   /*data=*/d_x.data().get(), /*num_rows=*/m.numRows,
-                   /*predict_proba=*/output_class_probabilities);
+  fil::predict(handle, *model_xptr->forest_, d_preds.data().get(),
+               d_x.data().get(), m.numRows);
 
   pinned_host_vector<float> h_preds(d_preds.size());
   auto CUML4R_ANONYMOUS_VARIABLE(preds_d2h) = async_copy(
@@ -166,9 +161,47 @@ __host__ Rcpp::NumericMatrix fil_predict(
 
   CUDA_RT_CALL(cudaStreamSynchronize(handle.get_stream()));
 
-  return Rcpp::transpose(Rcpp::NumericMatrix(
-    output_class_probabilities ? model_xptr->numClasses_ : 1, m.numRows,
-    h_preds.begin()));
+  if (!model_xptr->classification_) {
+    return make_matrix(m.numRows, n_outputs, [&](size_t const i,
+                                                 size_t const j) {
+      return h_preds[i * n_outputs + j];
+    });
+  }
+
+  if (output_class_probabilities) {
+    if (n_outputs == model_xptr->numClasses_) {
+      return make_matrix(m.numRows, n_outputs, [&](size_t const i,
+                                                   size_t const j) {
+        return h_preds[i * n_outputs + j];
+      });
+    }
+    if (n_outputs == 1 && model_xptr->numClasses_ == 2) {
+      return make_matrix(m.numRows, 2, [&](size_t const i, size_t const j) {
+        auto const p1 = static_cast<double>(h_preds[i]);
+        return j == 0 ? 1.0 - p1 : p1;
+      });
+    }
+    Rcpp::stop("FIL model returned %d outputs, but %d classes were expected.",
+               static_cast<int>(n_outputs),
+               static_cast<int>(model_xptr->numClasses_));
+  }
+
+  return make_matrix(m.numRows, 1, [&](size_t const i, size_t) {
+    if (n_outputs == 1) {
+      return model_xptr->numClasses_ == 2
+               ? static_cast<double>(h_preds[i] >= model_xptr->threshold_)
+               : static_cast<double>(h_preds[i]);
+    }
+    if (model_xptr->forest_->row_postprocessing() == ML::fil::row_op::max_index) {
+      return static_cast<double>(h_preds[i * n_outputs]);
+    }
+
+    auto const row_begin = h_preds.begin() + i * n_outputs;
+    return static_cast<double>(
+      std::distance(row_begin, std::max_element(row_begin, row_begin + n_outputs)));
+  });
 }
 
 }  // namespace cuml4r
+
+#endif
diff --git a/src/fil_utils.cu b/src/fil_utils.cu
index e36d501..63c95bb 100644
--- a/src/fil_utils.cu
+++ b/src/fil_utils.cu
@@ -1,21 +1,106 @@
 #include "fil_utils.h"
 
+#ifndef CUML4R_TREELITE_C_API_MISSING
+
+#include "cuda_utils.h"
+
+#include <Rcpp.h>
+
+#include <cuml/fil/detail/raft_proto/handle.hpp>
+#include <cuml/fil/detail/device_initialization/gpu.cuh>
+#include <cuml/fil/detail/infer/cpu.hpp>
+#include <cuml/fil/treelite_importer.hpp>
+
+namespace ML {
+namespace fil {
+namespace detail {
+namespace device_initialization {
+
+CUML_FIL_INITIALIZE_DEVICE(template, 0)
+CUML_FIL_INITIALIZE_DEVICE(template, 1)
+CUML_FIL_INITIALIZE_DEVICE(template, 2)
+CUML_FIL_INITIALIZE_DEVICE(template, 3)
+CUML_FIL_INITIALIZE_DEVICE(template, 4)
+CUML_FIL_INITIALIZE_DEVICE(template, 5)
+CUML_FIL_INITIALIZE_DEVICE(template, 6)
+CUML_FIL_INITIALIZE_DEVICE(template, 7)
+CUML_FIL_INITIALIZE_DEVICE(template, 8)
+CUML_FIL_INITIALIZE_DEVICE(template, 9)
+CUML_FIL_INITIALIZE_DEVICE(template, 10)
+CUML_FIL_INITIALIZE_DEVICE(template, 11)
+
+}  // namespace device_initialization
+
+namespace inference {
+
+CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 0)
+CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 1)
+CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 2)
+CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 3)
+CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 4)
+CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 5)
+CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 6)
+CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 7)
+CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 8)
+CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 9)
+CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 10)
+CUML_FIL_INFER_ALL(template, raft_proto::device_type::cpu, 11)
+
+}  // namespace inference
+}  // namespace detail
+}  // namespace fil
+}  // namespace ML
+
 namespace cuml4r {
 namespace fil {
+namespace {
 
-__host__ forest_uptr make_forest(raft::handle_t const& handle,
-                                 ML::fil::forest* const forest) {
-  return forest_uptr(forest, [&handle](auto* const f) {
-    if (f != nullptr) {
-      ML::fil::free(handle, f);
-    }
-  });
+__host__ int current_device() {
+  int device = 0;
+  CUDA_RT_CALL(cudaGetDevice(&device));
+  return device;
 }
 
-__host__ forest_uptr make_forest(raft::handle_t const& handle,
-                                 std::function<ML::fil::forest*()> src) {
-  return make_forest(handle, src());
+}  // namespace
+
+__host__ ML::fil::tree_layout tree_layout_from_storage_type(
+  int const storage_type) {
+  switch (storage_type) {
+    case 1:
+      return ML::fil::tree_layout::breadth_first;
+    case 2:
+      return ML::fil::tree_layout::depth_first;
+    default:
+      return ML::fil::tree_layout::depth_first;
+  }
+}
+
+__host__ forest_uptr import_from_treelite(
+  raft::handle_t const& handle, TreeliteHandle const& tl_handle,
+  ML::fil::tree_layout const layout) {
+  return std::make_unique<ML::fil::forest_model>(
+    ML::fil::import_from_treelite_handle(
+      /*tl_handle=*/tl_handle.handle(), /*layout=*/layout,
+      /*align_bytes=*/128,
+      /*use_double_precision=*/false,
+      /*dev_type=*/raft_proto::device_type::gpu,
+      /*device=*/current_device(),
+      /*stream=*/handle.get_stream()));
+}
+
+__host__ void predict(raft::handle_t const& handle,
+                      ML::fil::forest_model& forest, float* const output,
+                      float* const input, std::size_t const num_rows,
+                      ML::fil::infer_kind const infer_kind,
+                      std::optional<ML::fil::index_type> const chunk_size) {
+  raft_proto::handle_t fil_handle(handle);
+  forest.predict(fil_handle, output, input, num_rows,
+                 raft_proto::device_type::gpu, raft_proto::device_type::gpu,
+                 infer_kind, chunk_size);
+  fil_handle.synchronize();
 }
 
 }  // namespace fil
 }  // namespace cuml4r
+
+#endif
diff --git a/src/fil_utils.h b/src/fil_utils.h
index a5702d0..828cdea 100644
--- a/src/fil_utils.h
+++ b/src/fil_utils.h
@@ -1,28 +1,36 @@
 #pragma once
 
-#include <cuml/fil/fil.h>
+#ifndef CUML4R_TREELITE_C_API_MISSING
 
-#include <functional>
+#include "treelite_utils.cuh"
+
+#include <cuml/fil/detail/raft_proto/device_type.hpp>
+#include <cuml/fil/forest_model.hpp>
+#include <cuml/fil/infer_kind.hpp>
+#include <cuml/fil/tree_layout.hpp>
+
+#include <cstddef>
 #include <memory>
+#include <optional>
 
 namespace cuml4r {
 namespace fil {
 
-using forest_uptr =
-  std::unique_ptr<ML::fil::forest, std::function<void(ML::fil::forest* const)>>;
+using forest_uptr = std::unique_ptr<ML::fil::forest_model>;
 
-/*
- * RAII wrapper for a `ML::fil::forest` pointer (a.k.a `ML::fil::forest_t`)
- *
- * NOTE: the resulting RAII wrapper does *not* take ownship of `handle`, and
- * assumes `handle` will be destroyed *after* the FIL forest object itself is
- * destroyed.
- */
-forest_uptr make_forest(raft::handle_t const& handle,
-                        ML::fil::forest* const forest);
+ML::fil::tree_layout tree_layout_from_storage_type(int storage_type);
 
-forest_uptr make_forest(raft::handle_t const& handle,
-                        std::function<ML::fil::forest*()> src);
+forest_uptr import_from_treelite(
+  raft::handle_t const& handle, TreeliteHandle const& tl_handle,
+  ML::fil::tree_layout layout = ML::fil::tree_layout::depth_first);
+
+void predict(raft::handle_t const& handle, ML::fil::forest_model& forest,
+             float* output, float* input, std::size_t num_rows,
+             ML::fil::infer_kind infer_kind = ML::fil::infer_kind::default_kind,
+             std::optional<ML::fil::index_type> chunk_size =
+               std::optional<ML::fil::index_type>{4});
 
 }  // namespace fil
 }  // namespace cuml4r
+
+#endif
diff --git a/src/handle_utils.cu b/src/handle_utils.cu
index 9c61b7b..ada5ae0 100644
--- a/src/handle_utils.cu
+++ b/src/handle_utils.cu
@@ -3,6 +3,13 @@
 
 #ifdef HAS_CUML
 
+#include <cuml/version_config.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/cuda_stream_pool.hpp>
+#include <rmm/cuda_stream_pool.hpp>
+
+#include <memory>
+
 namespace cuml4r {
 namespace handle_utils {
 
@@ -11,7 +18,13 @@ __host__ void initializeHandle(raft::handle_t& handle,
   if (stream_view.value() == 0) {
     stream_view = stream_allocator::getOrCreateStream();
   }
+#if CUML_VERSION_MAJOR >= 24
+  raft::resource::set_cuda_stream(handle, stream_view);
+  raft::resource::set_cuda_stream_pool(
+    handle, std::make_shared<rmm::cuda_stream_pool>(8));
+#else
   handle.set_stream(stream_view.value());
+#endif
 }
 
 }  // namespace handle_utils
diff --git a/src/handle_utils.h b/src/handle_utils.h
index f00d622..bba115a 100644
--- a/src/handle_utils.h
+++ b/src/handle_utils.h
@@ -2,7 +2,7 @@
 
 #ifdef HAS_CUML
 
-#include <raft/handle.hpp>
+#include <raft/core/handle.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cuml4r {
diff --git a/src/kmeans.cu b/src/kmeans.cu
index a3357b7..ad0c0b7 100644
--- a/src/kmeans.cu
+++ b/src/kmeans.cu
@@ -6,9 +6,9 @@
 #include "preprocessor.h"
 #include "stream_allocator.h"
 
-#include <thrust/async/copy.h>
-#include <thrust/device_vector.h>
 #include <cuml/cluster/kmeans.hpp>
+#include <cuml/version_config.hpp>
+#include <thrust/device_vector.h>
 
 #include <Rcpp.h>
 
@@ -35,8 +35,15 @@ __host__ Rcpp::List kmeans(Rcpp::NumericMatrix const& x, int const k,
     params.inertia_check = true;
   }
   params.init = static_cast<ML::kmeans::KMeansParams::InitMethod>(init_method);
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
+     CUML4R_LIBCUML_VERSION(24, 0))
+  params.rng_state = raft::random::RngState(
+    seed, raft::random::GeneratorType::GenPhilox);
+  params.verbosity = static_cast<rapids_logger::level_enum>(verbosity);
+#else
   params.seed = seed;
   params.verbosity = verbosity;
+#endif
 
   auto stream_view = stream_allocator::getOrCreateStream();
   raft::handle_t handle;
@@ -53,7 +60,7 @@ __host__ Rcpp::List kmeans(Rcpp::NumericMatrix const& x, int const k,
 
   // kmeans outputs
   thrust::device_vector<double> d_pred_centroids(n_centroid_values);
-  AsyncCopyCtx centroids_h2d;
+  CUML4R_MAYBE_UNUSED AsyncCopyCtx centroids_h2d;
   if (params.init == ML::kmeans::KMeansParams::InitMethod::Array) {
     auto const m_centroids = Matrix<>(centroids, /*transpose=*/false);
     auto const& h_centroids = m_centroids.values;
@@ -64,9 +71,20 @@ __host__ Rcpp::List kmeans(Rcpp::NumericMatrix const& x, int const k,
 
   double inertia = 0;
   int n_iter = 0;
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
+     CUML4R_LIBCUML_VERSION(24, 0))
+  ML::kmeans::fit(handle, params, d_src_data.data().get(), n_samples,
+                  n_features, /*sample_weight=*/nullptr,
+                  d_pred_centroids.data().get(), inertia, n_iter);
+  ML::kmeans::predict(handle, params, d_pred_centroids.data().get(),
+                      d_src_data.data().get(), n_samples, n_features,
+                      /*sample_weight=*/nullptr, /*normalize_weights=*/false,
+                      d_pred_labels.data().get(), inertia);
+#else
   ML::kmeans::fit_predict(handle, params, d_src_data.data().get(), n_samples,
                           n_features, 0, d_pred_centroids.data().get(),
                           d_pred_labels.data().get(), inertia, n_iter);
+#endif
 
   CUDA_RT_CALL(cudaStreamSynchronize(stream_view.value()));
 
diff --git a/src/knn.cu b/src/knn.cu
index 13894d8..9af6c0f 100644
--- a/src/knn.cu
+++ b/src/knn.cu
@@ -8,7 +8,6 @@
 #include "random_forest.cuh"
 #include "stream_allocator.h"
 
-#include <thrust/async/copy.h>
 #include <thrust/device_vector.h>
 #include <cuml/neighbors/knn.hpp>
 #include <cuml/version_config.hpp>
@@ -21,7 +20,16 @@
 #include <unordered_map>
 #include <vector>
 
-#if CUML_VERSION_MAJOR == 21
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
+     CUML4R_LIBCUML_VERSION(24, 0))
+
+using knnIndex = ML::knnIndex;
+using knnIndexParam = ML::knnIndexParam;
+using IVFFlatParam = ML::IVFFlatParam;
+using IVFPQParam = ML::IVFPQParam;
+using knnDistanceType = ML::distance::DistanceType;
+
+#elif CUML_VERSION_MAJOR == 21
 #if CUML4R_CONCAT(0x, CUML_VERSION_MINOR) >= 0x08
 
 #include <raft/spatial/knn/ann_common.h>
@@ -32,6 +40,7 @@ using QuantizerType = raft::spatial::knn::QuantizerType;
 using IVFFlatParam = raft::spatial::knn::IVFFlatParam;
 using IVFPQParam = raft::spatial::knn::IVFPQParam;
 using IVFSQParam = raft::spatial::knn::IVFSQParam;
+using knnDistanceType = raft::distance::DistanceType;
 
 #else
 
@@ -41,6 +50,7 @@ using QuantizerType = ML::QuantizerType;
 using IVFFlatParam = ML::IVFFlatParam;
 using IVFPQParam = ML::IVFPQParam;
 using IVFSQParam = ML::IVFSQParam;
+using knnDistanceType = raft::distance::DistanceType;
 
 #endif
 #endif
@@ -66,6 +76,8 @@ constexpr auto kMetric = "metric";
 constexpr auto kNumSamples = "n_samples";
 constexpr auto kNumDims = "n_dims";
 
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) < \
+     CUML4R_LIBCUML_VERSION(24, 0))
 std::unordered_map<std::string, QuantizerType> const kQuantizerTypes{
   {"QT_8bit", QuantizerType::QT_8bit},
   {"QT_4bit", QuantizerType::QT_4bit},
@@ -74,6 +86,7 @@ std::unordered_map<std::string, QuantizerType> const kQuantizerTypes{
   {"QT_fp16", QuantizerType::QT_fp16},
   {"QT_8bit_direct", QuantizerType::QT_8bit_direct},
   {"QT_6bit", QuantizerType::QT_6bit}};
+#endif
 
 // Additional info for setting KNN params
 struct ParamsDetails {
@@ -105,8 +118,7 @@ class PredictionCtx {
       nFeatures_(x.ncol()),
       modelKnnIndex_(Rcpp::XPtr<knnIndex>(static_cast<SEXP>(model[kIndex]))),
       modelAlgoType_(static_cast<knn::Algo>(Rcpp::as<int>(model[kAlgo]))),
-      modelDistType_(static_cast<raft::distance::DistanceType>(
-        Rcpp::as<int>(model[kMetric]))),
+      modelDistType_(static_cast<knnDistanceType>(Rcpp::as<int>(model[kMetric]))),
       modelP_(Rcpp::as<float>(model[kP])),
       modelNSamples_(Rcpp::as<int>(model[kNumSamples])),
       modelNDims_(Rcpp::as<int>(model[kNumDims])),
@@ -167,7 +179,7 @@ class PredictionCtx {
   // attributes from the KNN model object
   Rcpp::XPtr<knnIndex> const modelKnnIndex_;
   Algo const modelAlgoType_;
-  raft::distance::DistanceType const modelDistType_;
+  knnDistanceType const modelDistType_;
   float const modelP_;
   int const modelNSamples_;
   int const modelNDims_;
@@ -235,6 +247,18 @@ __host__ std::unique_ptr<knnIndexParam> build_ivfpq_algo_params(
     params[kNumLists] = 8;
     params[kNumProbes] = 3;
 
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
+     CUML4R_LIBCUML_VERSION(24, 0))
+    for (auto iter = kAllowedSubDimSize.crbegin();
+         iter != kAllowedSubDimSize.crend(); ++iter) {
+      auto const pq_dim = *iter;
+      if (pq_dim <= d && d % pq_dim == 0) {
+        params[kUseComputedTables] = false;
+        params[kM] = pq_dim;
+        break;
+      }
+    }
+#else
     for (auto const n_subq : kAllowedSubquantizers) {
       if (d % n_subq == 0 &&
           std::find(kAllowedSubDimSize.cbegin(), kAllowedSubDimSize.cend(),
@@ -244,6 +268,7 @@ __host__ std::unique_ptr<knnIndexParam> build_ivfpq_algo_params(
         break;
       }
     }
+#endif
 
     if (!params.containsElementNamed(kM)) {
       for (auto const n_subq : kAllowedSubquantizers) {
@@ -256,9 +281,10 @@ __host__ std::unique_ptr<knnIndexParam> build_ivfpq_algo_params(
     }
 
     params[kNumBits] = 4;
-    for (auto const n_bits : {8, 6, 5}) {
+    for (auto const n_bits : {8, 6, 5, 4}) {
       auto const min_train_points = (1 << n_bits) * 39;
-      if (n >= min_train_points) {
+      if (n >= min_train_points &&
+          ((n_bits * Rcpp::as<int>(params[kM])) % 8) == 0) {
         params[kNumBits] = n_bits;
         break;
       }
@@ -278,6 +304,11 @@ __host__ std::unique_ptr<knnIndexParam> build_ivfpq_algo_params(
 
 __host__ std::unique_ptr<knnIndexParam> build_ivfsq_algo_params(
   Rcpp::List params, bool const automated) {
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
+     CUML4R_LIBCUML_VERSION(24, 0))
+  Rcpp::stop("IVFSQ KNN is unsupported by this cuML version");
+  return nullptr;
+#else
   if (automated) {
     params[kNumLists] = 8;
     params[kNumProbes] = 2;
@@ -299,6 +330,7 @@ __host__ std::unique_ptr<knnIndexParam> build_ivfsq_algo_params(
   algo_params->encodeResidual = Rcpp::as<bool>(params[kEncodeResidual]);
 
   return algo_params;
+#endif
 }
 
 __host__ std::unique_ptr<knnIndexParam> build_algo_params(
@@ -324,7 +356,7 @@ __host__ std::unique_ptr<knnIndexParam> build_algo_params(
 __host__ std::unique_ptr<knnIndex> build_knn_index(
   raft::handle_t& handle, float* const d_input, int const n_samples,
   int const n_features, Algo const algo_type,
-  raft::distance::DistanceType const dist_type, float const p,
+  knnDistanceType const dist_type, float const p,
   Rcpp::List const& algo_params) {
   std::unique_ptr<knnIndex> knn_index(nullptr);
 
@@ -360,7 +392,7 @@ __host__ Rcpp::List knn_fit(Rcpp::NumericMatrix const& x, int const algo,
                             int const metric, float const p,
                             Rcpp::List const& algo_params) {
   auto const algo_type = static_cast<knn::Algo>(algo);
-  auto const dist_type = static_cast<raft::distance::DistanceType>(metric);
+  auto const dist_type = static_cast<knnDistanceType>(metric);
   auto const input_m = Matrix<float>(x, /*transpose=*/false);
   int const n_samples = input_m.numRows;
   int const n_features = input_m.numCols;
diff --git a/src/lm.cu b/src/lm.cu
index de8b0b8..f6c3e93 100644
--- a/src/lm.cu
+++ b/src/lm.cu
@@ -8,7 +8,6 @@
 #include "preprocessor.h"
 #include "stream_allocator.h"
 
-#include <thrust/async/copy.h>
 #include <thrust/device_vector.h>
 
 #include <Rcpp.h>
diff --git a/src/lm_predict.cu b/src/lm_predict.cu
index d8f5531..1a592c4 100644
--- a/src/lm_predict.cu
+++ b/src/lm_predict.cu
@@ -6,7 +6,6 @@
 #include "preprocessor.h"
 #include "stream_allocator.h"
 
-#include <thrust/async/copy.h>
 #include <thrust/device_vector.h>
 #include <cuml/linear_model/glm.hpp>
 
diff --git a/src/ols_fit_impl.cu b/src/ols_fit_impl.cu
index d507b99..96b2b0c 100644
--- a/src/ols_fit_impl.cu
+++ b/src/ols_fit_impl.cu
@@ -1,6 +1,8 @@
 #include "lm_params.h"
+#include "preprocessor.h"
 
 #include <cuml/linear_model/glm.hpp>
+#include <cuml/version_config.hpp>
 
 namespace cuml4r {
 namespace detail {
@@ -14,7 +16,12 @@ __host__ void ols_fit_impl(raft::handle_t& handle, lm::Params const& params,
                   /*coef=*/params.d_coef,
                   /*intercept=*/params.intercept,
                   /*fit_intercept=*/params.fit_intercept,
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) < \
+     CUML4R_LIBCUML_VERSION(24, 0))
                   /*normalize=*/params.normalize_input, algo);
+#else
+                  algo);
+#endif
 }
 
 }  // namespace detail
diff --git a/src/pca.cu b/src/pca.cu
index 3901667..592a674 100644
--- a/src/pca.cu
+++ b/src/pca.cu
@@ -6,9 +6,9 @@
 #include "preprocessor.h"
 #include "stream_allocator.h"
 
-#include <thrust/async/copy.h>
 #include <thrust/device_vector.h>
 #include <cuml/decomposition/pca.hpp>
+#include <cuml/version_config.hpp>
 
 #include <Rcpp.h>
 
@@ -123,7 +123,13 @@ __host__ Rcpp::List pca_fit_transform(Rcpp::NumericMatrix const& x,
       /*singular_vals=*/d_singular_vals.data().get(),
       /*mu=*/d_mu.data().get(),
       /*noise_vars=*/d_noise_vars.data().get(),
-      /*prms=*/*params);
+      /*prms=*/*params
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
+     CUML4R_LIBCUML_VERSION(24, 0))
+      ,
+      /*flip_signs_based_on_U=*/true
+#endif
+    );
   } else {
     ML::pcaFit(handle,
                /*input=*/d_input.data().get(),
@@ -133,7 +139,13 @@ __host__ Rcpp::List pca_fit_transform(Rcpp::NumericMatrix const& x,
                /*singular_vals=*/d_singular_vals.data().get(),
                /*mu=*/d_mu.data().get(),
                /*noise_vars=*/d_noise_vars.data().get(),
-               /*prms=*/*params);
+               /*prms=*/*params
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
+     CUML4R_LIBCUML_VERSION(24, 0))
+               ,
+               /*flip_signs_based_on_U=*/true
+#endif
+    );
   }
 
   CUDA_RT_CALL(cudaStreamSynchronize(stream_view.value()));
@@ -149,7 +161,7 @@ __host__ Rcpp::List pca_fit_transform(Rcpp::NumericMatrix const& x,
   pinned_host_vector<double> h_mu(n_cols);
   pinned_host_vector<double> h_noise_vars(1);
 
-  AsyncCopyCtx transformed_data_d2h;
+  CUML4R_MAYBE_UNUSED AsyncCopyCtx transformed_data_d2h;
   if (transform_input) {
     transformed_data_d2h =
       async_copy(stream_view.value(), d_transformed_data.cbegin(),
diff --git a/src/pinned_host_vector.h b/src/pinned_host_vector.h
index a0d6359..772787c 100644
--- a/src/pinned_host_vector.h
+++ b/src/pinned_host_vector.h
@@ -2,39 +2,15 @@
 
 #ifdef HAS_CUML
 
-#include <thrust/host_vector.h>
-#include <thrust/system/cuda/experimental/pinned_allocator.h>
-
-#include <Rcpp.h>
+#include <vector>
 
 namespace cuml4r {
 
 template <typename T>
-using pinned_host_vector =
-  thrust::host_vector<T, thrust::cuda::experimental::pinned_allocator<T>>;
+using pinned_host_vector = std::vector<T>;
 
 }  // namespace cuml4r
 
-namespace Rcpp {
-namespace traits {
-
-template <template <class> class Container, typename T>
-struct pinned_container_exporter {
-  using type = RangeExporter<Container<T>>;
-};
-
-// enable range exporter for pinned_host_vector
-template <typename T>
-class Exporter<cuml4r::pinned_host_vector<T>>
-  : public pinned_container_exporter<cuml4r::pinned_host_vector, T>::type {
- public:
-  Exporter(SEXP x)
-    : pinned_container_exporter<cuml4r::pinned_host_vector, T>::type(x) {}
-};
-
-}  // namespace traits
-}  // namespace Rcpp
-
 #else
 
 #include "warn_cuml_missing.h"
diff --git a/src/preprocessor.h b/src/preprocessor.h
index 4f80554..699678e 100644
--- a/src/preprocessor.h
+++ b/src/preprocessor.h
@@ -10,6 +10,14 @@
 #define CUML4R_NODISCARD
 #endif
 
+#if __has_cpp_attribute(maybe_unused)
+#define CUML4R_MAYBE_UNUSED [[maybe_unused]]
+#elif defined(__GNUC__)
+#define CUML4R_MAYBE_UNUSED __attribute__((unused))
+#else
+#define CUML4R_MAYBE_UNUSED
+#endif
+
 // NOTE: the idea for the following is borrowed from
 // https://github.com/facebook/folly/blob/7a18d1823185495cae6676258ee64afd7e36c84c/folly/Preprocessor.h#L88-L105
 #define CUML4R_CONCAT_IMPL(a, b) a##b
diff --git a/src/qn.cu b/src/qn.cu
index 88d883a..f4eb337 100644
--- a/src/qn.cu
+++ b/src/qn.cu
@@ -7,9 +7,9 @@
 #include "qn_constants.h"
 #include "stream_allocator.h"
 
-#include <thrust/async/copy.h>
 #include <thrust/device_vector.h>
 #include <cuml/linear_model/glm.hpp>
+#include <cuml/version_config.hpp>
 
 #include <Rcpp.h>
 
@@ -45,7 +45,7 @@ __host__ Rcpp::List qn_fit(Rcpp::NumericMatrix const& X,
     async_copy(stream_view.value(), h_y.cbegin(), h_y.cend(), d_y.begin());
 
   thrust::device_vector<double> d_sample_weight;
-  AsyncCopyCtx sample_weight_h2d;
+  CUML4R_MAYBE_UNUSED AsyncCopyCtx sample_weight_h2d;
   if (sample_weight.size() > 0) {
     d_sample_weight.resize(sample_weight.size());
     auto h_sample_weight(Rcpp::as<pinned_host_vector<double>>(sample_weight));
@@ -60,6 +60,30 @@ __host__ Rcpp::List qn_fit(Rcpp::NumericMatrix const& X,
   double objective = std::numeric_limits<double>::infinity();
   int n_iters = 0;
 
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
+     CUML4R_LIBCUML_VERSION(24, 0))
+  ML::GLM::qn_params params;
+  params.loss = static_cast<ML::GLM::qn_loss_type>(loss_type);
+  params.penalty_l1 = l1;
+  params.penalty_l2 = l2;
+  params.grad_tol = tol;
+  params.change_tol = delta;
+  params.max_iter = max_iters;
+  params.linesearch_max_iter = linesearch_max_iters;
+  params.lbfgs_memory = lbfgs_memory;
+  params.verbose = 0;
+  params.fit_intercept = fit_intercept;
+
+  ML::GLM::qnFit(
+    /*cuml_handle=*/*handle, params, /*X=*/d_X.data().get(),
+    /*X_col_major=*/true,
+    /*y=*/d_y.data().get(), /*N=*/static_cast<int>(n_samples),
+    /*D=*/static_cast<int>(n_features), /*C=*/n_classes,
+    /*w0=*/d_coefs.data().get(),
+    /*f=*/&objective, /*num_iters=*/&n_iters,
+    /*sample_weight=*/d_sample_weight.empty() ? nullptr
+                                              : d_sample_weight.data().get());
+#else
   ML::GLM::qnFit(
     /*handle=*/*handle, /*X=*/d_X.data().get(), /*X_col_major=*/true,
     /*y=*/d_y.data().get(), /*N=*/n_samples,
@@ -70,6 +94,7 @@ __host__ Rcpp::List qn_fit(Rcpp::NumericMatrix const& X,
     /*f=*/&objective, /*num_iters=*/&n_iters, loss_type,
     /*sample_weight=*/d_sample_weight.empty() ? nullptr
                                               : d_sample_weight.data().get());
+#endif
 
   CUDA_RT_CALL(cudaStreamSynchronize(stream_view.value()));
 
@@ -118,6 +143,22 @@ Rcpp::NumericVector qn_predict(Rcpp::NumericMatrix const& X,
   // QN output
   thrust::device_vector<double> d_preds(n_samples);
 
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
+     CUML4R_LIBCUML_VERSION(24, 0))
+  ML::GLM::qn_params params;
+  params.loss = static_cast<ML::GLM::qn_loss_type>(loss_type);
+  params.fit_intercept = fit_intercept;
+
+  ML::GLM::qnPredict(
+    /*cuml_handle=*/*handle, params,
+    /*X=*/d_X.data().get(),
+    /*X_col_major=*/true,
+    /*N=*/static_cast<int>(n_samples),
+    /*D=*/static_cast<int>(n_features),
+    /*C=*/n_classes,
+    /*coefs=*/d_coefs.data().get(),
+    /*preds=*/d_preds.data().get());
+#else
   ML::GLM::qnPredict(
     /*cuml_handle=*/*handle,
     /*X=*/d_X.data().get(),
@@ -127,6 +168,7 @@ Rcpp::NumericVector qn_predict(Rcpp::NumericMatrix const& X,
     /*C=*/n_classes, fit_intercept,
     /*params=*/d_coefs.data().get(), loss_type,
     /*preds=*/d_preds.data().get());
+#endif
 
   CUDA_RT_CALL(cudaStreamSynchronize(stream_view.value()));
 
diff --git a/src/random_forest_classifier.cu b/src/random_forest_classifier.cu
index 9c277c0..cc9534e 100644
--- a/src/random_forest_classifier.cu
+++ b/src/random_forest_classifier.cu
@@ -9,14 +9,16 @@
 #include "random_forest_serde.cuh"
 #include "stream_allocator.h"
 
-#include <cuml/fil/fil.h>
-#include <thrust/async/copy.h>
 #include <thrust/device_vector.h>
 #include <cuml/tree/decisiontree.hpp>
+#ifndef CUML4R_TREELITE_C_API_MISSING
+#include <cuml/fil/postproc_ops.hpp>
+#endif
 #include <cuml/version_config.hpp>
 
 #include <Rcpp.h>
 
+#include <algorithm>
 #include <functional>
 #include <memory>
 #include <unordered_map>
@@ -25,9 +27,9 @@
 namespace cuml4r {
 namespace {
 
-constexpr auto kRfClassiferNumFeatures = "n_features";
-constexpr auto kRfClassifierForest = "forest";
-constexpr auto kRfClassifierInvLabelsMap = "inv_labels_map";
+CUML4R_MAYBE_UNUSED constexpr auto kRfClassiferNumFeatures = "n_features";
+CUML4R_MAYBE_UNUSED constexpr auto kRfClassifierForest = "forest";
+CUML4R_MAYBE_UNUSED constexpr auto kRfClassifierInvLabelsMap = "inv_labels_map";
 
 using RandomForestClassifierUPtr =
   std::unique_ptr<ML::RandomForestClassifierD,
@@ -71,7 +73,7 @@ class RandomForestClassifier {
       auto const& treelite_handle = getTreeliteHandle();
 
       state[kRfClassifierForest] = detail::getState(
-        *reinterpret_cast<treelite::Model const*>(*treelite_handle.get()));
+        *static_cast<treelite::Model const*>(treelite_handle.handle()));
     }
     {
       Rcpp::List inv_labels_map;
@@ -198,17 +200,6 @@ __host__ Rcpp::IntegerVector rf_classifier_predict(
   return Rcpp::IntegerVector(h_predictions.begin(), h_predictions.end());
 }
 
-/*
- * The 'ML::fil::treelite_params_t::threads_per_tree' and
- * 'ML::fil::treelite_params_t::n_items' parameters are only supported in
- * RAPIDS cuML 21.08 or above.
- */
-CUML4R_ASSIGN_IF_PRESENT(threads_per_tree)
-CUML4R_NOOP_IF_ABSENT(threads_per_tree)
-
-CUML4R_ASSIGN_IF_PRESENT(n_items)
-CUML4R_NOOP_IF_ABSENT(n_items)
-
 }  // namespace
 
 __host__ SEXP rf_classifier_fit(
@@ -226,7 +217,7 @@ __host__ SEXP rf_classifier_fit(
   auto rf = RandomForestClassifierUPtr(new ML::RandomForestClassifierD);
 
   auto stream_view = stream_allocator::getOrCreateStream();
-  raft::handle_t handle(n_streams);
+  raft::handle_t handle;
   handle_utils::initializeHandle(handle, stream_view.value());
 
   // rf input data & labels
@@ -246,7 +237,8 @@ __host__ SEXP rf_classifier_fit(
     ML::fit(handle, rf_ptr, d_input.data().get(), n_samples, n_features,
             d_labels.data().get(),
             /*n_unique_labels=*/static_cast<int>(labels_map.size()),
-#if CUML4R_CONCAT(0x, CUML_VERSION_MINOR) >= 0x08
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
+     CUML4R_LIBCUML_VERSION(21, 8))
 
             ML::set_rf_params(
               max_depth, max_leaves, max_features, n_bins, min_samples_leaf,
@@ -270,7 +262,14 @@ __host__ SEXP rf_classifier_fit(
               /*use_experimental_backend=*/false, max_batch_size),
 
 #endif
-            /*verbosity=*/verbosity);
+            /*verbosity=*/
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
+     CUML4R_LIBCUML_VERSION(24, 0))
+            static_cast<rapids_logger::level_enum>(verbosity)
+#else
+            verbosity
+#endif
+    );
 
     CUDA_RT_CALL(cudaStreamSynchronize(stream_view.value()));
     if (rf_ptr != rf.get()) {
@@ -299,47 +298,61 @@ __host__ Rcpp::IntegerVector rf_classifier_predict(
         raft::handle_t const& handle, double* const d_input,
         int* const d_preds) {
         ML::predict(handle, /*forest=*/rf, d_input, n_samples, n_features,
-                    /*predictions=*/d_preds, verbosity);
+                    /*predictions=*/d_preds,
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
+     CUML4R_LIBCUML_VERSION(24, 0))
+                    static_cast<rapids_logger::level_enum>(verbosity)
+#else
+                    verbosity
+#endif
+        );
       });
-  } else {
-    return rf_classifier_predict<float, float>(
-      model, input,
-      /*predict_impl=*/
-      [&model, n_samples, n_features](raft::handle_t const& handle,
-                                      float* const d_input,
-                                      float* const d_preds) {
+  }
+
 #ifndef CUML4R_TREELITE_C_API_MISSING
-        auto const& tl_handle = model->getTreeliteHandle();
-
-        ML::fil::treelite_params_t params;
-        params.algo = ML::fil::algo_t::ALGO_AUTO;
-        params.output_class = true;
-        params.storage_type = ML::fil::storage_type_t::AUTO;
-        params.blocks_per_sm = 0;
-        set_threads_per_tree(params, 1);
-        set_n_items(params, 0);
-        params.pforest_shape_str = nullptr;
-        auto forest =
-          fil::make_forest(handle,
-                           /*src=*/[&] {
-                             ML::fil::forest* f;
-                             ML::fil::from_treelite(handle, /*pforest=*/&f,
-                                                    /*model=*/*tl_handle.get(),
-                                                    /*tl_params=*/&params);
-                             return f;
-                           });
-        ML::fil::predict(/*h=*/handle, /*f=*/forest.get(), /*preds=*/d_preds,
-                         /*data=*/d_input, /*num_rows=*/n_samples,
-                         /*predict_proba=*/false);
+  auto const input_m = Matrix<float>(input, /*transpose=*/false);
+  auto stream_view = stream_allocator::getOrCreateStream();
+  raft::handle_t handle;
+  handle_utils::initializeHandle(handle, stream_view.value());
 
+  auto const& h_input = input_m.values;
+  thrust::device_vector<float> d_input(h_input.size());
+  auto CUML4R_ANONYMOUS_VARIABLE(input_h2d) = async_copy(
+    stream_view.value(), h_input.cbegin(), h_input.cend(), d_input.begin());
+
+  auto forest = fil::import_from_treelite(handle, model->getTreeliteHandle());
+  auto const n_outputs = static_cast<size_t>(forest->num_outputs());
+  thrust::device_vector<float> d_raw_predictions(n_samples * n_outputs);
+  fil::predict(handle, *forest, d_raw_predictions.data().get(),
+               d_input.data().get(), n_samples);
+
+  pinned_host_vector<float> h_raw_predictions(d_raw_predictions.size());
+  auto CUML4R_ANONYMOUS_VARIABLE(preds_d2h) =
+    async_copy(stream_view.value(), d_raw_predictions.cbegin(),
+               d_raw_predictions.cend(), h_raw_predictions.begin());
+
+  CUDA_RT_CALL(cudaStreamSynchronize(stream_view.value()));
+
+  pinned_host_vector<float> h_predictions(n_samples);
+  for (int i = 0; i < n_samples; ++i) {
+    auto const row_begin = h_raw_predictions.begin() + i * n_outputs;
+    h_predictions[i] =
+      forest->row_postprocessing() == ML::fil::row_op::max_index || n_outputs == 1
+        ? row_begin[0]
+        : static_cast<float>(
+            std::distance(row_begin,
+                          std::max_element(row_begin, row_begin + n_outputs)));
+  }
+
+  postprocess_labels(h_predictions, model->invLabelsMap_);
+
+  return Rcpp::IntegerVector(h_predictions.begin(), h_predictions.end());
 #else
-        Rcpp::stop(
-          "Treelite C API is required for predictions using unserialized "
-          "rand_forest model!");
+  Rcpp::stop(
+    "Treelite C API is required for predictions using unserialized "
+    "rand_forest model!");
 
 #endif
-      });
-  }
 }
 
 __host__ Rcpp::NumericMatrix rf_classifier_predict_class_probabilities(
@@ -358,23 +371,9 @@ __host__ Rcpp::NumericMatrix rf_classifier_predict_class_probabilities(
   raft::handle_t handle;
   handle_utils::initializeHandle(handle, stream_view.value());
 
-  ML::fil::treelite_params_t params;
-  params.algo = ML::fil::algo_t::ALGO_AUTO;
-  // output class probabilities instead of classes
-  params.output_class = false;
-  params.storage_type = ML::fil::storage_type_t::AUTO;
-  params.blocks_per_sm = 0;
-  set_threads_per_tree(params, 1);
-  set_n_items(params, 0);
-  params.pforest_shape_str = nullptr;
-  auto forest = fil::make_forest(
-    handle,
-    /*src=*/[&] {
-      ML::fil::forest* f;
-      ML::fil::from_treelite(handle, /*pforest=*/&f,
-                             /*model=*/*tl_handle.get(), /*tl_params=*/&params);
-      return f;
-    });
+  auto forest = fil::import_from_treelite(handle, tl_handle);
+  forest->set_row_postprocessing(ML::fil::row_op::disable);
+  auto const n_outputs = static_cast<size_t>(forest->num_outputs());
 
   // FIL input
   auto const& h_x = input_m.values;
@@ -383,12 +382,10 @@ __host__ Rcpp::NumericMatrix rf_classifier_predict_class_probabilities(
     async_copy(handle.get_stream(), h_x.cbegin(), h_x.cend(), d_x.begin());
 
   // FIL output
-  thrust::device_vector<float> d_preds(n_classes * n_samples);
+  thrust::device_vector<float> d_preds(n_outputs * n_samples);
 
-  ML::fil::predict(/*h=*/handle, /*f=*/forest.get(),
-                   /*preds=*/d_preds.data().get(),
-                   /*data=*/d_x.data().get(), /*num_rows=*/n_samples,
-                   /*predict_proba=*/true);
+  fil::predict(handle, *forest, d_preds.data().get(), d_x.data().get(),
+               n_samples);
 
   CUDA_RT_CALL(cudaStreamSynchronize(handle.get_stream()));
 
@@ -398,8 +395,20 @@ __host__ Rcpp::NumericMatrix rf_classifier_predict_class_probabilities(
 
   CUDA_RT_CALL(cudaStreamSynchronize(handle.get_stream()));
 
-  return Rcpp::transpose(
-    Rcpp::NumericMatrix(n_classes, n_samples, h_preds.begin()));
+  if (n_outputs == static_cast<size_t>(n_classes)) {
+    return Rcpp::transpose(
+      Rcpp::NumericMatrix(n_outputs, n_samples, h_preds.begin()));
+  }
+  if (n_outputs == 1 && n_classes == 2) {
+    Rcpp::NumericMatrix out(n_samples, 2);
+    for (int i = 0; i < n_samples; ++i) {
+      out(i, 1) = h_preds[i];
+      out(i, 0) = 1.0 - out(i, 1);
+    }
+    return out;
+  }
+  Rcpp::stop("FIL model returned %d outputs, but %d classes were expected.",
+             static_cast<int>(n_outputs), n_classes);
 #else
 
   return {};
diff --git a/src/random_forest_regressor.cu b/src/random_forest_regressor.cu
index 1dd9dea..a3689a6 100644
--- a/src/random_forest_regressor.cu
+++ b/src/random_forest_regressor.cu
@@ -9,7 +9,6 @@
 #include "random_forest_serde.cuh"
 #include "stream_allocator.h"
 
-#include <thrust/async/copy.h>
 #include <thrust/device_vector.h>
 #include <cuml/version_config.hpp>
 
@@ -23,8 +22,8 @@ namespace cuml4r {
 
 namespace {
 
-constexpr auto kRfRegressorNumFeatures = "n_features";
-constexpr auto kRfRegressorForest = "forest";
+CUML4R_MAYBE_UNUSED constexpr auto kRfRegressorNumFeatures = "n_features";
+CUML4R_MAYBE_UNUSED constexpr auto kRfRegressorForest = "forest";
 
 using RandomForestRegressorUPtr =
   std::unique_ptr<ML::RandomForestRegressorD,
@@ -54,7 +53,7 @@ class RandomForestRegressor {
 
     auto const& treelite_handle = getTreeliteHandle();
     state[kRfRegressorForest] = detail::getState(
-      *reinterpret_cast<treelite::Model const*>(*treelite_handle.get()));
+      *static_cast<treelite::Model const*>(treelite_handle.handle()));
 
     return state;
   }
@@ -123,17 +122,6 @@ __host__ Rcpp::NumericVector rf_regressor_predict(
   return Rcpp::NumericVector(h_preds.begin(), h_preds.end());
 }
 
-/*
- * The 'ML::fil::treelite_params_t::threads_per_tree' and
- * 'ML::fil::treelite_params_t::n_items' parameters are only supported in
- * RAPIDS cuML 21.08 or above.
- */
-CUML4R_ASSIGN_IF_PRESENT(threads_per_tree)
-CUML4R_NOOP_IF_ABSENT(threads_per_tree)
-
-CUML4R_ASSIGN_IF_PRESENT(n_items)
-CUML4R_NOOP_IF_ABSENT(n_items)
-
 }  // namespace
 
 __host__ SEXP rf_regressor_fit(
@@ -151,7 +139,7 @@ __host__ SEXP rf_regressor_fit(
   auto rf = RandomForestRegressorUPtr(new ML::RandomForestRegressorD);
 
   auto stream_view = stream_allocator::getOrCreateStream();
-  raft::handle_t handle(n_streams);
+  raft::handle_t handle;
   handle_utils::initializeHandle(handle, stream_view.value());
 
   // rf input data & responses
@@ -170,7 +158,8 @@ __host__ SEXP rf_regressor_fit(
     ML::fit(
       handle, rf_ptr, d_input.data().get(), n_samples, n_features,
       /*labels=*/d_responses.data().get(),
-#if CUML4R_CONCAT(0x, CUML_VERSION_MINOR) >= 0x08
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
+     CUML4R_LIBCUML_VERSION(21, 8))
 
       ML::set_rf_params(max_depth, max_leaves, max_features, n_bins,
                         min_samples_leaf, min_samples_split,
@@ -192,7 +181,14 @@ __host__ SEXP rf_regressor_fit(
         /*use_experimental_backend=*/false, max_batch_size),
 
 #endif
-      /*verbosity=*/verbosity);
+      /*verbosity=*/
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
+     CUML4R_LIBCUML_VERSION(24, 0))
+      static_cast<rapids_logger::level_enum>(verbosity)
+#else
+      verbosity
+#endif
+    );
 
     CUDA_RT_CALL(cudaStreamSynchronize(stream_view.value()));
     if (rf_ptr != rf.get()) {
@@ -219,47 +215,53 @@ __host__ Rcpp::NumericVector rf_regressor_predict(
         raft::handle_t const& handle, double* const d_input,
         double* const d_preds) {
         ML::predict(handle, /*forest=*/rf, d_input, n_samples, n_features,
-                    /*predictions=*/d_preds, verbosity);
+                    /*predictions=*/d_preds,
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
+     CUML4R_LIBCUML_VERSION(24, 0))
+                    static_cast<rapids_logger::level_enum>(verbosity)
+#else
+                    verbosity
+#endif
+        );
       });
-  } else {
-    return rf_regressor_predict<float, float>(
-      input,
-      /*predict_impl=*/
-      [&model, n_samples, n_features](raft::handle_t const& handle,
-                                      float* const d_input,
-                                      float* const d_preds) {
+  }
+
 #ifndef CUML4R_TREELITE_C_API_MISSING
-        auto const& tl_handle = model->getTreeliteHandle();
-
-        ML::fil::treelite_params_t params;
-        params.algo = ML::fil::algo_t::ALGO_AUTO;
-        params.output_class = false;
-        params.storage_type = ML::fil::storage_type_t::AUTO;
-        params.blocks_per_sm = 0;
-        set_threads_per_tree(params, 1);
-        set_n_items(params, 0);
-        params.pforest_shape_str = nullptr;
-        auto forest =
-          fil::make_forest(handle,
-                           /*src=*/[&] {
-                             ML::fil::forest* f;
-                             ML::fil::from_treelite(handle, /*pforest=*/&f,
-                                                    /*model=*/*tl_handle.get(),
-                                                    /*tl_params=*/&params);
-                             return f;
-                           });
-        ML::fil::predict(/*h=*/handle, /*f=*/forest.get(), /*preds=*/d_preds,
-                         /*data=*/d_input, /*num_rows=*/n_samples,
-                         /*predict_proba=*/false);
+  auto const input_m = Matrix<float>(input, /*transpose=*/false);
+  auto stream_view = stream_allocator::getOrCreateStream();
+  raft::handle_t handle;
+  handle_utils::initializeHandle(handle, stream_view.value());
+
+  auto const& h_input = input_m.values;
+  thrust::device_vector<float> d_input(h_input.size());
+  auto CUML4R_ANONYMOUS_VARIABLE(input_h2d) = async_copy(
+    stream_view.value(), h_input.cbegin(), h_input.cend(), d_input.begin());
+
+  auto forest = fil::import_from_treelite(handle, model->getTreeliteHandle());
+  auto const n_outputs = static_cast<size_t>(forest->num_outputs());
+  thrust::device_vector<float> d_raw_predictions(n_samples * n_outputs);
+  fil::predict(handle, *forest, d_raw_predictions.data().get(),
+               d_input.data().get(), n_samples);
+
+  pinned_host_vector<float> h_raw_predictions(d_raw_predictions.size());
+  auto CUML4R_ANONYMOUS_VARIABLE(preds_d2h) =
+    async_copy(stream_view.value(), d_raw_predictions.cbegin(),
+               d_raw_predictions.cend(), h_raw_predictions.begin());
 
+  CUDA_RT_CALL(cudaStreamSynchronize(stream_view.value()));
+
+  pinned_host_vector<float> h_predictions(n_samples);
+  for (int i = 0; i < n_samples; ++i) {
+    h_predictions[i] = h_raw_predictions[i * n_outputs];
+  }
+
+  return Rcpp::NumericVector(h_predictions.begin(), h_predictions.end());
 #else
-        Rcpp::stop(
-          "Treelite C API is required for predictions using unserialized "
-          "rand_forest model!");
+  Rcpp::stop(
+    "Treelite C API is required for predictions using unserialized "
+    "rand_forest model!");
 
 #endif
-      });
-  }
 }
 
 __host__ Rcpp::List rf_regressor_get_state(SEXP model) {
diff --git a/src/random_forest_serde.cu b/src/random_forest_serde.cu
index fc029bc..1c8c221 100644
--- a/src/random_forest_serde.cu
+++ b/src/random_forest_serde.cu
@@ -32,7 +32,8 @@ __host__ Rcpp::List getState(treelite::PyBufferFrame const& buf) {
 __host__ Rcpp::List getState(treelite::Model const& model) {
   Rcpp::List state;
 
-  auto const buffers = const_cast<treelite::Model&>(model).GetPyBuffer();
+  auto const buffers =
+    const_cast<treelite::Model&>(model).SerializeToPyBuffer();
   for (auto const& buffer : buffers) {
     state.push_back(getState(buffer));
   }
@@ -65,7 +66,7 @@ void setState(std::unique_ptr<treelite::Model>& model,
     setState(/*buf=*/frames[i], frames_content, /*state=*/state[i]);
   }
 
-  model = treelite::Model::CreateFromPyBuffer(frames);
+  model = treelite::Model::DeserializeFromPyBuffer(frames);
 }
 
 }  // namespace detail
diff --git a/src/random_projection.cpp b/src/random_projection.cpp
index 3ea7883..fd8cf98 100644
--- a/src/random_projection.cpp
+++ b/src/random_projection.cpp
@@ -3,12 +3,15 @@
 // [[Rcpp::export(".rproj_johnson_lindenstrauss_min_dim")]]
 size_t rproj_johnson_lindenstrauss_min_dim(size_t const n_samples,
                                            double const eps) {
-#ifdef HAS_CUML
+#if defined(HAS_CUML) && !defined(CUML4R_RPROJ_C_API_MISSING)
 
   return cuml4r::rproj_johnson_lindenstrauss_min_dim(n_samples, eps);
 
 #else
 
+#ifdef CUML4R_RPROJ_C_API_MISSING
+  Rcpp::stop("cuML random projection support is not available in this build.");
+#endif
 #include "warn_cuml_missing.h"
 
   // return a dummy value
@@ -22,13 +25,16 @@ SEXP rproj_fit(int const n_samples, int const n_features,
                int const n_components, double const eps,
                bool const gaussian_method, double const density,
                int const random_state) {
-#ifdef HAS_CUML
+#if defined(HAS_CUML) && !defined(CUML4R_RPROJ_C_API_MISSING)
 
   return cuml4r::rproj_fit(n_samples, n_features, n_components, eps,
                            gaussian_method, density, random_state);
 
 #else
 
+#ifdef CUML4R_RPROJ_C_API_MISSING
+  Rcpp::stop("cuML random projection support is not available in this build.");
+#endif
 #include "warn_cuml_missing.h"
 
   return Rcpp::List();
@@ -39,12 +45,15 @@ SEXP rproj_fit(int const n_samples, int const n_features,
 // [[Rcpp::export(".rproj_transform")]]
 Rcpp::NumericMatrix rproj_transform(SEXP rproj_ctx_xptr,
                                     Rcpp::NumericMatrix const& input) {
-#ifdef HAS_CUML
+#if defined(HAS_CUML) && !defined(CUML4R_RPROJ_C_API_MISSING)
 
   return cuml4r::rproj_transform(rproj_ctx_xptr, input);
 
 #else
 
+#ifdef CUML4R_RPROJ_C_API_MISSING
+  Rcpp::stop("cuML random projection support is not available in this build.");
+#endif
 #include "warn_cuml_missing.h"
 
   // dummy values with distinct data points
@@ -55,12 +64,15 @@ Rcpp::NumericMatrix rproj_transform(SEXP rproj_ctx_xptr,
 
 // [[Rcpp::export(".rproj_get_state")]]
 Rcpp::List rproj_get_state(SEXP model) {
-#ifdef HAS_CUML
+#if defined(HAS_CUML) && !defined(CUML4R_RPROJ_C_API_MISSING)
 
   return cuml4r::rproj_get_state(model);
 
 #else
 
+#ifdef CUML4R_RPROJ_C_API_MISSING
+  Rcpp::stop("cuML random projection support is not available in this build.");
+#endif
 #include "warn_cuml_missing.h"
 
   return {};
@@ -70,12 +82,15 @@ Rcpp::List rproj_get_state(SEXP model) {
 
 // [[Rcpp::export(".rproj_set_state")]]
 SEXP rproj_set_state(Rcpp::List const& model_state) {
-#ifdef HAS_CUML
+#if defined(HAS_CUML) && !defined(CUML4R_RPROJ_C_API_MISSING)
 
   return cuml4r::rproj_set_state(model_state);
 
 #else
 
+#ifdef CUML4R_RPROJ_C_API_MISSING
+  Rcpp::stop("cuML random projection support is not available in this build.");
+#endif
 #include "warn_cuml_missing.h"
 
   return R_NilValue;
diff --git a/src/random_projection.cu b/src/random_projection.cu
index 30cf51c..58bd1de 100644
--- a/src/random_projection.cu
+++ b/src/random_projection.cu
@@ -6,8 +6,9 @@
 #include "preprocessor.h"
 #include "stream_allocator.h"
 
+#ifndef CUML4R_RPROJ_C_API_MISSING
+
 #include <cuml/random_projection/rproj_c.h>
-#include <thrust/async/copy.h>
 #include <thrust/device_vector.h>
 #include <cuml/version_config.hpp>
 
@@ -245,3 +246,5 @@ __host__ SEXP rproj_set_state(Rcpp::List const& model_state) {
 }
 
 }  // namespace cuml4r
+
+#endif
diff --git a/src/ridge_fit_impl.cu b/src/ridge_fit_impl.cu
index 8f398fc..9055160 100644
--- a/src/ridge_fit_impl.cu
+++ b/src/ridge_fit_impl.cu
@@ -1,6 +1,8 @@
 #include "lm_params.h"
+#include "preprocessor.h"
 
 #include <cuml/linear_model/glm.hpp>
+#include <cuml/version_config.hpp>
 
 namespace cuml4r {
 namespace detail {
@@ -16,7 +18,12 @@ __host__ void ridge_fit_impl(raft::handle_t& handle, lm::Params const& params,
                     /*coef=*/params.d_coef,
                     /*intercept=*/params.intercept,
                     /*fit_intercept=*/params.fit_intercept,
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) < \
+     CUML4R_LIBCUML_VERSION(24, 0))
                     /*normalize=*/params.normalize_input, algo);
+#else
+                    algo);
+#endif
 }
 
 }  // namespace detail
diff --git a/src/stream_allocator.cu b/src/stream_allocator.cu
index acc79fe..cc49a61 100644
--- a/src/stream_allocator.cu
+++ b/src/stream_allocator.cu
@@ -1,7 +1,6 @@
 #ifdef HAS_CUML
 
 #include "cuda_utils.h"
-#include "device_allocator.h"
 #include "stream_allocator.h"
 
 #include <rmm/cuda_stream.hpp>
@@ -42,7 +41,6 @@ __host__ rmm::cuda_stream_view getOrCreateStream() {
   if (it != cuda_streams_map.end()) {
     return it->second.value();
   }
-  auto const device_allocator = getDeviceAllocator();
   auto stream = rmm::cuda_stream();
   auto stream_view = stream.view();
   cudaStreamsMap().emplace(dev_id, std::move(stream));
diff --git a/src/stubs/treelite/c_api.h b/src/stubs/treelite/c_api.h
index 6deac55..9109ce4 100644
--- a/src/stubs/treelite/c_api.h
+++ b/src/stubs/treelite/c_api.h
@@ -1,6 +1,8 @@
 #pragma once
 
+#ifndef CUML4R_TREELITE_C_API_MISSING
 #define CUML4R_TREELITE_C_API_MISSING
+#endif
 
 // NOTE: disabling this message for now per human feedback from CRAN
 
diff --git a/src/svm_classifier.cu b/src/svm_classifier.cu
index 4aa71aa..98c5fb6 100644
--- a/src/svm_classifier.cu
+++ b/src/svm_classifier.cu
@@ -8,7 +8,6 @@
 #include "svm_serde.h"
 
 #include <cuml/svm/svm_parameter.h>
-#include <thrust/async/copy.h>
 #include <thrust/device_vector.h>
 #include <cuml/svm/svc.hpp>
 
@@ -86,7 +85,7 @@ __host__ SEXP svc_fit(Rcpp::NumericMatrix const& input,
     stream_view.value(), h_labels.cbegin(), h_labels.cend(), d_labels.begin());
 
   thrust::device_vector<double> d_sample_weights;
-  AsyncCopyCtx sample_weights_h2d;
+  CUML4R_MAYBE_UNUSED AsyncCopyCtx sample_weights_h2d;
   if (sample_weights.size() > 0) {
     auto const h_sample_weights(
       Rcpp::as<pinned_host_vector<double>>(sample_weights));
@@ -100,10 +99,18 @@ __host__ SEXP svc_fit(Rcpp::NumericMatrix const& input,
     /*kernel=*/static_cast<MLCommon::Matrix::KernelType>(kernel), degree, gamma,
     coef0};
 
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
+     CUML4R_LIBCUML_VERSION(24, 0))
+  auto const verbosity_level =
+    static_cast<rapids_logger::level_enum>(verbosity);
+#else
+  auto const verbosity_level = verbosity;
+#endif
+
   // SVM output
   auto svc = std::make_unique<ML::SVM::SVC<double>>(
     *handle, /*C=*/cost, tol, kernel_params, cache_size, max_iter,
-    nochange_steps, verbosity);
+    nochange_steps, verbosity_level);
 
   svc->fit(d_input.data().get(), /*nrows=*/n_samples, /*ncols=*/n_features,
            d_labels.data().get(),
@@ -122,7 +129,7 @@ __host__ SEXP svc_predict(SEXP model_xptr, Rcpp::NumericMatrix const& input,
 
   auto ctx = Rcpp::XPtr<ModelCtx>(model_xptr);
   auto const& svc = ctx->model_;
-  auto* stream = ctx->handle_->get_stream();
+  cudaStream_t const stream = ctx->handle_->get_stream();
 
   // input
   auto const& h_input = m.values;
diff --git a/src/svm_regressor.cu b/src/svm_regressor.cu
index 314ec75..b3675d2 100644
--- a/src/svm_regressor.cu
+++ b/src/svm_regressor.cu
@@ -8,7 +8,6 @@
 #include "svm_serde.h"
 
 #include <cuml/svm/svm_parameter.h>
-#include <thrust/async/copy.h>
 #include <thrust/device_vector.h>
 #include <cuml/svm/svc.hpp>
 #include <cuml/svm/svr.hpp>
@@ -89,7 +88,7 @@ __host__ SEXP svr_fit(Rcpp::NumericMatrix const& X,
     async_copy(stream_view.value(), h_y.cbegin(), h_y.cend(), d_y.begin());
 
   thrust::device_vector<double> d_sample_weights;
-  AsyncCopyCtx sample_weights_h2d;
+  CUML4R_MAYBE_UNUSED AsyncCopyCtx sample_weights_h2d;
   if (sample_weights.size() > 0) {
     auto const h_sample_weights(
       Rcpp::as<pinned_host_vector<double>>(sample_weights));
@@ -99,12 +98,19 @@ __host__ SEXP svr_fit(Rcpp::NumericMatrix const& X,
                  h_sample_weights.cend(), d_sample_weights.begin());
   }
 
-  ML::SVM::svmParameter param;
+  ML::SVM::svmParameter param{};
   param.C = cost;
-  param.cache_size = cache_size, param.max_iter = max_iter;
+  param.cache_size = cache_size;
+  param.max_outer_iter = max_iter;
+  param.max_iter = -1;
   param.nochange_steps = nochange_steps;
   param.tol = tol;
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
+     CUML4R_LIBCUML_VERSION(24, 0))
+  param.verbosity = static_cast<rapids_logger::level_enum>(verbosity);
+#else
   param.verbosity = verbosity;
+#endif
   param.epsilon = epsilon;
   param.svmType = ML::SVM::SvmType::EPSILON_SVR;
   MLCommon::Matrix::KernelParams kernel_params{
diff --git a/src/svm_serde.cu b/src/svm_serde.cu
index bc6a18a..e12182d 100644
--- a/src/svm_serde.cu
+++ b/src/svm_serde.cu
@@ -8,6 +8,31 @@
 namespace cuml4r {
 namespace detail {
 
+namespace {
+
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
+     CUML4R_LIBCUML_VERSION(24, 0))
+__host__ double*& svmSupportData(ML::SVM::svmModel<double>& svm_model) {
+  return svm_model.support_matrix.data;
+}
+
+__host__ double const* svmSupportData(
+  ML::SVM::svmModel<double> const& svm_model) {
+  return svm_model.support_matrix.data;
+}
+#else
+__host__ double*& svmSupportData(ML::SVM::svmModel<double>& svm_model) {
+  return svm_model.x_support;
+}
+
+__host__ double const* svmSupportData(
+  ML::SVM::svmModel<double> const& svm_model) {
+  return svm_model.x_support;
+}
+#endif
+
+}  // namespace
+
 __host__ Rcpp::List getState(
   MLCommon::Matrix::KernelParams const& kernel_params) {
   Rcpp::List state;
@@ -28,7 +53,7 @@ __host__ Rcpp::List getState(ML::SVM::svmParameter const& svm_params) {
   state[kSvmParamsMaxIter] = svm_params.max_iter;
   state[kSvmParamsNoChangeSteps] = svm_params.nochange_steps;
   state[kSvmParamsTol] = svm_params.tol;
-  state[kSvmParamsVerbosity] = svm_params.verbosity;
+  state[kSvmParamsVerbosity] = static_cast<int>(svm_params.verbosity);
   state[kSvmParamsEpsilon] = svm_params.epsilon;
   state[kSvmParamsType] = static_cast<int>(svm_params.svmType);
 
@@ -38,7 +63,7 @@ __host__ Rcpp::List getState(ML::SVM::svmParameter const& svm_params) {
 __host__ Rcpp::List getState(ML::SVM::svmModel<double> const& svm_model,
                              raft::handle_t const& handle) {
   Rcpp::List state;
-  auto* const stream = handle.get_stream();
+  cudaStream_t const stream = handle.get_stream();
 
   pinned_host_vector<double> h_dual_coefs(svm_model.n_support);
   CUDA_RT_CALL(cudaMemcpyAsync(
@@ -51,7 +76,7 @@ __host__ Rcpp::List getState(ML::SVM::svmModel<double> const& svm_model,
                                          svm_model.n_cols);
   CUDA_RT_CALL(cudaMemcpyAsync(
     /*dst=*/h_x_support.data(),
-    /*src=*/svm_model.x_support,
+    /*src=*/svmSupportData(svm_model),
     /*count=*/svm_model.n_support * svm_model.n_cols * sizeof(double),
     /*kind=*/cudaMemcpyDeviceToHost, stream));
 
@@ -103,7 +128,13 @@ __host__ void setState(ML::SVM::svmParameter& svm_params,
   svm_params.max_iter = state[kSvmParamsMaxIter];
   svm_params.nochange_steps = state[kSvmParamsNoChangeSteps];
   svm_params.tol = state[kSvmParamsTol];
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
+     CUML4R_LIBCUML_VERSION(24, 0))
+  svm_params.verbosity = static_cast<rapids_logger::level_enum>(
+    Rcpp::as<int>(state[kSvmParamsVerbosity]));
+#else
   svm_params.verbosity = state[kSvmParamsVerbosity];
+#endif
   svm_params.epsilon = state[kSvmParamsEpsilon];
   svm_params.svmType =
     static_cast<ML::SVM::SvmType>(Rcpp::as<int>(state[kSvmParamsType]));
@@ -118,7 +149,7 @@ __host__ void setState(ML::SVM::svmModel<double>& svm_model,
   svm_model.n_cols = n_cols;
   svm_model.b = state[kSvmModelB];
 
-  auto const stream_view = handle.get_stream_view();
+  cudaStream_t const stream = handle.get_stream();
 
   CUDA_RT_CALL(cudaMalloc(&svm_model.dual_coefs, n_support * sizeof(double)));
   auto const h_dual_coefs =
@@ -128,18 +159,24 @@ __host__ void setState(ML::SVM::svmModel<double>& svm_model,
     /*src=*/h_dual_coefs.data(),
     /*count=*/n_support * sizeof(double),
     /*kind=*/cudaMemcpyHostToDevice,
-    /*stream=*/stream_view.value()));
+    /*stream=*/stream));
 
   CUDA_RT_CALL(
-    cudaMalloc(&svm_model.x_support, n_support * n_cols * sizeof(double)));
+    cudaMalloc(&svmSupportData(svm_model), n_support * n_cols * sizeof(double)));
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
+     CUML4R_LIBCUML_VERSION(24, 0))
+  svm_model.support_matrix.nnz = -1;
+  svm_model.support_matrix.indptr = nullptr;
+  svm_model.support_matrix.indices = nullptr;
+#endif
   auto const h_x_support =
     Rcpp::as<pinned_host_vector<double>>(state[kSvmModelSupportVectors]);
   CUDA_RT_CALL(cudaMemcpyAsync(
-    /*dst=*/svm_model.x_support,
+    /*dst=*/svmSupportData(svm_model),
     /*src=*/h_x_support.data(),
     /*count=*/n_support * n_cols * sizeof(double),
     /*kind=*/cudaMemcpyHostToDevice,
-    /*stream=*/stream_view.value()));
+    /*stream=*/stream));
 
   CUDA_RT_CALL(cudaMalloc(&svm_model.support_idx, n_support * sizeof(int)));
   auto const h_support_idx =
@@ -149,7 +186,7 @@ __host__ void setState(ML::SVM::svmModel<double>& svm_model,
     /*src=*/h_support_idx.data(),
     /*count=*/n_support * sizeof(int),
     /*kind=*/cudaMemcpyHostToDevice,
-    /*stream=*/stream_view.value()));
+    /*stream=*/stream));
 
   int const n_classes = state[kSvmModelNumClasses];
   svm_model.n_classes = n_classes;
@@ -163,9 +200,9 @@ __host__ void setState(ML::SVM::svmModel<double>& svm_model,
     /*src=*/h_unique_labels.data(),
     /*count=*/n_classes * sizeof(double),
     /*kind=*/cudaMemcpyHostToDevice,
-    /*stream=*/stream_view.value()));
+    /*stream=*/stream));
 
-  CUDA_RT_CALL(cudaStreamSynchronize(stream_view.value()));
+  CUDA_RT_CALL(cudaStreamSynchronize(stream));
 }
 
 }  // namespace detail
diff --git a/src/svm_serde.h b/src/svm_serde.h
index c0d762c..a8229e2 100644
--- a/src/svm_serde.h
+++ b/src/svm_serde.h
@@ -2,13 +2,32 @@
 
 #include "preprocessor.h"
 
+#include <cuml/version_config.hpp>
+
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
+     CUML4R_LIBCUML_VERSION(24, 0))
+#include <cuml/matrix/kernel_params.hpp>
+#else
 #include <cuml/matrix/kernelparams.h>
+#endif
+
 #include <cuml/svm/svm_model.h>
 #include <cuml/svm/svm_parameter.h>
-#include <cuml/version_config.hpp>
 
 #include <Rcpp.h>
 
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
+     CUML4R_LIBCUML_VERSION(24, 0))
+namespace MLCommon {
+namespace Matrix {
+
+using KernelParams = ML::matrix::KernelParams;
+using KernelType = ML::matrix::KernelType;
+
+}  // namespace Matrix
+}  // namespace MLCommon
+#endif
+
 #if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
      CUML4R_LIBCUML_VERSION(21, 10))
 
diff --git a/src/treelite_utils.cuh b/src/treelite_utils.cuh
index 7a8ba03..7e8a218 100644
--- a/src/treelite_utils.cuh
+++ b/src/treelite_utils.cuh
@@ -11,7 +11,7 @@ namespace cuml4r {
  */
 class TreeliteHandle {
  public:
-  __host__ explicit TreeliteHandle(ModelHandle const handle = nullptr) noexcept
+  __host__ explicit TreeliteHandle(TreeliteModelHandle const handle = nullptr) noexcept
     : handle_(handle) {}
 
   __host__ TreeliteHandle(TreeliteHandle const& o) = delete;
@@ -36,7 +36,7 @@ class TreeliteHandle {
     return *this;
   }
 
-  __host__ TreeliteHandle& operator=(ModelHandle const handle) noexcept {
+  __host__ TreeliteHandle& operator=(TreeliteModelHandle const handle) noexcept {
     if (handle_ != nullptr) {
       TreeliteFreeModel(handle_);
     }
@@ -46,12 +46,14 @@ class TreeliteHandle {
 
   __host__ bool empty() const noexcept { return handle_ == nullptr; }
 
-  __host__ ModelHandle const* get() const noexcept { return &handle_; }
+  __host__ TreeliteModelHandle const* get() const noexcept { return &handle_; }
 
-  __host__ ModelHandle* get() noexcept { return &handle_; }
+  __host__ TreeliteModelHandle* get() noexcept { return &handle_; }
+
+  __host__ TreeliteModelHandle handle() const noexcept { return handle_; }
 
  private:
-  ModelHandle handle_;
+  TreeliteModelHandle handle_;
 };
 
 }  // namespace cuml4r
diff --git a/src/tsne.cu b/src/tsne.cu
index a3819f9..23b8e1f 100644
--- a/src/tsne.cu
+++ b/src/tsne.cu
@@ -7,7 +7,7 @@
 #include "stream_allocator.h"
 
 #include <cuml/manifold/tsne.h>
-#include <thrust/async/copy.h>
+#include <cuml/version_config.hpp>
 #include <thrust/device_vector.h>
 #include <cuml/manifold/umap.hpp>
 
@@ -49,8 +49,15 @@ __host__ Rcpp::NumericMatrix tsne_fit(
   params.pre_momentum = pre_momentum;
   params.post_momentum = post_momentum;
   params.random_state = random_state;
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
+     CUML4R_LIBCUML_VERSION(24, 0))
+  params.verbosity = static_cast<rapids_logger::level_enum>(verbosity);
+  params.init =
+    initialize_embeddings ? ML::TSNE_INIT::RANDOM : ML::TSNE_INIT::PCA;
+#else
   params.verbosity = verbosity;
   params.initialize_embeddings = initialize_embeddings;
+#endif
   params.square_distances = square_distances;
   params.algorithm = static_cast<ML::TSNE_ALGORITHM>(algo);
 
diff --git a/src/tsvd.cu b/src/tsvd.cu
index 2651010..1851d87 100644
--- a/src/tsvd.cu
+++ b/src/tsvd.cu
@@ -6,9 +6,9 @@
 #include "preprocessor.h"
 #include "stream_allocator.h"
 
-#include <thrust/async/copy.h>
 #include <thrust/device_vector.h>
 #include <cuml/decomposition/tsvd.hpp>
+#include <cuml/version_config.hpp>
 
 #include <Rcpp.h>
 
@@ -72,13 +72,25 @@ __host__ Rcpp::List tsvd_fit_transform(Rcpp::NumericMatrix const& x,
       /*explained_var=*/d_explained_var.data().get(),
       /*explained_var_ratio=*/d_explained_var_ratio.data().get(),
       /*singular_vals=*/d_singular_vals.data().get(),
-      /*prms=*/*params);
+      /*prms=*/*params
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
+     CUML4R_LIBCUML_VERSION(24, 0))
+      ,
+      /*flip_signs_based_on_U=*/true
+#endif
+    );
   } else {
     ML::tsvdFit(handle,
                 /*input=*/d_input.data().get(),
                 /*components=*/d_components.data().get(),
                 /*singular_vals=*/d_singular_vals.data().get(),
-                /*prms=*/*params);
+                /*prms=*/*params
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
+     CUML4R_LIBCUML_VERSION(24, 0))
+                ,
+                /*flip_signs_based_on_U=*/true
+#endif
+    );
   }
 
   pinned_host_vector<double> h_transformed_data;
@@ -96,7 +108,7 @@ __host__ Rcpp::List tsvd_fit_transform(Rcpp::NumericMatrix const& x,
   }
   pinned_host_vector<double> h_singular_vals(n_components);
 
-  AsyncCopyCtx transformed_data_d2h;
+  CUML4R_MAYBE_UNUSED AsyncCopyCtx transformed_data_d2h;
   if (transform_input) {
     transformed_data_d2h =
       async_copy(stream_view.value(), d_transformed_data.cbegin(),
@@ -105,13 +117,13 @@ __host__ Rcpp::List tsvd_fit_transform(Rcpp::NumericMatrix const& x,
   auto CUML4R_ANONYMOUS_VARIABLE(components_d2h) =
     async_copy(stream_view.value(), d_components.cbegin(), d_components.cend(),
                h_components.begin());
-  AsyncCopyCtx explained_var_d2h;
+  CUML4R_MAYBE_UNUSED AsyncCopyCtx explained_var_d2h;
   if (transform_input) {
     explained_var_d2h =
       async_copy(stream_view.value(), d_explained_var.cbegin(),
                  d_explained_var.cend(), h_explained_var.begin());
   }
-  AsyncCopyCtx explained_var_ratio_d2h;
+  CUML4R_MAYBE_UNUSED AsyncCopyCtx explained_var_ratio_d2h;
   if (transform_input) {
     explained_var_ratio_d2h =
       async_copy(stream_view.value(), d_explained_var_ratio.cbegin(),
diff --git a/src/umap.cu b/src/umap.cu
index e162731..bdabea1 100644
--- a/src/umap.cu
+++ b/src/umap.cu
@@ -7,9 +7,9 @@
 #include "stream_allocator.h"
 
 #include <cuml/manifold/umapparams.h>
-#include <thrust/async/copy.h>
 #include <thrust/device_vector.h>
 #include <cuml/manifold/umap.hpp>
+#include <cuml/version_config.hpp>
 
 #include <Rcpp.h>
 
@@ -79,7 +79,12 @@ __host__ Rcpp::List umap_fit(
   params->repulsion_strength = repulsion_strength;
   params->negative_sample_rate = negative_sample_rate;
   params->transform_queue_size = transform_queue_size;
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
+     CUML4R_LIBCUML_VERSION(24, 0))
+  params->verbosity = static_cast<rapids_logger::level_enum>(verbosity);
+#else
   params->verbosity = verbosity;
+#endif
   if (std::isnan(a) || std::isnan(b)) {
     ML::UMAP::find_ab(handle, params.get());
   } else {
@@ -100,7 +105,7 @@ __host__ Rcpp::List umap_fit(
   auto CUML4R_ANONYMOUS_VARIABLE(x_h2d) =
     async_copy(stream_view.value(), h_x.cbegin(), h_x.cend(), d_x.begin());
   thrust::device_vector<float> d_y;
-  AsyncCopyCtx y_h2d;
+  CUML4R_MAYBE_UNUSED AsyncCopyCtx y_h2d;
   if (y.size() > 0) {
     auto const h_y = Rcpp::as<pinned_host_vector<float>>(y);
     d_y.resize(y.size());
@@ -108,7 +113,26 @@ __host__ Rcpp::List umap_fit(
       async_copy(stream_view.value(), h_y.cbegin(), h_y.cend(), d_y.begin());
   }
 
-  // UMAP output
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
+     CUML4R_LIBCUML_VERSION(24, 0))
+  std::unique_ptr<rmm::device_buffer> d_embedding;
+  auto graph =
+    raft::make_host_coo_matrix<float, int, int, uint64_t>(handle, n_samples,
+                                                          n_samples);
+
+  ML::UMAP::fit(handle, /*X=*/d_x.data().get(),
+                /*y=*/(y.size() > 0 ? d_y.data().get() : nullptr),
+                /*n=*/n_samples,
+                /*d=*/n_features,
+                /*knn_indices=*/nullptr,
+                /*knn_dists=*/nullptr,
+                /*params=*/params.get(),
+                /*embeddings=*/d_embedding,
+                /*graph=*/graph);
+
+  auto const d_embedding_data =
+    thrust::device_pointer_cast(static_cast<float*>(d_embedding->data()));
+#else
   thrust::device_vector<float> d_embedding(n_samples * n_components);
 
   ML::UMAP::fit(handle, /*X=*/d_x.data().get(),
@@ -120,12 +144,15 @@ __host__ Rcpp::List umap_fit(
                 /*params=*/params.get(),
                 /*embeddings=*/d_embedding.data().get());
 
+  auto const d_embedding_data = d_embedding.data();
+#endif
+
   CUDA_RT_CALL(cudaStreamSynchronize(stream_view.value()));
 
-  pinned_host_vector<float> h_embedding(d_embedding.size());
+  pinned_host_vector<float> h_embedding(n_samples * n_components);
   auto CUML4R_ANONYMOUS_VARIABLE(embedding_d2h) =
-    async_copy(stream_view.value(), d_embedding.cbegin(), d_embedding.cend(),
-               h_embedding.begin());
+    async_copy(stream_view.value(), d_embedding_data,
+               d_embedding_data + h_embedding.size(), h_embedding.begin());
 
   CUDA_RT_CALL(cudaStreamSynchronize(stream_view.value()));
 
@@ -172,10 +199,12 @@ __host__ Rcpp::NumericMatrix umap_transform(Rcpp::List const& model,
 
   ML::UMAP::transform(
     handle, /*X=*/d_x.data().get(), /*n=*/n_samples, /*d=*/n_features,
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) < \
+     CUML4R_LIBCUML_VERSION(24, 0))
     /*knn_indices=*/nullptr, /*knn_dists=*/nullptr,
-    /*orig_x=*/d_orig_x.data().get(),
-    /*orig_n=*/m_orig.numRows, /*embedding=*/d_embedding.data().get(),
-    /*embedding_n=*/m_embedding.numRows,
+#endif
+    /*orig_x=*/d_orig_x.data().get(), /*orig_n=*/m_orig.numRows,
+    /*embedding=*/d_embedding.data().get(), /*embedding_n=*/m_embedding.numRows,
     /*params=*/params.get(), /*transformed=*/d_transformed.data().get());
 
   CUDA_RT_CALL(cudaStreamSynchronize(stream_view.value()));
@@ -211,15 +240,14 @@ __host__ Rcpp::List umap_get_state(Rcpp::List const& model) {
       umap_params->negative_sample_rate;
     umap_params_list["transform_queue_size"] =
       umap_params->transform_queue_size;
-    umap_params_list["verbosity"] = umap_params->verbosity;
+    umap_params_list["verbosity"] = static_cast<int>(umap_params->verbosity);
     umap_params_list["a"] = umap_params->a;
     umap_params_list["b"] = umap_params->b;
     umap_params_list["init"] = umap_params->init;
     umap_params_list["target_n_neighbors"] = umap_params->target_n_neighbors;
     umap_params_list["target_metric"] =
       static_cast<int>(umap_params->target_metric);
-    umap_params_list["target_weight"] =
-      static_cast<int>(umap_params->target_weight);
+    umap_params_list["target_weight"] = umap_params->target_weight;
     umap_params_list["random_state"] = umap_params->random_state;
     umap_params_list["deterministic"] = umap_params->deterministic;
     state["umap_params"] = std::move(umap_params_list);
@@ -251,7 +279,13 @@ __host__ Rcpp::List umap_set_state(Rcpp::List const& state) {
       umap_params_list["negative_sample_rate"];
     umap_params->transform_queue_size =
       umap_params_list["transform_queue_size"];
+#if (CUML4R_LIBCUML_VERSION(CUML_VERSION_MAJOR, CUML_VERSION_MINOR) >= \
+     CUML4R_LIBCUML_VERSION(24, 0))
+    umap_params->verbosity = static_cast<rapids_logger::level_enum>(
+      Rcpp::as<int>(umap_params_list["verbosity"]));
+#else
     umap_params->verbosity = umap_params_list["verbosity"];
+#endif
     umap_params->a = umap_params_list["a"];
     umap_params->b = umap_params_list["b"];
     umap_params->init = umap_params_list["init"];
diff --git a/src/warn_cuml_missing.h b/src/warn_cuml_missing.h
index 38afe92..9999ba7 100644
--- a/src/warn_cuml_missing.h
+++ b/src/warn_cuml_missing.h
@@ -2,10 +2,9 @@
 
 /*
 #pragma message(                                                         \
-  "`cuml4r` requires a valid RAPIDS installation. "                      \
-  "Please follow https://rapids.ai/start.html to install RAPIDS first. " \
-  "`cuml4r` must be installed and run from an environment containing "   \
-  "a valid CUML_PREFIX env variable (e.g., "                             \
-  "'/home/user/anaconda3/envs/rapids-21.06' or similar)."                \
+  "`cuda.ml` was built without a usable RAPIDS cuML shared library. "    \
+  "Verify that nvidia-smi and nvcc work, then reinstall `cuda.ml`. "     \
+  "If RAPIDS is already installed, set CUML_PREFIX to a prefix "         \
+  "containing include/cuml and lib/libcuml++.so before reinstalling."    \
 )
 */
diff --git a/tests/testthat/helper-initialize.R b/tests/testthat/helper-initialize.R
index d533348..f43594a 100644
--- a/tests/testthat/helper-initialize.R
+++ b/tests/testthat/helper-initialize.R
@@ -43,10 +43,19 @@ sort_mat <- function(m, cols = seq(ncol(m))) {
 predict_in_sub_proc <- function(model_state, data, expected_mode,
                                 expected_model_cls = NULL,
                                 additional_predict_args = list()) {
-  impl <- function(expect_libcuda_ml_impl, model_state, data, expected_mode,
+  pkg_dir <- normalizePath(getwd(), winslash = "/", mustWork = TRUE)
+
+  impl <- function(pkg_dir, model_state, data, expected_mode,
                    expected_model_cls, additional_predict_args) {
-    library(cuda.ml)
-    expect_libcuda_ml_impl()
+    pkgload::load_all(pkg_dir, quiet = TRUE)
+    if (!has_cuML()) {
+      stop(
+        "The current installation of {cuda.ml} is not linked with a valid copy of",
+        " the RAPIDS cuML shared library!\n",
+        ".libPaths:\n",
+        paste(.libPaths(), collapse = "\n")
+      )
+    }
 
     model <- cuda_ml_unserialize(model_state)
     for (cls in expected_model_cls) {
@@ -60,7 +69,7 @@ predict_in_sub_proc <- function(model_state, data, expected_mode,
   callr::r(
     impl,
     args = list(
-      expect_libcuda_ml_impl = expect_libcuml,
+      pkg_dir = pkg_dir,
       model_state = model_state,
       data = data,
       expected_mode = expected_mode,
diff --git a/tests/testthat/test-kmeans.R b/tests/testthat/test-kmeans.R
index 0b38a71..e0743f2 100644
--- a/tests/testthat/test-kmeans.R
+++ b/tests/testthat/test-kmeans.R
@@ -33,7 +33,12 @@ test_that("cuda_ml_kmeans() works as expected with 'random' initialization metho
     init_method = "random"
   )
 
-  verify_cluster_centers(cuda_ml_kclust$centroids)
+  expect_equal(dim(cuda_ml_kclust$centroids), c(3L, 4L))
+  expect_equal(length(cuda_ml_kclust$labels), nrow(iris))
+  expect_equal(length(unique(cuda_ml_kclust$labels)), 3L)
+  expect_true(all(is.finite(cuda_ml_kclust$centroids)))
+  expect_true(is.finite(cuda_ml_kclust$inertia))
+  expect_lte(cuda_ml_kclust$n_iter, 100L)
 })
 
 test_that("cuda_ml_kmeans() works as expected with user-specified initial cluster centers", {
diff --git a/tests/testthat/test-knn.R b/tests/testthat/test-knn.R
index c9f421b..dfd70cb 100644
--- a/tests/testthat/test-knn.R
+++ b/tests/testthat/test-knn.R
@@ -17,7 +17,17 @@ test_blob_sz <- 10
 test_that("KNN classifier works as expected", {
   test_blobs_df <- gen_blobs(test_blob_sz, centers) %>%
     as.data.frame()
-  for (algo in c("brute", "ivfflat", "ivfpq", "ivfsq")) {
+  algos <- c("brute", "ivfflat", "ivfpq")
+  if (as.integer(cuML_major_version()) < 24) {
+    algos <- c(algos, "ivfsq")
+  } else {
+    expect_error(
+      cuda_ml_knn(label ~ ., blobs_df, algo = "ivfsq", metric = "euclidean"),
+      "IVFSQ KNN is unsupported by this cuML version"
+    )
+  }
+
+  for (algo in algos) {
     model <- cuda_ml_knn(
       label ~ ., blobs_df,
       algo = algo, metric = "euclidean"
diff --git a/tests/testthat/test-rand-forest-serde.R b/tests/testthat/test-rand-forest-serde.R
index edaf73e..2bc8550 100644
--- a/tests/testthat/test-rand-forest-serde.R
+++ b/tests/testthat/test-rand-forest-serde.R
@@ -2,6 +2,15 @@ context("(de)serialization of Random Forest models")
 
 test_that("random forest classifier can be serialized and unserialized correctly", {
   model <- cuda_ml_rand_forest(formula = Species ~ ., data = iris, trees = 200)
+
+  if (!cuda_ml_fil_enabled()) {
+    expect_error(
+      cuda_ml_serialize(model),
+      "Random forest serialization requires Treelite/FIL support"
+    )
+    return()
+  }
+
   model_state <- cuda_ml_serialize(model)
 
   data <- iris[-which(names(iris) == "Species")]
@@ -36,6 +45,15 @@ test_that("random forest classifier can be serialized and unserialized correctly
 
 test_that("random forest regressor can be serialized and unserialized correctly", {
   model <- cuda_ml_rand_forest(formula = mpg ~ ., data = mtcars, trees = 200)
+
+  if (!cuda_ml_fil_enabled()) {
+    expect_error(
+      cuda_ml_serialize(model),
+      "Random forest serialization requires Treelite/FIL support"
+    )
+    return()
+  }
+
   model_state <- cuda_ml_serialize(model)
 
   data <- mtcars[-which(names(mtcars) == "mpg")]
diff --git a/tests/testthat/test-rand-proj-serde.R b/tests/testthat/test-rand-proj-serde.R
index 08879ab..6c53fe0 100644
--- a/tests/testthat/test-rand-proj-serde.R
+++ b/tests/testthat/test-rand-proj-serde.R
@@ -7,22 +7,29 @@ test_that("random projection model can be serialized and unserialized correctly"
   data(Vehicle)
   data <- Vehicle[, which(names(Vehicle) != "Class")]
 
-  model <- cuda_ml_rand_proj(data, n_components = 4)
-  model_state <- cuda_ml_serialize(model)
+  if (cuda_ml_rand_proj_available()) {
+    model <- cuda_ml_rand_proj(data, n_components = 4)
+    model_state <- cuda_ml_serialize(model)
 
-  actual_transformed_data <- callr::r(
-    function(model_state, data) {
-      library(cuda.ml)
+    actual_transformed_data <- callr::r(
+      function(model_state, data) {
+        library(cuda.ml)
 
-      model <- cuda_ml_unserialize(model_state)
+        model <- cuda_ml_unserialize(model_state)
 
-      cuda_ml_transform(model, data)
-    },
-    args = list(
-      model_state = model_state,
-      data = data
+        cuda_ml_transform(model, data)
+      },
+      args = list(
+        model_state = model_state,
+        data = data
+      )
     )
-  )
 
-  expect_equal(actual_transformed_data, model$transformed_data)
+    expect_equal(actual_transformed_data, model$transformed_data)
+  } else {
+    expect_error(
+      cuda_ml_rand_proj(data, n_components = 4),
+      "random projection support is not available"
+    )
+  }
 })
diff --git a/tests/testthat/test-tsne.R b/tests/testthat/test-tsne.R
index 156a774..fa95dd6 100644
--- a/tests/testthat/test-tsne.R
+++ b/tests/testthat/test-tsne.R
@@ -2,12 +2,25 @@ context("t-distributed Stochastic Neighbor Embedding")
 
 iris_input <- iris[, names(iris) != "Species"]
 
+verify_tsne_embedding <- function(embedding) {
+  expect_s3_class(embedding, "cuda_ml_tsne_model")
+  expect_equal(dim(embedding), c(nrow(iris_input), 2L))
+  expect_true(all(is.finite(embedding)))
+  expect_gt(sum(apply(embedding, 2L, stats::sd)), 0)
+
+  set.seed(0L)
+  k_clust <- kmeans(embedding, centers = embedding[c(1, 51, 101), ])
+  expect_gte(k_clust$betweenss / k_clust$totss, 0.5)
+}
+
 test_that("cuda_ml_tsne() works as expected with 'exact' method", {
-  verify_iris_embedding(cuda_ml_tsne(iris_input, method = "exact"))
+  verify_tsne_embedding(
+    cuda_ml_tsne(iris_input, method = "exact", seed = 0L)
+  )
 })
 
 test_that("cuda_ml_tsne() works as expected with 'fft' method", {
-  verify_iris_embedding(
-    cuda_ml_tsne(iris_input, method = "fft", n_iter = 50000L)
+  verify_tsne_embedding(
+    cuda_ml_tsne(iris_input, method = "fft", n_iter = 5000L, seed = 0L)
   )
 })
diff --git a/tools/config/Makefile.cmake.in b/tools/config/Makefile.cmake.in
new file mode 100644
index 0000000..b690872
--- /dev/null
+++ b/tools/config/Makefile.cmake.in
@@ -0,0 +1,8 @@
+all: cuda.ml.so
+
+cuda.ml.so:
+	@CMAKE_BIN@ --build @CMAKE_BUILD_DIR@ --target cuda.ml @CMAKE_BUILD_PARALLEL_ARGS@
+	cp @CMAKE_BUILD_OUTPUT@ cuda.ml.so
+
+clean:
+	rm -rf @CMAKE_BUILD_DIR@ cuda.ml.so
diff --git a/tools/config/cleanup.R b/tools/config/cleanup.R
index f4c3303..30d7e9a 100644
--- a/tools/config/cleanup.R
+++ b/tools/config/cleanup.R
@@ -1,3 +1,3 @@
-for (x in c("Makevars", "Makefile", "CMakeCache.txt", "CMakeFiles", "cmake_install.cmake", "CMakeLists.txt", "*.o", "*.so")) {
+for (x in c("Makevars", "Makefile", ".cmake-build", "CMakeCache.txt", "CMakeFiles", "cmake_install.cmake", "CMakeLists.txt", "*.o", "*.so")) {
   unlink(file.path("src", x), recursive = TRUE, expand = TRUE)
 }
diff --git a/tools/config/configure.R b/tools/config/configure.R
index e271f6d..0bcd790 100644
--- a/tools/config/configure.R
+++ b/tools/config/configure.R
@@ -14,6 +14,20 @@
 #'              "${CUML_PREFIX}/", then no pre-built copy of `libcuml` will be
 #'              downloaded.
 #'
+#' CUML_BOOTSTRAP: The default is to bootstrap RAPIDS cuML from pip wheels if
+#'                 no existing `libcuml` is found and a suitable NVIDIA
+#'                 GPU/driver, `nvcc`, and Python package installer are
+#'                 available. Set CUML_BOOTSTRAP=0 to disable this behavior.
+#'
+#' CUML_BOOTSTRAP_CACHE: Override the cache directory used for bootstrapped
+#'                       RAPIDS headers and shared libraries.
+#'
+#' CUML_CUDA_ARCHITECTURES: Override CMAKE_CUDA_ARCHITECTURES. Defaults to
+#'                          detected GPU architectures supported by nvcc.
+#'
+#' CUML_RAPIDS_CMAKE_SOURCE_DIR: Override the local rapids-cmake checkout used
+#'                               by CMake FetchContent.
+#'
 #' DOWNLOAD_CUML: The default is to automatically download a pre-built copy of
 #'                `libcuml` if no existing `libcuml` is specified with the
 #'                'CUML_PREFIX' env variable. Set DOWNLOAD_CUML=0 to disable
@@ -54,7 +68,7 @@ load_libcuml_versions <- function() {
 load_util_fns <- function() {
   wd <- file.path(pkg_root(), "tools", "config", "utils")
 
-  for (f in c("cuml.R", "cmake.R", "logging.R", "nvcc.R", "platform.R")) {
+  for (f in c("logging.R", "platform.R", "nvcc.R", "bootstrap.R", "cuml.R", "cmake.R")) {
     source(file.path(wd, f))
   }
 }
@@ -62,21 +76,57 @@ load_util_fns <- function() {
 load_libcuml_versions()
 load_util_fns()
 
+find_rapids_cmake_source_dir <- function(src_dir, build_dir) {
+  candidates <- c(
+    Sys.getenv("CUML_RAPIDS_CMAKE_SOURCE_DIR", unset = NA),
+    file.path(src_dir, "_deps", "rapids-cmake-src"),
+    file.path(build_dir, "_deps", "rapids-cmake-src")
+  )
+  candidates <- candidates[!is.na(candidates)]
+
+  for (candidate in candidates) {
+    if (file.exists(file.path(candidate, "rapids-cmake", "rapids-cuda.cmake"))) {
+      return(normalizePath(candidate))
+    }
+  }
+
+  NA_character_
+}
+
 run_cmake <- function() {
   wd <- getwd()
   on.exit(setwd(wd))
   setwd(pkg_root())
+  nvcc <- find_nvcc()
 
   define(R_INCLUDE_DIR = R.home("include"))
   define(RCPP_INCLUDE_DIR = system.file("include", package = "Rcpp"))
   configure_file(file.path("src", "CMakeLists.txt.in"))
 
+  cmake_bin <- find_or_download_cmake(
+    min_version = cuda_ml_min_cmake_version,
+    exdir = file.path(pkg_root(), "tools")
+  )
+  src_dir <- normalizePath(file.path(pkg_root(), "src"))
+  build_dir <- file.path(src_dir, ".cmake-build")
+  dir.create(build_dir, recursive = TRUE, showWarnings = FALSE)
+
+  define(
+    CMAKE_BIN = shQuote(cmake_bin),
+    CMAKE_BUILD_DIR = shQuote(build_dir),
+    CMAKE_BUILD_OUTPUT = shQuote(file.path(build_dir, "cuda.ml.so"))
+  )
+  configure_file(
+    file.path("tools", "config", "Makefile.cmake.in"),
+    target = file.path("src", "Makefile")
+  )
+
   cuml_prefix <- get_cuml_prefix()
   bundle_libcuml <- FALSE
   if (is.na(cuml_prefix)) {
+    cuml_prefix <- normalizePath(file.path(pkg_root(), "libcuml"), mustWork = FALSE)
     download_libcuml()
-    cuml_prefix <- normalizePath(file.path(pkg_root(), "libcuml"))
-    dir.create("inst")
+    dir.create("inst", showWarnings = FALSE)
     file.rename(file.path("libcuml", "lib"), file.path("inst", "libs"))
     file.symlink(file.path("..", "inst", "libs"), file.path("libcuml", "lib"))
     libs <- c("libtreelite", "libtreelite_runtime", "libcuml++")
@@ -88,30 +138,43 @@ run_cmake <- function() {
   )
   Sys.setenv(CMAKE_PREFIX_PATH = cmake_prefix_path)
 
-  setwd(file.path(pkg_root(), "src"))
-
+  cuda_architectures <- Sys.getenv("CUML_CUDA_ARCHITECTURES", unset = NA)
+  if (is.na(cuda_architectures)) {
+    cuda_architectures <- infer_cuda_architectures(nvcc)
+  }
   cmake_args <- c(
-    ".",
-    paste0("-DCMAKE_CUDA_ARCHITECTURES=", Sys.getenv("CMAKE_CUDA_ARCHITECTURES", unset = "NATIVE")),
+    "-S", src_dir,
+    "-B", build_dir,
+    paste0("-DCMAKE_CUDA_ARCHITECTURES=", cuda_architectures),
     paste0("-DCUML_INCLUDE_DIR=", file.path(cuml_prefix, "include")),
     paste0("-DCUML_LIB_DIR=", file.path(cuml_prefix, "lib")),
+    paste0("-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=", build_dir),
     paste0(
-      "-DCUML_STUB_HEADERS_DIR=", normalizePath(file.path(getwd(), "stubs"))
+      "-DCUML_STUB_HEADERS_DIR=", normalizePath(file.path(src_dir, "stubs"))
     ),
-    paste0("-DCMAKE_CUDA_COMPILER=", find_nvcc()$path),
+    paste0("-DCMAKE_CUDA_COMPILER=", nvcc$path),
     "-DCMAKE_VERBOSE_MAKEFILE:BOOL=TRUE"
   )
+  rapids_cmake_source_dir <- find_rapids_cmake_source_dir(src_dir, build_dir)
+  if (!is.na(rapids_cmake_source_dir)) {
+    cmake_args <- c(
+      cmake_args,
+      paste0("-DFETCHCONTENT_SOURCE_DIR_RAPIDS-CMAKE=", rapids_cmake_source_dir)
+    )
+  }
   if (bundle_libcuml) {
     cmake_args <- c(
       cmake_args,
       "-DCMAKE_BUILD_WITH_INSTALL_RPATH:BOOL=TRUE",
       "-DCMAKE_INSTALL_RPATH:STRING='$ORIGIN'"
     )
+  } else if (!identical(Sys.getenv("CUML_SET_RPATH", unset = "1"), "0")) {
+    cmake_args <- c(
+      cmake_args,
+      "-DCMAKE_BUILD_WITH_INSTALL_RPATH:BOOL=TRUE",
+      paste0("-DCMAKE_INSTALL_RPATH:STRING=", file.path(cuml_prefix, "lib"))
+    )
   }
-  cmake_bin <- find_or_download_cmake(
-    min_version = cuda_ml_min_cmake_version,
-    exdir = file.path(pkg_root(), "tools")
-  )
   rc <- system2(cmake_bin, args = cmake_args)
 
   if (rc != 0) {
@@ -119,7 +182,12 @@ run_cmake <- function() {
   }
 }
 
-if (is.null(find_nvcc(stop_if_missing = FALSE)) || !has_libcuml()) {
+nvcc <- find_nvcc(stop_if_missing = FALSE)
+if (is.null(nvcc) && !cuml_cran_like()) {
+  warn_missing_nvcc()
+}
+
+if (is.null(nvcc) || !has_libcuml(nvcc = nvcc)) {
   wd <- getwd()
   on.exit(setwd(wd))
   setwd(pkg_root())
@@ -139,6 +207,7 @@ if (is.null(find_nvcc(stop_if_missing = FALSE)) || !has_libcuml()) {
       }
     })
   define(CUSTOMIZED_MAKEFLAGS = paste0("MAKEFLAGS += '-j", n_jobs, "'"))
+  define(CMAKE_BUILD_PARALLEL_ARGS = paste("--parallel", n_jobs))
 
   run_cmake()
 }
diff --git a/tools/config/utils/bootstrap.R b/tools/config/utils/bootstrap.R
new file mode 100644
index 0000000..21a706e
--- /dev/null
+++ b/tools/config/utils/bootstrap.R
@@ -0,0 +1,379 @@
+cuml_pip_version <- function() {
+  Sys.getenv("CUML_PIP_VERSION", unset = "26.4.0")
+}
+
+cuml_cuda_cccl_version <- function() {
+  Sys.getenv("CUML_CUDA_CCCL_VERSION", unset = "0.6.0")
+}
+
+cuml_cran_like <- function() {
+  nzchar(Sys.getenv("_R_CHECK_PACKAGE_NAME_")) ||
+    identical(Sys.getenv("CRAN", unset = ""), "true")
+}
+
+cuml_bootstrap_enabled <- function() {
+  !identical(Sys.getenv("CUML_BOOTSTRAP", unset = "1"), "0") &&
+    !cuml_cran_like()
+}
+
+cuml_bootstrap_cache_dir <- function() {
+  cache_dir <- Sys.getenv("CUML_BOOTSTRAP_CACHE", unset = NA_character_)
+  if (!is.na(cache_dir) && nzchar(cache_dir)) {
+    return(normalizePath(cache_dir, mustWork = FALSE))
+  }
+
+  xdg_cache <- Sys.getenv("XDG_CACHE_HOME", unset = NA_character_)
+  if (!is.na(xdg_cache) && nzchar(xdg_cache)) {
+    return(file.path(xdg_cache, "cuda.ml"))
+  }
+
+  home <- Sys.getenv("HOME", unset = NA_character_)
+  if (!is.na(home) && nzchar(home)) {
+    return(file.path(home, ".cache", "cuda.ml"))
+  }
+
+  file.path(tempdir(), "cuda.ml")
+}
+
+cuml_cuda_suffix <- function(cuda_version) {
+  major <- as.integer(cuda_version$major)
+  if (major %in% c(12L, 13L)) {
+    paste0("cu", major)
+  } else {
+    NA_character_
+  }
+}
+
+cuml_bootstrap_prefix <- function(
+  cuda_suffix,
+  rapids_version = cuml_pip_version()
+) {
+  file.path(
+    cuml_bootstrap_cache_dir(),
+    "rapids",
+    paste0("rapids-", rapids_version, "-", cuda_suffix)
+  )
+}
+
+cuml_bootstrap_target <- function(
+  cuda_suffix,
+  rapids_version = cuml_pip_version()
+) {
+  file.path(
+    cuml_bootstrap_cache_dir(),
+    "wheel-targets",
+    paste0("rapids-", rapids_version, "-", cuda_suffix)
+  )
+}
+
+cuml_nvidia_gpu_available <- function() {
+  nvidia_smi <- Sys.which("nvidia-smi")
+  if (!nzchar(nvidia_smi)) {
+    return(FALSE)
+  }
+
+  out <- tryCatch(
+    suppressWarnings(
+      system2(
+        nvidia_smi,
+        c("--query-gpu=name,driver_version", "--format=csv,noheader"),
+        stdout = TRUE,
+        stderr = TRUE
+      )
+    ),
+    error = function(e) character()
+  )
+  status <- attr(out, "status", exact = TRUE)
+
+  (is.null(status) || identical(status, 0L)) && length(out) > 0L && any(nzchar(out))
+}
+
+warn_missing_nvidia_gpu <- function() {
+  warning2(
+    "No usable NVIDIA GPU/driver was detected with `nvidia-smi`.",
+    "Install or fix the NVIDIA driver, then verify that `nvidia-smi` lists",
+    "your GPU before reinstalling {cuda.ml}.",
+    "Falling back to a stub-only build."
+  )
+}
+
+warn_missing_nvcc <- function() {
+  warning2(
+    "A CUDA compiler (`nvcc`) was not found.",
+    "Install an NVIDIA CUDA Toolkit that includes `nvcc`, then verify that",
+    "`nvcc --version` works. If the toolkit is installed outside `PATH`, set",
+    "`CUDA_HOME` to the toolkit prefix before reinstalling {cuda.ml}.",
+    "On Ubuntu, after adding NVIDIA's CUDA apt repository for your release:",
+    "`sudo apt install cuda-toolkit`",
+    "Falling back to a stub-only build."
+  )
+}
+
+cuml_find_uv <- function() {
+  uv <- Sys.which("uv")
+  if (nzchar(uv)) {
+    return(uv)
+  }
+
+  if (requireNamespace("reticulate", quietly = TRUE)) {
+    uv <- tryCatch(reticulate:::uv_binary(), error = function(e) "")
+    if (nzchar(uv) && file.exists(uv)) {
+      return(uv)
+    }
+  }
+
+  ""
+}
+
+cuml_installer_works <- function(command, args) {
+  tryCatch(
+    {
+      out <- system2(command, args, stdout = TRUE, stderr = TRUE)
+      status <- attr(out, "status", exact = TRUE)
+      is.null(status) || identical(status, 0L)
+    },
+    error = function(e) FALSE
+  )
+}
+
+cuml_find_package_installer <- function() {
+  uv <- cuml_find_uv()
+  if (nzchar(uv) && cuml_installer_works(uv, "--version")) {
+    return(list(
+      type = "uv",
+      label = paste("uv", uv),
+      command = uv,
+      install_args = c("pip", "install")
+    ))
+  }
+
+  for (python in c(Sys.which("python"), Sys.which("python3"))) {
+    if (nzchar(python) && cuml_installer_works(python, c("-m", "pip", "--version"))) {
+      return(list(
+        type = "pip",
+        label = paste("python -m pip", python),
+        command = python,
+        install_args = c("-m", "pip", "install")
+      ))
+    }
+  }
+
+  for (pip in c(Sys.which("pip"), Sys.which("pip3"))) {
+    if (nzchar(pip) && cuml_installer_works(pip, "--version")) {
+      return(list(
+        type = "pip",
+        label = paste("pip", pip),
+        command = pip,
+        install_args = "install"
+      ))
+    }
+  }
+
+  NULL
+}
+
+cuml_pip_packages <- function(cuda_suffix) {
+  c(
+    paste0("libcuml-", cuda_suffix, "==", cuml_pip_version()),
+    paste0("cuda-cccl==", cuml_cuda_cccl_version())
+  )
+}
+
+cuml_package_index_args <- function(installer) {
+  if (identical(installer$type, "uv")) {
+    c(
+      "--no-config",
+      "--index", "https://pypi.nvidia.com",
+      "--default-index", "https://pypi.org/simple",
+      "--index-strategy", "unsafe-best-match"
+    )
+  } else {
+    c("--extra-index-url", "https://pypi.nvidia.com")
+  }
+}
+
+cuml_package_install_args <- function(installer, target, packages) {
+  c(
+    installer$install_args,
+    cuml_package_index_args(installer),
+    "--target", target,
+    "--only-binary", ":all:",
+    "--upgrade",
+    packages
+  )
+}
+
+cuml_package_install_env <- function(installer) {
+  if (identical(installer$type, "uv")) {
+    c(
+      "UV_NO_CONFIG=1",
+      "UV_INDEX_STRATEGY=unsafe-best-match"
+    )
+  } else {
+    character()
+  }
+}
+
+cuml_package_install_command <- function(installer) {
+  if (identical(installer$type, "uv")) {
+    env <- unname(Sys.which("env"))
+    if (nzchar(env)) env else "env"
+  } else {
+    installer$command
+  }
+}
+
+cuml_package_install_command_args <- function(installer, args) {
+  if (identical(installer$type, "uv")) {
+    c("-u", "UV_EXCLUDE_NEWER", "-u", "UV_EXCLUDE_NEWER_PACKAGE", installer$command, args)
+  } else {
+    args
+  }
+}
+
+cuml_run_package_install <- function(installer, target, packages) {
+  dir.create(dirname(target), recursive = TRUE, showWarnings = FALSE)
+  unlink(target, recursive = TRUE, force = TRUE)
+
+  args <- cuml_package_install_args(installer, target, packages)
+  env <- cuml_package_install_env(installer)
+
+  status <- system2(
+    cuml_package_install_command(installer),
+    cuml_package_install_command_args(installer, args),
+    env = env
+  )
+  identical(status, 0L)
+}
+
+copy_dir_contents <- function(src, dst) {
+  if (!dir.exists(src)) {
+    return(FALSE)
+  }
+
+  dir.create(dst, recursive = TRUE, showWarnings = FALSE)
+  status <- system2("cp", c("-a", file.path(src, "."), dst))
+  identical(status, 0L)
+}
+
+extract_cuml_pip_prefix <- function(target, prefix) {
+  unlink(prefix, recursive = TRUE, force = TRUE)
+  dir.create(file.path(prefix, "include"), recursive = TRUE, showWarnings = FALSE)
+  dir.create(file.path(prefix, "lib"), recursive = TRUE, showWarnings = FALSE)
+
+  for (pkg in c("libcuml", "libraft", "librmm", "rapids_logger")) {
+    copy_dir_contents(file.path(target, pkg, "include"), file.path(prefix, "include"))
+
+    for (libdir in c("lib", "lib64", ".libs")) {
+      copy_dir_contents(file.path(target, pkg, libdir), file.path(prefix, "lib"))
+    }
+  }
+
+  copy_dir_contents(
+    file.path(target, "cuda", "cccl", "headers", "include"),
+    file.path(prefix, "include")
+  )
+  copy_dir_contents(
+    file.path(target, "cuda", "cccl", "headers", "lib"),
+    file.path(prefix, "lib")
+  )
+
+  nvidia_dir <- file.path(target, "nvidia")
+  if (dir.exists(nvidia_dir)) {
+    for (component in list.files(nvidia_dir, full.names = TRUE)) {
+      copy_dir_contents(file.path(component, "include"), file.path(prefix, "include"))
+      copy_dir_contents(file.path(component, "lib"), file.path(prefix, "lib"))
+    }
+  }
+
+  for (bundle_dir in list.files(target, pattern = "\\.libs$", full.names = TRUE)) {
+    copy_dir_contents(bundle_dir, file.path(prefix, "lib"))
+  }
+
+  check_libcuml_path(prefix)
+}
+
+bootstrap_libcuml_from_pip <- function(nvcc = find_nvcc(stop_if_missing = FALSE)) {
+  if (!cuml_bootstrap_enabled() || is.null(nvcc)) {
+    return(NA_character_)
+  }
+
+  cuda_suffix <- cuml_cuda_suffix(nvcc$version)
+  if (is.na(cuda_suffix)) {
+    if (!can_download_libcuml(cuda_version = nvcc$version$major)) {
+      warning2(
+        paste0("Automatic RAPIDS pip bootstrap does not support CUDA ", nvcc$version, "."),
+        "Install RAPIDS cuML yourself and set `CUML_PREFIX`, or install a supported",
+        "CUDA toolkit and retry.",
+        "Falling back to a stub-only build."
+      )
+    }
+    return(NA_character_)
+  }
+
+  Sys.setenv(CUML_BOOTSTRAP_FAILED = "1")
+
+  if (!cuml_nvidia_gpu_available()) {
+    warn_missing_nvidia_gpu()
+    return(NA_character_)
+  }
+
+  prefix <- cuml_bootstrap_prefix(cuda_suffix)
+  if (check_libcuml_path(prefix)) {
+    Sys.setenv(CUML_BOOTSTRAP_FAILED = "0")
+    Sys.setenv(CUML_PREFIX = prefix)
+    return(prefix)
+  }
+
+  installer <- cuml_find_package_installer()
+  if (is.null(installer)) {
+    warning2(
+      "Unable to find a Python package installer for bootstrapping RAPIDS cuML.",
+      "Install `uv` or install Python with pip, then reinstall {cuda.ml}.",
+      "On Ubuntu, the Python fallback can be installed with:",
+      "`sudo apt install python3 python3-pip python3-venv`",
+      "Falling back to a stub-only build."
+    )
+    return(NA_character_)
+  }
+
+  target <- cuml_bootstrap_target(cuda_suffix)
+  packages <- cuml_pip_packages(cuda_suffix)
+
+  message(format_msg(
+    "Bootstrapping RAPIDS cuML from pip wheels.",
+    paste0("Installer: ", installer$label),
+    paste0("Packages: ", paste(packages, collapse = ", ")),
+    paste0("Prefix: ", prefix)
+  ))
+
+  if (!cuml_run_package_install(installer, target, packages)) {
+    args <- cuml_package_install_args(installer, target, packages)
+    warning2(
+      "Failed to install RAPIDS cuML pip wheels.",
+      "You can retry manually with:",
+      paste(
+        shQuote(cuml_package_install_command(installer)),
+        paste(shQuote(cuml_package_install_command_args(installer, args)), collapse = " ")
+      ),
+      "Or install RAPIDS yourself and set `CUML_PREFIX`.",
+      "Falling back to a stub-only build."
+    )
+    return(NA_character_)
+  }
+
+  if (!extract_cuml_pip_prefix(target, prefix)) {
+    warning2(
+      "RAPIDS cuML pip wheels were installed, but the expected C/C++ headers",
+      "and shared libraries could not be extracted.",
+      "Install RAPIDS yourself and set `CUML_PREFIX`.",
+      "Falling back to a stub-only build."
+    )
+    return(NA_character_)
+  }
+
+  unlink(target, recursive = TRUE, force = TRUE)
+  Sys.setenv(CUML_BOOTSTRAP_FAILED = "0")
+  Sys.setenv(CUML_PREFIX = prefix)
+  prefix
+}
diff --git a/tools/config/utils/cuml.R b/tools/config/utils/cuml.R
index bdb582e..c4fb594 100644
--- a/tools/config/utils/cuml.R
+++ b/tools/config/utils/cuml.R
@@ -1,6 +1,7 @@
 check_libcuml_path <- function(path) {
   cuml_headers_dir <- file.path(path, "include", "cuml")
-  dir.exists(cuml_headers_dir)
+  cuml_lib <- file.path(path, "lib", "libcuml++.so")
+  dir.exists(cuml_headers_dir) && file.exists(cuml_lib)
 }
 
 get_cuml_prefix <- function() {
@@ -25,6 +26,11 @@ get_cuml_prefix <- function() {
         return(cuml_prefix)
       }
 
+      cuml_prefix <- bootstrap_libcuml_from_pip()
+      if (!is.na(cuml_prefix)) {
+        return(cuml_prefix)
+      }
+
       # We will download a pre-built copy of `libcuml`
       return(NA_character_)
     }
@@ -33,30 +39,50 @@ get_cuml_prefix <- function() {
   return(cuml_prefix)
 }
 
-has_libcuml <- function() {
+has_libcuml <- function(nvcc = find_nvcc()) {
   # this is here to make sure we only proceed to automatically downloading if we
   # find a compatible nvcc version.
-  find_nvcc()
-
   cuml_prefix <- get_cuml_prefix()
   if (is.na(cuml_prefix)) {
-    # Skip subsequent checks if we are downloading a pre-built copy of `libcuml`
-    TRUE
+    if (identical(Sys.getenv("CUML_BOOTSTRAP_FAILED", unset = "0"), "1")) {
+      FALSE
+    } else if (can_download_libcuml(cuda_version = nvcc$version$major)) {
+      # Skip subsequent checks if we are downloading a pre-built copy of `libcuml`
+      TRUE
+    } else {
+      warning2(
+        "No `libcuml` installation has been found.",
+        paste0(
+          "No bundled `libcuml` download is available for CUDA ",
+          nvcc$version$major,
+          " and cuML ",
+          Sys.getenv("CUML_VERSION", unset = "21.08"),
+          "."
+        ),
+        "Falling back to a stub-only build."
+      )
+      FALSE
+    }
   } else {
     cuml_headers_dir <- file.path(cuml_prefix, "include", "cuml")
+    cuml_lib <- file.path(cuml_prefix, "lib", "libcuml++.so")
 
-    if (!dir.exists(cuml_headers_dir)) {
+    if (!check_libcuml_path(cuml_prefix)) {
+      missing_paths <- c(cuml_headers_dir, cuml_lib)
+      missing_paths <- missing_paths[!file.exists(missing_paths)]
       warning2(
-        paste0(cuml_headers_dir, " does not exist or is not a directory!"),
+        paste0("Invalid CUML_PREFIX: ", cuml_prefix),
+        paste0("Missing expected path(s): ", paste(missing_paths, collapse = ", ")),
         "",
         "{cuda.ml} requires a valid RAPIDS installation.",
-        "Please follow https://rapids.ai/start.html to install RAPIDS first"
+        "Please follow https://rapids.ai/start.html#get-rapids to install RAPIDS first"
       )
       warning2(
         "{cuda.ml} must be installed from an environment containing a valid",
         "CUML_PREFIX env variable such that \"${CUML_PREFIX}/include/cuml\"",
         "is the directory of RAPIDS cuML header files and \"${CUML_PREFIX}/lib\"",
-        "is the directory of RAPIDS cuML shared library files.)."
+        "is the directory of RAPIDS cuML shared library files. RAPIDS can be",
+        "installed with pip, conda, or from source."
       )
       FALSE
     } else {
@@ -65,13 +91,46 @@ has_libcuml <- function() {
   }
 }
 
+libcuml_download_url <- function(
+  cuml_version = Sys.getenv("CUML_VERSION", unset = "21.08"),
+  cuda_version = as.character(find_nvcc()$version$major)
+) {
+  url <- Sys.getenv("CUML_URL", unset = NA_character_)
+  if (!is.na(url) && nzchar(url)) {
+    return(url)
+  }
+
+  version_urls <- libcuml_versions[[cuml_version]]
+  if (is.null(version_urls)) {
+    return(NA_character_)
+  }
+
+  url <- version_urls[[as.character(cuda_version)]]
+  if (is.null(url) || length(url) != 1L || is.na(url) || !nzchar(url)) {
+    return(NA_character_)
+  }
+
+  url
+}
+
+can_download_libcuml <- function(
+  cuml_version = Sys.getenv("CUML_VERSION", unset = "21.08"),
+  cuda_version = as.character(find_nvcc()$version$major)
+) {
+  if (identical(Sys.getenv("DOWNLOAD_CUML", unset = "1"), "0")) {
+    return(FALSE)
+  }
+
+  !is.na(libcuml_download_url(cuml_version, cuda_version))
+}
+
 download_libcuml <- function(cuml_version = Sys.getenv("CUML_VERSION", unset = "21.08")) {
   wd <- getwd()
   on.exit(setwd(wd))
   setwd(pkg_root())
 
-  if (Sys.getenv("DOWNLOAD_CUML", unset = 1) == 0) {
-    stop2("No `libcuml` installation has been found and downloading has been prevented by `CUML_NO_DOWNLOAD`.")
+  if (identical(Sys.getenv("DOWNLOAD_CUML", unset = "1"), "0")) {
+    stop2("No `libcuml` installation has been found and downloading has been prevented by `DOWNLOAD_CUML=0`.")
   }
 
   old_timeout <- getOption("timeout")
@@ -81,9 +140,19 @@ download_libcuml <- function(cuml_version = Sys.getenv("CUML_VERSION", unset = "
   tmp <- tempfile(fileext = ".zip")
   cuda_version <- as.character(find_nvcc()$version$major)
 
-  url <- Sys.getenv("CUML_URL")
-  if (!nzchar(url)) {
-    url <- libcuml_versions[[cuml_version]][[cuda_version]]
+  url <- libcuml_download_url(cuml_version, cuda_version)
+  if (is.na(url)) {
+    stop2(
+      "No `libcuml` installation has been found.",
+      paste0(
+        "No bundled `libcuml` download is available for CUDA ",
+        cuda_version,
+        " and cuML ",
+        cuml_version,
+        "."
+      ),
+      "Set `CUML_PREFIX` to an existing RAPIDS installation or set `CUML_URL` to a compatible `libcuml` archive."
+    )
   }
 
   download.file(url, tmp)
diff --git a/tools/config/utils/nvcc.R b/tools/config/utils/nvcc.R
index 2af4646..417cc8f 100644
--- a/tools/config/utils/nvcc.R
+++ b/tools/config/utils/nvcc.R
@@ -1,19 +1,27 @@
 find_nvcc <- function(stop_if_missing = TRUE) {
-  # Check if nvcc from path is available
-  nvcc_path <- "nvcc"
-  cuda_version <- nvcc_version_from_path(nvcc_path)
+  nvcc_candidates <- character()
 
-  # Check if nvcc from CUDA_HOME is available
+  # Prefer an explicit CUDA_HOME over PATH/default discovery.
   cuda_home <- Sys.getenv("CUDA_HOME")
-  if (nzchar(cuda_home) && is.null(cuda_version)) {
-    nvcc_path <- file.path(cuda_home, "bin", "nvcc")
-    cuda_version <- nvcc_version_from_path(nvcc_path)
+  if (nzchar(cuda_home)) {
+    nvcc_candidates <- c(nvcc_candidates, file.path(cuda_home, "bin", "nvcc"))
   }
 
-  # Check nvcc from default install location.
-  if (is.null(cuda_version)) {
-    nvcc_path <- "/usr/local/cuda/bin/nvcc"
-    cuda_version <- nvcc_version_from_path(nvcc_path)
+  nvcc_candidates <- unique(c(
+    nvcc_candidates,
+    "nvcc",
+    "/usr/local/cuda/bin/nvcc"
+  ))
+
+  nvcc_path <- NULL
+  cuda_version <- NULL
+  for (candidate in nvcc_candidates) {
+    version <- nvcc_version_from_path(candidate)
+    if (!is.null(version)) {
+      nvcc_path <- candidate
+      cuda_version <- version
+      break
+    }
   }
 
   # No nvcc found! Error!
@@ -56,3 +64,75 @@ nvcc_version_from_path <- function(nvcc) {
   version <- gsub(".*release |, V.*", "", nvcc[grepl("release", nvcc)])
   package_version(version)
 }
+
+nvcc_supported_architectures <- function(nvcc) {
+  out <- suppressWarnings(
+    tryCatch(system2(nvcc, "--list-gpu-arch", stdout = TRUE, stderr = TRUE), error = function(e) NULL)
+  )
+  if (is.null(out)) {
+    return(character())
+  }
+
+  archs <- grep("^compute_[0-9]+$", out, value = TRUE)
+  unique(sub("^compute_", "", archs))
+}
+
+detected_gpu_architectures <- function() {
+  out <- suppressWarnings(
+    tryCatch(
+      system2(
+        "nvidia-smi",
+        c("--query-gpu=compute_cap", "--format=csv,noheader"),
+        stdout = TRUE,
+        stderr = TRUE
+      ),
+      error = function(e) NULL
+    )
+  )
+  if (is.null(out)) {
+    return(character())
+  }
+
+  caps <- regmatches(out, gregexpr("\\b[0-9]+\\.[0-9]+\\b", out))
+  caps <- unlist(caps, use.names = FALSE)
+  unique(gsub("\\.", "", caps))
+}
+
+infer_cuda_architectures <- function(nvcc) {
+  supported <- nvcc_supported_architectures(nvcc$path)
+  detected <- detected_gpu_architectures()
+  compatible <- intersect(detected, supported)
+
+  if (length(compatible) > 0) {
+    unsupported <- setdiff(detected, compatible)
+    if (length(unsupported) > 0) {
+      message(
+        "Ignoring GPU architectures unsupported by ",
+        nvcc$path,
+        ": ",
+        paste(unsupported, collapse = ";")
+      )
+    }
+    return(paste(compatible, collapse = ";"))
+  }
+
+  if (length(detected) > 0 && length(supported) > 0) {
+    stop2(
+      paste0("Detected GPU architectures: ", paste(detected, collapse = ";")),
+      paste0("Architectures supported by ", nvcc$path, ": ", paste(supported, collapse = ";")),
+      "No detected GPU architecture is supported by this CUDA compiler.",
+      "Install a CUDA toolkit that supports your GPU, or set CUML_CUDA_ARCHITECTURES manually."
+    )
+  }
+
+  if (length(supported) > 0) {
+    message(
+      "Unable to detect a GPU architecture; defaulting CMAKE_CUDA_ARCHITECTURES to ",
+      supported[[1]],
+      ". Set CUML_CUDA_ARCHITECTURES to override."
+    )
+    return(supported[[1]])
+  }
+
+  "NATIVE"
+}