kkholst · kkholst · Feb 6, 2025 · Mar 13, 2025 · Mar 13, 2025 · Mar 17, 2025
diff --git a/NAMESPACE b/NAMESPACE
@@ -109,6 +109,8 @@ export(stratify)
 export(superlearner)
 export(test_intersection_sw)
 export(test_zmax_onesided)
+export(tune)
+export(tuner)
 import(Rcpp)
 import(methods)
 importFrom(R6,R6Class)

diff --git a/R/tune.R b/R/tune.R
@@ -0,0 +1,88 @@
+#' @title Hyperparameter tuning
+#' @param model [ml_model] object
+#' @param data data.frame
+#' @param nfolds Number of folds to use in cross-validation
+#' @param model.score model scoring (default MSE)
+#' @param ... Additional arguments to
+#' [rBayesianOptimization::BayesianOptimization]
+#' @return list
+#' @inherit rBayesianOptimization::BayesianOptimization
+#' @author Klaus Kähler Holst
+#' @export
+#' @aliases tune tuner
+tune <- function(model, bounds, data,
+                 n_iter = 15,
+                 kappa = 2.576,
+                 init_points = 4,
+                 nfolds = 5,
+                 model.score = targeted::scoring,
+                 ...) {
+  obj <- function(...) {
+    cl <- rlang::call_match(defaults = TRUE)
+    cl[1] <- expression(model)
+    f <- eval(cl)
+    res <- targeted::cv(list(f),
+      model.score = model.score,
+      nfolds = nfolds,
+      data = data,
+      silent = TRUE
+    )
+    list(Score = -coef(res)[1], Pred = 0)
+  }
+  bo.args <- c(list(
+    FUN = obj,
+    bounds = bounds,
+    init_points = init_points,
+    n_iter = n_iter,
+    kappa = kappa
+  ), list(...))
+  arg <- names(bounds)
+  idx.cat <- which(unlist(lapply(bounds, is.list)))
+  if (length(idx.cat) > 0) {
+    par.cat <- arg[idx.cat]
+  }
+  res <- c()
+  if (length(par.cat) == 0) {
+      op <- do.call(rBayesianOptimization::BayesianOptimization, bo.args)
+      res <- list(op)
+  } else {
+    par.cat.grid <- expand.grid(bounds[idx.cat])
+    bounds[idx.cat] <- NULL
+    bo.args$bounds <- bounds
+      colnames(par.cat.grid) <- par.cat
+      for (i in seq_len(nrow(par.cat.grid))) {
+        obj0 <- obj
+        formals(obj0)[par.cat] <- par.cat.grid[i, ]
+        bo.args$FUN <- obj0
+        op <- do.call(rBayesianOptimization::BayesianOptimization, bo.args)
+        for (j in seq_len(length(par.cat))) {
+          op$History[[par.cat[j]]] <-
+            par.cat.grid[i, j]
+        }
+        res <- c(res, list(op))
+      }
+  }
+  res <- Reduce(rbind, lapply(res, function(x) x$History)) |>
+    as.data.frame()
+  ii <- which.max(res$Value)
+  par.opt <- res[ii, arg, drop=TRUE]
+  mod <- do.call(model, par.opt)
+  mod$estimate(data)
+  return(list(
+    history = res,
+    model = mod,
+    value = -res$Value[ii],
+    best = par.opt)
+    )
+}
+
+#' @export
+tuner <- function(mod, bounds, data, ...) {
+  modwrap <- function(...) {
+    args <- list(...)
+    mod$args[names(args)] <- args
+    mod$description <- ""
+    return(mod)
+  }
+  tune(modwrap, bounds=bounds, data=data, ...)
+}
diff --git a/man/tune.Rd b/man/tune.Rd
diff --git a/vignettes/test-tune.Rmd b/vignettes/test-tune.Rmd
@@ -0,0 +1,83 @@
+---
+title: "Test `tune` function"
+output:
+  knitr:::html_vignette:
+    fig_caption: yes
+    fig_width: 5.15
+    fig_height: 3.5
+    fig_align: center
+---
+
+```{r, include = FALSE}
+library(targeted)
+```
+
+
+Mayo Clinic Primary Biliary Cholangitis Data
+```{r}
+data(pbc, package = "survival")
+```
+
+In the following we are interested in predicting events (death or transplant)
+before 2 years.
+```{r}
+pbc <- transform(pbc, y = (time < 730) * (status > 0))
+```
+
+A couple of benchmark models
+```{r}
+m0 <- predictor_glm(y ~ 1)
+m1 <- predictor_glm(y ~ age + sex + bili + edema + albumin)
+## m2 <- predictor_nb(y ~ age + sex + bili + edema + albumin, kernel = TRUE)
+cv(list(benchmark = m0, glm = m1), data = pbc)
+```
+
+Next we tune the hyper-parameters of a xgboost model with the `tuner` function.
+As inputs we need to provide a `ml_model` object and
+a list of parameters to optimize over, here defined by the `bounds` variable.
+For continuous variables (`eta`, `nrounds`) the lower and upper bounds are
+defined numeric vectors, for categorical variables (`max_depth`) the possible
+values are defined as a list element. A grid search is performed over all
+combinations of the categorical variables, and a Bayesian Optimization routine
+is run over the remaining continuous variables with the categorical variables fixed.
+```{r}
+mod <- predictor_xgboost_binary(
+  y ~ age + sex + bili + edema + albumin
+)
+bounds <- list(
+  eta = c(0.001, 1),
+  nrounds = c(1, 100),
+  max_depth = list(1,3,6)
+)
+a <- tuner(mod, bounds, pbc, n_iter = 5)
+a
+```
+
+[OBS: I suggest this function should be implemented as a R6 class method
+allowing to tune models with syntax `mod$tune(...)`]
+Arguments can be given to control the underlying cross-validation assessment and
+scoring method (default is MSE).
+The returned object contains all the function evaluations and argument values
+that was explored (`history`), the model object with the hyper-parameters set to
+the best identified values (`model`), the best model score (`value`, here
+MSE/Brier score), and the best hyper-parameters (`best`)
+```{r}
+names(a)
+a$history
+```
+
+```{r}
+cv(list(xgb=a$model, m0=m0), pbc)
+```
+Alternative way:
+
+```{r}
+model <- function(eta, nrounds, max_depth) {
+  predictor_xgboost_binary(
+    y ~ age + sex + bili + edema + albumin,
+    eta = eta, nrounds = nrounds, max_depth = max_depth
+  )
+}
+b <- targeted:::tune(model, bounds, pbc, n_iter = 2)
+b
+```