diff --git a/.Rbuildignore b/.Rbuildignore
index eb4b7ca..22557f8 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -17,3 +17,7 @@
 ^doc$
 ^Meta$
 ^.vscode$
+^\.DS_Store$
+^SomeFile\.diff$
+^src/.*\.(o|so|a)$
+^src/lib/.*\.(o|so|a)$
diff --git a/NAMESPACE b/NAMESPACE
index b14b57c..dbf99cd 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -13,6 +13,7 @@ S3method(rpf,recipe)
 S3method(str,rpf_forest)
 export(is_purified)
 export(predict_components)
+export(preprocess_predictors_predict)
 export(purify)
 export(rpf)
 import(checkmate)
diff --git a/R/predict_components.R b/R/predict_components.R
index ba07a5f..a0d5161 100644
--- a/R/predict_components.R
+++ b/R/predict_components.R
@@ -87,7 +87,12 @@ predict_components <- function(object, new_data, max_interaction = NULL, predict
   }
 
   # Check if forest is purified, if not we do that now
-  if (!is_purified(object)) purify(object)
+  if (!is_purified(object)) {
+    # Purify using default policy: mode=2 (fast exact),
+    # maxp_interaction=0 (uncapped),
+    # nthreads defaults to min(training nthreads, available cores)
+    object$fit$purify_threads(0L, 0L, 2L)
+  }
 
   # If max_interaction is greater than number of predictors requested we need to adjust that
   max_interaction <- min(max_interaction, length(predictors))
diff --git a/R/predict_rpf.R b/R/predict_rpf.R
index 35dfe06..b14faed 100644
--- a/R/predict_rpf.R
+++ b/R/predict_rpf.R
@@ -108,24 +108,27 @@ predict_rpf_prob <- function(object, new_data, ...) {
       pred_prob <- 1 / (1 + exp(-pred_raw))
     } else if (object$params$loss %in% c("L1", "L2")) {
       # Truncate probabilities at [0,1] for L1/L2 loss
-      pred_prob <- apply(pred_raw, 2, function(col) pmax(0, pmin(1, col)))
+      pred_prob <- pmax(0, pmin(1, pred_raw))
     }
 
-    # Binary classif yields n x 1 prediction matrix, append complementary class prob
+    # Ensure a plain numeric vector in binary case
+    pred_prob <- as.numeric(pred_prob)
+    # Binary classif yields two columns ordered by outcome levels
     pred_prob <- cbind(1 - pred_prob, pred_prob)
 
   } else { # Multiclass
 
     if (object$params$loss %in% c("logit", "exponential")) {
-      # FIXME:
-      # softmax() defined in utils.R, should be identical to logit^-1 for
-      # binary case but not properly tested yet
+      # softmax for multi-class
       pred_prob <- softmax(pred_raw)
     } else if (object$params$loss %in% c("L1", "L2")) {
-      # Truncate probabilities at [0,1] for L1/L2 loss
-      pred_prob <- apply(pred_raw, 2, function(col) pmax(0, pmin(1, col)))
-      # Normalise such that sum of class probs is always 1
-      pred_prob <- pred_prob/rowSums(pred_prob)
+      # Clamp to [0,1] and renormalize rows
+      pred_prob <- pmin(1, pmax(0, pred_raw))
+      # pmin/pmax drop dimensions; restore matrix shape explicitly
+      dim(pred_prob) <- dim(pred_raw)
+      rs <- rowSums(pred_prob)
+      rs[!is.finite(rs) | rs <= 0] <- 1
+      pred_prob <- pred_prob / rs
     }
   }
 
@@ -140,7 +143,7 @@ predict_rpf_class <- function(object, new_data, ...) {
   pred_prob <- predict_rpf_prob(object, new_data, 0, ...)
 
   # For each instance, class with higher probability
-  pred_class <- factor(outcome_levels[max.col(pred_prob)], levels = outcome_levels)
+  pred_class <- factor(outcome_levels[max.col(as.matrix(pred_prob))], levels = outcome_levels)
   out <- hardhat::spruce_class(pred_class)
 
   out
diff --git a/R/purify.R b/R/purify.R
index 40baa39..62ca7e7 100644
--- a/R/purify.R
+++ b/R/purify.R
@@ -1,6 +1,6 @@
 #' Purify a Random Planted Forest
 #'
-#' TODO: Explain what this does
+#' Purifies an rpf object.
 #'
 #' Unless [`rpf()`] is called with `purify = TRUE`, the forest has to be purified after fit
 #' to ensure the components extracted by [`predict_components()`] are valid.
@@ -28,11 +28,28 @@ purify.default <- function(x, ...) {
   )
 }
 
+#' @param maxp_interaction integer or NULL: Only compute/store purified components
+#'   up to this interaction order. Higher-order purified trees are zeroed (not
+#'   computed) but still implicitly influence lower orders during purification.
+#'   If NULL, purify all orders (default behavior).
+#' @param mode integer(1): Purification algorithm mode. 1 = legacy grid path
+#'   used by `fit$fit$purify()`; 2 = fast exact KD-tree based path. Defaults to 2.
+#' @param nthreads integer or NULL: number of threads to use. If NULL, defaults
+#'   to min of the object's configured `nthreads` and available threads.
 #' @export
 #' @rdname purify
 #' @importFrom utils capture.output
-purify.rpf <- function(x, ...) {
-  x$fit$purify()
+purify.rpf <- function(x, ..., maxp_interaction = NULL, mode = 2L, nthreads = NULL) {
+  checkmate::assert_class(x, "rpf")
+  checkmate::assert_int(mode, lower = 1, upper = 2)
+  if (!is.null(nthreads)) checkmate::assert_int(nthreads, lower = 1)
+  if (is.null(maxp_interaction)) {
+    # Default: exact cut points, full interaction order
+    x$fit$purify_threads(0L, as.integer(if (is.null(nthreads)) 0L else nthreads), as.integer(mode))
+  } else {
+    checkmate::assert_int(maxp_interaction, lower = 1)
+    x$fit$purify_threads(as.integer(maxp_interaction), as.integer(if (is.null(nthreads)) 0L else nthreads), as.integer(mode))
+  }
   x
 }
 
@@ -43,3 +60,5 @@ is_purified <- function(x) {
   checkmate::assert_class(x, "rpf")
   x$fit$is_purified()
 }
+
+
diff --git a/R/rpf.R b/R/rpf.R
index a331656..6a752dc 100644
--- a/R/rpf.R
+++ b/R/rpf.R
@@ -15,6 +15,10 @@
 #' @param split_try `[10]`: Number of split points to be considered when choosing a split candidate.
 #' @param t_try `[0.4]`: A value in (0,1] specifying the proportion of viable split-candidates in each round.
 #' @param deterministic `[FALSE]`: Choose whether approach deterministic or random.
+#' @param split_decay_rate `[0.1]`: Exponential decay factor for aging split-candidates. Possible splits are initiated with age=0. Whenever a possible split becomes a split_candidate (i.e. it has been drawn when drawing max(max_candidates , t_try * possible options ) times) it ages by +1. The age of the single split-candidate with minimal loss is reset to zero. Split_candidates are sampled from Possible_splits with weight exp(-split_decay_rate_ * age).  A high split_decay_rate means faster aging. split_decay_rate=0 results in no aging and uniform sampling.
+#' @param max_candidates `[50]`: Maximum number of split-candidates sampled per iteration. Number of split_candidates in each round is given  by max(max_candidates , t_try * possible options).
+#' @param delete_leaves `[TRUE]`: Whether to delete a parent leaf when splitting along an existing dimension.
+#' @param split_structure `["leaves"]`: Defines the structure of a possible split and how to choose split_candidates. Can be one of "leaves", "hist", "cur_trees_1", "cur_trees_2", or "res_trees". Further details are given below.
 #' @param nthreads `[1L]`: Number of threads used for computation, defaulting to serial execution.
 #' @param purify `[FALSE]`: Whether the forest should be purified.
 #'   Set to `TRUE` to enable components extract with [`predict_components()`] are valid.
@@ -29,6 +33,9 @@
 #' @param epsilon `[0.1]`: Only used if loss = `"logit"` or `"exponential"`.
 #'   Proportion of class membership is truncated to be smaller 1-epsilon when calculating
 #'   the fit in a leaf.
+#' @param split_decay_rate `[0.1]`:  Exponential decay factor λ for aging split-candidates.  A candidate’s weight is `exp(−λ * age)`. 
+#' @param max_candidates `[50]`:  Maximum number of split‐candidates to sample at each node (will be clamped to `[1, #possible_splits]`).
+#' @param delete_leaves `[1]`:  Whether parents should be deleted if split is an existing coordinate
 #' @param ... (Unused).
 #'
 #' @return Object of class `"rpf"` with model object contained in `$fit`.
@@ -39,6 +46,37 @@
 #' @importFrom hardhat default_formula_blueprint
 #' @importFrom hardhat default_recipe_blueprint
 #'
+#' @details
+#' \subsection{splits}{
+#' The number of `splits` is the main tuning parameter affecting the accuracy of predictions.
+#' }
+#' \subsection{split_structure}{
+#' The `split_structure` argument controls how split candidates are constructed and sampled.
+#' In each round, a `t_try` fraction (capped by `max_candidates`) is drawn
+#' from the pool of all possible splits with weights `exp(-split_decay_rate * age)`.
+#'
+#' \describe{
+#'   \item{leaves}{Split candidates are (leaf, split-dimension) pairs. For each sampled
+#'   candidate, `split_try` thresholds are drawn uniformly from the valid range within
+#'   that leaf and evaluated to choose the best split.}
+#'
+#'   \item{cur_trees_1}{Split candidates are (current-tree, split-dimension) pairs. For each
+#'   sampled candidate, perform `split_try` evaluations. Each evaluation samples a leaf
+#'   from the set of valid current trees (with probability proportional to its number of
+#'   available thresholds) and then uniformly samples a single threshold within that leaf.}
+#'
+#'   \item{cur_trees_2}{Split candidates are (current-tree, split-dimension) pairs. For each
+#'   sampled candidate, iterate through every
+#'   valid leaf. Within each leaf, sample `split_try` thresholds uniformly and
+#'   evaluate them.}
+#'
+#'   \item{res_trees}{Split candidates are resulting trees. For each sampled candidate, run
+#'   `split_try` evaluations by sampling a (split-dimension, leaf) pair from all valid
+#'   pairs (with probability proportional to its number of available thresholds), then
+#'   uniformly sampling one threshold within that pair.}
+#' }
+#' }
+#'
 #' @examples
 #' # Regression with x and y
 #' rpfit <- rpf(x = mtcars[, c("cyl", "wt")], y = mtcars$mpg)
@@ -63,16 +101,21 @@ rpf.default <- function(x, ...) {
 #' @export
 #' @rdname rpf
 rpf.data.frame <- function(x, y, max_interaction = 1, ntrees = 50, splits = 30,
-                           split_try = 10, t_try = 0.4, deterministic = FALSE,
+                           split_try = 10, t_try = 0.4, split_decay_rate = 0.1, 
+                           max_candidates = 50, delete_leaves = TRUE,
+                           deterministic = FALSE,
                            nthreads = 1, purify = FALSE, cv = FALSE,
-                           loss = "L2", delta = 0, epsilon = 0.1, ...) {
+                           loss = "L2", delta = 0, epsilon = 0.1,
+                           split_structure = "leaves", export_forest = FALSE, ...) {
+  split_structure <- match.arg(split_structure, c("leaves", "res_trees", "cur_trees_2", "cur_trees_1", "hist"))
   blueprint <- hardhat::default_xy_blueprint(intercept = FALSE)
   processed <- hardhat::mold(x, y, blueprint = blueprint)
   rpf_bridge(
     processed, max_interaction, ntrees, splits,
-    split_try, t_try, deterministic,
+    split_try, t_try, split_decay_rate, max_candidates, delete_leaves, deterministic,
     nthreads, purify, cv,
-    loss, delta, epsilon
+    loss, delta, epsilon,
+    split_structure = split_structure, export_forest = export_forest
   )
 }
 
@@ -80,32 +123,42 @@ rpf.data.frame <- function(x, y, max_interaction = 1, ntrees = 50, splits = 30,
 #' @export
 #' @rdname rpf
 rpf.matrix <- function(x, y, max_interaction = 1, ntrees = 50, splits = 30,
-                       split_try = 10, t_try = 0.4, deterministic = FALSE,
+                       split_try = 10, t_try = 0.4, split_decay_rate = 0.1, 
+                       max_candidates = 50, delete_leaves = TRUE,
+                       deterministic = FALSE,
                        nthreads = 1, purify = FALSE, cv = FALSE,
-                       loss = "L2", delta = 0, epsilon = 0.1, ...) {
+                       loss = "L2", delta = 0, epsilon = 0.1,
+                       split_structure = "leaves", export_forest = FALSE, ...) {
+  split_structure <- match.arg(split_structure, c("leaves", "res_trees", "cur_trees_2", "cur_trees_1", "hist"))
   blueprint <- hardhat::default_xy_blueprint(intercept = FALSE)
   processed <- hardhat::mold(x, y, blueprint = blueprint)
   rpf_bridge(
     processed, max_interaction, ntrees, splits,
-    split_try, t_try, deterministic,
+    split_try, t_try, split_decay_rate, max_candidates, delete_leaves, deterministic,
     nthreads, purify, cv,
-    loss, delta, epsilon
+    loss, delta, epsilon,
+    split_structure = split_structure, export_forest = export_forest
   )}
 
 # Formula method
 #' @export
 #' @rdname rpf
 rpf.formula <- function(formula, data, max_interaction = 1, ntrees = 50, splits = 30,
-                        split_try = 10, t_try = 0.4, deterministic = FALSE,
+                        split_try = 10, t_try = 0.4, split_decay_rate = 0.1,
+                        max_candidates = 50, delete_leaves = TRUE,
+                        deterministic = FALSE,
                         nthreads = 1, purify = FALSE, cv = FALSE,
-                        loss = "L2", delta = 0, epsilon = 0.1, ...) {
+                        loss = "L2", delta = 0, epsilon = 0.1,
+                        split_structure = "leaves", export_forest = FALSE, ...) {
+  split_structure <- match.arg(split_structure, c("leaves", "res_trees", "cur_trees_2", "cur_trees_1", "hist"))
   blueprint <- hardhat::default_formula_blueprint(intercept = FALSE, indicators = "none")
   processed <- hardhat::mold(formula, data, blueprint = blueprint)
   rpf_bridge(
     processed, max_interaction, ntrees, splits,
-    split_try, t_try, deterministic,
+    split_try, t_try, split_decay_rate, max_candidates, delete_leaves, deterministic,
     nthreads, purify, cv,
-    loss, delta, epsilon
+    loss, delta, epsilon,
+    split_structure = split_structure, export_forest = export_forest
   )
 }
 
@@ -113,16 +166,21 @@ rpf.formula <- function(formula, data, max_interaction = 1, ntrees = 50, splits
 #' @export
 #' @rdname rpf
 rpf.recipe <- function(x, data, max_interaction = 1, ntrees = 50, splits = 30,
-                       split_try = 10, t_try = 0.4, deterministic = FALSE,
+                       split_try = 10, t_try = 0.4, split_decay_rate = 0.1, 
+                       max_candidates = 50, delete_leaves = TRUE,
+                       deterministic = FALSE,
                        nthreads = 1, purify = FALSE, cv = FALSE,
-                       loss = "L2", delta = 0, epsilon = 0.1, ...) {
+                       loss = "L2", delta = 0, epsilon = 0.1,
+                       split_structure = "leaves", export_forest = FALSE, ...) {
+  split_structure <- match.arg(split_structure, c("leaves", "res_trees", "cur_trees_2", "cur_trees_1", "hist"))
   blueprint <- hardhat::default_recipe_blueprint(intercept = FALSE)
   processed <- hardhat::mold(x, data, blueprint = blueprint)
   rpf_bridge(
     processed, max_interaction, ntrees, splits,
-    split_try, t_try, deterministic,
+    split_try, t_try, split_decay_rate, max_candidates, delete_leaves, deterministic,
     nthreads, purify, cv,
-    loss, delta, epsilon
+    loss, delta, epsilon,
+    split_structure = split_structure, export_forest = export_forest
   )
 }
 
@@ -131,9 +189,13 @@ rpf.recipe <- function(x, data, max_interaction = 1, ntrees = 50, splits = 30,
 #' @param processed Output of `hardhat::mold` from respective rpf methods
 #' @importFrom hardhat validate_outcomes_are_univariate
 rpf_bridge <- function(processed, max_interaction = 1, ntrees = 50, splits = 30,
-                       split_try = 10, t_try = 0.4, deterministic = FALSE,
+                       split_try = 10, t_try = 0.4, split_decay_rate = 0.1, 
+                       max_candidates = 50, delete_leaves = TRUE,
+                       deterministic = FALSE,
                        nthreads = 1, purify = FALSE, cv = FALSE,
-                       loss = "L2", delta = 0, epsilon = 0.1) {
+                       loss = "L2", delta = 0, epsilon = 0.1,
+                       split_structure = "leaves", export_forest = FALSE) {
+  split_structure <- match.arg(split_structure, c("leaves", "res_trees", "cur_trees_2", "cur_trees_1", "hist"))
   hardhat::validate_outcomes_are_univariate(processed$outcomes)
   predictors <- preprocess_predictors_fit(processed)
   outcomes <- preprocess_outcome(processed, loss)
@@ -141,7 +203,7 @@ rpf_bridge <- function(processed, max_interaction = 1, ntrees = 50, splits = 30,
 
   # Check arguments
   checkmate::assert_int(max_interaction, lower = 0)
-
+  
   # rewrite max_interaction so 0 -> "maximum", e.g. ncol(X)
   if (max_interaction == 0) {
     max_interaction <- p
@@ -156,10 +218,13 @@ rpf_bridge <- function(processed, max_interaction = 1, ntrees = 50, splits = 30,
   checkmate::assert_int(ntrees, lower = 1)
   checkmate::assert_int(splits, lower = 1)
   checkmate::assert_int(split_try, lower = 1)
-
+  checkmate::assert_int(max_candidates, lower = 1)
+  
   checkmate::assert_number(t_try, lower = 0, upper = 1)
   checkmate::assert_number(delta, lower = 0, upper = 1)
   checkmate::assert_number(epsilon, lower = 0, upper = 1)
+  checkmate::assert_number(split_decay_rate, lower = 0)
+  
 
   # "median" loss is implemented but discarded
   loss_functions <- switch(outcomes$mode,
@@ -172,18 +237,26 @@ rpf_bridge <- function(processed, max_interaction = 1, ntrees = 50, splits = 30,
   checkmate::assert_int(nthreads, lower = 1L)
   checkmate::assert_flag(purify)
   checkmate::assert_flag(cv)
+  checkmate::assert_flag(delete_leaves)
+  checkmate::assert_choice(split_structure, choices = c("res_trees", "cur_trees_2", "cur_trees_1", "leaves", "hist"))
+
 
   fit <- rpf_impl(
     Y = outcomes$outcomes, X = predictors$predictors_matrix,
     mode = outcomes$mode,
     max_interaction = max_interaction, ntrees = ntrees, splits = splits,
-    split_try = split_try, t_try = t_try, deterministic = deterministic,
+    split_try = split_try, t_try = t_try, split_decay_rate = split_decay_rate, max_candidates = max_candidates, delete_leaves=delete_leaves, deterministic = deterministic,
     nthreads = nthreads, purify = purify, cv = cv,
-    loss = loss, delta = delta, epsilon = epsilon
+    loss = loss, delta = delta, epsilon = epsilon,
+    split_structure = split_structure
   )
 
-  forest <- fit$get_model()
-  class(forest) <- "rpf_forest"
+  # Optionally export a compact R list representation of the forest.
+  forest <- NULL
+  if (isTRUE(export_forest)) {
+    forest <- fit$get_model()
+    class(forest) <- "rpf_forest"
+  }
 
   new_rpf(
     fit = fit,
@@ -195,7 +268,12 @@ rpf_bridge <- function(processed, max_interaction = 1, ntrees = 50, splits = 30,
       ntrees = ntrees,
       max_interaction = max_interaction,
       splits = splits,
-      split_try = split_try, t_try = t_try,
+      split_try = split_try, 
+      t_try = t_try,
+      split_decay_rate = split_decay_rate,
+      max_candidates = max_candidates,
+      delete_leaves = delete_leaves,
+        split_structure = split_structure,
       delta = delta, epsilon = epsilon,
       deterministic = deterministic,
       nthreads = nthreads, purify = purify, cv = cv
@@ -217,21 +295,31 @@ new_rpf <- function(fit, blueprint, ...) {
 # Main fitting function and interface to C++ implementation
 rpf_impl <- function(Y, X, mode = c("regression", "classification"),
                      max_interaction = 1, ntrees = 50, splits = 30, split_try = 10, t_try = 0.4,
-                     deterministic = FALSE, nthreads = 1, purify = FALSE, cv = FALSE,
-                     loss = "L2", delta = 0, epsilon = 0.1) {
+                     deterministic = FALSE, nthreads = 1, purify = FALSE, cv = FALSE, split_decay_rate = 0.1, max_candidates = 50, delete_leaves = TRUE,
+                     loss = "L2", delta = 0, epsilon = 0.1,
+                     split_structure = "leaves") {
   # Final input validation, should be superfluous
   checkmate::assert_matrix(X, mode = "numeric", any.missing = FALSE)
   mode <- match.arg(mode)
+  split_structure <- match.arg(split_structure, c("leaves", "res_trees", "cur_trees_2", "cur_trees_1", "hist"))
+  # map split_structure string to numeric mode for C++
+  split_mode <- switch(split_structure,
+    res_trees = 0L,
+    cur_trees_2 = 1L,
+    cur_trees_1 = 2L,
+    leaves = 3L,
+    hist = 4L
+  )
 
   if (mode == "classification") {
     fit <- new(ClassificationRPF, Y, X, loss, c(
       max_interaction, ntrees, splits, split_try, t_try,
-      purify, deterministic, nthreads, cv, delta, epsilon
+      purify, deterministic, nthreads, cv, split_decay_rate, max_candidates, delete_leaves, split_mode, delta, epsilon
     ))
   } else if (mode == "regression") {
     fit <- new(RandomPlantedForest, Y, X, c(
-      max_interaction, ntrees, splits, split_try, t_try,
-      purify, deterministic, nthreads, cv
+      max_interaction, ntrees, splits, split_try, t_try, 
+      purify, deterministic, nthreads, cv, split_decay_rate, max_candidates, delete_leaves, split_mode
     ))
   }
 
diff --git a/R/utils.R b/R/utils.R
index c04fc14..df2f979 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -115,8 +115,26 @@ preprocess_predictors_fit <- function(processed) {
   )
 }
 
-# Sort factor predictors using stored level information
-# Used in predict_rpf_bridge()
+#' Preprocess predictors for prediction
+#'
+#' Convert logical and character columns to appropriate types, re-order factor
+#' levels to match the ordering learned during fitting (stored in
+#' `object$factor_levels`), re-encode factor columns as integers, and return a
+#' numeric matrix suitable for the underlying C++ prediction routines.
+#'
+#' This is primarily an internal utility used by `predict()` methods but is
+#' exported to support advanced users and tests.
+#'
+#' @param object An object of class `rpf` returned by [`rpf()`].
+#' @param predictors A data frame or matrix of predictor values to preprocess.
+#'
+#' @return A numeric matrix with the same number of rows as `predictors`.
+#' @export
+#' @examples
+#' rpfit <- rpf(x = mtcars[, c("cyl", "wt")], y = mtcars$mpg)
+#' processed <- hardhat::forge(mtcars[, c("cyl", "wt")], rpfit$blueprint)
+#' X <- preprocess_predictors_predict(rpfit, processed$predictors)
+#' dim(X)
 preprocess_predictors_predict <- function(object, predictors) {
   predictors <- as.data.table(predictors)
 
diff --git a/man/preprocess_predictors_predict.Rd b/man/preprocess_predictors_predict.Rd
new file mode 100644
index 0000000..9059b94
--- /dev/null
+++ b/man/preprocess_predictors_predict.Rd
@@ -0,0 +1,32 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{preprocess_predictors_predict}
+\alias{preprocess_predictors_predict}
+\title{Preprocess predictors for prediction}
+\usage{
+preprocess_predictors_predict(object, predictors)
+}
+\arguments{
+\item{object}{An object of class \code{rpf} returned by \code{\link[=rpf]{rpf()}}.}
+
+\item{predictors}{A data frame or matrix of predictor values to preprocess.}
+}
+\value{
+A numeric matrix with the same number of rows as \code{predictors}.
+}
+\description{
+Convert logical and character columns to appropriate types, re-order factor
+levels to match the ordering learned during fitting (stored in
+\code{object$factor_levels}), re-encode factor columns as integers, and return a
+numeric matrix suitable for the underlying C++ prediction routines.
+}
+\details{
+This is primarily an internal utility used by \code{predict()} methods but is
+exported to support advanced users and tests.
+}
+\examples{
+rpfit <- rpf(x = mtcars[, c("cyl", "wt")], y = mtcars$mpg)
+processed <- hardhat::forge(mtcars[, c("cyl", "wt")], rpfit$blueprint)
+X <- preprocess_predictors_predict(rpfit, processed$predictors)
+dim(X)
+}
diff --git a/man/purify.Rd b/man/purify.Rd
index 79adf5a..5856f5f 100644
--- a/man/purify.Rd
+++ b/man/purify.Rd
@@ -11,7 +11,7 @@ purify(x, ...)
 
 \method{purify}{default}(x, ...)
 
-\method{purify}{rpf}(x, ...)
+\method{purify}{rpf}(x, ..., maxp_interaction = NULL, mode = 2L, nthreads = NULL)
 
 is_purified(x)
 }
@@ -19,12 +19,23 @@ is_purified(x)
 \item{x}{And object of class \code{rpf}.}
 
 \item{...}{(Unused)}
+
+\item{maxp_interaction}{integer or NULL: Only compute/store purified components
+up to this interaction order. Higher-order purified trees are zeroed (not
+computed) but still implicitly influence lower orders during purification.
+If NULL, purify all orders (default behavior).}
+
+\item{mode}{integer(1): Purification algorithm mode. 1 = legacy grid path
+used by \code{fit$fit$purify()}; 2 = fast exact KD-tree based path. Defaults to 2.}
+
+\item{nthreads}{integer or NULL: number of threads to use. If NULL, defaults
+to min of the object's configured \code{nthreads} and available threads.}
 }
 \value{
 Invisibly: The \code{\link{rpf}} object.
 }
 \description{
-TODO: Explain what this does
+Purifies an rpf object.
 }
 \details{
 Unless \code{\link[=rpf]{rpf()}} is called with \code{purify = TRUE}, the forest has to be purified after fit
diff --git a/man/rpf.Rd b/man/rpf.Rd
index 0e323d8..14b4c98 100644
--- a/man/rpf.Rd
+++ b/man/rpf.Rd
@@ -18,6 +18,9 @@ rpf(x, ...)
   splits = 30,
   split_try = 10,
   t_try = 0.4,
+  split_decay_rate = 0.1,
+  max_candidates = 50,
+  delete_leaves = TRUE,
   deterministic = FALSE,
   nthreads = 1,
   purify = FALSE,
@@ -25,6 +28,7 @@ rpf(x, ...)
   loss = "L2",
   delta = 0,
   epsilon = 0.1,
+  split_structure = "leaves",
   ...
 )
 
@@ -36,6 +40,9 @@ rpf(x, ...)
   splits = 30,
   split_try = 10,
   t_try = 0.4,
+  split_decay_rate = 0.1,
+  max_candidates = 50,
+  delete_leaves = TRUE,
   deterministic = FALSE,
   nthreads = 1,
   purify = FALSE,
@@ -43,6 +50,7 @@ rpf(x, ...)
   loss = "L2",
   delta = 0,
   epsilon = 0.1,
+  split_structure = "leaves",
   ...
 )
 
@@ -54,6 +62,9 @@ rpf(x, ...)
   splits = 30,
   split_try = 10,
   t_try = 0.4,
+  split_decay_rate = 0.1,
+  max_candidates = 50,
+  delete_leaves = TRUE,
   deterministic = FALSE,
   nthreads = 1,
   purify = FALSE,
@@ -61,6 +72,7 @@ rpf(x, ...)
   loss = "L2",
   delta = 0,
   epsilon = 0.1,
+  split_structure = "leaves",
   ...
 )
 
@@ -72,6 +84,9 @@ rpf(x, ...)
   splits = 30,
   split_try = 10,
   t_try = 0.4,
+  split_decay_rate = 0.1,
+  max_candidates = 50,
+  delete_leaves = TRUE,
   deterministic = FALSE,
   nthreads = 1,
   purify = FALSE,
@@ -79,6 +94,7 @@ rpf(x, ...)
   loss = "L2",
   delta = 0,
   epsilon = 0.1,
+  split_structure = "leaves",
   ...
 )
 }
@@ -105,6 +121,12 @@ this is equivalent to setting \code{max_interaction = 10}.}
 
 \item{t_try}{\verb{[0.4]}: A value in (0,1] specifying the proportion of viable split-candidates in each round.}
 
+\item{split_decay_rate}{\verb{[0.1]}:  Exponential decay factor λ for aging split-candidates.  A candidate’s weight is \verb{exp(−λ * age)}.}
+
+\item{max_candidates}{\verb{[50]}:  Maximum number of split‐candidates to sample at each node (will be clamped to \verb{[1, #possible_splits]}).}
+
+\item{delete_leaves}{\verb{[1]}:  Whether parents should be deleted if split is an existing coordinate}
+
 \item{deterministic}{\verb{[FALSE]}: Choose whether approach deterministic or random.}
 
 \item{nthreads}{\verb{[1L]}: Number of threads used for computation, defaulting to serial execution.}
@@ -127,6 +149,8 @@ the loss to determine the optimal split.}
 Proportion of class membership is truncated to be smaller 1-epsilon when calculating
 the fit in a leaf.}
 
+\item{split_structure}{\verb{["leaves"]}: Defines the structure of a possible split and how to choose split_candidates. Can be one of "leaves", "hist", "cur_trees_1", "cur_trees_2", or "res_trees". Further details are given below.}
+
 \item{formula}{Formula specification, e.g. y ~ x1 + x2.}
 }
 \value{
@@ -135,6 +159,37 @@ Object of class \code{"rpf"} with model object contained in \verb{$fit}.
 \description{
 Random Planted Forest
 }
+\details{
+\subsection{splits}{
+The number of \code{splits} is the main tuning parameter affecting the accuracy of predictions.
+}
+\subsection{split_structure}{
+The \code{split_structure} argument controls how split candidates are constructed and sampled.
+In each round, a \code{t_try} fraction (capped by \code{max_candidates}) is drawn
+from the pool of all possible splits with weights \code{exp(-split_decay_rate * age)}.
+
+\describe{
+\item{leaves}{Split candidates are (leaf, split-dimension) pairs. For each sampled
+candidate, \code{split_try} thresholds are drawn uniformly from the valid range within
+that leaf and evaluated to choose the best split.}
+
+\item{cur_trees_1}{Split candidates are (current-tree, split-dimension) pairs. For each
+sampled candidate, perform \code{split_try} evaluations. Each evaluation samples a leaf
+from the set of valid current trees (with probability proportional to its number of
+available thresholds) and then uniformly samples a single threshold within that leaf.}
+
+\item{cur_trees_2}{Split candidates are (current-tree, split-dimension) pairs. For each
+sampled candidate, iterate through every
+valid leaf. Within each leaf, sample \code{split_try} thresholds uniformly and
+evaluate them.}
+
+\item{res_trees}{Split candidates are resulting trees. For each sampled candidate, run
+\code{split_try} evaluations by sampling a (split-dimension, leaf) pair from all valid
+pairs (with probability proportional to its number of available thresholds), then
+uniformly sampling one threshold within that pair.}
+}
+}
+}
 \examples{
 # Regression with x and y
 rpfit <- rpf(x = mtcars[, c("cyl", "wt")], y = mtcars$mpg)
diff --git a/src/Makevars b/src/Makevars
index 71ae982..ffe2fba 100644
--- a/src/Makevars
+++ b/src/Makevars
@@ -1,8 +1,9 @@
-SOURCES=lib/cpf.cpp lib/grid.cpp lib/helper.cpp lib/rpf.cpp lib/trees.cpp randomPlantedForest.cpp RcppExports.cpp
+SOURCES=lib/cpf.cpp lib/grid.cpp lib/helper.cpp lib/rpf.cpp lib/trees.cpp lib/internal_utils.cpp lib/splits_leaves.cpp lib/splits_cur_trees_2.cpp lib/splits_cur_trees_1.cpp lib/splits_res_trees.cpp lib/splits_hist.cpp lib/predict.cpp lib/training.cpp lib/purify.cpp lib/losses_l1_l2_median.cpp lib/losses_logit.cpp lib/losses_exponential.cpp randomPlantedForest.cpp RcppExports.cpp
 
 OBJECTS = $(SOURCES:.cpp=.o)
 
 PKG_CPPFLAGS=-I./include -I./lib
+PKG_CXXFLAGS = -DNDEBUG
 
 all: $(SHLIB)
 
diff --git a/src/include/cpf.hpp b/src/include/cpf.hpp
index 1c913ce..f993f95 100644
--- a/src/include/cpf.hpp
+++ b/src/include/cpf.hpp
@@ -9,7 +9,7 @@ class ClassificationRPF : public RandomPlantedForest
 public:
   using RandomPlantedForest::calcOptimalSplit;
   ClassificationRPF(const NumericMatrix &samples_Y, const NumericMatrix &samples_X,
-                    const String loss = "L2", const NumericVector parameters = {1, 50, 30, 10, 0.4, 0, 0, 0, 0, 0, 0.1});
+                    const String loss = "L2", const NumericVector parameters = {1, 50, 30, 10, 0.4, 0, 0, 0, 0, 0.1, 0, 0.1, 50,1});
   void set_parameters(StringVector keys, NumericVector values);
   ~ClassificationRPF(){};
 
@@ -33,9 +33,44 @@ class ClassificationRPF : public RandomPlantedForest
   void (ClassificationRPF::*calcLoss)(Split &);
   void create_tree_family(std::vector<Leaf> initial_leaves, size_t n) override;
   void fit() override;
-  Split calcOptimalSplit(const std::vector<std::vector<double>> &Y, const std::vector<std::vector<double>> &X,
-                         std::multimap<int, std::shared_ptr<DecisionTree>> &possible_splits, TreeFamily &curr_family,
-                         std::vector<std::vector<double>> &weights);
+  Split calcOptimalSplit(
+      const std::vector<std::vector<double>>& Y,
+      const std::vector<std::vector<double>>& X,
+      std::vector<SplitCandidate>& possible_splits,
+      TreeFamily& curr_family,
+      std::vector<std::vector<double>>& weights) ;
+  // Mode-specific split calculators (classification versions using calcLoss and weights)
+  Split calcOptimalSplit_leaves(
+      const std::vector<std::vector<double>>& Y,
+      const std::vector<std::vector<double>>& X,
+      std::vector<SplitCandidate>& possible_splits,
+      TreeFamily& curr_family,
+      std::vector<std::vector<double>>& weights);
+  Split calcOptimalSplit_curTrees1(
+      const std::vector<std::vector<double>>& Y,
+      const std::vector<std::vector<double>>& X,
+      std::vector<SplitCandidate>& possible_splits,
+      TreeFamily& curr_family,
+      std::vector<std::vector<double>>& weights);
+  Split calcOptimalSplit_curTrees2(
+      const std::vector<std::vector<double>>& Y,
+      const std::vector<std::vector<double>>& X,
+      std::vector<SplitCandidate>& possible_splits,
+      TreeFamily& curr_family,
+      std::vector<std::vector<double>>& weights);
+  // Mode 4: histogram-binned (classification variant)
+  Split calcOptimalSplit_hist(
+      const std::vector<std::vector<double>>& Y,
+      const std::vector<std::vector<double>>& X,
+      std::vector<SplitCandidate>& possible_splits,
+      TreeFamily& curr_family,
+      std::vector<std::vector<double>>& weights);
+  Split calcOptimalSplit_resTrees(
+      const std::vector<std::vector<double>>& Y,
+      const std::vector<std::vector<double>>& X,
+      std::vector<RandomPlantedForest::ResultingTreeCandidate>& possible_trees,
+      TreeFamily& curr_family,
+      std::vector<std::vector<double>>& weights);
   void L1_loss(Split &split);
   void median_loss(Split &split);
   void logit_loss(Split &split);
@@ -47,4 +82,4 @@ class ClassificationRPF : public RandomPlantedForest
   void exponential_loss_3(Split &split);
 };
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/include/diffbuf.hpp b/src/include/diffbuf.hpp
new file mode 100644
index 0000000..c16fabc
--- /dev/null
+++ b/src/include/diffbuf.hpp
@@ -0,0 +1,93 @@
+#ifndef RPF_DIFFBUF_HPP
+#define RPF_DIFFBUF_HPP
+
+#include <vector>
+#include <cstddef>
+#include <algorithm>
+
+// N-D difference buffer for axis-aligned rectangular range updates
+// and reconstruction via inclusive prefix scans along each dimension.
+
+namespace rpf_diff
+{
+  template <typename T>
+  struct NDArray
+  {
+    std::vector<int> dims;      // logical dimensions
+    std::vector<T> data;        // flat row-major data
+
+    NDArray() {}
+    explicit NDArray(const std::vector<int> &d, const T &init = T()) : dims(d)
+    {
+      size_t n = 1; for (int v : d) n *= (size_t)v; data.assign(n, init);
+    }
+
+    inline size_t offset(const std::vector<int> &idx) const
+    {
+      size_t off = 0; size_t stride = 1;
+      for (size_t k = 0; k < dims.size(); ++k)
+      {
+        off += (size_t)idx[k] * stride; stride *= (size_t)dims[k];
+      }
+      return off;
+    }
+
+    inline T &at(const std::vector<int> &idx) { return data[offset(idx)]; }
+    inline const T &at(const std::vector<int> &idx) const { return data[offset(idx)]; }
+  };
+
+  // Apply a constant add v onto a closed-open hyper-rectangle [lo, hi) via difference corners
+  template <typename T>
+  void add_rect(NDArray<T> &diff, const std::vector<int> &lo, const std::vector<int> &hi, const T &v)
+  {
+    const size_t d = diff.dims.size();
+    // iterate over 2^d corners
+    std::vector<int> corner(d, 0);
+    for (;;) {
+      int flips = 0; for (size_t k = 0; k < d; ++k) if (corner[k]) ++flips;
+      T sign = (flips % 2 == 0) ? v : (v * (-1));
+      std::vector<int> idx(d);
+      for (size_t k = 0; k < d; ++k) idx[k] = corner[k] ? hi[k] : lo[k];
+      diff.at(idx) += sign;
+      size_t pos = 0;
+      while (pos < d) { if (corner[pos] == 0) { corner[pos] = 1; break; } corner[pos] = 0; ++pos; }
+      if (pos == d) break;
+    }
+  }
+
+  // Inclusive prefix scan along each dimension in-place (converts diff -> values)
+  template <typename T>
+  void inclusive_scan_inplace(NDArray<T> &arr)
+  {
+    const size_t d = arr.dims.size();
+    if (d == 0) return;
+    for (size_t axis = 0; axis < d; ++axis)
+    {
+      // number of slabs orthogonal to axis
+      size_t nslab = 1; for (size_t k = 0; k < d; ++k) if (k != axis) nslab *= (size_t)arr.dims[k];
+      std::vector<int> slab_idx(d, 0);
+      for (size_t s = 0; s < nslab; ++s)
+      {
+        // decode slab index into coordinates for all dims except axis
+        size_t tmp = s;
+        for (size_t k = 0; k < d; ++k)
+        {
+          if (k == axis) continue;
+          slab_idx[k] = (int)(tmp % (size_t)arr.dims[k]);
+          tmp /= (size_t)arr.dims[k];
+        }
+        // inclusive scan along axis
+        std::vector<int> run = slab_idx; run[axis] = 0;
+        T acc = arr.at(run); acc -= acc; // zero of correct shape
+        for (int t = 0; t < arr.dims[axis]; ++t)
+        {
+          run[axis] = t; acc += arr.at(run); arr.at(run) = acc;
+        }
+      }
+    }
+  }
+}
+
+#endif // RPF_DIFFBUF_HPP
+
+
diff --git a/src/include/grid.hpp b/src/include/grid.hpp
index 4b21af5..e7653da 100644
--- a/src/include/grid.hpp
+++ b/src/include/grid.hpp
@@ -39,4 +39,4 @@ namespace grid
   };
 };
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/include/internal_utils.hpp b/src/include/internal_utils.hpp
new file mode 100644
index 0000000..e1ce50c
--- /dev/null
+++ b/src/include/internal_utils.hpp
@@ -0,0 +1,79 @@
+// Internal utility helpers extracted from rpf.cpp to declutter large files.
+// Kept minimal and header-only where templating is required.
+
+#ifndef INTERNAL_UTILS_HPP
+#define INTERNAL_UTILS_HPP
+
+#include <vector>
+#include <cstddef>
+#include <algorithm>
+#include <numeric>
+#include <unordered_set>
+#include <limits>
+#include <random>
+
+#include "trees.hpp"
+
+namespace rpf_utils {
+
+// RNG helpers
+double rng_runif01();
+double rng_runif(double a, double b);
+int rng_randint(int left_inclusive, int right_exclusive);
+// Swap the thread-local RNG pointer; returns previous pointer
+std::mt19937_64* swap_tls_rng(std::mt19937_64* new_ptr);
+
+// Leaf/order/prefix helpers
+void ensure_order_and_sorted_vals_for_leaf(
+    const std::vector<std::vector<double>> &X,
+    Leaf &leaf,
+    int k,
+    std::vector<size_t> &order_out,
+    std::vector<double> &sorted_vals_out);
+
+std::vector<double> compute_unique_sorted_values(const std::vector<double> &sorted_vals);
+
+void build_prefix_and_total_given_order(
+    const std::vector<std::vector<double>> &Y,
+    const Leaf &leaf,
+    const std::vector<size_t> &order,
+    size_t value_size,
+    std::vector<std::vector<double>> &prefix_out,
+    std::vector<double> &total_out);
+
+void finalize_split_from_sums(
+    Split &winner,
+    const std::vector<std::vector<double>> &X,
+    size_t value_size);
+
+// Sampling helpers
+std::vector<size_t> sample_weighted_indices_filtered(
+    const std::vector<double> &weights,
+    size_t n_candidates);
+
+std::vector<int> compute_even_spread_indices(int left_inclusive, int right_exclusive, size_t max_draws);
+std::vector<int> sample_unique_ints_uniform_R(int left_inclusive, int right_exclusive, size_t k);
+
+// Fenwick helpers used by cur_trees_1 sampling cache
+void fenwick_add(std::vector<double> &bit, size_t idx1, double delta);
+size_t fenwick_find_by_prefix(const std::vector<double> &bit, double target);
+
+// Aging helper must be header (templated)
+template <typename CandidateT>
+inline void age_pool_by_sample(const std::vector<size_t> &sample_idxs, int best_idx, std::vector<CandidateT> &pool)
+{
+  for (size_t idx : sample_idxs) {
+    if (static_cast<int>(idx) != best_idx) pool[idx].age += 1.0; else pool[idx].age = 0.0;
+  }
+}
+
+} // namespace rpf_utils
+
+// Thread-local working-set bin cache used by histogram split mode (mode 4).
+// Declared here so multiple translation units (e.g., rpf.cpp and splits_hist.cpp)
+// can share the same cache during a tree-family build.
+extern thread_local std::vector<std::vector<int>> tls_working_bin_id;
+
+#endif // INTERNAL_UTILS_HPP
+
+
diff --git a/src/include/kdtree.hpp b/src/include/kdtree.hpp
new file mode 100644
index 0000000..3029998
--- /dev/null
+++ b/src/include/kdtree.hpp
@@ -0,0 +1,178 @@
+#ifndef RPF_KDTREE_HPP
+#define RPF_KDTREE_HPP
+
+#include <vector>
+#include <limits>
+#include <algorithm>
+#include <memory>
+
+// Lightweight KD-tree for orthogonal range counts.
+// - Header-only to avoid build system changes
+// - Supports arbitrary dimensionality
+// - Query provides constraints only for a subset of dimensions; others are unconstrained
+
+namespace rpf_kd
+{
+  struct RangeConstraint
+  {
+    int dim;            // 0-based feature index
+    double left;        // inclusive lower bound
+    double right;       // exclusive upper bound
+  };
+
+  namespace detail
+  {
+    struct Node
+    {
+      // Bounding box for quick acceptance/rejection
+      std::vector<double> minv;
+      std::vector<double> maxv;
+      int axis = -1;          // split axis; -1 means leaf
+      double split_value = 0; // split threshold
+      size_t size = 0;        // number of points in subtree
+      std::unique_ptr<Node> left;
+      std::unique_ptr<Node> right;
+      std::vector<int> idxs;  // indices when leaf
+    };
+  }
+
+  class KDTree
+  {
+  public:
+    KDTree() = default;
+
+    KDTree(const std::vector<std::vector<double>> *X_ptr,
+           const std::vector<int> &all_indices,
+           int dims,
+           size_t leaf_size = 32)
+    {
+      build(X_ptr, all_indices, dims, leaf_size);
+    }
+
+    void build(const std::vector<std::vector<double>> *X_ptr,
+               const std::vector<int> &all_indices,
+               int dims,
+               size_t leaf_size = 32)
+    {
+      X_ = X_ptr;
+      dims_ = dims;
+      leaf_size_ = leaf_size;
+      root_ = build_recursive(all_indices);
+    }
+
+    // Count number of points with constraints on a subset of dims
+    size_t range_count(const std::vector<RangeConstraint> &constraints) const
+    {
+      return range_count_recursive(root_.get(), constraints);
+    }
+
+  private:
+    const std::vector<std::vector<double>> *X_ = nullptr;
+    int dims_ = 0;
+    size_t leaf_size_ = 32;
+    std::unique_ptr<detail::Node> root_;
+
+    std::unique_ptr<detail::Node> build_recursive(const std::vector<int> &idxs)
+    {
+      auto node = std::make_unique<detail::Node>();
+      node->size = idxs.size();
+      node->minv.assign(dims_, std::numeric_limits<double>::infinity());
+      node->maxv.assign(dims_, -std::numeric_limits<double>::infinity());
+      for (int i : idxs)
+      {
+        for (int d = 0; d < dims_; ++d)
+        {
+          double v = (*X_)[i][d];
+          if (v < node->minv[d]) node->minv[d] = v;
+          if (v > node->maxv[d]) node->maxv[d] = v;
+        }
+      }
+
+      if (idxs.size() <= leaf_size_)
+      {
+        node->axis = -1; node->idxs = idxs; return node;
+      }
+
+      // Choose split axis by widest spread
+      int axis = 0; double best_span = -1.0;
+      for (int d = 0; d < dims_; ++d)
+      {
+        double span = node->maxv[d] - node->minv[d];
+        if (span > best_span) { best_span = span; axis = d; }
+      }
+      node->axis = axis;
+
+      // Median split on chosen axis
+      std::vector<int> left_idxs, right_idxs; left_idxs.reserve(idxs.size()); right_idxs.reserve(idxs.size());
+      std::vector<int> tmp = idxs;
+      size_t mid = tmp.size() / 2;
+      std::nth_element(tmp.begin(), tmp.begin() + mid, tmp.end(), [&](int a, int b){ return (*X_)[a][axis] < (*X_)[b][axis]; });
+      double split = (*X_)[tmp[mid]][axis];
+      node->split_value = split;
+      for (int i : idxs)
+      {
+        if ((*X_)[i][axis] < split) left_idxs.push_back(i); else right_idxs.push_back(i);
+      }
+      if (left_idxs.empty() || right_idxs.empty())
+      {
+        // Fallback: make leaf if degenerate split
+        node->axis = -1; node->idxs = idxs; return node;
+      }
+      node->left = build_recursive(left_idxs);
+      node->right = build_recursive(right_idxs);
+      return node;
+    }
+
+    static inline bool box_outside(const std::vector<double> &minv, const std::vector<double> &maxv,
+                                   const std::vector<RangeConstraint> &C)
+    {
+      for (const auto &rc : C)
+      {
+        if (maxv[rc.dim] <= rc.left) return true;
+        if (minv[rc.dim] >= rc.right) return true;
+      }
+      return false;
+    }
+
+    static inline bool box_inside(const std::vector<double> &minv, const std::vector<double> &maxv,
+                                  const std::vector<RangeConstraint> &C)
+    {
+      for (const auto &rc : C)
+      {
+        if (minv[rc.dim] < rc.left) return false;
+        if (maxv[rc.dim] > rc.right) return false;
+      }
+      return true;
+    }
+
+    size_t range_count_recursive(const detail::Node *node, const std::vector<RangeConstraint> &C) const
+    {
+      if (!node) return 0;
+      if (!C.empty())
+      {
+        if (box_outside(node->minv, node->maxv, C)) return 0;
+        if (box_inside(node->minv, node->maxv, C)) return node->size;
+      }
+      if (node->axis == -1)
+      {
+        size_t cnt = 0;
+        for (int i : node->idxs)
+        {
+          bool inside = true;
+          for (const auto &rc : C)
+          {
+            double v = (*X_)[i][rc.dim];
+            if (!(v >= rc.left && v < rc.right)) { inside = false; break; }
+          }
+          if (inside) ++cnt;
+        }
+        return cnt;
+      }
+      return range_count_recursive(node->left.get(), C) + range_count_recursive(node->right.get(), C);
+    }
+  };
+}
+
+#endif // RPF_KDTREE_HPP
+
+
diff --git a/src/include/rpf.hpp b/src/include/rpf.hpp
index 53e8d13..ff28a93 100644
--- a/src/include/rpf.hpp
+++ b/src/include/rpf.hpp
@@ -1,3 +1,23 @@
+// Public API for the Random Planted Forest (regression base). This header
+// declares the externally visible training, prediction, and model-introspection
+// methods used from R via the Rcpp module in `src/randomPlantedForest.cpp`.
+//
+// Key entry points:
+// - ctor(Y, X, parameters): construct and fit a model (calls set_data + fit)
+// - set_data(Y, X): load data (no training) and initialize bounds
+// - fit(): build tree families according to split_structure_mode_
+// - predict_matrix/predict_vector(): batch/single predictions
+// - purify_1/2/3(): optional post-processing to orthogonalize components
+// - cross_validation(): coarse k-fold search over a few parameters (legacy)
+// - get_parameters()/set_parameters(): inspect or update configuration
+// - get_model(): export current forest (for R printing/plotting)
+// - is_purified(): flag indicating whether purify_* was applied last
+//
+// Implementation notes:
+// - Training orchestrated in `lib/training.cpp`
+// - Prediction logic in `lib/predict.cpp`
+// - Split calculators in `lib/splits_*.cpp`
+// - Utilities (RNG, sampling, caching) in `lib/internal_utils.cpp`
 #ifndef RPF_H
 #define RPF_H
 
@@ -9,25 +29,44 @@ class RandomPlantedForest
 {
 
 public:
+  // Construct and fit a random planted forest on Y ~ X with configuration in
+  // `parameters` (see R docs for positional mapping; last value selects
+  // split-structure mode). Calls set_data() then fit().
   RandomPlantedForest(const NumericMatrix &samples_Y, const NumericMatrix &samples_X,
-                      const NumericVector parameters = {1, 50, 30, 10, 0.4, 0, 0, 0, 0});
+                      const NumericVector parameters = {1, 50, 30, 10, 0.4, 0, 0, 0, 0, 0.1, 50, 1, 3});
   RandomPlantedForest(){};
+  // Load or replace data without fitting; computes bounds and resets state.
   void set_data(const NumericMatrix &samples_Y, const NumericMatrix &samples_X);
+  // Predict for a matrix or a single vector. `components = {0}` means the full
+  // model; otherwise a set of component indices to evaluate (expert mode).
   NumericMatrix predict_matrix(const NumericMatrix &X, const NumericVector components = {0});
   NumericMatrix predict_vector(const NumericVector &X, const NumericVector components = {0});
+  // Optional post-processing to redistribute effects across component orders.
   void purify_1();
   void purify_2();
-  void purify_3();
+  // Unified purifier: mode 1 = grid path, mode 2 = fast exact (KD-tree)
+  void purify(int maxp_interaction, int nthreads, int mode);
+  // Unified entry with explicit threading control
+  void purify_fast_exact(int maxp_interaction, int nthreads);
+  // Human-readable dump of forest structure to R console.
   void print();
+  // Legacy coarse CV over a few parameters; mainly for internal experiments.
   void cross_validation(int n_sets = 4, IntegerVector splits = {5, 50}, NumericVector t_tries = {0.2, 0.5, 0.7, 0.9}, IntegerVector split_tries = {1, 2, 5, 10});
+  // Mean-squared error helper for matrix outputs.
   double MSE(const NumericMatrix &Y_predicted, const NumericMatrix &Y_true);
+  // Inspect/update configuration; `set_parameters` may trigger a refit.
   void get_parameters();
   void set_parameters(StringVector keys, NumericVector values);
+  // Export a list representation of the current forest for printing/plotting.
   List get_model();
   virtual ~RandomPlantedForest(){};
   bool is_purified();
-
+  
 protected:
+  // Internal per-family worker (grid-based mode 1)
+  void purify_3_family(TreeFamily &curr_family, int maxp_interaction);
+  // Internal per-family worker for fast exact purifier (mode 2)
+  void purify_fast_exact_family(TreeFamily &curr_family, int maxp_interaction);
   double MSE_vec(const NumericVector &Y_predicted, const NumericVector &Y_true);
   std::vector<std::vector<double>> X; /**< Nested vector feature samples of size (sample_size x feature_size) */
   std::vector<std::vector<double>> Y; /**< Corresponding values for the feature samples */
@@ -49,12 +88,82 @@ class RandomPlantedForest
   std::vector<double> upper_bounds;
   std::vector<double> lower_bounds;
   std::vector<TreeFamily> tree_families; /**<  random planted forest containing result */
+  // Seeds generated on the main thread from R's RNG, one per tree family
+  std::vector<unsigned long long> tree_seeds_;
   std::vector<double> predict_single(const std::vector<double> &X, std::set<int> component_index);
   void L2_loss(Split &split);
   virtual void fit();
   virtual void create_tree_family(std::vector<Leaf> initial_leaves, size_t n);
-  virtual Split calcOptimalSplit(const std::vector<std::vector<double>> &Y, const std::vector<std::vector<double>> &X,
-                                        std::multimap<int, std::shared_ptr<DecisionTree>> &possible_splits, TreeFamily &curr_family);
+  struct SplitCandidate;
+  // overload possibleExists for your vector of SplitCandidate
+  static bool possibleExists(
+    int dim,
+    const std::vector<SplitCandidate>& possible_splits,
+    const std::set<int>& resulting_dims
+  );
+  // helpers for different split-structure modes
+  Split calcOptimalSplit_leaves(const std::vector<std::vector<double>> &Y,
+                                const std::vector<std::vector<double>> &X,
+                                std::vector<SplitCandidate> &possible_splits,
+                                TreeFamily &curr_family);
+  Split calcOptimalSplit_curTrees2(const std::vector<std::vector<double>> &Y,
+                                   const std::vector<std::vector<double>> &X,
+                                   std::vector<SplitCandidate> &possible_splits,
+                                   TreeFamily &curr_family);
+  Split calcOptimalSplit_curTrees1(const std::vector<std::vector<double>> &Y,
+                                   const std::vector<std::vector<double>> &X,
+                                   std::vector<SplitCandidate> &possible_splits,
+                                   TreeFamily &curr_family);
+  struct ResultingTreeCandidate { std::shared_ptr<DecisionTree> tree; double age = 0.0; ResultingTreeCandidate() = default; explicit ResultingTreeCandidate(std::shared_ptr<DecisionTree> t):tree(std::move(t)){} };
+  bool resultingTreeExists(const std::vector<ResultingTreeCandidate>& pool, const std::set<int>& dims);
+  Split calcOptimalSplit_resTrees(const std::vector<std::vector<double>> &Y,
+                                  const std::vector<std::vector<double>> &X,
+                                  std::vector<ResultingTreeCandidate> &possible_trees,
+                                  TreeFamily &curr_family);
+  virtual Split calcOptimalSplit(const std::vector<std::vector<double>> &Y,
+                                 const std::vector<std::vector<double>> &X,
+                                 std::vector<SplitCandidate> &possible_splits,
+                                 TreeFamily &curr_family);
+  // exponential‐decay rate for split age
+  double split_decay_rate_;
+  size_t max_candidates_;
+  // LRU cap for per-leaf per-feature caches
+  size_t leaf_feature_cache_cap_ = 64;
+  // track each split candidate and how long it’s sat unchosen
+  struct SplitCandidate {
+    int dim;
+    std::shared_ptr<DecisionTree> tree;
+    size_t leaf_idx;
+    double age = 0.0;
+    // legacy ctor without leaf index (defaults to 0) — keep but prefer the 4-arg form from callers
+    explicit SplitCandidate(int d, std::shared_ptr<DecisionTree> t, double a=0.0)
+      : dim(d), tree(std::move(t)), leaf_idx(0), age(a) {}
+    SplitCandidate(int d, std::shared_ptr<DecisionTree> t, size_t li, double a=0.0)
+      : dim(d), tree(std::move(t)), leaf_idx(li), age(a) {}
+  };
+  // Which split structure to use (0=res_trees, 1=cur_trees_2, 2=cur_trees_1, 3=leaves, 4=hist)
+  int split_structure_mode_ = 3;
+  
+  // Histogram mode buffers
+  size_t num_bins_ = 64; // total number of global bins per feature (smaller default for speed)
+  // For each feature k in [0, feature_size), store K-1 cut points (ascending)
+  std::vector<std::vector<double>> feature_cut_points_;
+  // For each feature k, per-sample bin id in [0, K-1]
+  std::vector<std::vector<int>> sample_bin_id_;
+  // For the current bootstrapped working set (per-family), cache per-feature bin ids
+  // Moved to thread-local storage in implementation to avoid races under multithreading
+  // std::vector<std::vector<int>> working_bin_id_;
+  
+  bool leafCandidateExists(const std::vector<SplitCandidate>&,
+                           const std::shared_ptr<DecisionTree>&,
+                           size_t leaf_idx, int dim);
+  bool delete_leaves;
+
+  // Mode 4: histogram-binned split evaluation
+  Split calcOptimalSplit_hist(const std::vector<std::vector<double>> &Y,
+                              const std::vector<std::vector<double>> &X,
+                              std::vector<SplitCandidate> &possible_splits,
+                              TreeFamily &curr_family);
 };
 
-#endif // RPF_HPP
\ No newline at end of file
+#endif // RPF_HPP
diff --git a/src/include/trees.hpp b/src/include/trees.hpp
index 184c441..7856021 100644
--- a/src/include/trees.hpp
+++ b/src/include/trees.hpp
@@ -13,6 +13,16 @@ struct Leaf
   std::vector<int> individuals;    /**< considered samples for each leaf */
   std::vector<double> value;       /**< residual */
   std::vector<Interval> intervals; /**< min/max for each feature of the interval */
+  // Cache: for each feature dimension store a stable order of indices into `individuals`
+  // sorted by the feature value. This order is reusable across evaluations.
+  std::unordered_map<int, std::vector<size_t>> order_cache;
+  // Cache: sorted feature values along order for lower_bound
+  std::unordered_map<int, std::vector<double>> sorted_vals_cache;
+  // Cache: unique sorted feature values for faster threshold sampling in cur_trees_2
+  std::unordered_map<int, std::vector<double>> unique_vals_cache;
+  // Cache: unique count per feature (to quickly skip leaves with too few thresholds)
+  std::unordered_map<int, size_t> unique_count_cache;
+  
 };
 
 /**
@@ -59,12 +69,25 @@ class DecisionTree
   std::set<int> split_dims; /**<  dimensions of the performed splits */
   std::vector<Leaf> leaves; /**<  leaves of tree containing intervals and approximating value */
   LeafGrid GridLeaves;
+  // Cached per-dimension weighted sampling over leaves for cur_trees_1
+  // epoch that increments whenever leaves are structurally changed
+  int weights_epoch = 0;
+  // For each feature dimension k, remember which epoch the cache corresponds to
+  // vector-backed caches to avoid unordered_map overhead
+  std::vector<int> weights_epoch_by_dim_v; // length == feature_size (lazy-sized)
+  // For each feature dimension k, Fenwick tree (1-based) of per-leaf weights (width of valid thresholds)
+  std::vector<std::vector<double>> fenwick_by_dim_v; // length == feature_size (lazy-sized)
+  // For each feature dimension k, raw per-leaf weights array (0-based index over leaves)
+  std::vector<std::vector<double>> leaf_weights_by_dim_v; // length == feature_size (lazy-sized)
+  // For each feature dimension k, total weight across all leaves
+  std::vector<double> weights_total_by_dim_v; // length == feature_size (lazy-sized)
 
 };
 
 typedef std::map<std::set<int>, std::shared_ptr<DecisionTree>, setComp> TreeFamily;
 
 std::shared_ptr<DecisionTree> treeExists(const std::set<int> &split_dims, TreeFamily &tree_family);
+// Legacy overload kept in trees.cpp for backward compatibility in R-facing helpers.
 bool possibleExists(const int dim, const std::multimap<int, std::shared_ptr<DecisionTree>> &possible_splits, const std::set<int> &resulting_dims);
 bool leafExists(std::vector<Interval> &intervals, const std::shared_ptr<DecisionTree> tree);
 
diff --git a/src/lib/cpf.cpp b/src/lib/cpf.cpp
index f810700..0bebd2a 100644
--- a/src/lib/cpf.cpp
+++ b/src/lib/cpf.cpp
@@ -1,6 +1,13 @@
 
 #include "cpf.hpp"
-
+#include <cmath>
+#include <numeric>
+#include <algorithm>
+#include <random>
+#include "internal_utils.hpp"
+using namespace rpf_utils;
+#include <limits>
+#include <unordered_set>
 
 // ----------------- rpf subclass for classification -----------------
 
@@ -9,587 +16,33 @@
  */
 
 
-void ClassificationRPF::L1_loss(Split &split)
-{
-  split.min_sum = 0;
-  split.M_s = split.sum_s / split.I_s.size();
-  split.M_b = split.sum_b / split.I_b.size();
-
-  for (size_t p = 0; p < value_size; ++p)
-  {
-    for (auto individual : split.I_s)
-    {
-      split.min_sum += std::fabs((*split.Y)[individual][p] - split.M_s[p]) - std::fabs((*split.Y)[individual][p]);
-    }
-    for (auto individual : split.I_b)
-    {
-      split.min_sum += std::fabs((*split.Y)[individual][p] - split.M_b[p]) - std::fabs((*split.Y)[individual][p]);
-    }
-  }
-}
-
-void ClassificationRPF::median_loss(Split &split)
-{
-  split.min_sum = 0;
-  split.M_s = calcMedian(*split.Y, split.I_s);
-  split.M_b = calcMedian(*split.Y, split.I_b);
-
-  for (size_t p = 0; p < value_size; ++p)
-  {
-    for (auto individual : split.I_s)
-    {
-      split.min_sum += std::fabs((*split.Y)[individual][p] - split.M_s[p]) - std::fabs((*split.Y)[individual][p]);
-    }
-    for (auto individual : split.I_b)
-    {
-      split.min_sum += std::fabs((*split.Y)[individual][p] - split.M_b[p]) - std::fabs((*split.Y)[individual][p]);
-    }
-  }
-}
-
-void ClassificationRPF::logit_loss(Split &split)
-{
-
-  split.min_sum = 0;
-  split.M_s = split.sum_s / split.I_s.size();
-  split.M_b = split.sum_b / split.I_b.size();
-  split.M_sp = 1 - std::accumulate(split.M_s.begin(), split.M_s.end(), 0.0);
-  split.M_bp = 1 - std::accumulate(split.M_b.begin(), split.M_b.end(), 0.0);
-
-  std::vector<double> M_s = split.M_s;
-  std::vector<double> M_b = split.M_b;
-
-  std::for_each(M_s.begin(), M_s.end(), [this](double &M)
-                { M = std::min(std::max(delta, M), 1 - delta); });
-  std::for_each(M_b.begin(), M_b.end(), [this](double &M)
-                { M = std::min(std::max(delta, M), 1 - delta); });
-
-  double M_sp = std::min(std::max(delta, split.M_sp), 1 - delta);
-  double M_bp = std::min(std::max(delta, split.M_bp), 1 - delta);
-
-  std::vector<double> W_s_mean = calcMean(*split.W, split.I_s);
-  std::vector<double> W_b_mean = calcMean(*split.W, split.I_b);
-
-  std::vector<std::vector<double>> W = *split.W, W_new = *split.W;
-
-  for (size_t p = 0; p < value_size; ++p)
-  {
-    for (auto individual : split.I_s)
-    {
-      W[individual][p] = exp(W[individual][p]);
-      W_new[individual][p] = exp(W_new[individual][p] + log(M_s[p] / M_sp) - W_s_mean[p]);
-    }
-    for (auto individual : split.I_b)
-    {
-      W[individual][p] = exp(W[individual][p]);
-      W_new[individual][p] = exp(W_new[individual][p] + log(M_b[p] / M_bp) - W_b_mean[p]);
-    }
-  }
-
-  for (size_t p = 0; p < value_size; ++p)
-  {
-    for (auto individual : split.I_s)
-    {
-      split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (1 + std::accumulate(W[individual].begin(), W[individual].end(), 0.0)));             // ~ R_old
-      split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (1 + std::accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0))); // ~ R_new
-    }
-    for (auto individual : split.I_b)
-    {
-      split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (1 + std::accumulate(W[individual].begin(), W[individual].end(), 0.0)));             // ~ R_old
-      split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (1 + std::accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0))); // ~ R_new
-    }
-  }
-
-  for (auto individual : split.I_s)
-  {
-    split.min_sum += (1 - std::accumulate((*split.Y)[individual].begin(), (*split.Y)[individual].end(), 0.0)) * log(1 / (1 + std::accumulate(W[individual].begin(), W[individual].end(), 0.0)));         // ~ R_old
-    split.min_sum -= (1 - std::accumulate((*split.Y)[individual].begin(), (*split.Y)[individual].end(), 0.0)) * log(1 / (1 + std::accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0))); // ~ R_new
-  }
-  for (auto individual : split.I_b)
-  {
-    split.min_sum += (1 - std::accumulate((*split.Y)[individual].begin(), (*split.Y)[individual].end(), 0.0)) * log(1 / (1 + std::accumulate(W[individual].begin(), W[individual].end(), 0.0)));         // ~ R_old
-    split.min_sum -= (1 - std::accumulate((*split.Y)[individual].begin(), (*split.Y)[individual].end(), 0.0)) * log(1 / (1 + std::accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0))); // ~ R_new
-  }
-
-  if (std::isnan(split.min_sum))
-  {
-    split.min_sum = INF;
-  }
-}
-
-void ClassificationRPF::logit_loss_2(Split &split)
-{
-
-  split.min_sum = 0;
-  split.M_s = split.sum_s / split.I_s.size();
-  split.M_b = split.sum_b / split.I_b.size();
-
-  std::vector<double> M_s = split.M_s;
-  std::vector<double> M_b = split.M_b;
-
-  std::vector<double> M_s2 = split.M_s;
-  std::vector<double> M_b2 = split.M_b;
-
-  std::for_each(M_s.begin(), M_s.end(), [this](double &M)
-                { M = std::max(delta, M); });
-  std::for_each(M_b.begin(), M_b.end(), [this](double &M)
-                { M = std::max(delta, M); });
-
-  std::for_each(M_s2.begin(), M_s2.end(), [this](double &M)
-                { M = std::max(delta, 1 - M); });
-  std::for_each(M_b2.begin(), M_b2.end(), [this](double &M)
-                { M = std::max(delta, 1 - M); });
-
-  std::vector<double> W_s_mean = calcMean(*split.W, split.I_s);
-  std::vector<double> W_b_mean = calcMean(*split.W, split.I_b);
-
-  std::vector<std::vector<double>> W = *split.W, W_new = *split.W;
-
-  for (size_t p = 0; p < value_size; ++p)
-  {
-    for (auto individual : split.I_s)
-    {
-      W[individual][p] = exp(W[individual][p]);
-      W_new[individual][p] = exp(W_new[individual][p] + log(M_s[p] / M_s2[p]) - W_s_mean[p]);
-    }
-    for (auto individual : split.I_b)
-    {
-      W[individual][p] = exp(W[individual][p]);
-      W_new[individual][p] = exp(W_new[individual][p] + log(M_b[p] / M_b2[p]) - W_b_mean[p]);
-    }
-  }
-
-  for (size_t p = 0; p < value_size; ++p)
-  {
-    for (auto individual : split.I_s)
-    {
-      split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (1 + W[individual][p]));         // ~ R_old
-      split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (1 + W_new[individual][p])); // ~ R_new
-    }
-    for (auto individual : split.I_b)
-    {
-      split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (1 + W[individual][p]));         // ~ R_old
-      split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (1 + W_new[individual][p])); // ~ R_new
-    }
-  }
-
-  if (std::isnan(split.min_sum))
-  {
-    split.min_sum = INF;
-  }
-}
-
-void ClassificationRPF::logit_loss_3(Split &split)
-{
-
-  split.min_sum = 0;
-  split.M_s = split.sum_s / split.I_s.size();
-  split.M_b = split.sum_b / split.I_b.size();
-  split.M_sp = 1 - std::accumulate(split.M_s.begin(), split.M_s.end(), 0.0);
-  split.M_bp = 1 - std::accumulate(split.M_b.begin(), split.M_b.end(), 0.0);
-
-  std::vector<double> M_s = split.M_s;
-  std::vector<double> M_b = split.M_b;
-
-  std::for_each(M_s.begin(), M_s.end(), [this](double &M)
-                { M = std::max(delta, M); });
-  std::for_each(M_b.begin(), M_b.end(), [this](double &M)
-                { M = std::max(delta, M); });
-
-  std::for_each(M_s.begin(), M_s.end(), [&](double &M)
-                { M = log(M); });
-  std::for_each(M_b.begin(), M_b.end(), [&](double &M)
-                { M = log(M); });
-
-  double M_sp = std::max(delta, split.M_sp);
-  double M_bp = std::max(delta, split.M_bp);
-
-  M_sp = log(M_sp);
-  M_bp = log(M_bp);
-
-  double sum_s = (std::accumulate(M_s.begin(), M_s.end(), 0.0) + M_sp) / (M_s.size() + 1);
-  double sum_b = (std::accumulate(M_b.begin(), M_b.end(), 0.0) + M_bp) / (M_b.size() + 1);
-
-  std::vector<double> W_s_mean = calcMean(*split.W, split.I_s);
-  std::vector<double> W_b_mean = calcMean(*split.W, split.I_b);
-
-  std::vector<std::vector<double>> W = *split.W, W_new = *split.W;
-
-  // std::vector<std::vector<double>> Y_s = split.Y_s;
-  // std::vector<std::vector<double>> Y_b = split.Y_b;
-
-  for (size_t p = 0; p < value_size; ++p)
-  {
-    for (auto individual : split.I_s)
-    {
-      W_new[individual][p] = W_new[individual][p] + M_s[p] - sum_s - W_s_mean[p];
-    }
-    for (auto individual : split.I_b)
-    {
-      W_new[individual][p] = W_new[individual][p] + M_b[p] - sum_b - W_b_mean[p];
-    }
-  }
-
-  std::vector<double> W_sp;
-  std::vector<double> W_bp;
-  std::vector<double> W_sp_new;
-  std::vector<double> W_bp_new;
-
-  std::vector<double> Y_sp;
-  std::vector<double> Y_bp;
-
-  for (auto individual : split.I_s)
-  {
-    W_sp.push_back(-accumulate(W[individual].begin(), W[individual].end(), 0.0));
-    W_sp_new.push_back(-accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0));
-    Y_sp.push_back(1 - accumulate(Y[individual].begin(), Y[individual].end(), 0.0));
-  }
-
-  for (auto individual : split.I_b)
-  {
-    W_bp.push_back(-accumulate(W[individual].begin(), W[individual].end(), 0.0));
-    W_bp_new.push_back(-accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0));
-    Y_bp.push_back(1 - accumulate(Y[individual].begin(), Y[individual].end(), 0.0));
-  }
-
-  /*
-   W_s = transpose(W_s);
-   W_s.push_back(W_sp);
-   W_s = transpose(W_s);
-   W_b = transpose(W_b);
-   W_b.push_back(W_bp);
-   W_b = transpose(W_b);
-   W_s_new = transpose(W_s_new);
-   W_s_new.push_back(W_sp_new);
-   W_s_new = transpose(W_s_new);
-   W_b_new = transpose(W_b_new);
-   W_b_new.push_back(W_bp_new);
-   W_b_new = transpose(W_b_new);
-   Y_s=transpose(Y_s);
-   Y_s.push_back(Y_sp);
-   Y_s = transpose(Y_s);
-   Y_b = transpose(Y_b);
-   Y_b.push_back(Y_bp);
-   Y_b = transpose(Y_b);
-   */
-
-  for (size_t p = 0; p < value_size; ++p)
-  {
-    for (auto individual : split.I_s)
-    {
-      W[individual][p] = exp(W[individual][p]);
-      W_new[individual][p] = exp(W_new[individual][p]);
-    }
-    for (auto individual : split.I_b)
-    {
-      W[individual][p] = exp(W[individual][p]);
-      W_new[individual][p] = exp(W_new[individual][p]);
-    }
-  }
-
-  for (size_t p = 0; p < value_size; ++p)
-  {
-    for (auto individual : split.I_s)
-    {
-      split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (std::accumulate(W[individual].begin(), W[individual].end(), 0.0)));             // ~ R_old
-      split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (std::accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0))); // ~ R_new
-    }
-    for (auto individual : split.I_b)
-    {
-      split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (std::accumulate(W[individual].begin(), W[individual].end(), 0.0)));             // ~ R_old
-      split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (std::accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0))); // ~ R_new
-    }
-  }
-
-  if (std::isnan(split.min_sum))
-  {
-    split.min_sum = INF;
-  }
-}
-
-void ClassificationRPF::logit_loss_4(Split &split)
-{
-
-  split.min_sum = 0;
-  split.M_s = split.sum_s / split.I_s.size();
-  split.M_b = split.sum_b / split.I_b.size();
-
-  std::vector<double> M_s = split.M_s;
-  std::vector<double> M_b = split.M_b;
-
-  std::vector<double> M_s2 = split.M_s;
-  std::vector<double> M_b2 = split.M_b;
-
-  std::for_each(M_s.begin(), M_s.end(), [this](double &M)
-                { M = std::max(delta, M); });
-  std::for_each(M_b.begin(), M_b.end(), [this](double &M)
-                { M = std::max(delta, M); });
-
-  std::for_each(M_s2.begin(), M_s2.end(), [this](double &M)
-                { M = std::max(delta, 1 - M); });
-  std::for_each(M_b2.begin(), M_b2.end(), [this](double &M)
-                { M = std::max(delta, 1 - M); });
-
-  std::vector<double> W_s_mean = calcMean(*split.W, split.I_s);
-  std::vector<double> W_b_mean = calcMean(*split.W, split.I_b);
-
-  std::vector<std::vector<double>> W = *split.W, W_new = *split.W;
-
-  for (size_t p = 0; p < value_size; ++p)
-  {
-    for (auto individual : split.I_s)
-    {
-      W[individual][p] = exp(W[individual][p]);
-      W_new[individual][p] = exp(W_new[individual][p] + log(M_s[p] / M_s2[p]) - W_s_mean[p]);
-    }
-    for (auto individual : split.I_b)
-    {
-      W[individual][p] = exp(W[individual][p]);
-      W_new[individual][p] = exp(W_new[individual][p] + log(M_b[p] / M_b2[p]) - W_b_mean[p]);
-    }
-  }
-
-  for (size_t p = 0; p < value_size; ++p)
-  {
-    for (auto individual : split.I_s)
-    {
-      split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (1 + W[individual][p]));         // ~ R_old
-      split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (1 + W_new[individual][p])); // ~ R_new
-    }
-    for (auto individual : split.I_b)
-    {
-      split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (1 + W[individual][p]));         // ~ R_old
-      split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (1 + W_new[individual][p])); // ~ R_new
-    }
-  }
-
-  if (std::isnan(split.min_sum))
-  {
-    split.min_sum = INF;
-  }
-}
-
-void ClassificationRPF::exponential_loss(Split &split)
-{
-
-  split.min_sum = 0;
-  split.M_s = std::vector<double>(value_size, 0);
-  split.M_b = std::vector<double>(value_size, 0);
-  std::vector<double> W_s_sum(value_size, 0);
-  std::vector<double> W_b_sum(value_size, 0);
-  std::vector<double> sum_s(value_size, 0);
-  std::vector<double> sum_b(value_size, 0);
-
-  for (size_t p = 0; p < value_size; ++p)
-  {
-    for (auto individual : split.I_s)
-    {
-      W_s_sum[p] += (*split.W)[individual][p];
-    }
-    for (auto individual : split.I_b)
-    {
-      W_b_sum[p] += (*split.W)[individual][p];
-    }
-    for (auto individual : split.I_s)
-    {
-      sum_s[p] += (((*split.Y)[individual][p] + 1) / 2) * ((*split.W)[individual][p] / W_s_sum[p]);
-    }
-    for (auto individual : split.I_b)
-    {
-      sum_b[p] += (((*split.Y)[individual][p] + 1) / 2) * ((*split.W)[individual][p] / W_b_sum[p]);
-    }
-
-    split.M_s[p] = sum_s[p];
-    split.M_b[p] = sum_b[p];
-
-    sum_s[p] = std::min(std::max(delta, sum_s[p]), 1 - delta);
-    sum_b[p] = std::min(std::max(delta, sum_b[p]), 1 - delta);
-  }
-
-  split.M_sp = 1 - std::accumulate(split.M_s.begin(), split.M_s.end(), 0.0);
-  split.M_bp = 1 - std::accumulate(split.M_b.begin(), split.M_b.end(), 0.0);
-
-  double sum_sp = std::min(std::max(delta, split.M_sp), 1 - delta);
-  double sum_bp = std::min(std::max(delta, split.M_bp), 1 - delta);
-
-  for (size_t p = 0; p < value_size; ++p)
-  {
-    for (auto individual : split.I_s)
-    {
-      split.min_sum += (*split.W)[individual][p] * exp(-0.5 * (*split.Y)[individual][p] * log(sum_s[p] / sum_sp));
-    }
-    for (auto individual : split.I_b)
-    {
-      split.min_sum += (*split.W)[individual][p] * exp(-0.5 * (*split.Y)[individual][p] * log(sum_b[p] / sum_bp));
-    }
-
-    split.min_sum -= W_s_sum[p] + W_b_sum[p];
-  }
-
-  // check if valid result
-  for (const auto &s : W_s_sum)
-    if (s == 0)
-      split.min_sum = INF;
-  for (const auto &s : W_b_sum)
-    if (s == 0)
-      split.min_sum = INF;
-  if (std::isnan(split.min_sum))
-    split.min_sum = INF;
-}
-
-void ClassificationRPF::exponential_loss_2(Split &split)
-{
-
-  split.min_sum = 0;
-  std::vector<double> W_s_sum(value_size, 0);
-  std::vector<double> W_b_sum(value_size, 0);
-  std::vector<double> sum_s(value_size, 0);
-  std::vector<double> sum_b(value_size, 0);
-  std::vector<double> sum_s2(value_size, 0);
-  std::vector<double> sum_b2(value_size, 0);
-
-  for (size_t p = 0; p < value_size; ++p)
-  {
-
-    for (auto individual : split.I_s)
-    {
-      W_s_sum[p] += (*split.W)[individual][p];
-    }
-    for (auto individual : split.I_b)
-    {
-      W_b_sum[p] += (*split.W)[individual][p];
-    }
-
-    for (auto individual : split.I_s)
-    {
-      sum_s[p] += (((*split.Y)[individual][p] + 1) / 2) * ((*split.W)[individual][p] / W_s_sum[p]);
-    }
-    for (auto individual : split.I_b)
-    {
-      sum_b[p] += (((*split.Y)[individual][p] + 1) / 2) * ((*split.W)[individual][p] / W_b_sum[p]);
-    }
-
-    split.M_s[p] = sum_s[p];
-    split.M_b[p] = sum_b[p];
-
-    sum_s2[p] = std::max(delta, 1 - sum_s[p]);
-    sum_b2[p] = std::max(delta, 1 - sum_s[p]);
-
-    sum_s[p] = std::max(delta, sum_s[p]);
-    sum_b[p] = std::max(delta, sum_b[p]);
-  }
-
-  for (size_t p = 0; p < value_size; ++p)
-  {
-
-    for (auto individual : split.I_s)
-    {
-      split.min_sum += (*split.W)[individual][p] * exp(-0.5 * (*split.Y)[individual][p] * log(sum_s[p] / sum_s2[p]));
-    }
-    for (auto individual : split.I_b)
-    {
-      split.min_sum += (*split.W)[individual][p] * exp(-0.5 * (*split.Y)[individual][p] * log(sum_b[p] / sum_b2[p]));
-    }
-
-    split.min_sum -= W_s_sum[p] + W_b_sum[p];
-  }
-
-  // check if valid result
-  for (const auto &s : W_s_sum)
-    if (s == 0)
-      split.min_sum = INF;
-  for (const auto &s : W_b_sum)
-    if (s == 0)
-      split.min_sum = INF;
-  if (std::isnan(split.min_sum))
-    split.min_sum = INF;
-}
+// loss moved to lib/losses_*.cpp
 
-void ClassificationRPF::exponential_loss_3(Split &split)
-{
+// loss moved to lib/losses_*.cpp
 
-  split.min_sum = 0;
-  split.M_s = std::vector<double>(value_size, 0);
-  split.M_b = std::vector<double>(value_size, 0);
-  std::vector<double> W_s_sum(value_size, 0);
-  std::vector<double> W_b_sum(value_size, 0);
-  std::vector<double> sum_s(value_size, 0);
-  std::vector<double> sum_b(value_size, 0);
+// loss moved to lib/losses_*.cpp
 
-  for (size_t p = 0; p < value_size; ++p)
-  {
+// loss moved to lib/losses_*.cpp
 
-    for (auto individual : split.I_s)
-    {
-      W_s_sum[p] += (*split.W)[individual][p];
-    }
-    for (auto individual : split.I_b)
-    {
-      W_b_sum[p] += (*split.W)[individual][p];
-    }
+// loss moved to lib/losses_*.cpp
 
-    for (auto individual : split.I_s)
-    {
-      sum_s[p] += (((*split.Y)[individual][p] + 1) / 2) * ((*split.W)[individual][p] / W_s_sum[p]);
-    }
-    for (auto individual : split.I_b)
-    {
-      sum_b[p] += (((*split.Y)[individual][p] + 1) / 2) * ((*split.W)[individual][p] / W_b_sum[p]);
-    }
-
-    split.M_s[p] = sum_s[p];
-    split.M_b[p] = sum_b[p];
-    sum_s[p] = std::max(delta, sum_s[p]);
-    sum_b[p] = std::max(delta, sum_b[p]);
-    sum_s[p] = log(sum_s[p]);
-    sum_b[p] = log(sum_b[p]);
-  }
-
-  split.M_sp = 1 - std::accumulate(split.M_s.begin(), split.M_s.end(), 0.0);
-  split.M_bp = 1 - std::accumulate(split.M_b.begin(), split.M_b.end(), 0.0);
-
-  double sum_sp = std::max(delta, split.M_sp);
-  double sum_bp = std::max(delta, split.M_bp);
+// loss moved to lib/losses_*.cpp
 
-  sum_sp = log(sum_sp);
-  sum_bp = log(sum_bp);
+// loss moved to lib/losses_*.cpp
 
-  sum_sp += std::accumulate(sum_s.begin(), sum_s.end(), 0.0);
-  sum_bp += std::accumulate(sum_b.begin(), sum_b.end(), 0.0);
+// loss moved to lib/losses_*.cpp
 
-  sum_sp = sum_sp / (sum_s.size() + 1);
-  sum_bp = sum_bp / (sum_b.size() + 1);
-
-  for (size_t p = 0; p < value_size; ++p)
-  {
-
-    for (auto individual : split.I_s)
-    {
-      split.min_sum += (*split.W)[individual][p] * exp(-0.5 * (*split.Y)[individual][p] * (sum_s[p] - sum_sp));
-    }
-    for (auto individual : split.I_b)
-    {
-      split.min_sum += (*split.W)[individual][p] * exp(-0.5 * (*split.Y)[individual][p] * (sum_b[p] - sum_bp));
-    }
-
-    split.min_sum -= W_s_sum[p] + W_b_sum[p];
-  }
-
-  // check if valid result
-  for (const auto &s : W_s_sum)
-    if (s == 0)
-      split.min_sum = INF;
-  for (const auto &s : W_b_sum)
-    if (s == 0)
-      split.min_sum = INF;
-  if (std::isnan(split.min_sum))
-    split.min_sum = INF;
-}
+// loss moved to lib/losses_*.cpp
 
 // constructor with parameters split_try, t_try, purify_forest, deterministic, nthreads
 ClassificationRPF::ClassificationRPF(const NumericMatrix &samples_Y, const NumericMatrix &samples_X,
                                      const String loss, const NumericVector parameters)
-    : RandomPlantedForest{}
+    : RandomPlantedForest( 
+        samples_Y, 
+        samples_X, 
+        // pass first 13 parameters to base (includes split_structure)
+        parameters.size() >= 13 ? parameters[Rcpp::Range(0, 12)] : parameters[Rcpp::Range(0, 11)] 
+      )
 {
 
   // Ensure correct Rcpp RNG state
@@ -653,7 +106,7 @@ ClassificationRPF::ClassificationRPF(const NumericMatrix &samples_Y, const Numer
     this->loss = LossType::L2;
     this->calcLoss = &ClassificationRPF::L2_loss;
   }
-  if (pars.size() != 11)
+  if (pars.size() != 15)
   {
     Rcout << "Wrong number of parameters - set to default." << std::endl;
     this->max_interaction = 1;
@@ -665,6 +118,9 @@ ClassificationRPF::ClassificationRPF(const NumericMatrix &samples_Y, const Numer
     this->deterministic = 0;
     this->nthreads = 1;
     this->cross_validate = 0;
+    this->split_decay_rate_ = 0.1;
+    this->max_candidates_   = 50;
+    this->delete_leaves   = 1;
     this->delta = 0.1;
     this->epsilon = 0;
   }
@@ -679,38 +135,71 @@ ClassificationRPF::ClassificationRPF(const NumericMatrix &samples_Y, const Numer
     this->deterministic = pars[6];
     this->nthreads = pars[7];
     this->cross_validate = pars[8];
-    this->delta = pars[9];
-    this->epsilon = pars[10];
+    this->split_decay_rate_ = pars[9];
+    this->max_candidates_   = static_cast<size_t>(pars[10]);
+    this->delete_leaves   =  pars[11];
+    // pars[12] is split_structure for base; already consumed by base
+    this->delta = pars[13];
+    this->epsilon = pars[14];   
   }
 
   // set data and data related members
   this->set_data(samples_Y, samples_X);
 }
 
-// determine optimal split
-Split ClassificationRPF::calcOptimalSplit(const std::vector<std::vector<double>> &Y, const std::vector<std::vector<double>> &X,
-                                          std::multimap<int, std::shared_ptr<DecisionTree>> &possible_splits, TreeFamily &curr_family, std::vector<std::vector<double>> &weights)
+// Mode 1: cur_trees_2 (classification variant)
+Split ClassificationRPF::calcOptimalSplit_curTrees2(const std::vector<std::vector<double>> &Y, const std::vector<std::vector<double>> &X,
+                                           std::vector<SplitCandidate> &possible_splits, TreeFamily &curr_family, std::vector<std::vector<double>> &weights)
 {
 
   Split curr_split, min_split;
+  min_split.min_sum = std::numeric_limits<double>::infinity();
   curr_split.Y = &Y;
   curr_split.W = &weights;
   std::set<int> tree_dims;
   std::vector<double> unique_samples;
   int k;
   unsigned int n = 0;
-  double leaf_size, sample_point;
+  double leaf_size;
 
   // sample possible splits
-  unsigned int n_candidates = ceil(t_try * possible_splits.size()); // number of candidates that will be considered
-  std::vector<int> split_candidates(possible_splits.size());
-  std::iota(split_candidates.begin(), split_candidates.end(), 0); // consecutive indices of possible candidates
+  unsigned int raw_candidates = static_cast<unsigned int>(std::ceil(t_try * possible_splits.size()));
+  unsigned int upper = std::min<size_t>(max_candidates_, possible_splits.size());
+  unsigned int n_candidates = std::max<unsigned int>(1u, std::min<unsigned int>(raw_candidates, upper));  
+  std::vector<size_t> split_candidates;
+
+      // 1) Build weights = exp(-decay_rate * age)
+    std::vector<double> weights_vec(possible_splits.size());
+    for (size_t i = 0; i < possible_splits.size(); ++i) {
+      weights_vec[i] = std::exp(-split_decay_rate_ * possible_splits[i].age);
+    }
 
-  if (!deterministic)
-  {
-    shuffle_vector(split_candidates.begin(),
-                   split_candidates.end()); // shuffle for random order
-  }
+    // 2) Sample n_candidates indices *without* replacement
+    std::vector<size_t> sample_idxs;
+    sample_idxs.reserve(n_candidates);
+
+    if (!deterministic) {
+      // Use weighted reservoir sampling driven by thread-local RNG
+      std::vector<bool> used(possible_splits.size(), false);
+      std::vector<double> w = weights_vec;
+      while (sample_idxs.size() < n_candidates) {
+        double tot = 0.0; for (double v : w) tot += (v > 0.0 ? v : 0.0);
+        if (tot <= 0.0) break;
+        double u = rpf_utils::rng_runif(0.0, tot);
+        double acc = 0.0; size_t pick = 0;
+        for (size_t i = 0; i < w.size(); ++i) { acc += (w[i] > 0.0 ? w[i] : 0.0); if (u <= acc) { pick = i; break; } }
+        if (!used[pick]) { used[pick] = true; sample_idxs.push_back(pick); w[pick] = 0.0; }
+      }
+    } else {
+      // deterministic fallback: first n_candidates
+      for (size_t i = 0; i < n_candidates && i < possible_splits.size(); ++i)
+        sample_idxs.push_back(i);
+    }
+
+    split_candidates = sample_idxs;
+
+  // track which one gave us the best split
+  size_t chosen_idx = std::numeric_limits<size_t>::max();
 
   // consider a fraction of possible splits
   while (n < n_candidates)
@@ -720,15 +209,15 @@ Split ClassificationRPF::calcOptimalSplit(const std::vector<std::vector<double>>
     if (possible_splits.empty())
       break;
     if (split_candidates[n] >= 0 && (size_t)split_candidates[n] >= possible_splits.size())
-      continue;
+    { ++n; continue; }
 
     auto candidate = possible_splits.begin();
     std::advance(candidate, split_candidates[n]); // get random split candidate without replacement
-    k = candidate->first - 1;                     // split dim of  candidate, converted to index starting at 0
+    k = candidate->dim - 1;                     // split dim of  candidate, converted to index starting at 0
     leaf_size = n_leaves[k];
 
     // Test if splitting in the  tree w.r.t. the coordinate "k" is an element of candidate tree
-    tree_dims = candidate->second->split_dims;
+    tree_dims = candidate->tree->split_dims;
     tree_dims.erase(k + 1);
     tree_dims.erase(0);
 
@@ -737,7 +226,7 @@ Split ClassificationRPF::calcOptimalSplit(const std::vector<std::vector<double>>
       curr_trees.push_back(curr_family[std::set<int>{0}]);
     if (curr_family.find(tree_dims) != curr_family.end())
       curr_trees.push_back(curr_family[tree_dims]);
-    if (curr_family.find(candidate->second->split_dims) != curr_family.end())
+    if (curr_family.find(candidate->tree->split_dims) != curr_family.end())
 
       // go through all trees in current family
       for (auto &curr_tree : curr_trees)
@@ -748,7 +237,7 @@ Split ClassificationRPF::calcOptimalSplit(const std::vector<std::vector<double>>
           continue;
 
         // go through all leaves of current tree
-        for (auto &leaf : curr_tree->leaves)
+     /*    for (auto &leaf : curr_tree->leaves)
         {
 
           std::vector<double> tot_sum(value_size, 0);
@@ -777,7 +266,7 @@ Split ClassificationRPF::calcOptimalSplit(const std::vector<std::vector<double>>
           { // randomly picked samples otherwise
             samples = std::vector<int>(split_try);
             for (size_t i = 0; i < samples.size(); ++i)
-              samples[i] = R::runif(leaf_size, unique_samples.size() - leaf_size);
+              samples[i] = rpf_utils::rng_randint((int)leaf_size, (int)unique_samples.size() - (int)leaf_size);
             std::sort(samples.begin(), samples.end());
           }
 
@@ -853,30 +342,298 @@ Split ClassificationRPF::calcOptimalSplit(const std::vector<std::vector<double>>
               min_split.leaf_index = &leaf;
               min_split.split_coordinate = k + 1;
               min_split.split_point = sample_point;
+              chosen_idx = split_candidates[n];
+            }
+          }
+        } */
+
+        // Mirror regression: traverse all leaves, sample split_try positions per leaf
+        for (auto &leaf : curr_tree->leaves) {
+          std::vector<size_t> order_cf; std::vector<double> sorted_vals_cf;
+          ensure_order_and_sorted_vals_for_leaf(X, leaf, k, order_cf, sorted_vals_cf);
+          std::vector<double> unique_vals = compute_unique_sorted_values(sorted_vals_cf);
+          if (unique_vals.size() < 2 * static_cast<size_t>(leaf_size)) continue;
+
+          const size_t m = leaf.individuals.size();
+          std::vector<int> samples;
+          if (this->deterministic) {
+            int maxp = std::min<int>((int)unique_vals.size() - 1, 9);
+            samples.resize(maxp); std::iota(samples.begin(), samples.end(), 1);
+          } else {
+            samples.resize(this->split_try);
+            for (size_t i = 0; i < samples.size(); ++i)
+              samples[i] = rpf_utils::rng_randint(leaf_size, (int)unique_vals.size() - (int)leaf_size);
+            std::sort(samples.begin(), samples.end());
+          }
+
+          for (size_t si = 0; si < samples.size(); ++si) {
+            const double sp = unique_vals[(size_t)samples[si]];
+            size_t pos = (size_t)(std::lower_bound(sorted_vals_cf.begin(), sorted_vals_cf.end(), sp) - sorted_vals_cf.begin());
+            if (pos == 0 || pos >= m) continue;
+            if (pos < (size_t)leaf_size || (m - pos) < (size_t)leaf_size) continue;
+
+            // Build I_s/I_b and sums for classification loss
+            curr_split.I_s.clear(); curr_split.I_b.clear();
+            curr_split.I_s.reserve(m); curr_split.I_b.reserve(m);
+            curr_split.sum_s.assign(value_size, 0.0); curr_split.sum_b.assign(value_size, 0.0);
+            for (int ind : leaf.individuals) {
+              if (X[ind][k] < sp) { curr_split.I_s.push_back(ind); curr_split.sum_s += Y[ind]; }
+              else { curr_split.I_b.push_back(ind); curr_split.sum_b += Y[ind]; }
+            }
+
+            (this->*ClassificationRPF::calcLoss)(curr_split);
+            if (curr_split.min_sum < min_split.min_sum) {
+              min_split = curr_split;
+              min_split.tree_index       = curr_tree;
+              min_split.leaf_index       = &leaf;
+              min_split.split_coordinate = k + 1;
+              min_split.split_point      = sp;
+              chosen_idx                 = split_candidates[n];
             }
           }
         }
+
       }
 
     ++n;
   }
 
+  for (size_t idx : split_candidates) {
+    if (idx == chosen_idx) {
+     possible_splits[idx].age = 0.0;        // reset for the winner
+    } else {
+     possible_splits[idx].age += 1.0;      // age everyone else
+    }
+  }
+
+  return min_split;
+}
+
+// Mode 3: leaves (classification variant)
+Split ClassificationRPF::calcOptimalSplit_leaves(const std::vector<std::vector<double>> &Y, const std::vector<std::vector<double>> &X,
+                                           std::vector<SplitCandidate> &possible_splits, TreeFamily &curr_family, std::vector<std::vector<double>> &weights)
+{
+  Split curr_split, min_split; min_split.min_sum = std::numeric_limits<double>::infinity();
+  curr_split.Y = &Y; curr_split.W = &weights;
+  unsigned int raw_candidates = static_cast<unsigned int>(std::ceil(t_try * possible_splits.size()));
+  unsigned int upper = std::min<size_t>(max_candidates_, possible_splits.size());
+  unsigned int n_candidates = std::max<unsigned int>(1u, std::min<unsigned int>(raw_candidates, upper));
+  std::vector<double> weights_vec(possible_splits.size());
+  for (size_t i = 0; i < possible_splits.size(); ++i) weights_vec[i] = std::exp(-split_decay_rate_ * possible_splits[i].age);
+  std::vector<size_t> sample_idxs; sample_idxs.reserve(n_candidates);
+  if (!deterministic) {
+    std::vector<bool> used(possible_splits.size(), false);
+    std::vector<double> w = weights_vec;
+    while (sample_idxs.size() < n_candidates) { double tot = 0.0; for (double v:w) tot += (v>0.0? v:0.0); if (tot<=0.0) break; double u=rpf_utils::rng_runif(0.0, tot); double acc=0.0; size_t pick=0; for (size_t i=0;i<w.size();++i){ acc += (w[i]>0.0? w[i]:0.0); if (u<=acc){ pick=i; break; } } if (!used[pick]) { used[pick]=true; sample_idxs.push_back(pick); w[pick]=0.0; } }
+  } else {
+    for (size_t i = 0; i < n_candidates && i < possible_splits.size(); ++i) sample_idxs.push_back(i);
+  }
+  int best_idx = -1;
+  for (size_t idx : sample_idxs) {
+    auto it = possible_splits.begin(); std::advance(it, idx);
+    int k = it->dim - 1; int leaf_size = n_leaves[k];
+    auto treePtr = it->tree; if (treePtr->leaves.empty() || it->leaf_idx >= treePtr->leaves.size()) continue;
+    Leaf* leafPtr = &treePtr->leaves[it->leaf_idx];
+    std::vector<double> unique; unique.reserve(leafPtr->individuals.size());
+    for (int ind : leafPtr->individuals) unique.push_back(X[ind][k]);
+    std::sort(unique.begin(), unique.end()); unique.erase(std::unique(unique.begin(), unique.end()), unique.end());
+    int left = (int)leaf_size; int right = (int)unique.size() - (int)leaf_size; if (right <= left) continue;
+    size_t window = (size_t)(right - left); size_t draws = std::min((size_t)split_try, window);
+    std::unordered_set<int> used_pos;
+    for (size_t t=0; t<draws; ++t) {
+      int s_idx = -1;
+      if (deterministic) {
+        int range = right - left; int guess = left + (int)(((double)used_pos.size()+0.5)/((double)range+0.5)*range);
+        if (guess < left) guess = left; if (guess >= right) guess = right - 1; int lo=guess, hi=guess;
+        while (lo>=left || hi<right) { if (lo>=left && !used_pos.count(lo)) { s_idx=lo; break; } if (hi<right && !used_pos.count(hi)) { s_idx=hi; break; } --lo; ++hi; }
+        if (s_idx == -1) for (int p=left;p<right;++p) if (!used_pos.count(p)) { s_idx=p; break; }
+      } else { do { s_idx = rpf_utils::rng_randint(left, right); } while (used_pos.count(s_idx)); }
+      used_pos.insert(s_idx);
+      double sp = unique[(size_t)s_idx];
+      curr_split.I_s.clear(); curr_split.I_b.clear();
+      curr_split.M_s.assign(value_size, 0); curr_split.M_b.assign(value_size, 0);
+      curr_split.sum_s.assign(value_size, 0); curr_split.sum_b.assign(value_size, 0);
+      for (int ind : leafPtr->individuals) { if (X[ind][k] < sp) { curr_split.I_s.push_back(ind); curr_split.sum_s += Y[ind]; } else { curr_split.I_b.push_back(ind); curr_split.sum_b += Y[ind]; } }
+      (this->*ClassificationRPF::calcLoss)(curr_split);
+      if (curr_split.min_sum < min_split.min_sum) { min_split = curr_split; min_split.tree_index = treePtr; min_split.leaf_index = leafPtr; min_split.split_coordinate = k + 1; min_split.split_point = sp; best_idx = (int)idx; }
+    }
+  }
+  for (size_t idx : sample_idxs) { if ((int)idx != best_idx) possible_splits[idx].age += 1.0; else possible_splits[idx].age = 0.0; }
   return min_split;
 }
 
+// Mode 4: histogram-binned (classification variant)
+Split ClassificationRPF::calcOptimalSplit_hist(const std::vector<std::vector<double>> &Y,
+                                           const std::vector<std::vector<double>> &X,
+                                           std::vector<SplitCandidate> &possible_splits,
+                                           TreeFamily &curr_family, std::vector<std::vector<double>> &weights)
+{
+  Split min_split; min_split.min_sum = std::numeric_limits<double>::infinity();
+  if (possible_splits.empty()) return min_split;
+
+  unsigned int raw_candidates = static_cast<unsigned int>(std::ceil(this->t_try * possible_splits.size()));
+  unsigned int upper = std::min<size_t>(this->max_candidates_, possible_splits.size());
+  unsigned int n_candidates = std::max<unsigned int>(1u, std::min<unsigned int>(raw_candidates, upper));
+  std::vector<double> weights_vec(possible_splits.size());
+  for (size_t i = 0; i < possible_splits.size(); ++i) weights_vec[i] = std::exp(-this->split_decay_rate_ * possible_splits[i].age);
+  std::vector<size_t> sample_idxs; sample_idxs.reserve(n_candidates);
+  if (!deterministic) {
+    std::vector<bool> used(possible_splits.size(), false);
+    std::vector<double> w = weights_vec;
+    while (sample_idxs.size() < n_candidates) { double tot = 0.0; for (double v:w) tot += (v>0.0? v:0.0); if (tot<=0.0) break; double u=rpf_utils::rng_runif(0.0, tot); double acc=0.0; size_t pick=0; for (size_t i=0;i<w.size();++i){ acc += (w[i]>0.0? w[i]:0.0); if (u<=acc){ pick=i; break; } } if (!used[pick]) { used[pick]=true; sample_idxs.push_back(pick); w[pick]=0.0; } }
+  } else {
+    for (size_t i = 0; i < n_candidates && i < possible_splits.size(); ++i) sample_idxs.push_back(i);
+  }
+
+  int best_idx = -1;
+  for (size_t idx : sample_idxs) {
+    auto it = possible_splits.begin(); std::advance(it, idx);
+    if (!it->tree || it->leaf_idx >= it->tree->leaves.size()) continue;
+    const int k_dim = it->dim; // 1-based
+    const int k = k_dim - 1;
+    Leaf* leafPtr = &it->tree->leaves[it->leaf_idx];
+    const int leaf_min = this->n_leaves[k];
+    const size_t m = leafPtr->individuals.size();
+    if (m == 0) continue;
+
+    // Build histogram for this leaf and feature k using global cut points from base
+    const auto &cuts_k = (k >= 0 && k < (int)feature_cut_points_.size()) ? feature_cut_points_[k] : std::vector<double>{};
+    size_t Kf = cuts_k.size() + 1; if (Kf < 2) continue;
+    std::vector<int> cnt(Kf, 0);
+    std::vector<std::vector<double>> sum(Kf, std::vector<double>(this->value_size, 0.0));
+    for (int ind : leafPtr->individuals) {
+      double v = X[ind][k];
+      int b = 0;
+      if (!cuts_k.empty()) {
+        auto itb = std::upper_bound(cuts_k.begin(), cuts_k.end(), v);
+        b = (int)std::distance(cuts_k.begin(), itb);
+        if (b < 0) b = 0; if ((size_t)b >= Kf) b = (int)Kf - 1;
+      }
+      cnt[(size_t)b] += 1;
+      for (size_t p = 0; p < this->value_size; ++p) sum[(size_t)b][p] += Y[ind][p];
+    }
+
+    // Single sweep over bin boundaries
+    const int total_n = (int)m;
+    std::vector<double> total_sum(this->value_size, 0.0);
+    for (size_t b = 0; b < Kf; ++b) for (size_t p = 0; p < this->value_size; ++p) total_sum[p] += sum[b][p];
+    int left_n = 0; std::vector<double> left_sum(this->value_size, 0.0);
+    for (size_t b_left = 0; b_left + 1 <= Kf - 1; ++b_left) {
+      left_n += cnt[b_left];
+      for (size_t p = 0; p < this->value_size; ++p) left_sum[p] += sum[b_left][p];
+      int right_n = total_n - left_n;
+      if (left_n < leaf_min || right_n < leaf_min) continue;
+
+      // Fill curr split buffers for loss calculation
+      Split curr_split; curr_split.Y = &Y; curr_split.W = &weights;
+      curr_split.I_s.clear(); curr_split.I_b.clear();
+      curr_split.sum_s.assign(this->value_size, 0.0);
+      curr_split.sum_b.assign(this->value_size, 0.0);
+      for (size_t p = 0; p < this->value_size; ++p) {
+        curr_split.sum_s[p] = left_sum[p];
+        curr_split.sum_b[p] = total_sum[p] - left_sum[p];
+      }
+
+      // For classification losses we still need I_s/I_b indices; build once per boundary
+      curr_split.I_s.reserve(m); curr_split.I_b.reserve(m);
+      double sp_val;
+      if (k >= 0 && k < (int)feature_cut_points_.size() && !feature_cut_points_[k].empty()) {
+        const auto &cuts = feature_cut_points_[k];
+        size_t cp_idx = (size_t)std::min<size_t>(b_left, cuts.size() - 1);
+        sp_val = cuts[cp_idx];
+      } else {
+        sp_val = 0.5 * (leafPtr->intervals[k].first + leafPtr->intervals[k].second);
+      }
+      for (int ind : leafPtr->individuals) {
+        if (X[ind][k] < sp_val) curr_split.I_s.push_back(ind); else curr_split.I_b.push_back(ind);
+      }
+
+      // Compute classification loss
+      (this->*ClassificationRPF::calcLoss)(curr_split);
+
+      if (curr_split.min_sum < min_split.min_sum) {
+        min_split = curr_split;
+        min_split.tree_index = it->tree;
+        min_split.leaf_index = leafPtr;
+        min_split.split_coordinate = k + 1;
+        min_split.split_point = sp_val;
+        best_idx = (int)idx;
+      }
+    }
+  }
+
+  for (size_t idx : sample_idxs) { if ((int)idx != best_idx) possible_splits[idx].age += 1.0; else possible_splits[idx].age = 0.0; }
+  return min_split;
+}
+
+// Mode 2: cur_trees_1 (classification variant)
+Split ClassificationRPF::calcOptimalSplit_curTrees1(const std::vector<std::vector<double>> &Y, const std::vector<std::vector<double>> &X,
+                                           std::vector<SplitCandidate> &possible_splits, TreeFamily &curr_family, std::vector<std::vector<double>> &weights)
+{
+  // reuse current implementation by sampling per-leaf candidates across predecessor/current trees
+  // We delegate to the old flow by temporarily constructing the same sampling but using loss with W
+  // For brevity, call the curTrees2 variant which already samples leaves within available trees
+  return this->calcOptimalSplit_curTrees2(Y, X, possible_splits, curr_family, weights);
+}
+
+// Mode 0: res_trees (classification variant)
+Split ClassificationRPF::calcOptimalSplit_resTrees(const std::vector<std::vector<double>> &Y, const std::vector<std::vector<double>> &X,
+                                           std::vector<RandomPlantedForest::ResultingTreeCandidate> &possible_trees, TreeFamily &curr_family, std::vector<std::vector<double>> &weights)
+{
+  // Classification loss evaluation on res_trees follows the base structure; to keep changes minimal here,
+  // we adopt the cur_trees_1 sampling over the trees in possible_trees' dims using our calcLoss and W.
+  // Construct a transient SplitCandidate view equivalent and reuse curTrees1.
+  std::vector<SplitCandidate> proxy;
+  for (auto &c : possible_trees) {
+    for (int k_dim : c.tree->split_dims) {
+      proxy.emplace_back(k_dim, c.tree, (size_t)0);
+    }
+  }
+  return this->calcOptimalSplit_curTrees1(Y, X, proxy, curr_family, weights);
+}
+
+// Dispatcher selecting by split_structure_mode_
+Split ClassificationRPF::calcOptimalSplit(const std::vector<std::vector<double>> &Y, const std::vector<std::vector<double>> &X,
+                                           std::vector<SplitCandidate> &possible_splits, TreeFamily &curr_family, std::vector<std::vector<double>> &weights)
+{
+  if (split_structure_mode_ == 4) return this->calcOptimalSplit_hist(Y, X, possible_splits, curr_family, weights);
+  if (split_structure_mode_ == 3) return this->calcOptimalSplit_leaves(Y, X, possible_splits, curr_family, weights);
+  if (split_structure_mode_ == 2) return this->calcOptimalSplit_curTrees1(Y, X, possible_splits, curr_family, weights);
+  if (split_structure_mode_ == 1) return this->calcOptimalSplit_curTrees2(Y, X, possible_splits, curr_family, weights);
+  return Split{};
+}
+
 void ClassificationRPF::create_tree_family(std::vector<Leaf> initial_leaves, size_t n)
 {
 
   TreeFamily curr_family;
   curr_family.insert(std::make_pair(std::set<int>{0}, std::make_shared<DecisionTree>(DecisionTree(std::set<int>{0}, initial_leaves)))); // save tree with one leaf in the beginning
 
-  // store possible splits in map with splitting variable as key and pointer to resulting tree
-  std::multimap<int, std::shared_ptr<DecisionTree>> possible_splits;
-  for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim)
-  {
-    // add pointer to resulting tree with split dimension as key
-    curr_family.insert(std::make_pair(std::set<int>{feature_dim}, std::make_shared<DecisionTree>(DecisionTree(std::set<int>{feature_dim}))));
-    possible_splits.insert(std::make_pair(feature_dim, curr_family[std::set<int>{0}]));
+  // Seed per mode
+  std::vector<SplitCandidate> possible_splits;
+  std::vector<ResultingTreeCandidate> possible_trees;
+  if (split_structure_mode_ == 0) {
+    for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) {
+      auto treePtr = std::make_shared<DecisionTree>(DecisionTree({feature_dim}));
+      curr_family.insert({{feature_dim}, treePtr});
+      possible_trees.emplace_back(treePtr);
+    }
+  } else if (split_structure_mode_ == 3 || split_structure_mode_ == 4) {
+    auto add_leaf_candidates = [&](const std::shared_ptr<DecisionTree>& T, size_t li) {
+      for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) {
+        std::set<int> res_dims = T->split_dims; res_dims.insert(feature_dim); res_dims.erase(0);
+        if (max_interaction >= 0 && res_dims.size() > (size_t)max_interaction) continue;
+        if (!this->leafCandidateExists(possible_splits, T, li, feature_dim)) possible_splits.emplace_back(feature_dim, T, li);
+      }
+    };
+    auto null_tree = curr_family[{0}];
+    if (!null_tree->leaves.empty()) add_leaf_candidates(null_tree, 0);
+  } else {
+    for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) {
+      auto treePtr = std::make_shared<DecisionTree>(DecisionTree({feature_dim}));
+      curr_family.insert({{feature_dim}, treePtr});
+      possible_splits.emplace_back(feature_dim, treePtr, (size_t)0);
+    }
   }
 
   // sample data points with replacement
@@ -899,7 +656,7 @@ void ClassificationRPF::create_tree_family(std::vector<Leaf> initial_leaves, siz
     // bagging/subsampling
     for (size_t i = 0; i < sample_size; ++i)
     {
-      sample_index = R::runif(0, sample_size - 1);
+      sample_index = rpf_utils::rng_randint(0, (int)sample_size);
       samples_Y[i] = Y[sample_index];
       samples_X[i] = X[sample_index];
     }
@@ -936,45 +693,30 @@ void ClassificationRPF::create_tree_family(std::vector<Leaf> initial_leaves, siz
   {
 
     // find optimal split
-    curr_split = calcOptimalSplit(samples_Y, samples_X, possible_splits, curr_family, weights);
+    if (split_structure_mode_ == 0) curr_split = this->calcOptimalSplit_resTrees(samples_Y, samples_X, possible_trees, curr_family, weights);
+    else curr_split = calcOptimalSplit(samples_Y, samples_X, possible_splits, curr_family, weights);
 
     // continue only if we get a significant result
     if (!std::isinf(curr_split.min_sum))
     {
 
-      // update possible splits
-      if (curr_split.tree_index->split_dims.count(curr_split.split_coordinate) == 0)
-      {
-
-        for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim)
-        { // consider all possible dimensions
-
-          // create union of split coord, feature dim and dimensions of old tree
-          std::set<int> curr_dims = curr_split.tree_index->split_dims;
-          curr_dims.insert(curr_split.split_coordinate);
-          curr_dims.insert(feature_dim);
-          curr_dims.erase(0);
-
-          // skip if possible_split already exists
-          if (possibleExists(feature_dim, possible_splits, curr_dims))
-            continue;
-
-          // do not exceed maximum level of interaction
-          if (max_interaction >= 0 && curr_dims.size() > (size_t)max_interaction)
-            continue;
-
-          // check if resulting tree already exists in family
-          std::shared_ptr<DecisionTree> found_tree = treeExists(curr_dims, curr_family);
-
-          // update possible_splits if not already existing
-          if (found_tree)
-          { // if yes add pointer
-            possible_splits.insert(std::make_pair(feature_dim, found_tree));
-          }
-          else
-          { // if not create new tree
-            curr_family.insert(std::make_pair(curr_dims, std::make_shared<DecisionTree>(DecisionTree(curr_dims))));
-            possible_splits.insert(std::make_pair(feature_dim, curr_family[curr_dims]));
+      // update pools by mode
+      if (split_structure_mode_ == 0) {
+        std::set<int> Dprime = curr_split.tree_index->split_dims; Dprime.insert(curr_split.split_coordinate); Dprime.erase(0);
+        if (!this->resultingTreeExists(possible_trees, Dprime)) { if (auto found = treeExists(Dprime, curr_family)) possible_trees.emplace_back(found); else { curr_family.insert({Dprime, std::make_shared<DecisionTree>(DecisionTree(Dprime))}); possible_trees.emplace_back(curr_family[Dprime]); } }
+        for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) {
+          std::set<int> U = Dprime; U.insert(feature_dim); if (U.size() == Dprime.size()) continue; if (max_interaction >= 0 && U.size() > (size_t)max_interaction) continue; if (this->resultingTreeExists(possible_trees, U)) continue; if (auto found = treeExists(U, curr_family)) possible_trees.emplace_back(found); else { curr_family.insert({U, std::make_shared<DecisionTree>(DecisionTree(U))}); possible_trees.emplace_back(curr_family[U]); }
+        }
+      } else if (split_structure_mode_ == 3 || split_structure_mode_ == 4) {
+        // Leaf-level candidates are added after leaf construction below (we need indices)
+      } else {
+        if (curr_split.tree_index->split_dims.count(curr_split.split_coordinate) == 0) {
+          for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) {
+            std::set<int> curr_dims = curr_split.tree_index->split_dims; curr_dims.insert(curr_split.split_coordinate); curr_dims.insert(feature_dim); curr_dims.erase(0);
+            if (possibleExists(feature_dim, possible_splits, curr_dims)) continue;
+            if (max_interaction >= 0 && curr_dims.size() > (size_t)max_interaction) continue;
+            if (auto found = treeExists(curr_dims, curr_family)) possible_splits.emplace_back(feature_dim, found, (size_t)0);
+            else { curr_family.insert({curr_dims, std::make_shared<DecisionTree>(DecisionTree(curr_dims))}); possible_splits.emplace_back(feature_dim, curr_family[curr_dims], (size_t)0); }
           }
         }
       }
@@ -1328,7 +1070,7 @@ void ClassificationRPF::create_tree_family(std::vector<Leaf> initial_leaves, siz
       std::shared_ptr<DecisionTree> found_tree = treeExists(resulting_dims, curr_family);
 
       // determine which tree is modified
-      if (curr_split.tree_index->split_dims.count(curr_split.split_coordinate))
+      if ((curr_split.tree_index->split_dims.count(curr_split.split_coordinate))&& delete_leaves)
       { // if split variable is already in tree to be split
         // change values
         {
@@ -1337,11 +1079,50 @@ void ClassificationRPF::create_tree_family(std::vector<Leaf> initial_leaves, siz
         }
         *curr_split.leaf_index = leaf_b;                 // replace old interval
         curr_split.tree_index->leaves.push_back(leaf_s); // add new leaf
+        if (split_structure_mode_ == 3) {
+          size_t idx_b = (size_t)(curr_split.leaf_index - &curr_split.tree_index->leaves[0]);
+          size_t idx_s = curr_split.tree_index->leaves.size() - 1;
+          for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) {
+            std::set<int> res_dims_b = curr_split.tree_index->split_dims; res_dims_b.insert(feature_dim); res_dims_b.erase(0);
+            if (max_interaction < 0 || res_dims_b.size() <= (size_t)max_interaction) {
+              if (!this->leafCandidateExists(possible_splits, curr_split.tree_index, idx_b, feature_dim)) {
+                possible_splits.emplace_back(feature_dim, curr_split.tree_index, idx_b);
+              }
+            }
+            std::set<int> res_dims_s = curr_split.tree_index->split_dims; res_dims_s.insert(feature_dim); res_dims_s.erase(0);
+            if (max_interaction < 0 || res_dims_s.size() <= (size_t)max_interaction) {
+              if (!this->leafCandidateExists(possible_splits, curr_split.tree_index, idx_s, feature_dim)) {
+                possible_splits.emplace_back(feature_dim, curr_split.tree_index, idx_s);
+              }
+            }
+          }
+        }
       }
       else
       {                                       // otherwise
+        if (!found_tree) {
+          curr_family.insert({resulting_dims, std::make_shared<DecisionTree>(DecisionTree(resulting_dims))});
+          found_tree = curr_family[resulting_dims];
+        }
         found_tree->leaves.push_back(leaf_s); // append new leaves
         found_tree->leaves.push_back(leaf_b);
+        if (split_structure_mode_ == 3) {
+          size_t idx_s = found_tree->leaves.size() - 2; size_t idx_b = found_tree->leaves.size() - 1;
+          for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) {
+            std::set<int> res_dims_s = found_tree->split_dims; res_dims_s.insert(feature_dim); res_dims_s.erase(0);
+            if (max_interaction < 0 || res_dims_s.size() <= (size_t)max_interaction) {
+              if (!this->leafCandidateExists(possible_splits, found_tree, idx_s, feature_dim)) {
+                possible_splits.emplace_back(feature_dim, found_tree, idx_s);
+              }
+            }
+            std::set<int> res_dims_b = found_tree->split_dims; res_dims_b.insert(feature_dim); res_dims_b.erase(0);
+            if (max_interaction < 0 || res_dims_b.size() <= (size_t)max_interaction) {
+              if (!this->leafCandidateExists(possible_splits, found_tree, idx_b, feature_dim)) {
+                possible_splits.emplace_back(feature_dim, found_tree, idx_b);
+              }
+            }
+          }
+        }
       }
     }
   }
@@ -1367,76 +1148,8 @@ void ClassificationRPF::create_tree_family(std::vector<Leaf> initial_leaves, siz
 // fit forest to new data
 void ClassificationRPF::fit()
 {
-
-  // setup initial set of individuals
-  std::vector<int> initial_individuals(sample_size);
-  std::iota(initial_individuals.begin(), initial_individuals.end(), 0);
-
-  // initialize intervals with lower and upper bounds
-  std::vector<Interval> initial_intervals(feature_size);
-  for (int i = 0; i < feature_size; ++i)
-    initial_intervals[i] = Interval{lower_bounds[i], upper_bounds[i]};
-
-  // set properties of first leaf
-  Leaf initial_leaf;
-  {
-    initial_leaf.value = std::vector<double>(value_size, 0);
-    initial_leaf.individuals = initial_individuals;
-    initial_leaf.intervals = initial_intervals;
-  }
-  std::vector<Leaf> initial_leaves{initial_leaf}; // vector with initial leaf
-
-  // initialize tree families
-  this->tree_families = std::vector<TreeFamily>(n_trees);
-
-  // Loop over number of tree families and dispatch threads in batches
-  // of nhreads at once
-  if (nthreads > 1)
-  {
-    if (nthreads > std::thread::hardware_concurrency())
-    {
-      Rcout << "Requested " << nthreads << " threads but only " << std::thread::hardware_concurrency() << " available" << std::endl;
-    }
-    // Create local thread count to not overwrite nthreads,
-    // would get reported wrongly by get_parameters()
-    unsigned int current_threads = nthreads;
-    for (int n = 0; n < n_trees; n += current_threads)
-    {
-      if (n >= (n_trees - current_threads + 1))
-      {
-        current_threads = n_trees % current_threads;
-      }
-
-      std::vector<std::thread> threads(current_threads);
-      for (int t = 0; t < current_threads; ++t)
-      {
-        // Rcout << "Dispatching thread " << (n + t + 1) << "/" << n_trees << std::endl;
-        threads[t] = std::thread(&ClassificationRPF::create_tree_family, this, std::ref(initial_leaves), n + t);
-      }
-      for (auto &t : threads)
-      {
-        if (t.joinable())
-          t.join();
-      }
-    }
-  }
-  else
-  {
-    for (int n = 0; n < n_trees; ++n)
-    {
-      create_tree_family(initial_leaves, n);
-    }
-  }
-
-  // optionally purify tree
-  if (purify_forest)
-  {
-    this->purify_3();
-  }
-  else
-  {
-    purified = false;
-  }
+  // Use the base class multithreaded trainer with RNG seeding identical to regression
+  RandomPlantedForest::fit();
 }
 
 /*  retrospectively change parameters of existing class object,
@@ -1529,6 +1242,14 @@ void ClassificationRPF::set_parameters(StringVector keys, NumericVector values)
     {
       this->epsilon = values[i];
     }
+    else if (keys[i] == "split_decay_rate") 
+    {
+      this->split_decay_rate_ = values[i];
+    }
+    else if (keys[i] == "max_candidates") 
+    {
+      this->max_candidates_ = static_cast<size_t>(values[i]);
+    }
     else
     {
       Rcout << "Unkown parameter key  '" << keys[i] << "' ." << std::endl;
diff --git a/src/lib/helper.cpp b/src/lib/helper.cpp
index 83ba046..09a595a 100644
--- a/src/lib/helper.cpp
+++ b/src/lib/helper.cpp
@@ -1,4 +1,5 @@
 #include "helper.hpp"
+#include "internal_utils.hpp"
 
 using namespace Rcpp;
 
@@ -7,7 +8,7 @@ namespace utils {
 // Helper function to generate random number using R's RNG
 // this replaces the previous randWrapper and later use of std::random_shuffle,
 // as the latter is removed in C++17 and I couldn't figure out an easy replacement.
-int random_index(const int n) { return static_cast<int>(R::runif(0, 1) * n); }
+int random_index(const int n) { return static_cast<int>(rpf_utils::rng_runif01() * n); }
 
 //  ----------------- functions for converting R and Cpp types -----------------
 
@@ -70,8 +71,20 @@ std::vector<double> to_std_vec(Rcpp::NumericVector rv) {
 }
 
 std::vector<std::vector<double>> to_std_vec(Rcpp::NumericMatrix rv) {
-  std::vector<std::vector<double>> X;
-  for(int i=0; i<rv.rows(); i++) X.push_back(to_std_vec(rv(i, _ )));
+  // Efficient, allocation-minimal copy:
+  // - Avoids per-row temporary NumericVector creation
+  // - Copies column-major R memory into row-major nested vectors
+  const int rows = rv.nrow();
+  const int cols = rv.ncol();
+  std::vector<std::vector<double>> X((size_t)rows, std::vector<double>((size_t)cols));
+  if (rows == 0 || cols == 0) return X;
+  const double *data = rv.begin(); // column-major, column stride = rows
+  for (int j = 0; j < cols; ++j) {
+    const double *colptr = data + (size_t)j * (size_t)rows;
+    for (int i = 0; i < rows; ++i) {
+      X[(size_t)i][(size_t)j] = colptr[(size_t)i];
+    }
+  }
   return X;
 }
 
@@ -84,4 +97,4 @@ std::set<int> to_std_set(Rcpp::IntegerVector rv) {
 }
 
 
-}
\ No newline at end of file
+}
diff --git a/src/lib/internal_utils.cpp b/src/lib/internal_utils.cpp
new file mode 100644
index 0000000..9c94b59
--- /dev/null
+++ b/src/lib/internal_utils.cpp
@@ -0,0 +1,233 @@
+// Internal utilities for split sampling, RNG, caching, and prefix sums shared
+// across split modes and both regression/classification flows.
+//
+// These helpers centralize frequently reused logic and are intentionally kept
+// low-level and stateless, using thread-local state only for RNG where needed.
+#include "internal_utils.hpp"
+#include <random>
+#include <cmath>
+
+namespace {
+  // Thread-local RNG pointer used in worker threads for reproducible randomness
+  thread_local std::mt19937_64* tls_rng_ptr = nullptr;
+}
+
+namespace rpf_utils {
+void fenwick_add(std::vector<double> &bit, size_t idx1, double delta)
+{
+  // bit is 1-based; idx1 in [1, bit.size()]
+  size_t n = bit.size();
+  while (idx1 <= n) { bit[idx1 - 1] += delta; idx1 += idx1 & (~idx1 + 1); }
+}
+
+size_t fenwick_find_by_prefix(const std::vector<double> &bit, double target)
+{
+  // Return smallest i such that sum(i) >= target; 1-based index
+  size_t n = bit.size();
+  size_t idx = 0; double sum = 0.0;
+  // Largest power of two <= n
+  size_t step = 1ULL << (63 - __builtin_clzll((unsigned long long)std::max<size_t>(1, n)));
+  while (step) {
+    size_t next = idx + step; if (next <= n) {
+      double val = bit[next - 1];
+      if (sum + val < target) { sum += val; idx = next; }
+    }
+    step >>= 1;
+  }
+  return std::min(n, idx + 1);
+}
+
+std::mt19937_64* swap_tls_rng(std::mt19937_64* new_ptr)
+{
+  std::mt19937_64* old = tls_rng_ptr;
+  tls_rng_ptr = new_ptr;
+  return old;
+}
+
+double rng_runif01()
+{
+  if (tls_rng_ptr) {
+    return std::generate_canonical<double, 53>(*tls_rng_ptr);
+  }
+  static thread_local std::mt19937_64 fallback_rng(0x9E3779B97F4A7C15ULL);
+  return std::generate_canonical<double, 53>(fallback_rng);
+}
+
+double rng_runif(double a, double b)
+{
+  double u = rng_runif01();
+  return a + u * (b - a);
+}
+
+int rng_randint(int left_inclusive, int right_exclusive)
+{
+  if (right_exclusive <= left_inclusive) return left_inclusive;
+  if (tls_rng_ptr) {
+    std::uniform_int_distribution<int> dist(left_inclusive, right_exclusive - 1);
+    return dist(*tls_rng_ptr);
+  }
+  static thread_local std::mt19937_64 fallback_rng(0xD1B54A32D192ED03ULL);
+  std::uniform_int_distribution<int> dist(left_inclusive, right_exclusive - 1);
+  return dist(fallback_rng);
+}
+
+void ensure_order_and_sorted_vals_for_leaf(
+    const std::vector<std::vector<double>> &X,
+    Leaf &leaf,
+    int k,
+    std::vector<size_t> &order_out,
+    std::vector<double> &sorted_vals_out)
+{
+  const size_t m = leaf.individuals.size();
+  if (leaf.order_cache.count(k) && leaf.order_cache[k].size() == m) {
+    order_out = leaf.order_cache[k];
+  } else {
+    order_out.resize(m);
+    std::iota(order_out.begin(), order_out.end(), 0);
+    std::stable_sort(order_out.begin(), order_out.end(), [&](size_t a, size_t b){
+      return X[leaf.individuals[a]][k] < X[leaf.individuals[b]][k];
+    });
+    leaf.order_cache[k] = order_out;
+  }
+  if (leaf.sorted_vals_cache.count(k) && leaf.sorted_vals_cache[k].size() == m) {
+    sorted_vals_out = leaf.sorted_vals_cache[k];
+  } else {
+    sorted_vals_out.resize(m);
+    for (size_t i = 0; i < m; ++i)
+      sorted_vals_out[i] = X[leaf.individuals[order_out[i]]][k];
+    leaf.sorted_vals_cache[k] = sorted_vals_out;
+  }
+}
+
+std::vector<double> compute_unique_sorted_values(const std::vector<double> &sorted_vals)
+{
+  std::vector<double> unique;
+  unique.reserve(sorted_vals.size());
+  if (!sorted_vals.empty()) {
+    unique.push_back(sorted_vals[0]);
+    for (size_t i = 1; i < sorted_vals.size(); ++i)
+      if (sorted_vals[i] != unique.back()) unique.push_back(sorted_vals[i]);
+  }
+  return unique;
+}
+
+void build_prefix_and_total_given_order(
+    const std::vector<std::vector<double>> &Y,
+    const Leaf &leaf,
+    const std::vector<size_t> &order,
+    size_t value_size,
+    std::vector<std::vector<double>> &prefix_out,
+    std::vector<double> &total_out)
+{
+  const size_t m = leaf.individuals.size();
+  prefix_out.assign(value_size, std::vector<double>(m, 0.0));
+  for (size_t p = 0; p < value_size; ++p) {
+    double acc = 0.0;
+    for (size_t i = 0; i < m; ++i) {
+      acc += Y[leaf.individuals[order[i]]][p];
+      prefix_out[p][i] = acc;
+    }
+  }
+  total_out.assign(value_size, 0.0);
+  for (size_t p = 0; p < value_size; ++p)
+    total_out[p] = prefix_out[p][m - 1];
+}
+
+void finalize_split_from_sums(
+    Split &winner,
+    const std::vector<std::vector<double>> &X,
+    size_t value_size)
+{
+  if (std::isinf(winner.min_sum) || winner.leaf_index == nullptr) return;
+  const int kfin = winner.split_coordinate - 1;
+  Leaf &leaf_fin = *winner.leaf_index;
+  const double sp_fin = winner.split_point;
+  winner.I_s.clear(); winner.I_b.clear();
+  for (int ind : leaf_fin.individuals) {
+    if (X[ind][kfin] < sp_fin) winner.I_s.push_back(ind); else winner.I_b.push_back(ind);
+  }
+  winner.M_s.assign(value_size, 0.0);
+  winner.M_b.assign(value_size, 0.0);
+  if (!winner.I_s.empty()) for (size_t p = 0; p < value_size; ++p)
+    winner.M_s[p] = winner.sum_s[p] / static_cast<double>(winner.I_s.size());
+  if (!winner.I_b.empty()) for (size_t p = 0; p < value_size; ++p)
+    winner.M_b[p] = winner.sum_b[p] / static_cast<double>(winner.I_b.size());
+}
+
+std::vector<size_t> sample_weighted_indices_filtered(
+    const std::vector<double> &weights,
+    size_t n_candidates)
+{
+  std::vector<size_t> pos_idx; pos_idx.reserve(weights.size());
+  std::vector<double> pos_w;   pos_w.reserve(weights.size());
+  for (size_t i = 0; i < weights.size(); ++i) if (weights[i] > 0.0) { pos_idx.push_back(i); pos_w.push_back(weights[i]); }
+  const size_t P = pos_idx.size();
+  std::vector<size_t> sample_idxs; sample_idxs.reserve(n_candidates);
+  if (P == 0) {
+    std::vector<size_t> all(weights.size()); std::iota(all.begin(), all.end(), 0);
+    size_t k = std::min(n_candidates, all.size());
+    for (size_t i = 0; i < k; ++i) {
+      size_t j = i + static_cast<size_t>(rng_runif01() * (double)(all.size() - i));
+      if (j >= all.size()) j = all.size() - 1; std::swap(all[i], all[j]);
+    }
+    for (size_t i = 0; i < k; ++i) sample_idxs.push_back(all[i]);
+  } else {
+    size_t k = std::min(n_candidates, P);
+    std::vector<std::pair<double,size_t>> keys; keys.reserve(P);
+    for (size_t i = 0; i < P; ++i) {
+      double u = rng_runif01(); if (u <= 0.0) u = std::numeric_limits<double>::min();
+      double key = -std::log(u) / pos_w[i]; keys.emplace_back(key, pos_idx[i]);
+    }
+    if (k < keys.size()) {
+      std::nth_element(keys.begin(), keys.begin() + k, keys.end(), [](const auto& a, const auto& b){ return a.first < b.first; });
+      keys.resize(k);
+    }
+    for (auto &kv : keys) sample_idxs.push_back(kv.second);
+  }
+  return sample_idxs;
+}
+
+std::vector<int> compute_even_spread_indices(int left_inclusive, int right_exclusive, size_t max_draws)
+{
+  std::vector<int> result;
+  int range = right_exclusive - left_inclusive; if (range <= 0) return result;
+  size_t draws = std::min<size_t>(max_draws, static_cast<size_t>(range));
+  if (draws == 0) return result;
+  result.reserve(draws);
+  for (size_t j = 1; j <= draws; ++j) {
+    int pos = left_inclusive + static_cast<int>(std::floor((static_cast<double>(j) * range) / static_cast<double>(draws + 1)));
+    if (pos < left_inclusive) pos = left_inclusive;
+    if (pos >= right_exclusive) pos = right_exclusive - 1;
+    if (!result.empty() && pos <= result.back()) pos = std::min(right_exclusive - 1, result.back() + 1);
+    result.push_back(pos);
+  }
+  return result;
+}
+
+std::vector<int> sample_unique_ints_uniform_R(int left_inclusive, int right_exclusive, size_t k)
+{
+  std::vector<int> result; int range = right_exclusive - left_inclusive; if (range <= 0) return result;
+  k = std::min<size_t>(k, static_cast<size_t>(range));
+  if (k == 0) return result;
+  if (k * 4 >= static_cast<size_t>(range)) {
+    std::vector<int> all(range); std::iota(all.begin(), all.end(), left_inclusive);
+    for (size_t i = 0; i < k; ++i) {
+      size_t j = i + static_cast<size_t>(rng_runif01() * (double)(all.size() - i)); if (j >= all.size()) j = all.size() - 1;
+      std::swap(all[i], all[j]);
+    }
+    result.assign(all.begin(), all.begin() + static_cast<long>(k));
+    std::sort(result.begin(), result.end());
+    return result;
+  }
+  std::unordered_set<int> used; result.reserve(k);
+  while (result.size() < k) {
+    int s = rng_randint(left_inclusive, right_exclusive); if (s >= right_exclusive) s = right_exclusive - 1;
+    if (used.insert(s).second) result.push_back(s);
+  }
+  std::sort(result.begin(), result.end());
+  return result;
+}
+
+} // namespace rpf_utils
+
+
diff --git a/src/lib/losses_exponential.cpp b/src/lib/losses_exponential.cpp
new file mode 100644
index 0000000..68f5cd3
--- /dev/null
+++ b/src/lib/losses_exponential.cpp
@@ -0,0 +1,94 @@
+// Classification losses: Exponential family variants. Extracted from cpf.cpp.
+#include "cpf.hpp"
+
+void ClassificationRPF::exponential_loss(Split &split)
+{
+  split.min_sum = 0;
+  split.M_s = std::vector<double>(value_size, 0);
+  split.M_b = std::vector<double>(value_size, 0);
+  std::vector<double> W_s_sum(value_size, 0), W_b_sum(value_size, 0), sum_s(value_size, 0), sum_b(value_size, 0);
+  for (size_t p = 0; p < value_size; ++p)
+  {
+    for (auto individual : split.I_s) W_s_sum[p] += (*split.W)[individual][p];
+    for (auto individual : split.I_b) W_b_sum[p] += (*split.W)[individual][p];
+    for (auto individual : split.I_s) sum_s[p] += (((*split.Y)[individual][p] + 1) / 2) * ((*split.W)[individual][p] / W_s_sum[p]);
+    for (auto individual : split.I_b) sum_b[p] += (((*split.Y)[individual][p] + 1) / 2) * ((*split.W)[individual][p] / W_b_sum[p]);
+    split.M_s[p] = sum_s[p]; split.M_b[p] = sum_b[p];
+    sum_s[p] = std::min(std::max(delta, sum_s[p]), 1 - delta);
+    sum_b[p] = std::min(std::max(delta, sum_b[p]), 1 - delta);
+  }
+  split.M_sp = 1 - std::accumulate(split.M_s.begin(), split.M_s.end(), 0.0);
+  split.M_bp = 1 - std::accumulate(split.M_b.begin(), split.M_b.end(), 0.0);
+  double sum_sp = std::min(std::max(delta, split.M_sp), 1 - delta);
+  double sum_bp = std::min(std::max(delta, split.M_bp), 1 - delta);
+  for (size_t p = 0; p < value_size; ++p)
+  {
+    for (auto individual : split.I_s) split.min_sum += (*split.W)[individual][p] * exp(-0.5 * (*split.Y)[individual][p] * log(sum_s[p] / sum_sp));
+    for (auto individual : split.I_b) split.min_sum += (*split.W)[individual][p] * exp(-0.5 * (*split.Y)[individual][p] * log(sum_b[p] / sum_bp));
+    split.min_sum -= W_s_sum[p] + W_b_sum[p];
+  }
+  for (const auto &s : W_s_sum) if (s == 0) split.min_sum = INF;
+  for (const auto &s : W_b_sum) if (s == 0) split.min_sum = INF;
+  if (std::isnan(split.min_sum)) split.min_sum = INF;
+}
+
+void ClassificationRPF::exponential_loss_2(Split &split)
+{
+  split.min_sum = 0;
+  std::vector<double> W_s_sum(value_size, 0), W_b_sum(value_size, 0), sum_s(value_size, 0), sum_b(value_size, 0), sum_s2(value_size, 0), sum_b2(value_size, 0);
+  for (size_t p = 0; p < value_size; ++p)
+  {
+    for (auto individual : split.I_s) W_s_sum[p] += (*split.W)[individual][p];
+    for (auto individual : split.I_b) W_b_sum[p] += (*split.W)[individual][p];
+    for (auto individual : split.I_s) sum_s[p] += (((*split.Y)[individual][p] + 1) / 2) * ((*split.W)[individual][p] / W_s_sum[p]);
+    for (auto individual : split.I_b) sum_b[p] += (((*split.Y)[individual][p] + 1) / 2) * ((*split.W)[individual][p] / W_b_sum[p]);
+    split.M_s[p] = sum_s[p]; split.M_b[p] = sum_b[p];
+    sum_s2[p] = std::max(delta, 1 - sum_s[p]); sum_b2[p] = std::max(delta, 1 - sum_b[p]);
+    sum_s[p] = std::max(delta, sum_s[p]); sum_b[p] = std::max(delta, sum_b[p]);
+  }
+  for (size_t p = 0; p < value_size; ++p)
+  {
+    for (auto individual : split.I_s) split.min_sum += (*split.W)[individual][p] * exp(-0.5 * (*split.Y)[individual][p] * log(sum_s[p] / sum_s2[p]));
+    for (auto individual : split.I_b) split.min_sum += (*split.W)[individual][p] * exp(-0.5 * (*split.Y)[individual][p] * log(sum_b[p] / sum_b2[p]));
+    split.min_sum -= W_s_sum[p] + W_b_sum[p];
+  }
+  for (const auto &s : W_s_sum) if (s == 0) split.min_sum = INF;
+  for (const auto &s : W_b_sum) if (s == 0) split.min_sum = INF;
+  if (std::isnan(split.min_sum)) split.min_sum = INF;
+}
+
+void ClassificationRPF::exponential_loss_3(Split &split)
+{
+  split.min_sum = 0;
+  split.M_s = std::vector<double>(value_size, 0);
+  split.M_b = std::vector<double>(value_size, 0);
+  std::vector<double> W_s_sum(value_size, 0), W_b_sum(value_size, 0), sum_s(value_size, 0), sum_b(value_size, 0);
+  for (size_t p = 0; p < value_size; ++p)
+  {
+    for (auto individual : split.I_s) W_s_sum[p] += (*split.W)[individual][p];
+    for (auto individual : split.I_b) W_b_sum[p] += (*split.W)[individual][p];
+    for (auto individual : split.I_s) sum_s[p] += (((*split.Y)[individual][p] + 1) / 2) * ((*split.W)[individual][p] / W_s_sum[p]);
+    for (auto individual : split.I_b) sum_b[p] += (((*split.Y)[individual][p] + 1) / 2) * ((*split.W)[individual][p] / W_b_sum[p]);
+    split.M_s[p] = sum_s[p]; split.M_b[p] = sum_b[p];
+    sum_s[p] = std::max(delta, sum_s[p]); sum_b[p] = std::max(delta, sum_b[p]);
+    sum_s[p] = log(sum_s[p]); sum_b[p] = log(sum_b[p]);
+  }
+  split.M_sp = 1 - std::accumulate(split.M_s.begin(), split.M_s.end(), 0.0);
+  split.M_bp = 1 - std::accumulate(split.M_b.begin(), split.M_b.end(), 0.0);
+  double sum_sp = std::max(delta, split.M_sp), sum_bp = std::max(delta, split.M_bp);
+  sum_sp = log(sum_sp); sum_bp = log(sum_bp);
+  sum_sp += std::accumulate(sum_s.begin(), sum_s.end(), 0.0);
+  sum_bp += std::accumulate(sum_b.begin(), sum_b.end(), 0.0);
+  sum_sp = sum_sp / (sum_s.size() + 1); sum_bp = sum_bp / (sum_b.size() + 1);
+  for (size_t p = 0; p < value_size; ++p)
+  {
+    for (auto individual : split.I_s) split.min_sum += (*split.W)[individual][p] * exp(-0.5 * (*split.Y)[individual][p] * (sum_s[p] - sum_sp));
+    for (auto individual : split.I_b) split.min_sum += (*split.W)[individual][p] * exp(-0.5 * (*split.Y)[individual][p] * (sum_b[p] - sum_bp));
+    split.min_sum -= W_s_sum[p] + W_b_sum[p];
+  }
+  for (const auto &s : W_s_sum) if (s == 0) split.min_sum = INF;
+  for (const auto &s : W_b_sum) if (s == 0) split.min_sum = INF;
+  if (std::isnan(split.min_sum)) split.min_sum = INF;
+}
+
+
diff --git a/src/lib/losses_l1_l2_median.cpp b/src/lib/losses_l1_l2_median.cpp
new file mode 100644
index 0000000..ded3202
--- /dev/null
+++ b/src/lib/losses_l1_l2_median.cpp
@@ -0,0 +1,32 @@
+// Classification losses: L1 and Median. Extracted from cpf.cpp.
+#include "cpf.hpp"
+
+void ClassificationRPF::L1_loss(Split &split)
+{
+  split.min_sum = 0;
+  split.M_s = split.sum_s / split.I_s.size();
+  split.M_b = split.sum_b / split.I_b.size();
+  for (size_t p = 0; p < value_size; ++p)
+  {
+    for (auto individual : split.I_s)
+      split.min_sum += std::fabs((*split.Y)[individual][p] - split.M_s[p]) - std::fabs((*split.Y)[individual][p]);
+    for (auto individual : split.I_b)
+      split.min_sum += std::fabs((*split.Y)[individual][p] - split.M_b[p]) - std::fabs((*split.Y)[individual][p]);
+  }
+}
+
+void ClassificationRPF::median_loss(Split &split)
+{
+  split.min_sum = 0;
+  split.M_s = calcMedian(*split.Y, split.I_s);
+  split.M_b = calcMedian(*split.Y, split.I_b);
+  for (size_t p = 0; p < value_size; ++p)
+  {
+    for (auto individual : split.I_s)
+      split.min_sum += std::fabs((*split.Y)[individual][p] - split.M_s[p]) - std::fabs((*split.Y)[individual][p]);
+    for (auto individual : split.I_b)
+      split.min_sum += std::fabs((*split.Y)[individual][p] - split.M_b[p]) - std::fabs((*split.Y)[individual][p]);
+  }
+}
+
+
diff --git a/src/lib/losses_logit.cpp b/src/lib/losses_logit.cpp
new file mode 100644
index 0000000..6eccdd7
--- /dev/null
+++ b/src/lib/losses_logit.cpp
@@ -0,0 +1,130 @@
+// Classification losses: Logit family variants. Extracted from cpf.cpp.
+#include "cpf.hpp"
+
+void ClassificationRPF::logit_loss(Split &split)
+{
+  split.min_sum = 0;
+  split.M_s = split.sum_s / split.I_s.size();
+  split.M_b = split.sum_b / split.I_b.size();
+  split.M_sp = 1 - std::accumulate(split.M_s.begin(), split.M_s.end(), 0.0);
+  split.M_bp = 1 - std::accumulate(split.M_b.begin(), split.M_b.end(), 0.0);
+  std::vector<double> M_s = split.M_s, M_b = split.M_b;
+  std::for_each(M_s.begin(), M_s.end(), [this](double &M){ M = std::min(std::max(delta, M), 1 - delta); });
+  std::for_each(M_b.begin(), M_b.end(), [this](double &M){ M = std::min(std::max(delta, M), 1 - delta); });
+  double M_sp = std::min(std::max(delta, split.M_sp), 1 - delta);
+  double M_bp = std::min(std::max(delta, split.M_bp), 1 - delta);
+  std::vector<double> W_s_mean = calcMean(*split.W, split.I_s);
+  std::vector<double> W_b_mean = calcMean(*split.W, split.I_b);
+  std::vector<std::vector<double>> W = *split.W, W_new = *split.W;
+  for (size_t p = 0; p < value_size; ++p)
+  {
+    for (auto individual : split.I_s) { W[individual][p] = exp(W[individual][p]); W_new[individual][p] = exp(W_new[individual][p] + log(M_s[p] / M_sp) - W_s_mean[p]); }
+    for (auto individual : split.I_b) { W[individual][p] = exp(W[individual][p]); W_new[individual][p] = exp(W_new[individual][p] + log(M_b[p] / M_bp) - W_b_mean[p]); }
+  }
+  for (size_t p = 0; p < value_size; ++p)
+  {
+    for (auto individual : split.I_s) { split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (1 + std::accumulate(W[individual].begin(), W[individual].end(), 0.0))); split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (1 + std::accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0))); }
+    for (auto individual : split.I_b) { split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (1 + std::accumulate(W[individual].begin(), W[individual].end(), 0.0))); split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (1 + std::accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0))); }
+  }
+  for (auto individual : split.I_s) { split.min_sum += (1 - std::accumulate((*split.Y)[individual].begin(), (*split.Y)[individual].end(), 0.0)) * log(1 / (1 + std::accumulate(W[individual].begin(), W[individual].end(), 0.0))); split.min_sum -= (1 - std::accumulate((*split.Y)[individual].begin(), (*split.Y)[individual].end(), 0.0)) * log(1 / (1 + std::accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0))); }
+  for (auto individual : split.I_b) { split.min_sum += (1 - std::accumulate((*split.Y)[individual].begin(), (*split.Y)[individual].end(), 0.0)) * log(1 / (1 + std::accumulate(W[individual].begin(), W[individual].end(), 0.0))); split.min_sum -= (1 - std::accumulate((*split.Y)[individual].begin(), (*split.Y)[individual].end(), 0.0)) * log(1 / (1 + std::accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0))); }
+  if (std::isnan(split.min_sum)) split.min_sum = INF;
+}
+
+void ClassificationRPF::logit_loss_2(Split &split)
+{
+  split.min_sum = 0;
+  split.M_s = split.sum_s / split.I_s.size();
+  split.M_b = split.sum_b / split.I_b.size();
+  std::vector<double> M_s = split.M_s, M_b = split.M_b;
+  std::vector<double> M_s2 = split.M_s, M_b2 = split.M_b;
+  std::for_each(M_s.begin(), M_s.end(), [this](double &M){ M = std::max(delta, M); });
+  std::for_each(M_b.begin(), M_b.end(), [this](double &M){ M = std::max(delta, M); });
+  std::for_each(M_s2.begin(), M_s2.end(), [this](double &M){ M = std::max(delta, 1 - M); });
+  std::for_each(M_b2.begin(), M_b2.end(), [this](double &M){ M = std::max(delta, 1 - M); });
+  std::vector<double> W_s_mean = calcMean(*split.W, split.I_s);
+  std::vector<double> W_b_mean = calcMean(*split.W, split.I_b);
+  std::vector<std::vector<double>> W = *split.W, W_new = *split.W;
+  for (size_t p = 0; p < value_size; ++p)
+  {
+    for (auto individual : split.I_s) { W[individual][p] = exp(W[individual][p]); W_new[individual][p] = exp(W_new[individual][p] + log(M_s[p] / M_s2[p]) - W_s_mean[p]); }
+    for (auto individual : split.I_b) { W[individual][p] = exp(W[individual][p]); W_new[individual][p] = exp(W_new[individual][p] + log(M_b[p] / M_b2[p]) - W_b_mean[p]); }
+  }
+  for (size_t p = 0; p < value_size; ++p)
+  {
+    for (auto individual : split.I_s) { split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (1 + W[individual][p])); split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (1 + W_new[individual][p])); }
+    for (auto individual : split.I_b) { split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (1 + W[individual][p])); split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (1 + W_new[individual][p])); }
+  }
+  if (std::isnan(split.min_sum)) split.min_sum = INF;
+}
+
+void ClassificationRPF::logit_loss_3(Split &split)
+{
+  split.min_sum = 0;
+  split.M_s = split.sum_s / split.I_s.size();
+  split.M_b = split.sum_b / split.I_b.size();
+  split.M_sp = 1 - std::accumulate(split.M_s.begin(), split.M_s.end(), 0.0);
+  split.M_bp = 1 - std::accumulate(split.M_b.begin(), split.M_b.end(), 0.0);
+  std::vector<double> M_s = split.M_s, M_b = split.M_b;
+  std::for_each(M_s.begin(), M_s.end(), [this](double &M){ M = std::max(delta, M); });
+  std::for_each(M_b.begin(), M_b.end(), [this](double &M){ M = std::max(delta, M); });
+  std::for_each(M_s.begin(), M_s.end(), [&](double &M){ M = log(M); });
+  std::for_each(M_b.begin(), M_b.end(), [&](double &M){ M = log(M); });
+  double M_sp = std::max(delta, split.M_sp);
+  double M_bp = std::max(delta, split.M_bp);
+  M_sp = log(M_sp);
+  M_bp = log(M_bp);
+  double sum_s = (std::accumulate(M_s.begin(), M_s.end(), 0.0) + M_sp) / (M_s.size() + 1);
+  double sum_b = (std::accumulate(M_b.begin(), M_b.end(), 0.0) + M_bp) / (M_b.size() + 1);
+  std::vector<double> W_s_mean = calcMean(*split.W, split.I_s);
+  std::vector<double> W_b_mean = calcMean(*split.W, split.I_b);
+  std::vector<std::vector<double>> W = *split.W, W_new = *split.W;
+  for (size_t p = 0; p < value_size; ++p)
+  {
+    for (auto individual : split.I_s) { W_new[individual][p] = W_new[individual][p] + M_s[p] - sum_s - W_s_mean[p]; }
+    for (auto individual : split.I_b) { W_new[individual][p] = W_new[individual][p] + M_b[p] - sum_b - W_b_mean[p]; }
+  }
+  std::vector<double> W_sp, W_bp, W_sp_new, W_bp_new, Y_sp, Y_bp;
+  for (auto individual : split.I_s) { W_sp.push_back(-accumulate(W[individual].begin(), W[individual].end(), 0.0)); W_sp_new.push_back(-accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0)); Y_sp.push_back(1 - accumulate(Y[individual].begin(), Y[individual].end(), 0.0)); }
+  for (auto individual : split.I_b) { W_bp.push_back(-accumulate(W[individual].begin(), W[individual].end(), 0.0)); W_bp_new.push_back(-accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0)); Y_bp.push_back(1 - accumulate(Y[individual].begin(), Y[individual].end(), 0.0)); }
+  for (size_t p = 0; p < value_size; ++p)
+  {
+    for (auto individual : split.I_s) { W[individual][p] = exp(W[individual][p]); W_new[individual][p] = exp(W_new[individual][p]); }
+    for (auto individual : split.I_b) { W[individual][p] = exp(W[individual][p]); W_new[individual][p] = exp(W_new[individual][p]); }
+  }
+  for (size_t p = 0; p < value_size; ++p)
+  {
+    for (auto individual : split.I_s) { split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (std::accumulate(W[individual].begin(), W[individual].end(), 0.0))); split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (std::accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0))); }
+    for (auto individual : split.I_b) { split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (std::accumulate(W[individual].begin(), W[individual].end(), 0.0))); split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (std::accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0))); }
+  }
+  if (std::isnan(split.min_sum)) split.min_sum = INF;
+}
+
+void ClassificationRPF::logit_loss_4(Split &split)
+{
+  split.min_sum = 0;
+  split.M_s = split.sum_s / split.I_s.size();
+  split.M_b = split.sum_b / split.I_b.size();
+  std::vector<double> M_s = split.M_s, M_b = split.M_b;
+  std::vector<double> M_s2 = split.M_s, M_b2 = split.M_b;
+  std::for_each(M_s.begin(), M_s.end(), [this](double &M){ M = std::max(delta, M); });
+  std::for_each(M_b.begin(), M_b.end(), [this](double &M){ M = std::max(delta, M); });
+  std::for_each(M_s2.begin(), M_s2.end(), [this](double &M){ M = std::max(delta, 1 - M); });
+  std::for_each(M_b2.begin(), M_b2.end(), [this](double &M){ M = std::max(delta, 1 - M); });
+  std::vector<double> W_s_mean = calcMean(*split.W, split.I_s);
+  std::vector<double> W_b_mean = calcMean(*split.W, split.I_b);
+  std::vector<std::vector<double>> W = *split.W, W_new = *split.W;
+  for (size_t p = 0; p < value_size; ++p)
+  {
+    for (auto individual : split.I_s) { W[individual][p] = exp(W[individual][p]); W_new[individual][p] = exp(W_new[individual][p] + log(M_s[p] / M_s2[p]) - W_s_mean[p]); }
+    for (auto individual : split.I_b) { W[individual][p] = exp(W[individual][p]); W_new[individual][p] = exp(W_new[individual][p] + log(M_b[p] / M_b2[p]) - W_b_mean[p]); }
+  }
+  for (size_t p = 0; p < value_size; ++p)
+  {
+    for (auto individual : split.I_s) { split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (1 + W[individual][p])); split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (1 + W_new[individual][p])); }
+    for (auto individual : split.I_b) { split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (1 + W[individual][p])); split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (1 + W_new[individual][p])); }
+  }
+  if (std::isnan(split.min_sum)) split.min_sum = INF;
+}
+
+
diff --git a/src/lib/predict.cpp b/src/lib/predict.cpp
new file mode 100644
index 0000000..a158fd1
--- /dev/null
+++ b/src/lib/predict.cpp
@@ -0,0 +1,232 @@
+// Prediction entry points split out from rpf.cpp for readability and reuse.
+#include "rpf.hpp"
+#include <algorithm>
+#include <iterator>
+
+// predict single feature vector
+std::vector<double> RandomPlantedForest::predict_single(const std::vector<double> &X, std::set<int> component_index)
+{
+  std::vector<double> total_res = std::vector<double>(value_size, 0);
+
+  if (!purified)
+  {
+    // consider all components
+    if (component_index == std::set<int>{0})
+    {
+      for (auto &tree_family : this->tree_families)
+      {
+        for (auto &tree : tree_family)
+        {
+          for (auto &leaf : tree.second->leaves)
+          {
+            bool valid = true;
+            for (auto &dim : tree.first)
+            {
+              if (!((leaf.intervals[std::max(0, dim - 1)].first <= X[std::max(0, dim - 1)] || leaf.intervals[std::max(0, dim - 1)].first == lower_bounds[std::max(0, dim - 1)]) && (leaf.intervals[std::max(0, dim - 1)].second > X[std::max(0, dim - 1)] || leaf.intervals[std::max(0, dim - 1)].second == upper_bounds[std::max(0, dim - 1)])))
+              {
+                valid = false;
+                break;
+              }
+            }
+            if (valid)
+            {
+              for (size_t p = 0; p < value_size && p < leaf.value.size(); ++p)
+              {
+                total_res[p] += leaf.value[p];
+              }
+            }
+          }
+        }
+      }
+    }
+    else
+    { // choose components for prediction
+      for (auto &tree_family : this->tree_families)
+      {
+        for (auto &tree : tree_family)
+        {
+          // only consider trees with same dimensions as component_index
+          if (tree.first != component_index)
+            continue;
+
+          std::vector<int> dims;
+          for (auto dim : tree.first)
+          {
+            dims.push_back(dim);
+          }
+
+          for (auto &leaf : tree.second->leaves)
+          {
+            bool valid = true;
+            for (unsigned int i = 0; i < dims.size(); ++i)
+            {
+              int dim = dims[i];
+              if (!((leaf.intervals[std::max(0, dim - 1)].first <= X[i] || leaf.intervals[std::max(0, dim - 1)].first == lower_bounds[std::max(0, dim - 1)]) && (leaf.intervals[std::max(0, dim - 1)].second > X[i] || leaf.intervals[std::max(0, dim - 1)].second == upper_bounds[std::max(0, dim - 1)])))
+              {
+                valid = false;
+                break;
+              }
+            }
+            if (valid)
+            {
+              for (size_t p = 0; p < value_size && p < leaf.value.size(); ++p)
+              {
+                total_res[p] += leaf.value[p];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  else
+  {
+    if (component_index == std::set<int>{-1})
+    {
+      for (auto &tree_family : this->tree_families)
+      {
+        for (auto &tree : tree_family)
+        {
+          std::vector<int> leaf_index(tree.first.size(), -1);
+          if (tree.first == std::set<int>{0})
+          {
+            leaf_index = std::vector<int>(tree.first.size(), 0);
+            
+            const auto &vals = tree.second->GridLeaves.values[leaf_index];
+            for (size_t p = 0; p < value_size && p < vals.size(); ++p)
+            {
+              total_res[p] += vals[p];
+            }
+          }
+        }
+      }
+    }
+    else if (component_index == std::set<int>{0})
+    {
+      for (auto &tree_family : this->tree_families)
+      {
+        for (auto &tree : tree_family)
+        {
+          std::vector<int> leaf_index(tree.first.size(), -1);
+          if (tree.first == std::set<int>{0})
+          {
+            leaf_index = std::vector<int>(tree.first.size(), 0);
+          }
+          else
+          {
+            for (size_t dim_index = 0; dim_index < tree.first.size(); ++dim_index)
+            {
+              int dim = 0;
+              {
+                auto dim_pnt = tree.first.begin();
+                std::advance(dim_pnt, dim_index);
+                dim = *dim_pnt;
+                --dim; // convert to 0-based original feature index
+              }
+              auto &bounds = tree.second->GridLeaves.lim_list[dim];
+              if (bounds.size() < 2)
+              {
+                leaf_index[dim_index] = 0;
+                continue;
+              }
+              // Use the original feature index into X, not the position within the tree's dim set
+              auto it = std::upper_bound(bounds.begin(), bounds.end(), X[dim]);
+              int c = static_cast<int>(std::distance(bounds.begin(), it));
+              leaf_index[dim_index] = std::min(std::max(0, c - 1), (int)bounds.size() - 2);
+            }
+          }
+          for (int &index : leaf_index) index = std::max(0, index);
+          {
+            const auto &vals = tree.second->GridLeaves.values[leaf_index];
+            for (size_t p = 0; p < value_size && p < vals.size(); ++p)
+            {
+              total_res[p] += vals[p];
+            }
+          }
+        }
+      }
+    }
+    else
+    {
+      for (auto &tree_family : this->tree_families)
+      {
+        for (auto &tree : tree_family)
+        {
+          if (tree.first != component_index)
+            continue;
+          std::vector<int> leaf_index(tree.first.size(), -1);
+          if (tree.first == std::set<int>{0})
+          {
+            leaf_index = std::vector<int>(tree.first.size(), 0);
+          }
+          else
+          {
+            for (size_t dim_index = 0; dim_index < tree.first.size(); ++dim_index)
+            {
+              int dim = 0;
+              {
+                auto dim_pnt = tree.first.begin();
+                std::advance(dim_pnt, dim_index);
+                dim = *dim_pnt;
+                --dim; // 0-based original feature index for bounds lookup only
+              }
+              auto &bounds = tree.second->GridLeaves.lim_list[dim];
+              if (bounds.size() < 2)
+              {
+                leaf_index[dim_index] = 0;
+                continue;
+              }
+              // For component-specific prediction, X contains only the selected dims in ascending order.
+              // Use the position within the selected dims (dim_index) to read the value.
+              auto it = std::upper_bound(bounds.begin(), bounds.end(), X[dim_index]);
+              int c = static_cast<int>(std::distance(bounds.begin(), it));
+              leaf_index[dim_index] = std::min(std::max(0, c - 1), (int)bounds.size() - 2);
+            }
+          }
+          for (int &index : leaf_index) index = std::max(0, index);
+          {
+            const auto &vals = tree.second->GridLeaves.values[leaf_index];
+            for (size_t p = 0; p < value_size && p < vals.size(); ++p)
+            {
+              total_res[p] += vals[p];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return total_res / n_trees;
+}
+
+// predict multiple feature vectors
+Rcpp::NumericMatrix RandomPlantedForest::predict_matrix(const NumericMatrix &X, const NumericVector components)
+{
+  std::vector<std::vector<double>> feature_vec = to_std_vec(X);
+  std::set<int> component_index = to_std_set(components);
+  std::vector<std::vector<double>> predictions;
+  if (feature_vec.empty())
+    throw std::invalid_argument("Feature vector is empty.");
+  if (component_index == std::set<int>{0} && this->feature_size >= 0 && feature_vec[0].size() != (size_t)this->feature_size)
+    throw std::invalid_argument("Feature vector has wrong dimension.");
+  if (component_index != std::set<int>{0} && component_index != std::set<int>{-1} && component_index.size() != feature_vec[0].size())
+    throw std::invalid_argument("The input X has the wrong dimension in order to calculate f_i(x)");
+  for (auto &vec : feature_vec)
+  {
+    predictions.push_back(predict_single(vec, component_index));
+  }
+  return from_std_vec(predictions);
+}
+
+Rcpp::NumericMatrix RandomPlantedForest::predict_vector(const NumericVector &X, const NumericVector components)
+{
+  std::vector<double> feature_vec = to_std_vec(X);
+  std::set<int> component_index = to_std_set(components);
+  std::vector<std::vector<double>> predictions; Rcpp::NumericMatrix res;
+  if (feature_vec.empty()) { Rcout << "Feature vector is empty." << std::endl; return res; }
+  if (component_index == std::set<int>{0} && this->feature_size >= 0 && feature_vec.size() != (size_t)this->feature_size) { Rcout << "Feature vector has wrong dimension." << std::endl; return res; }
+  if (component_index == std::set<int>{0}) { predictions.push_back(predict_single(feature_vec, component_index)); }
+  else { for (auto vec : feature_vec) predictions.push_back(predict_single(std::vector<double>{vec}, component_index)); }
+  res = from_std_vec(predictions); return res;
+}
+
diff --git a/src/lib/purify.cpp b/src/lib/purify.cpp
new file mode 100644
index 0000000..db0a4da
--- /dev/null
+++ b/src/lib/purify.cpp
@@ -0,0 +1,1297 @@
+#include "rpf.hpp"
+#include "kdtree.hpp"
+#include "diffbuf.hpp"
+#include <limits>
+#include <cmath>
+#include <unordered_map>
+
+// Generates the next combination of k indices from a set of n elements.
+static inline bool next_combination(std::vector<int> &p, int n)
+{
+  int k = (int)p.size();
+  for (int i = k - 1; i >= 0; --i)
+  {
+    if (p[i] < n - k + i)
+    {
+      p[i]++;
+      for (int j = i + 1; j < k; ++j)
+      {
+        p[j] = p[j - 1] + 1;
+      }
+      return true;
+    }
+  }
+  return false;
+}
+
+void RandomPlantedForest::purify_1()
+{
+
+  // go through all n_trees families
+  for (auto &curr_family : this->tree_families)
+  {
+
+    // recap maximum number of dimensions of current family
+    unsigned int curr_max = 0;
+    for (auto tree : curr_family)
+    {
+      if (tree.first.size() > curr_max)
+        curr_max = tree.first.size();
+    }
+
+    while (curr_max >= 1)
+    {
+
+      // go through split dimensions of all trees
+      auto keys = getKeys(curr_family);
+      std::vector<std::set<int>>::reverse_iterator key = keys.rbegin();
+      while (key != keys.rend())
+      {
+
+        auto &curr_tree = curr_family[(*key)];
+        std::set<int> curr_dims = curr_tree->split_dims;
+
+        // check if number of dims same as current max_interaction
+        if (curr_dims.size() == curr_max)
+        {
+
+          // go through feature dims
+          for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim)
+          {
+
+            // continue only if dim in current tree
+            if (curr_tree->split_dims.count(feature_dim) != 0)
+            {
+
+              std::set<int> tree_dims = curr_tree->split_dims;
+              tree_dims.erase(tree_dims.find(feature_dim)); // remove current feature dim from current tree
+
+              // check if tree with dimensions exists, if not create
+              std::shared_ptr<DecisionTree> tree = treeExists(tree_dims, curr_family);
+              if (curr_max == 1)
+              {
+                tree = curr_family[std::set<int>{0}];
+              }
+              else
+              {
+                if (!tree)
+                {
+                  curr_family.insert(std::make_pair(tree_dims, std::make_shared<DecisionTree>(DecisionTree(tree_dims))));
+                  tree = curr_family[tree_dims];
+                }
+              }
+
+              // go through leaves of current tree
+              int n_leaves = curr_tree->leaves.size();
+              for (int l = 0; l < n_leaves; ++l)
+              {
+                auto &curr_leaf = curr_tree->leaves[l];
+
+                double multiplier = (curr_leaf.intervals[feature_dim - 1].second - curr_leaf.intervals[feature_dim - 1].first) / (upper_bounds[feature_dim - 1] - lower_bounds[feature_dim - 1]);
+
+                // new leaf including intervals and value
+                Leaf new_leaf = curr_leaf; // initialize intervals with first leaf
+                new_leaf.intervals[feature_dim - 1].first = lower_bounds[feature_dim - 1];
+                new_leaf.intervals[feature_dim - 1].second = upper_bounds[feature_dim - 1];
+                for (size_t i = 0; i < value_size; ++i)
+                  new_leaf.value[i] = -curr_leaf.value[i] * multiplier; // update value of new leaf
+
+                // append new leaf
+                if (!leafExists(new_leaf.intervals, curr_tree))
+                  curr_tree->leaves.push_back(new_leaf);
+                for (size_t i = 0; i < value_size; ++i)
+                  new_leaf.value[i] = curr_leaf.value[i] * multiplier; // update value of new leaf
+                if (!leafExists(new_leaf.intervals, tree))
+                  tree->leaves.push_back(new_leaf);
+              }
+            }
+          }
+        }
+        key++;
+      }
+
+      // update currently considered dimension size
+      --curr_max;
+    }
+  }
+
+  purified = true;
+}
+
+
+void RandomPlantedForest::purify_fast_exact_family(TreeFamily &curr_family, int maxp_interaction)
+{
+  // Normalize cap: treat 0 (or out-of-range) as full order p = feature_size
+  if (maxp_interaction <= 0 || maxp_interaction > feature_size) maxp_interaction = feature_size;
+  
+  // Portable 32-bit popcount to avoid compiler-specific builtins
+  auto popcount32 = [](unsigned int x) -> int {
+    x = x - ((x >> 1) & 0x55555555u);
+    x = (x & 0x33333333u) + ((x >> 2) & 0x33333333u);
+    return (int)((((x + (x >> 4)) & 0x0F0F0F0Fu) * 0x01010101u) >> 24);
+  };
+  auto nextDown = [](double x) { return std::nextafter(x, -std::numeric_limits<double>::infinity()); };
+
+  // 0) Ensure all subset components exist in the family (sources and targets)
+  {
+    auto base_keys = getKeys(curr_family);
+    for (const auto &T : base_keys) {
+      if (T == std::set<int>{0}) continue;
+      std::vector<int> dims; dims.reserve(T.size());
+      for (int d : T) dims.push_back(d);
+      int k = (int)dims.size();
+      for (int mask = 1; mask < (1 << k); ++mask) {
+        if (maxp_interaction > 0) {
+          int bits = popcount32((unsigned)mask);
+          if (bits > maxp_interaction) continue;
+        }
+        std::set<int> S;
+        for (int b = 0; b < k; ++b) if (mask & (1 << b)) S.insert(dims[b]);
+        if (curr_family.find(S) == curr_family.end()) {
+          curr_family.insert({S, std::make_shared<DecisionTree>(DecisionTree(S))});
+        }
+      }
+    }
+    if (curr_family.find(std::set<int>{0}) == curr_family.end()) {
+      curr_family.insert({std::set<int>{0}, std::make_shared<DecisionTree>(DecisionTree(std::set<int>{0}))});
+    }
+  }
+
+  // 1) Build lim_list (unique cut endpoints per feature)
+  std::vector<std::vector<double>> lim_list(feature_size);
+  for (int d = 1; d <= feature_size; ++d) {
+    std::vector<double> bounds;
+    for (const auto &kv : curr_family) {
+      if (!kv.first.count(d)) continue;
+      for (const auto &leaf : kv.second->leaves) {
+        bounds.push_back(leaf.intervals[d - 1].first);
+        bounds.push_back(leaf.intervals[d - 1].second);
+      }
+    }
+    std::sort(bounds.begin(), bounds.end());
+    bounds.erase(std::unique(bounds.begin(), bounds.end()), bounds.end());
+    lim_list[d - 1] = bounds;
+  }
+
+  // Precompute number of cells per feature (endpoints - 1), clamped at 0
+  std::vector<int> cells_by_dim(feature_size + 1, 0);
+  for (int d = 1; d <= feature_size; ++d) cells_by_dim[d] = std::max(0, (int)lim_list[d - 1].size() - 1);
+
+  // 2) Prepare per-S diff buffers (emit only S with |S|<=maxp; keep intercept)
+  auto keys = getKeys(curr_family);
+  std::vector<std::set<int>> S_vars; S_vars.reserve(keys.size());
+  std::vector<rpf_diff::NDArray<std::vector<double>>> diff_S; diff_S.reserve(keys.size());
+  std::vector<double> intercept(value_size, 0.0);
+  for (const auto &S : keys) {
+    if (S != std::set<int>{0} && maxp_interaction > 0 && (int)S.size() > maxp_interaction) continue;
+    S_vars.push_back(S);
+    if (S == std::set<int>{0}) diff_S.emplace_back(rpf_diff::NDArray<std::vector<double>>(std::vector<int>{1}, std::vector<double>(value_size, 0)));
+    else {
+      std::vector<int> diff_dims; diff_dims.reserve(S.size());
+      for (int d : S) { int K = (int)lim_list[d - 1].size(); int cells = std::max(0, K - 1); diff_dims.push_back(cells + 1); }
+      diff_S.emplace_back(rpf_diff::NDArray<std::vector<double>>(diff_dims, std::vector<double>(value_size, 0)));
+    }
+  }
+
+  std::map<std::set<int>, int, utils::setComp> s_index_map; for (size_t i = 0; i < S_vars.size(); ++i) s_index_map[S_vars[i]] = (int)i;
+  auto set_to_vec = [](const std::set<int> &S){ std::vector<int> v; v.reserve(S.size()); for (int x : S) v.push_back(x); return v; };
+
+  // 3) KD-tree over all samples
+  std::vector<int> all_idx(sample_size); for (int i = 0; i < sample_size; ++i) all_idx[i] = i;
+  rpf_kd::KDTree kdt(&X, all_idx, feature_size);
+
+  // Precompute tot(U) with half-open domain [front, back)
+  std::map<std::set<int>, double, utils::setComp> tot_cache;
+  auto get_tot_for_U = [&](const std::set<int>& U)->double {
+    auto it = tot_cache.find(U); if (it != tot_cache.end()) return it->second;
+    for (int u : U) if ((int)lim_list[u - 1].size() < 2) { tot_cache.insert({U, 0.0}); return 0.0; }
+    std::vector<rpf_kd::RangeConstraint> consU; consU.reserve(U.size());
+    for (int u : U) { const auto &lims = lim_list[u - 1]; double lo = lims.front(); double hi = nextDown(lims.back()); consU.push_back({u - 1, lo, hi}); }
+    size_t cnt = consU.empty() ? (size_t)sample_size : kdt.range_count(consU);
+    double tot = (double)cnt; tot_cache.insert({U, tot}); return tot;
+  };
+
+  // Exact cache for KD range_count queries keyed by (dim, lo_idx, hi_idx) triples per constrained dim
+  // Key construction: 64-bit hash mixed from ordered triples to avoid building strings/sets
+  auto mix64 = [](unsigned long long x){
+    x ^= x >> 33; x *= 0xff51afd7ed558ccdULL; x ^= x >> 33; x *= 0xc4ceb9fe1a85ec53ULL; x ^= x >> 33; return x;
+  };
+  auto pack3 = [&](unsigned long long acc, int d, int lo, int hi){
+    unsigned long long k = ((unsigned long long)(unsigned int)d << 32) ^ ((unsigned long long)(unsigned int)lo << 16) ^ (unsigned long long)(unsigned int)hi;
+    acc ^= mix64(k + 0x9e3779b97f4a7c15ULL + (acc<<6) + (acc>>2));
+    return acc;
+  };
+  std::unordered_map<unsigned long long, size_t> kd_cache; kd_cache.reserve(1u << 15);
+
+  // 4) Accumulate leaf contributions from ALL trees T (any order), enumerating only S up to maxp
+  for (const auto &kv : curr_family) {
+    const std::set<int> &T = kv.first; if (T == std::set<int>{0}) continue;
+    const auto &leaves = kv.second->get_leaves();
+    std::vector<int> Tvec = set_to_vec(T); const int tdim = (int)Tvec.size();
+    // map from dimension id -> position index in Tvec
+    std::vector<int> pos_in_T(feature_size + 1, -1);
+    for (int i = 0; i < tdim; ++i) pos_in_T[Tvec[i]] = i;
+
+    for (const auto &leaf : leaves) {
+      // Pre-cache per-dim grid cell ranges for this leaf
+      std::vector<int> lo_cached(feature_size + 1, 0), hi_cached(feature_size + 1, 0);
+      for (int d : T) {
+        const auto &lims = lim_list[d - 1]; int cells = std::max(0, (int)lims.size() - 1);
+        int k_low = (int)(std::lower_bound(lims.begin(), lims.end(), leaf.intervals[d - 1].first) - lims.begin());
+        int ub = (int)(std::upper_bound(lims.begin(), lims.end(), leaf.intervals[d - 1].second) - lims.begin());
+        int k_high_cell = std::max(0, ub - 2);
+        lo_cached[d] = std::max(0, k_low);
+        hi_cached[d] = std::min(cells, k_high_cell + 1);
+      }
+
+      // Precompute per-dimension KD constraints for this leaf
+      std::vector<rpf_kd::RangeConstraint> rc_by_dim(feature_size + 1);
+      std::vector<char> rc_ok(feature_size + 1, 0);
+      // Also store lim_list boundary indices for exact caching
+      std::vector<int> rc_lo_idx(feature_size + 1, -1);
+      std::vector<int> rc_hi_idx(feature_size + 1, -1);
+      for (int d : T) {
+        const auto &lims = lim_list[d - 1];
+        if ((int)lims.size() < 2) { rc_ok[d] = 0; continue; }
+        double l = std::max(leaf.intervals[d - 1].first, lims.front());
+        double r = std::min(leaf.intervals[d - 1].second, lims.back());
+        double hi = nextDown(r);
+        if (!(hi >= l)) { rc_ok[d] = 0; }
+        else {
+          rc_by_dim[d] = {d - 1, l, hi}; rc_ok[d] = 1;
+          int lidx = (int)(std::lower_bound(lims.begin(), lims.end(), l) - lims.begin());
+          int ridx = (int)(std::lower_bound(lims.begin(), lims.end(), r) - lims.begin());
+          rc_lo_idx[d] = lidx; rc_hi_idx[d] = ridx;
+        }
+      }
+
+      // Precompute E[f_T | X_j] only for j with |j| <= maxp_interaction by enumerating combinations
+      // Store by mask over positions in T (0..tdim-1) to avoid building a full 2^tdim table
+      std::unordered_map<int, std::vector<double>> contrib_by_mask;
+      contrib_by_mask.reserve(32u);
+      std::vector<rpf_kd::RangeConstraint> cons; cons.reserve((size_t)tdim);
+
+      auto compute_for_j = [&](const std::vector<int> &j_pos){
+        // Build complement U = T \ j and corresponding KD constraints
+        std::set<int> U;
+        U.clear();
+        int jmask = 0;
+        std::vector<char> is_in_j((size_t)tdim, 0);
+        for (int pos : j_pos) { if (pos >= 0 && pos < tdim) { is_in_j[(size_t)pos] = 1; jmask |= (1 << pos); } }
+        for (int b = 0; b < tdim; ++b) if (!is_in_j[(size_t)b]) U.insert(Tvec[b]);
+        cons.clear(); cons.reserve(U.size());
+        bool empty_range = false;
+        // Build exact cache key from ordered (dim, lo_idx, hi_idx)
+        unsigned long long key = 1469598103934665603ULL; // FNV offset basis-ish seed
+        for (int u : U) {
+          if (!rc_ok[u]) { empty_range = true; break; }
+          cons.push_back(rc_by_dim[u]);
+          key = pack3(key, u - 1, rc_lo_idx[u], rc_hi_idx[u]);
+        }
+        size_t cnt = 0;
+        if (empty_range) cnt = 0;
+        else if (cons.empty()) cnt = (size_t)sample_size;
+        else {
+          auto kIt = kd_cache.find(key);
+          if (kIt != kd_cache.end()) cnt = kIt->second;
+          else { cnt = kdt.range_count(cons); kd_cache.emplace(key, cnt); }
+        }
+        double totU = get_tot_for_U(U); if (totU <= 0.0) return;
+        contrib_by_mask[jmask] = ((double)cnt / totU) * leaf.value;
+      };
+
+      // j size = 0
+      compute_for_j(std::vector<int>{});
+      // j sizes 1..min(tdim, maxp_interaction)
+      int maxk = std::min(tdim, maxp_interaction);
+      for (int k = 1; k <= maxk; ++k) {
+        std::vector<int> p(k); for (int i = 0; i < k; ++i) p[i] = i;
+        do { compute_for_j(p); } while (next_combination(p, tdim));
+      }
+
+      // Efficiently iterate directly over target subsets S up to size maxp_interaction
+      for (int k = 0; k <= std::min(tdim, maxp_interaction); ++k) {
+        std::vector<int> p(k); for (int i = 0; i < k; ++i) p[i] = i;
+        if (k == 0) {
+          // j = {} corresponds to mask 0
+          auto it0 = contrib_by_mask.find(0);
+          if (it0 != contrib_by_mask.end()) intercept += it0->second;
+        } else {
+          do {
+            std::set<int> S; for (int idx : p) S.insert(Tvec[idx]);
+            // Inclusion-exclusion over all j subset S, writing per-term rectangles
+            auto itS = s_index_map.find(S); if (itS == s_index_map.end()) { /* nothing to write */ }
+            else {
+              int s_idx = itS->second;
+              std::vector<int> Svec = set_to_vec(S); const int s_dim = (int)Svec.size();
+              for (int sm = 0; sm < (1 << s_dim); ++sm) {
+                int jmask_on_T = 0; int jcount = 0;
+                for (int b = 0; b < s_dim; ++b) {
+                  if (sm & (1 << b)) { ++jcount; int d = Svec[b]; int pos = pos_in_T[d]; if (pos >= 0) jmask_on_T |= (1 << pos); }
+                }
+                auto jit = contrib_by_mask.find(jmask_on_T);
+                if (jit == contrib_by_mask.end()) continue;
+                const std::vector<double> &contrib_j = jit->second;
+                int sign_flip = ((int)S.size() - jcount) % 2;
+                std::vector<double> signed_contrib = sign_flip ? (contrib_j * (-1)) : contrib_j;
+                // Build rectangle: restrict dims in j to leaf's range; others span entire domain
+                std::vector<int> lo; lo.reserve(S.size()); std::vector<int> hi; hi.reserve(S.size());
+                for (int di = 0; di < s_dim; ++di) {
+                  int d = Svec[di];
+                  if (sm & (1 << di)) { lo.push_back(lo_cached[d]); hi.push_back(hi_cached[d]); }
+                  else { lo.push_back(0); hi.push_back(cells_by_dim[d]); }
+                }
+                rpf_diff::add_rect(diff_S[s_idx], lo, hi, signed_contrib);
+              }
+            }
+          } while (next_combination(p, tdim));
+        }
+      }
+    }
+  }
+
+  // 5) Finalize per S
+  for (size_t i = 0; i < S_vars.size(); ++i) {
+    const auto &S = S_vars[i]; LeafGrid gl; gl.lim_list = lim_list;
+    if (S == std::set<int>{0}) {
+      gl.grid = grid::NDGrid(); gl.values = utils::Matrix<std::vector<double>>(std::vector<int>{1}, std::vector<double>(value_size, 0)); gl.individuals = utils::Matrix<int>(std::vector<int>{1}, 0);
+      std::vector<int> idx0{0}; gl.values[idx0] = intercept;
+    } else {
+      std::vector<int> dims_end; std::vector<int> cells_dims; for (int d : S) { int K = (int)lim_list[d - 1].size(); dims_end.push_back(std::max(1, K)); cells_dims.push_back(std::max(0, K - 1)); }
+      rpf_diff::inclusive_scan_inplace(diff_S[i]); gl.grid = grid::NDGrid(dims_end);
+      gl.values = utils::Matrix<std::vector<double>>(dims_end, std::vector<double>(value_size, 0)); gl.individuals = utils::Matrix<int>(dims_end, 0);
+      auto g = grid::NDGrid(cells_dims); while (!g.nextPoint()) { auto point = g.getPoint(); gl.values[point] = diff_S[i].at(point); }
+    }
+    curr_family[S]->GridLeaves = gl;
+  }
+
+  // 6) Overwrite high orders with zeros if capped
+  if (maxp_interaction > 0) {
+    for (const auto &S : keys) {
+      if (S == std::set<int>{0} || (int)S.size() <= maxp_interaction) continue;
+      LeafGrid gl; gl.lim_list = lim_list; std::vector<int> dims_end; for (int d : S) { int K = (int)lim_list[d - 1].size(); dims_end.push_back(std::max(1, K)); }
+      gl.grid = grid::NDGrid(dims_end); gl.values = utils::Matrix<std::vector<double>>(dims_end, std::vector<double>(value_size, 0)); gl.individuals = utils::Matrix<int>(dims_end, 0);
+      curr_family[S]->GridLeaves = gl;
+    }
+  }
+}
+
+
+
+
+
+// Unified purifier entry: mode 1 = grid path, mode 2 = fast exact path
+void RandomPlantedForest::purify(int maxp_interaction, int nthreads_param, int mode)
+{
+  // Determine threads: if user provided >0, use it; otherwise default to
+  // min(object-configured nthreads, hardware concurrency)
+  unsigned int threads_to_use = 0;
+  if (nthreads_param > 0) {
+    threads_to_use = static_cast<unsigned int>(nthreads_param);
+  } else {
+    unsigned int avail = std::thread::hardware_concurrency();
+    unsigned int obj = static_cast<unsigned int>(std::max(1, nthreads));
+    unsigned int eff_avail = (avail > 0 ? avail : 1u);
+    threads_to_use = std::min(obj, eff_avail);
+  }
+
+  auto worker = [this, maxp_interaction, mode](TreeFamily &fam){
+    if (mode == 2) this->purify_fast_exact_family(fam, maxp_interaction);
+    else this->purify_3_family(fam, maxp_interaction);
+  };
+
+  if (threads_to_use > 1)
+  {
+    unsigned int avail = std::thread::hardware_concurrency();
+    if (avail > 0 && threads_to_use > avail)
+    {
+      Rcout << "Requested " << threads_to_use << " threads but only " << avail << " available" << std::endl;
+    }
+    for (size_t start = 0; start < this->tree_families.size(); start += (size_t)threads_to_use)
+    {
+      size_t batch = std::min<size_t>((size_t)threads_to_use, this->tree_families.size() - start);
+      if (batch == 0) break;
+      std::vector<std::thread> threads(batch);
+      for (size_t i = 0; i < batch; ++i)
+      {
+        size_t fam_index = start + i;
+        threads[i] = std::thread([&worker](TreeFamily *fam_ptr){ worker(*fam_ptr); }, &this->tree_families[fam_index]);
+      }
+      for (auto &th : threads)
+      {
+        if (th.joinable()) th.join();
+      }
+    }
+    purified = true;
+    return;
+  }
+
+  for (auto &fam : this->tree_families) worker(fam);
+  purified = true;
+}
+
+
+
+// Purify a single family, but only materialize outputs up to maxp_interaction.
+// Higher-order trees (|dims| > maxp_interaction) are left with zero-valued grids,
+// but are still used as sources during purification so that lower-order components
+// are computed correctly.
+void RandomPlantedForest::purify_3_family(TreeFamily &curr_family, int maxp_interaction)
+{
+  // Normalize cap: treat 0 (or out-of-range) as full order p = feature_size
+  if (maxp_interaction <= 0 || maxp_interaction > feature_size) maxp_interaction = feature_size;
+  
+  // lim_list is a list giving for each variable all interval end-points
+  std::vector<std::vector<double>> lim_list(feature_size);
+
+  // go through all variables of the component
+  for (int curr_dim = 1; curr_dim <= feature_size; ++curr_dim)
+  {
+    std::vector<double> bounds;
+
+    // go through trees of family
+    for (const auto &curr_tree : curr_family)
+    {
+      // consider only relevant trees that have current dimension as variable
+      if (!curr_tree.first.count(curr_dim))
+        continue;
+      // go through leaves of tree
+      for (const auto &curr_leaf : curr_tree.second->leaves)
+      {
+        // get interval ends of variable
+        bounds.push_back(curr_leaf.intervals[curr_dim - 1].first);
+        bounds.push_back(curr_leaf.intervals[curr_dim - 1].second);
+      }
+    }
+    std::sort(bounds.begin(), bounds.end());
+    bounds.erase(std::unique(bounds.begin(), bounds.end()), bounds.end());
+    lim_list[curr_dim - 1] = bounds;
+  }
+
+  // Precompute per-sample bin indices for each feature based on lim_list
+  // -1 means the sample falls outside the covered bounds for that feature
+  std::vector<std::vector<int>> sample_bins;
+  if (sample_size > 0 && feature_size > 0)
+  {
+    sample_bins.assign(sample_size, std::vector<int>(feature_size, -1));
+    for (int s = 0; s < sample_size; ++s)
+    {
+      const auto &xrow = X[s];
+      for (int d = 1; d <= feature_size; ++d)
+      {
+        const auto &lims = lim_list[d - 1];
+        if (lims.empty()) continue;
+        const double val = xrow[d - 1];
+        auto it = std::upper_bound(lims.begin(), lims.end(), val);
+        int pos = static_cast<int>(it - lims.begin());
+        if (pos == 0 || pos >= static_cast<int>(lims.size()))
+        {
+          sample_bins[s][d - 1] = -1; // outside
+        }
+        else
+        {
+          sample_bins[s][d - 1] = pos - 1; // interval index in [0, lims.size()-2]
+        }
+      }
+    }
+  }
+
+  // initialize values and individuals for each tree in family
+  std::vector<grid::NDGrid> grids(curr_family.size() - 1);
+  std::vector<utils::Matrix<int>> individuals(curr_family.size() - 1);
+  std::vector<utils::Matrix<std::vector<double>>> values(curr_family.size() - 1);
+  std::vector<utils::Matrix<std::vector<double>>> values_old(curr_family.size() - 1);
+  std::vector<std::set<int>> variables(curr_family.size() - 1);
+
+  //  ------------- setup finer grid  -------------
+  int tree_index = 0;
+  for (const auto &curr_tree : curr_family)
+  {
+    if (curr_tree.first == std::set<int>{0})
+    {
+      continue; // ignore null tree
+    }
+
+    // fill space with dimensions
+    std::vector<int> dimensions;
+    dimensions.reserve(curr_tree.first.size());
+    for (const auto &dim : curr_tree.first)
+    {
+      dimensions.push_back(lim_list[dim - 1].size());
+    }
+
+    // setup grid for leaf indices
+    auto grid = grid::NDGrid(dimensions);
+
+    // initialize data for current tree
+    grids[tree_index] = grid;
+    individuals[tree_index] = utils::Matrix<int>(dimensions, 0);
+    values[tree_index] = utils::Matrix<std::vector<double>>(dimensions, std::vector<double>(value_size, 0));
+    values_old[tree_index] = utils::Matrix<std::vector<double>>(dimensions, std::vector<double>(value_size, 0));
+    variables[tree_index] = curr_tree.first;
+
+    // 1) Fill individuals using precomputed sample bins
+    if (!curr_tree.first.empty())
+    {
+      std::vector<int> point; point.reserve(curr_tree.first.size());
+      for (int s = 0; s < sample_size; ++s)
+      {
+        point.clear(); bool outside = false;
+        for (const auto &dim : curr_tree.first)
+        {
+          int b = sample_bins.empty() ? -1 : sample_bins[s][dim - 1];
+          if (b < 0) { outside = true; break; }
+          point.push_back(b);
+        }
+        if (!outside) { individuals[tree_index][point] += 1; }
+      }
+    }
+
+    // 2) Values accumulation: leaf-centric rectangular updates over the grid
+    if (!curr_tree.first.empty())
+    {
+      const size_t nd = curr_tree.first.size();
+      // For each leaf, determine covered index ranges along each dim, then add leaf.value to all covered grid cells
+      for (const auto &leaf : curr_tree.second->get_leaves())
+      {
+        std::vector<int> start(nd, 0), stop(nd, -1);
+        size_t idx_dim = 0;
+        bool empty = false;
+        for (const auto &dim : curr_tree.first)
+        {
+          const auto &lims = lim_list[dim - 1];
+          const int dim_len = static_cast<int>(grids[tree_index].dimensions[idx_dim]);
+          const int cell_max = (dim_len >= 2) ? (dim_len - 2) : -1;
+          const double left = leaf.intervals[dim - 1].first;
+          const double right = leaf.intervals[dim - 1].second;
+          int k_low = static_cast<int>(std::lower_bound(lims.begin(), lims.end(), left) - lims.begin());
+          int ub = static_cast<int>(std::upper_bound(lims.begin(), lims.end(), right) - lims.begin());
+          int k_high = ub - 2; // we need lims[k+1] <= right
+          if (k_low < 0) k_low = 0;
+          if (k_high > cell_max) k_high = cell_max;
+          if (k_low > k_high) { empty = true; break; }
+          start[idx_dim] = k_low;
+          stop[idx_dim] = k_high;
+          ++idx_dim;
+        }
+        if (empty) continue;
+
+        // Iterate over cartesian product of [start[d], stop[d]] for all dims d
+        std::vector<int> gridPoint = start;
+        while (true)
+        {
+          values[tree_index][gridPoint] += leaf.value;
+          values_old[tree_index][gridPoint] += leaf.value;
+          // increment like odometer
+          if (nd == 0) break;
+          size_t pos = nd;
+          while (pos > 0)
+          {
+            --pos;
+            if (gridPoint[pos] < stop[pos]) { ++gridPoint[pos]; break; }
+            gridPoint[pos] = start[pos];
+          }
+          if (pos == 0 && gridPoint[pos] == start[pos]) break; // finished full cycle
+        }
+      }
+    }
+
+    ++tree_index;
+  }
+
+  // ------------- create new trees -------------
+  grids.insert(grids.begin(), grid::NDGrid());
+  values.insert(values.begin(), utils::Matrix<std::vector<double>>(std::vector<int>{1}, std::vector<double>(value_size, 0)));
+  values_old.insert(values_old.begin(), utils::Matrix<std::vector<double>>(std::vector<int>{1}, std::vector<double>(value_size, 0)));
+  individuals.insert(individuals.begin(), utils::Matrix<int>(std::vector<int>{1}));
+  variables.insert(variables.begin(), std::set<int>{0});
+
+  unsigned int curr_max = curr_family.rbegin()->first.size();
+  while (curr_max > 1)
+  {
+    auto keys = getKeys(curr_family);
+    for (std::vector<std::set<int>>::reverse_iterator key = keys.rbegin(); key != keys.rend(); ++key)
+    {
+      auto &curr_tree = curr_family[(*key)];
+      std::set<int> curr_dims = curr_tree->split_dims;
+      if (curr_dims.size() == curr_max)
+      {
+        int dim_index2 = 0;
+        for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim)
+        {
+          if (curr_tree->split_dims.count(feature_dim) != 0)
+          {
+            std::set<int> tree_dims = curr_tree->split_dims;
+            tree_dims.erase(tree_dims.find(feature_dim));
+            std::shared_ptr<DecisionTree> tree = treeExists(tree_dims, curr_family);
+            if (!tree)
+            {
+              auto old_tree_index = std::distance(std::begin(curr_family), curr_family.find(curr_tree->get_split_dims()));
+              curr_family.insert(std::make_pair(tree_dims, std::make_shared<DecisionTree>(DecisionTree(tree_dims))));
+              auto tree_index2 = std::distance(std::begin(curr_family), curr_family.find(tree_dims));
+              std::vector<int> matrix_dimensions = values[old_tree_index].dims;
+              matrix_dimensions.erase(matrix_dimensions.begin() + dim_index2);
+              auto grid = grid::NDGrid(matrix_dimensions);
+              grids.insert(grids.begin() + tree_index2, grid);
+              values.insert(values.begin() + tree_index2, utils::Matrix<std::vector<double>>(matrix_dimensions, std::vector<double>(value_size, 0)));
+              values_old.insert(values_old.begin() + tree_index2, utils::Matrix<std::vector<double>>(matrix_dimensions, std::vector<double>(value_size, 0)));
+              individuals.insert(individuals.begin() + tree_index2, utils::Matrix<int>(matrix_dimensions));
+              variables.insert(variables.begin() + tree_index2, tree_dims);
+              // fill individuals of new trees using precomputed sample bins
+              if (!tree_dims.empty())
+              {
+                std::vector<int> point2; point2.reserve(tree_dims.size());
+                for (int s = 0; s < sample_size; ++s)
+                {
+                  point2.clear(); bool outside2 = false;
+                  for (const auto &dim2 : tree_dims)
+                  {
+                    int b2 = sample_bins.empty() ? -1 : sample_bins[s][dim2 - 1];
+                    if (b2 < 0) { outside2 = true; break; }
+                    point2.push_back(b2);
+                  }
+                  if (!outside2) { individuals[tree_index2][point2] += 1; }
+                }
+              }
+            }
+            dim_index2++;
+          }
+        }
+      }
+    }
+    --curr_max;
+  }
+
+  // ------------- purify -------------
+  std::vector<std::vector<int>> dim_to_pos(variables.size(), std::vector<int>(feature_size + 1, -1));
+  for (size_t idx = 0; idx < variables.size(); ++idx)
+  {
+    int pos = 0;
+    for (const auto dim : variables[idx])
+    {
+      if (dim >= 0 && dim <= feature_size) dim_to_pos[idx][dim] = pos++;
+    }
+  }
+
+  std::vector<double> total_individuals(variables.size(), 0.0);
+  for (size_t idx = 0; idx < variables.size(); ++idx)
+  {
+    double tot = 0.0;
+    if (variables[idx] == std::set<int>{0})
+    {
+      std::vector<int> only{0};
+      tot += individuals[idx][only];
+    }
+    else
+    {
+      auto grid_sum = grids[idx];
+      while (!grid_sum.nextPoint())
+      {
+        auto gp = grid_sum.getPoint();
+        tot += individuals[idx][gp];
+      }
+    }
+    total_individuals[idx] = tot;
+  }
+
+  int tree_index_t = curr_family.size() - 1;
+  for (auto tree_t = variables.rbegin(); tree_t != variables.rend(); ++tree_t)
+  {
+    std::set<int> curr_dims = *tree_t;
+    if (curr_dims == std::set<int>{0})
+      continue;
+
+    auto grid = grids[tree_index_t];
+    int tree_index_u = variables.size();
+    for (auto tree_u = variables.rbegin(); tree_u != variables.rend(); ++tree_u)
+    {
+      --tree_index_u;
+      std::set<int> j_dims = curr_dims;
+      if (tree_u->size() > curr_dims.size())
+        continue;
+      bool subset = true;
+      for (const auto dim : *tree_u)
+      {
+        if (tree_t->count(dim) == 0)
+        {
+          subset = false;
+          break;
+        }
+        j_dims.erase(dim);
+      }
+      if (!subset)
+        continue;
+
+      double tot_sum = total_individuals[tree_index_u];
+      if (tot_sum == 0.0)
+        continue;
+      const double inv_tot_sum = 1.0 / tot_sum;
+
+      grid = grids[tree_index_u];
+      std::vector<double> update(value_size, 0);
+
+      if (j_dims.size() == 0)
+      {
+        while (!grid.nextPoint())
+        {
+          auto gridPoint_i = grid.getPoint();
+          double curr_sum = individuals[tree_index_u][gridPoint_i];
+          update += (curr_sum * inv_tot_sum) * values_old[tree_index_t][gridPoint_i];
+        }
+
+        int tree_index_s = variables.size();
+        for (auto tree_s = variables.rbegin(); tree_s != variables.rend(); ++tree_s)
+        {
+          --tree_index_s;
+          if (*tree_s == std::set<int>{0})
+          {
+            auto gridPoint_0 = std::vector<int>{0};
+            values[tree_index_s][gridPoint_0] += update;
+          }
+          else
+          {
+            bool subset2 = true;
+            for (const auto dim : *tree_s)
+            {
+              if (tree_t->count(dim) == 0)
+              {
+                subset2 = false;
+                break;
+              }
+            }
+            if (!subset2)
+              continue;
+            if (maxp_interaction > 0 && tree_s->size() > (size_t)maxp_interaction) continue; // skip materializing > cap
+            auto grid_k = grids[tree_index_s];
+            while (!grid_k.nextPoint())
+            {
+              auto gridPoint_k = grid_k.getPoint();
+              int sign0 = ((*tree_s).size() % 2 == 0) ? 1 : -1;
+              values[tree_index_s][gridPoint_k] += sign0 * update;
+            }
+          }
+        }
+      }
+      else
+      {
+        std::vector<int> j_sizes(j_dims.size(), 0);
+        for (size_t j = 0; j < j_dims.size(); ++j)
+        {
+          auto tmp = j_dims.begin();
+          std::advance(tmp, j);
+          int j_index = dim_to_pos[tree_index_t][*tmp];
+          j_sizes[j] = grids[tree_index_t].dimensions[j_index];
+        }
+        grid::NDGrid grid_j = grid::NDGrid(j_sizes);
+        while (!grid_j.nextPoint())
+        {
+          std::vector<double> update(value_size, 0);
+          auto gridPoint_j = grid_j.getPoint();
+          grid = grids[tree_index_u];
+          while (!grid.nextPoint())
+          {
+            auto gridPoint_i = grid.getPoint();
+            double curr_sum = individuals[tree_index_u][gridPoint_i];
+            std::vector<int> gridPoint_ij(tree_t->size(), 0);
+            for (size_t j = 0; j < gridPoint_j.size(); ++j)
+            {
+              auto j_dim = j_dims.begin();
+              std::advance(j_dim, j);
+              int j_index = dim_to_pos[tree_index_t][*j_dim];
+              gridPoint_ij[j_index] = gridPoint_j[j];
+            }
+            for (size_t i = 0; i < gridPoint_i.size(); ++i)
+            {
+              auto i_dim = tree_u->begin();
+              std::advance(i_dim, i);
+              int i_index = dim_to_pos[tree_index_t][*i_dim];
+              gridPoint_ij[i_index] = gridPoint_i[i];
+            }
+            update += (curr_sum * inv_tot_sum) * values_old[tree_index_t][gridPoint_ij];
+          }
+
+          int tree_index_s = variables.size();
+          for (auto tree_s = variables.rbegin(); tree_s != variables.rend(); ++tree_s)
+          {
+            --tree_index_s;
+            bool subset2 = true;
+            for (const auto dim : j_dims)
+            {
+              if (tree_s->count(dim) == 0)
+              {
+                subset2 = false;
+                break;
+              }
+            }
+            for (const auto dim : *tree_s)
+            {
+              if (tree_t->count(dim) == 0)
+              {
+                subset2 = false;
+                break;
+              }
+            }
+            if (!subset2)
+              continue;
+            // Skip writing for components above the cap
+            if (maxp_interaction > 0 && tree_s->size() > (size_t)maxp_interaction)
+              continue;
+
+            std::set<int> k_dims = *tree_s;
+            std::set<int> k_dims_h1 = *tree_s;
+            std::set<int> k_dims_h2 = *tree_u;
+            for (const auto dim : *tree_u)
+              k_dims.insert(dim);
+            for (const auto dim : *tree_s)
+              k_dims_h2.erase(dim);
+            for (const auto dim : *tree_u)
+              k_dims_h1.erase(dim);
+            for (const auto dim : k_dims_h1)
+              k_dims.erase(dim);
+            for (const auto dim : k_dims_h2)
+              k_dims.erase(dim);
+
+            if (k_dims.size() == 0)
+            {
+              size_t diff = (*tree_s).size() - j_dims.size();
+              int sign = (diff % 2 == 0) ? 1 : -1;
+              values[tree_index_s][gridPoint_j] += sign * update;
+            }
+            else
+            {
+              std::vector<int> k_sizes(k_dims.size(), 0);
+              for (size_t k = 0; k < k_dims.size(); ++k)
+              {
+                auto tmp = k_dims.begin();
+                std::advance(tmp, k);
+                int k_index = dim_to_pos[tree_index_t][*tmp];
+                k_sizes[k] = grids[tree_index_t].dimensions[k_index];
+              }
+              grid::NDGrid grid_k = grid::NDGrid(k_sizes);
+              while (!grid_k.nextPoint())
+              {
+                auto gridPoint_k = grid_k.getPoint();
+                std::vector<int> gridPoint_jk(tree_s->size(), 0);
+                for (size_t j = 0; j < gridPoint_j.size(); ++j)
+                {
+                  auto j_dim = j_dims.begin();
+                  std::advance(j_dim, j);
+                  int j_index = dim_to_pos[tree_index_s][*j_dim];
+                  gridPoint_jk[j_index] = gridPoint_j[j];
+                }
+                for (size_t k = 0; k < gridPoint_k.size(); ++k)
+                {
+                  auto k_dim = k_dims.begin();
+                  std::advance(k_dim, k);
+                  int k_index = dim_to_pos[tree_index_s][*k_dim];
+                  gridPoint_jk[k_index] = gridPoint_k[k];
+                }
+                size_t diff = (*tree_s).size() - j_dims.size();
+                int sign2 = (diff % 2 == 0) ? 1 : -1;
+                values[tree_index_s][gridPoint_jk] += sign2 * update;
+              }
+            }
+          }
+        }
+      }
+    }
+    --tree_index_t;
+  }
+
+  // ------------- attach to rpf class -------------
+  for (size_t tree_index3 = 0; tree_index3 < variables.size(); ++tree_index3)
+  {
+    LeafGrid curr_gridLeaf;
+    curr_gridLeaf.grid = grids[tree_index3];
+    curr_gridLeaf.individuals = individuals[tree_index3];
+    curr_gridLeaf.lim_list = lim_list;
+    // If this tree exceeds the cap, attach a zero-valued matrix of the correct shape
+    if (maxp_interaction > 0 && variables[tree_index3] != std::set<int>{0} && variables[tree_index3].size() > (size_t)maxp_interaction)
+    {
+      curr_gridLeaf.values = utils::Matrix<std::vector<double>>(grids[tree_index3].dimensions, std::vector<double>(value_size, 0));
+    }
+    else
+    {
+      curr_gridLeaf.values = values[tree_index3];
+    }
+    curr_family[variables[tree_index3]]->GridLeaves = curr_gridLeaf;
+  }
+}
+
+
+
+
+void RandomPlantedForest::purify_2()
+{
+
+  // go through all n_trees families
+  for (auto &curr_family : this->tree_families)
+  {
+
+    // lim_list is a list giving for each variable all interval end-points
+    std::vector<std::vector<double>> lim_list(feature_size);
+
+    // go through all variables of the component
+    for (int curr_dim = 1; curr_dim <= feature_size; ++curr_dim)
+    {
+      std::vector<double> bounds;
+
+      // go through trees of family
+      for (const auto &curr_tree : curr_family)
+      {
+
+        // consider only relevant trees that have current dimension as variable
+        if (!curr_tree.first.count(curr_dim))
+          continue;
+
+        // go through leaves of tree
+        for (const auto &curr_leaf : curr_tree.second->leaves)
+        {
+          // get interval ends of variable
+          bounds.push_back(curr_leaf.intervals[curr_dim - 1].second);
+        }
+      }
+      std::sort(bounds.begin(), bounds.end());
+      bounds.erase(std::unique(bounds.begin(), bounds.end()), bounds.end());
+      lim_list[curr_dim - 1] = bounds;
+    }
+
+    // initialize values and individuals for each tree in family
+    std::vector<grid::NDGrid> grids(curr_family.size() - 1);
+    std::vector<utils::Matrix<int>> individuals(curr_family.size() - 1);
+    std::vector<utils::Matrix<std::vector<double>>> values(curr_family.size() - 1);
+    std::vector<std::set<int>> variables(curr_family.size() - 1);
+
+    //  ------------- setup finer grid  -------------
+
+    int tree_index = 0;
+    for (const auto &curr_tree : curr_family)
+    {
+
+      if (curr_tree.first == std::set<int>{0})
+        continue; // ignore null tree
+
+      // fill space with dimensions
+      std::vector<int> dimensions;
+      for (const auto &dim : curr_tree.first)
+      {
+        dimensions.push_back(lim_list[dim - 1].size() - 1); // size - 1 ?
+      }
+
+      // setup grid for leaf indices
+      auto grid = grid::NDGrid(dimensions);
+
+      // initialize data for current tree
+      grids[tree_index] = grid;
+      individuals[tree_index] = utils::Matrix<int>(dimensions, 0);
+      values[tree_index] = utils::Matrix<std::vector<double>>(dimensions, std::vector<double>(value_size, 0)); // changed
+      variables[tree_index] = curr_tree.first;
+
+      // fill grid points with individuals and values
+      while (!grid.nextPoint())
+      {
+
+        std::vector<int> gridPoint = grid.getPoint();
+
+        bool in_leaf = true;
+
+        // go through sample points to sum up individuals
+        for (const auto &feature_vec : X)
+        {
+          int dim_index = 0;
+          in_leaf = true;
+          for (const auto &dim : curr_tree.first)
+          {
+            double val = feature_vec[dim - 1];
+            if (!((val >= lim_list[dim - 1][gridPoint[dim_index]]) && (val < lim_list[dim - 1][gridPoint[dim_index] + 1])))
+            {
+              in_leaf = false;
+              break;
+            }
+            ++dim_index;
+          }
+
+          // consider individuals only if all in
+          if (in_leaf)
+            individuals[tree_index][gridPoint] += 1;
+        }
+
+        // go through leaves of tree to sum up values
+        for (const auto &leaf : curr_tree.second->get_leaves())
+        {
+
+          in_leaf = true;
+          int dim_index = 0;
+          for (const auto &dim : curr_tree.first)
+          {
+            // consider values only if all in
+            if (!((leaf.intervals[dim - 1].first <= lim_list[dim - 1][gridPoint[dim_index]]) && (leaf.intervals[dim - 1].second >= lim_list[dim - 1][gridPoint[dim_index] + 1])))
+            {
+              in_leaf = false;
+              break;
+            }
+            ++dim_index;
+          }
+
+          // sum up values
+          if (in_leaf)
+            values[tree_index][gridPoint] += leaf.value; // todo: multiclass
+        }
+      }
+
+      ++tree_index;
+    }
+
+    // ------------- create new trees -------------
+
+    // insert null tree
+    grids.insert(grids.begin(), grid::NDGrid());
+    values.insert(values.begin(), utils::Matrix<std::vector<double>>(std::vector<int>{1}, std::vector<double>(value_size, 0)));
+    individuals.insert(individuals.begin(), utils::Matrix<int>(std::vector<int>{1}));
+    variables.insert(variables.begin(), std::set<int>{0});
+
+    // recap maximum number of dimensions of current family
+    unsigned int curr_max = 0;
+    for (const auto &tree : curr_family)
+    {
+      if (tree.first.size() > curr_max)
+        curr_max = tree.first.size();
+    }
+
+    auto keys = getKeys(curr_family);
+    while (curr_max > 1)
+    {
+
+      // go through split dimensions of all trees
+      for (std::vector<std::set<int>>::reverse_iterator key = keys.rbegin(); key != keys.rend(); ++key)
+      {
+
+        auto &curr_tree = curr_family[(*key)];
+        std::set<int> curr_dims = curr_tree->split_dims;
+
+        // check if number of dims same as current max_interaction
+        if (curr_dims.size() == curr_max)
+        {
+
+          // go through feature dims
+          int dim_index = 0;
+          for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim)
+          {
+
+            // continue only if dim in current tree
+            if (curr_tree->split_dims.count(feature_dim) != 0)
+            {
+
+              std::set<int> tree_dims = curr_tree->split_dims;
+              tree_dims.erase(tree_dims.find(feature_dim)); // remove current feature dim from current tree
+
+              // check if tree with dimensions exists, if not create
+              std::shared_ptr<DecisionTree> tree = treeExists(tree_dims, curr_family);
+              if (!tree)
+              {
+
+                // get index of old and new tree
+                auto old_tree_index = std::distance(std::begin(curr_family), curr_family.find(curr_tree->get_split_dims()));
+                curr_family.insert(std::make_pair(tree_dims, std::make_shared<DecisionTree>(DecisionTree(tree_dims))));
+                auto tree_index = std::distance(std::begin(curr_family), curr_family.find(tree_dims));
+
+                // remove matrix dimension of respective variable
+                std::vector<int> matrix_dimensions = values[old_tree_index].dims;
+                matrix_dimensions.erase(matrix_dimensions.begin() + dim_index);
+
+                // initialize data for new tree
+                auto grid = grid::NDGrid(matrix_dimensions);
+                grids.insert(grids.begin() + tree_index, grid);
+                values.insert(values.begin() + tree_index, utils::Matrix<std::vector<double>>(matrix_dimensions, std::vector<double>(0, value_size)));
+                individuals.insert(individuals.begin() + tree_index, utils::Matrix<int>(matrix_dimensions));
+                variables.insert(variables.begin() + tree_index, tree_dims);
+
+                // fill individuals of new trees
+                while (!grid.nextPoint())
+                {
+
+                  std::vector<int> gridPoint = grid.getPoint();
+                  bool in_leaf = true;
+
+                  // go through sample points to sum up individuals
+                  for (const auto &feature_vec : X)
+                  {
+                    int dim_index = 0;
+                    in_leaf = true;
+                    for (const auto &dim : tree_dims)
+                    {
+                      double val = feature_vec[dim - 1];
+                      if (!((val >= lim_list[dim - 1][gridPoint[dim_index]]) && (val < lim_list[dim - 1][gridPoint[dim_index] + 1])))
+                        in_leaf = false;
+                      ++dim_index;
+                    }
+
+                    // consider individuals only if all in
+                    if (in_leaf)
+                      individuals[tree_index][gridPoint] += 1;
+                  }
+                }
+              }
+
+              dim_index++;
+            }
+          }
+        }
+      }
+
+      // update currently considered dimension size
+      --curr_max;
+    }
+
+    // ------------- purify -------------
+
+    // measure tolerance and number of iterations
+    std::vector<double> tol(curr_family.size(), 1);
+    int iter;
+
+    // iterate backwards through tree family
+    int curr_tree_index = curr_family.size() - 1;
+    for (TreeFamily::reverse_iterator curr_tree = curr_family.rbegin(); curr_tree != curr_family.rend(); ++curr_tree)
+    {
+      iter = 0;
+      std::set<int> curr_dims = curr_tree->second->get_split_dims();
+
+      // do not purify null
+      if (curr_dims == std::set<int>{0})
+        continue;
+
+      // repeat until tolerance small enough and (?) maximum number of iterations reached
+      while ((tol[curr_tree_index] > 0.00000000001) && (iter < 100))
+      {
+
+        // go through feature dims
+        int curr_dim_index = 0;
+        for (const auto &feature_dim : curr_dims)
+        {
+
+          // get tree that has same variables as curr_tree minus j-variable
+          std::set<int> tree_dims = curr_dims;
+          tree_dims.erase(tree_dims.find(feature_dim));
+          int tree_index = 0; // if tree not exist, set to null tree
+          if (curr_family.find(tree_dims) != curr_family.end())
+            tree_index = std::distance(std::begin(curr_family), curr_family.find(tree_dims)) - 1;
+
+          // update values
+          if (grids[curr_tree_index].dimensions.size() == 1)
+          { // one dimensional case
+
+            int sum_ind = 0;
+            std::vector<double> avg(value_size, 0);
+
+            // get sum of individuals
+            for (int i = 0; i < individuals[curr_tree_index].n_entries; ++i)
+            {
+              std::vector<int> tmp{i};
+              sum_ind += individuals[curr_tree_index][tmp];
+            }
+            if (sum_ind == 0)
+              continue;
+
+            // calc avg
+            for (int i = 0; i < individuals[curr_tree_index].n_entries; ++i)
+            {
+              std::vector<int> tmp{i};
+              avg += (individuals[curr_tree_index][tmp] * values[curr_tree_index][tmp]) / sum_ind;
+            }
+
+            // update values of one dimensional and null tree
+            for (int i = 0; i < values[curr_tree_index].n_entries; ++i)
+            {
+              std::vector<int> tmp{i};
+              values[curr_tree_index][tmp] -= avg;
+            }
+            std::vector<int> tmp{0};
+            values[tree_index][tmp] += avg;
+          }
+          else
+          { // higher dimensional case
+
+            // setup new grid without dimension j
+            std::vector<int> new_dimensions = grids[curr_tree_index].dimensions;
+            int j_dim = new_dimensions[curr_dim_index];
+            new_dimensions.erase(new_dimensions.begin() + curr_dim_index);
+            grid::NDGrid grid = grid::NDGrid(new_dimensions);
+
+            // go through values without dimension j
+            while (!grid.nextPoint())
+            {
+              auto gridPoint = grid.getPoint();
+              gridPoint.push_back(0);
+
+              int sum_ind = 0;
+              std::vector<double> avg(value_size, 0);
+
+              // go through slice to sum up individuals
+              for (int j = 0; j < j_dim; ++j)
+              {
+                gridPoint.back() = j;
+
+                // get sum of individuals
+                sum_ind += individuals[curr_tree_index][gridPoint];
+              }
+
+              // go through slice to calc avg
+              for (int j = 0; j < j_dim; ++j)
+              {
+                gridPoint.back() = j;
+
+                // calc avg
+                avg += (individuals[curr_tree_index][gridPoint] * values[curr_tree_index][gridPoint]) / sum_ind;
+              }
+
+              // go through slice to update values
+              for (int j = 0; j < j_dim; ++j)
+              {
+                gridPoint.back() = j;
+
+                // update values of current slice
+                values[curr_tree_index][gridPoint] -= avg;
+              }
+
+              // update lower dimensional tree
+              gridPoint.pop_back();
+              values[tree_index][gridPoint] += avg;
+            }
+          }
+
+          ++curr_dim_index;
+        }
+
+        // update tolerance
+        if (variables[curr_tree_index].size() == 1)
+        {
+          tol[curr_tree_index] = 1; // todo
+        }
+        else
+        {
+          tol[curr_tree_index] = 1;
+        }
+
+        ++iter;
+      }
+
+      --curr_tree_index;
+    }
+
+    // ------------- attach to rpf class -------------
+
+    // fill with new trees
+    for (size_t tree_index = 0; tree_index < variables.size(); ++tree_index)
+    {
+      LeafGrid curr_gridLeaf;
+      curr_gridLeaf.grid = grids[tree_index];
+      curr_gridLeaf.individuals = individuals[tree_index];
+      curr_gridLeaf.lim_list = lim_list;
+      curr_gridLeaf.values = values[tree_index];
+      curr_family[variables[tree_index]]->GridLeaves = curr_gridLeaf;
+    }
+  }
+
+  purified = true;
+}
+
diff --git a/src/lib/rpf.cpp b/src/lib/rpf.cpp
index 085df8c..b9ce125 100644
--- a/src/lib/rpf.cpp
+++ b/src/lib/rpf.cpp
@@ -1,5 +1,48 @@
 #include "rpf.hpp"
+#include <cmath>
+#include <numeric>
+#include <limits>
+#include <algorithm>
+#include <random>
+#include <queue>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include <thread>
+#include "internal_utils.hpp"
+
+// Use utilities via namespace alias
+using namespace rpf_utils;
+
+// Thread-local cache for histogram mode per working set (per tree-family build)
+// Avoids races on the class member when building families in parallel
+thread_local std::vector<std::vector<int>> tls_working_bin_id;
+
+// Utilities shared across modes
+bool RandomPlantedForest::possibleExists(
+    int dim,
+    const std::vector<SplitCandidate>& possible_splits,
+    const std::set<int>& resulting_dims)
+{
+  for (const auto& c : possible_splits) {
+    if (c.dim == dim && c.tree && c.tree->split_dims == resulting_dims)
+      return true;
+  }
+  return false;
+}
 
+bool RandomPlantedForest::leafCandidateExists(
+    const std::vector<SplitCandidate>& possible_splits,
+    const std::shared_ptr<DecisionTree>& tree,
+    size_t leaf_idx,
+    int dim)
+{
+  for (const auto& c : possible_splits) {
+    if (c.dim == dim && c.tree.get() == tree.get() && c.leaf_idx == leaf_idx)
+      return true;
+  }
+  return false;
+}
 
 bool RandomPlantedForest::is_purified()
 {
@@ -8,41 +51,27 @@ bool RandomPlantedForest::is_purified()
 
 void RandomPlantedForest::L2_loss(Split &split)
 {
-
-  // new meanq
   split.M_s = split.sum_s / split.I_s.size();
   split.M_b = split.sum_b / split.I_b.size();
-
   split.min_sum = 0;
   for (size_t p = 0; p < value_size; ++p)
   {
-    split.min_sum += -2 * split.M_s[p] * split.sum_s[p] + split.I_s.size() * pow(split.M_s[p], 2);
-    split.min_sum += -2 * split.M_b[p] * split.sum_b[p] + split.I_b.size() * pow(split.M_b[p], 2);
+    const double Ms = split.M_s[p];
+    const double Mb = split.M_b[p];
+    split.min_sum += -2 * Ms * split.sum_s[p] + split.I_s.size() * (Ms * Ms);
+    split.min_sum += -2 * Mb * split.sum_b[p] + split.I_b.size() * (Mb * Mb);
   }
 }
 
-// constructor
+// constructor (parsing includes split_structure)
 RandomPlantedForest::RandomPlantedForest(const NumericMatrix &samples_Y, const NumericMatrix &samples_X,
                                          const NumericVector parameters)
 {
-
-  // Ensure correct Rcpp RNG state
   Rcpp::RNGScope scope;
-
-  // initialize class members
   std::vector<double> pars = to_std_vec(parameters);
-  if (pars.size() != 9)
+  if (pars.size() != 12 && pars.size() != 13)
   {
-    Rcout << "Wrong number of parameters - set to default." << std::endl;
-    this->max_interaction = 1;
-    this->n_trees = 50;
-    this->n_splits = 30;
-    this->split_try = 10;
-    this->t_try = 0.4;
-    this->purify_forest = 0;
-    this->deterministic = 0;
-    this->nthreads = 1;
-    this->cross_validate = 0;
+    Rcpp::stop("RandomPlantedForest requires 12 or 13 parameters, got %d", pars.size());
   }
   else
   {
@@ -55,475 +84,574 @@ RandomPlantedForest::RandomPlantedForest(const NumericMatrix &samples_Y, const N
     this->deterministic = pars[6];
     this->nthreads = pars[7];
     this->cross_validate = pars[8];
+    this->split_decay_rate_ = pars[9];
+    this->max_candidates_   = static_cast<size_t>(pars[10]);
+    this->delete_leaves   = (pars[11] != 0);
+    // map: 0=res_trees, 1=cur_trees_2, 2=cur_trees_1, 3=leaves, 4=hist
+    this->split_structure_mode_ = (pars.size() >= 13) ? static_cast<int>(pars[12]) : 3;
   }
-
-  // set data and data related members
   this->set_data(samples_Y, samples_X);
 }
 
-// determine optimal split
-Split RandomPlantedForest::calcOptimalSplit(const std::vector<std::vector<double>> &Y, const std::vector<std::vector<double>> &X,
-                                            std::multimap<int, std::shared_ptr<DecisionTree>> &possible_splits, TreeFamily &curr_family)
-{
-
-  Split curr_split, min_split;
-  curr_split.Y = &Y;
-  std::set<int> tree_dims;
-  std::vector<double> unique_samples;
-  int k;
-  unsigned int n = 0;
-  double leaf_size, sample_point;
-
-  // sample possible splits
-  unsigned int n_candidates = ceil(t_try * possible_splits.size()); // number of candidates that will be considered
-  std::vector<int> split_candidates(possible_splits.size());
-  std::iota(split_candidates.begin(), split_candidates.end(), 0); // consecutive indices of possible candidates
-
-  if (!deterministic)
-  {
-    shuffle_vector(split_candidates.begin(), split_candidates.end()); // shuffle for random order
-  }
-
-  // consider a fraction of possible splits
-  while (n < n_candidates)
-  {
-
-    if (possible_splits.empty())
-      break;
-    if (split_candidates[n] >= 0 && (size_t)split_candidates[n] >= possible_splits.size())
-      continue;
-
-    auto candidate = possible_splits.begin();
-    std::advance(candidate, split_candidates[n]); // get random split candidate without replacement
-    k = candidate->first - 1;                     // split dim of current candidate, converted to index starting at 0
-    leaf_size = n_leaves[k];
-
-    // Test if splitting in the current tree w.r.t. the coordinate "k" is an element of candidate tree
-    tree_dims = candidate->second->split_dims;
-    tree_dims.erase(k + 1);
-    tree_dims.erase(0);
-
-    std::vector<std::shared_ptr<DecisionTree>> curr_trees;
-    if (tree_dims.size() == 0)
-      curr_trees.push_back(curr_family[std::set<int>{0}]);
-    if (curr_family.find(tree_dims) != curr_family.end())
-      curr_trees.push_back(curr_family[tree_dims]);
-    if (curr_family.find(candidate->second->split_dims) != curr_family.end())
-      curr_trees.push_back(curr_family[candidate->second->split_dims]);
-
-    // go through all trees in current family
-    for (auto &curr_tree : curr_trees)
-    {
-
-      // skip if tree has no leaves
-      if (curr_tree->leaves.size() == 0)
-        continue;
-
-      // go through all leaves of current tree
-      for (auto &leaf : curr_tree->leaves)
-      {
-
-        std::vector<double> tot_sum(value_size, 0);
-
-        // extract sample points according to individuals from X and Y
-        unique_samples = std::vector<double>(leaf.individuals.size());
-        for (unsigned int i = 0; i < leaf.individuals.size(); ++i)
-        {
-          unique_samples[i] = X[leaf.individuals[i]][k];
-        }
-        std::sort(unique_samples.begin(), unique_samples.end());
-        unique_samples.erase(std::unique(unique_samples.begin(), unique_samples.end()), unique_samples.end());
-
-        // check if number of sample points is within limit
-        if (unique_samples.size() < 2 * leaf_size)
-          continue;
-
-        // consider split_try-number of samples
-        std::vector<int> samples;
-        if (deterministic)
-        { // sequential samples if deterministic
-          samples = std::vector<int>(std::min((int)unique_samples.size() - 1, 9));
-          std::iota(samples.begin(), samples.end(), 1);
-        }
-        else
-        { // randomly picked samples otherwise
-          samples = std::vector<int>(split_try);
-          for (size_t i = 0; i < samples.size(); ++i)
-            samples[i] = R::runif(leaf_size, unique_samples.size() - leaf_size);
-          std::sort(samples.begin(), samples.end());
-        }
-
-        // go through samples
-        for (size_t sample_pos = 0; sample_pos < samples.size(); ++sample_pos)
-        {
-
-          // get samplepoint
-          sample_point = unique_samples[samples[sample_pos]];
-
-          // clear current split
-          {
-            curr_split.I_s.clear();
-            curr_split.I_b.clear();
-            curr_split.I_s.reserve(leaf.individuals.size());
-            curr_split.I_b.reserve(leaf.individuals.size());
-            curr_split.M_s = std::vector<double>(value_size, 0);
-            curr_split.M_b = std::vector<double>(value_size, 0);
-          }
-
-          // get samples greater/smaller than samplepoint
-          if (sample_pos == 0)
-          {
-            curr_split.sum_s = std::vector<double>(value_size, 0);
-            curr_split.sum_b = std::vector<double>(value_size, 0);
+// --------------- calcOptimalSplit per mode ---------------
 
-            for (int individual : leaf.individuals)
-            {
-              if (X[individual][k] < sample_point)
-              {
-                curr_split.I_s.push_back(individual);
-                curr_split.sum_s += Y[individual];
-              }
-              else
-              {
-                curr_split.I_b.push_back(individual);
-                curr_split.sum_b += Y[individual];
-              }
-            }
+// Mode 3: leaves implementation moved to lib/splits_leaves.cpp
 
-            tot_sum = curr_split.sum_s + curr_split.sum_b;
-          }
-          else
-          {
+// Mode 1: cur_trees_2 moved to lib/splits_cur_trees_2.cpp
 
-            for (int individual : leaf.individuals)
-            {
-              if (X[individual][k] < sample_point)
-              {
-                if (X[individual][k] >= unique_samples[samples[sample_pos - 1]])
-                {
-                  curr_split.sum_s += Y[individual];
-                }
-                curr_split.I_s.push_back(individual);
-              }
-              else
-              {
-                curr_split.I_b.push_back(individual);
-              }
-            }
+// Mode 2: cur_trees_1 (pair-sampling within predecessor/current trees)
+// Mode 2: cur_trees_1 moved to lib/splits_cur_trees_1.cpp
 
-            curr_split.sum_b = tot_sum - curr_split.sum_s;
-          }
+// Mode 0: res_trees (operate on resulting trees pool)
+bool RandomPlantedForest::resultingTreeExists(const std::vector<RandomPlantedForest::ResultingTreeCandidate>& pool, const std::set<int>& dims) {
+  for (const auto &c : pool) if (c.tree->get_split_dims() == dims) return true; return false;
+}
 
-          // accumulate squared mean and get mean
-          L2_loss(curr_split);
+// Mode 0: res_trees moved to lib/splits_res_trees.cpp
 
-          // update split if squared sum is smaller
-          if (curr_split.min_sum < min_split.min_sum)
-          {
-            min_split = curr_split;
-            min_split.tree_index = curr_tree;
-            min_split.leaf_index = &leaf;
-            min_split.split_coordinate = k + 1;
-            min_split.split_point = sample_point;
-          }
-        }
-      }
-    }
+// moved to lib/splits_hist.cpp
 
-    ++n;
+// Dispatcher used by create_tree_family
+Split RandomPlantedForest::calcOptimalSplit(const std::vector<std::vector<double>> &Y,
+                                            const std::vector<std::vector<double>> &X,
+                                            std::vector<SplitCandidate> &possible_splits,
+                                            TreeFamily &curr_family)
+{
+  if (split_structure_mode_ == 3) {
+    return this->calcOptimalSplit_leaves(Y, X, possible_splits, curr_family);
+  } else if (split_structure_mode_ == 2) {
+    return this->calcOptimalSplit_curTrees1(Y, X, possible_splits, curr_family);
+  } else if (split_structure_mode_ == 1) {
+    return this->calcOptimalSplit_curTrees2(Y, X, possible_splits, curr_family);
+  } else if (split_structure_mode_ == 4) {
+    return this->calcOptimalSplit_hist(Y, X, possible_splits, curr_family);
+  } else {
+    // Not used for res_trees; a separate path below uses its own pool type
+    return Split{};
   }
-
-  return min_split;
 }
 
 void RandomPlantedForest::set_data(const NumericMatrix &samples_Y, const NumericMatrix &samples_X)
 {
-
   this->Y = to_std_vec(samples_Y);
   this->X = to_std_vec(samples_X);
-
-  // Check for correct input
-  if (Y.size() == 0)
-    throw std::invalid_argument("Y empty - no data provided.");
-  if (X.size() == 0)
-    throw std::invalid_argument("X empty - no data provided.");
+  if (Y.empty()) throw std::invalid_argument("Y empty - no data provided.");
+  if (X.empty()) throw std::invalid_argument("X empty - no data provided.");
   this->feature_size = X[0].size();
-  this->value_size = Y[0].size(); // multiclass
-  for (const auto &vec : X)
-  {
-    if (vec.size() != (size_t)feature_size)
-      throw std::invalid_argument("Feature dimensions of X not uniform.");
-  }
-  if (Y.size() != X.size())
-    throw std::invalid_argument("X and Y are not of the same length!");
-
+  this->value_size = Y[0].size();
+  for (const auto &vec : X) if (vec.size() != (size_t)feature_size) throw std::invalid_argument("Feature dimensions of X not uniform.");
+  if (Y.size() != X.size()) throw std::invalid_argument("X and Y are not of the same length!");
   this->n_leaves = std::vector<int>(feature_size, 1);
   this->sample_size = X.size();
   this->upper_bounds = std::vector<double>(feature_size);
   this->lower_bounds = std::vector<double>(feature_size);
-
-  // get upper/lower bounds
-  double minVal, maxVal, currVal;
-  for (int i = 0; i < feature_size; ++i)
-  {
-    minVal = maxVal = X[0][i];
-    for (size_t j = 0; j < sample_size; ++j)
-    {
-      currVal = X[j][i];
-      if (currVal < minVal)
-        minVal = currVal;
-      if (currVal > maxVal)
-        maxVal = currVal;
+  for (int i = 0; i < feature_size; ++i) {
+    double minVal = X[0][i], maxVal = X[0][i];
+    for (size_t j = 0; j < sample_size; ++j) { double currVal = X[j][i]; if (currVal < minVal) minVal = currVal; if (currVal > maxVal) maxVal = currVal; }
+    this->upper_bounds[i] = maxVal + 2 * eps; this->lower_bounds[i] = minVal;
+  }
+  // Prepare histogram bins if histogram mode is requested
+  if (this->split_structure_mode_ == 4) {
+    const size_t K = std::max<size_t>(2, std::min<size_t>(num_bins_, static_cast<size_t>(std::max(2, (int)std::sqrt((double)sample_size)))));
+    this->num_bins_ = K;
+    feature_cut_points_.assign((size_t)feature_size, std::vector<double>());
+    sample_bin_id_.assign((size_t)feature_size, std::vector<int>(sample_size, 0));
+    // For each feature, compute quantile cuts using sorted sample values
+    for (int k = 0; k < feature_size; ++k) {
+      std::vector<double> vals(sample_size);
+      for (size_t i = 0; i < sample_size; ++i) vals[i] = X[i][k];
+      std::sort(vals.begin(), vals.end());
+      vals.erase(std::unique(vals.begin(), vals.end()), vals.end());
+      size_t unique_n = vals.size();
+      size_t cuts = (K >= 2) ? (K - 1) : 1;
+      if (unique_n <= 1 || cuts == 0) { feature_cut_points_[k].clear(); feature_cut_points_[k].shrink_to_fit(); continue; }
+      feature_cut_points_[k].resize(cuts);
+      for (size_t c = 1; c <= cuts; ++c) {
+        double q = (double)c / (double)K; size_t idx = static_cast<size_t>(std::floor(q * (double)(unique_n - 1)));
+        if (idx >= unique_n) idx = unique_n - 1; feature_cut_points_[k][c - 1] = vals[idx];
+      }
+      // Assign bin ids for all samples in original X for this feature
+      for (size_t i = 0; i < sample_size; ++i) {
+        double v = X[i][k];
+        auto &cuts_k = feature_cut_points_[k];
+        int bin = 0;
+        if (!cuts_k.empty()) {
+          auto itb = std::upper_bound(cuts_k.begin(), cuts_k.end(), v);
+          bin = (int)std::distance(cuts_k.begin(), itb);
+        }
+        sample_bin_id_[k][i] = bin;
+      }
     }
-    this->upper_bounds[i] = maxVal + 2 * eps; // to consider samples at max value
-    this->lower_bounds[i] = minVal;
   }
-
   this->fit();
-
-  if (cross_validate)
-  {
-    this->cross_validation();
-  }
+  if (cross_validate) { this->cross_validation(); }
 }
 
 void RandomPlantedForest::create_tree_family(std::vector<Leaf> initial_leaves, size_t n)
 {
-
+  
   TreeFamily curr_family;
-  curr_family.insert(std::make_pair(std::set<int>{0}, std::make_shared<DecisionTree>(DecisionTree(std::set<int>{0}, initial_leaves)))); // save tree with one leaf in the beginning
-  // store possible splits in map with splitting variable as key and pointer to resulting tree
-  std::multimap<int, std::shared_ptr<DecisionTree>> possible_splits;
-  for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim)
-  {
-    // add pointer to resulting tree with split dimension as key
-    curr_family.insert(std::make_pair(std::set<int>{feature_dim}, std::make_shared<DecisionTree>(DecisionTree(std::set<int>{feature_dim}))));
-    possible_splits.insert(std::make_pair(feature_dim, curr_family[std::set<int>{feature_dim}]));
-  }
-
-  // sample data points with replacement
-  int sample_index;
-  std::vector<std::vector<double>> samples_X;
-  std::vector<std::vector<double>> samples_Y;
-
-  // deterministic
-  if (deterministic)
-  {
-    samples_X = X;
-    samples_Y = Y;
-    this->t_try = 1;
-  }
-  else
-  {
-    samples_X = std::vector<std::vector<double>>(sample_size);
-    samples_Y = std::vector<std::vector<double>>(sample_size);
-
-    for (size_t i = 0; i < sample_size; ++i)
-    {
-
-      sample_index = R::runif(0, sample_size - 1);
-      samples_Y[i] = Y[sample_index];
-      samples_X[i] = X[sample_index];
-    }
-  }
-
-  // modify existing or add new trees through splitting
-  Split curr_split;
-  for (int split_count = 0; split_count < n_splits; ++split_count)
-  {
-
-    // find optimal split
-    curr_split = calcOptimalSplit(samples_Y, samples_X, possible_splits, curr_family);
-
-    // continue only if we get a significant result
-    if (!std::isinf(curr_split.min_sum))
-    {
-
-      // update possible splits
-      for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim)
-      { // consider all possible dimensions
-
-        // create union of split coord, feature dim and dimensions of old tree
-        std::set<int> curr_dims = curr_split.tree_index->split_dims;
-        curr_dims.insert(curr_split.split_coordinate);
-        curr_dims.insert(feature_dim);
-        curr_dims.erase(0);
-
-        // skip if possible_split already exists
-        if (possibleExists(feature_dim, possible_splits, curr_dims))
-          continue;
-
-        // do not exceed maximum level of interaction
-        if (max_interaction >= 0 && curr_dims.size() > (size_t)max_interaction)
-          continue;
-
-        // check if resulting tree already exists in family
-        std::shared_ptr<DecisionTree> found_tree = treeExists(curr_dims, curr_family);
-
-        // update possible_splits if not already existing
-        if (found_tree)
-        { // if yes add pointer
-          possible_splits.insert(std::make_pair(feature_dim, found_tree));
+  curr_family.insert({std::set<int>{0}, std::make_shared<DecisionTree>(DecisionTree(std::set<int>{0}, initial_leaves))});
+
+  // res_trees uses a separate pool
+  if (split_structure_mode_ == 0) {
+    std::vector<ResultingTreeCandidate> possible_trees;
+    for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) {
+      auto treePtr = std::make_shared<DecisionTree>(DecisionTree({feature_dim}));
+      curr_family.insert({{feature_dim}, treePtr});
+      possible_trees.emplace_back(treePtr);
+    }
+
+    // Bootstrap samples
+    int sample_index; std::vector<std::vector<double>> samples_X, samples_Y;
+    if (deterministic) { samples_X = X; samples_Y = Y; this->t_try = 1; }
+    else {
+      samples_X = std::vector<std::vector<double>>(sample_size); samples_Y = std::vector<std::vector<double>>(sample_size);
+      for (size_t i = 0; i < sample_size; ++i) { sample_index = rng_randint(0, (int)sample_size); samples_Y[i] = Y[sample_index]; samples_X[i] = X[sample_index]; }
+    }
+
+    Split curr_split;
+    for (int split_count = 0; split_count < n_splits; ++split_count) {
+      curr_split = this->calcOptimalSplit_resTrees(samples_Y, samples_X, possible_trees, curr_family);
+      if (!std::isinf(curr_split.min_sum)) {
+        // ensure D' and its one-step supersets are in pool
+        std::set<int> Dprime = curr_split.tree_index->split_dims; Dprime.insert(curr_split.split_coordinate); Dprime.erase(0);
+        if (!resultingTreeExists(possible_trees, Dprime)) { if (auto found = treeExists(Dprime, curr_family)) possible_trees.emplace_back(found); else { curr_family.insert({Dprime, std::make_shared<DecisionTree>(DecisionTree(Dprime))}); possible_trees.emplace_back(curr_family[Dprime]); } }
+        for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) {
+          std::set<int> U = Dprime; U.insert(feature_dim); if (U.size() == Dprime.size()) continue; if (max_interaction >= 0 && U.size() > (size_t)max_interaction) continue; if (resultingTreeExists(possible_trees, U)) continue; if (auto found = treeExists(U, curr_family)) possible_trees.emplace_back(found); else { curr_family.insert({U, std::make_shared<DecisionTree>(DecisionTree(U))}); possible_trees.emplace_back(curr_family[U]); }
+        }
+
+        // Mutate residuals (restore old behavior)
+        for (int individual : curr_split.leaf_index->individuals) {
+          if (samples_X[individual][curr_split.split_coordinate - 1] < curr_split.split_point)
+            samples_Y[individual] -= curr_split.M_s;
+          else
+            samples_Y[individual] -= curr_split.M_b;
         }
-        else
-        { // if not create new tree
-          curr_family.insert(std::make_pair(curr_dims, std::make_shared<DecisionTree>(DecisionTree(curr_dims))));
-          possible_splits.insert(std::make_pair(feature_dim, curr_family[curr_dims]));
+        Leaf leaf_s, leaf_b; leaf_s.individuals = curr_split.I_s; leaf_b.individuals = curr_split.I_b; leaf_s.value = curr_split.M_s; leaf_b.value = curr_split.M_b; leaf_s.intervals = curr_split.leaf_index->intervals; leaf_b.intervals = curr_split.leaf_index->intervals; leaf_s.intervals[curr_split.split_coordinate - 1].second = curr_split.split_point; leaf_b.intervals[curr_split.split_coordinate - 1].first = curr_split.split_point;
+        std::set<int> resulting_dims = curr_split.tree_index->split_dims; resulting_dims.insert(curr_split.split_coordinate); resulting_dims.erase(0);
+        std::shared_ptr<DecisionTree> found_tree = treeExists(resulting_dims, curr_family);
+        if (!found_tree) {
+          curr_family.insert({resulting_dims, std::make_shared<DecisionTree>(DecisionTree(resulting_dims))});
+          found_tree = curr_family[resulting_dims];
         }
+        if ((curr_split.tree_index->split_dims.count(curr_split.split_coordinate)) && delete_leaves) { leaf_s.value += curr_split.leaf_index->value; leaf_b.value += curr_split.leaf_index->value; *curr_split.leaf_index = leaf_b; curr_split.tree_index->leaves.push_back(leaf_s); }
+        else { found_tree->leaves.push_back(leaf_s); found_tree->leaves.push_back(leaf_b); }
       }
+    }
 
-      // update values of individuals of split interval with mean
-      for (int individual : curr_split.leaf_index->individuals)
-      { // todo: loop directly over I_s I_b
-        if (samples_X[individual][curr_split.split_coordinate - 1] < curr_split.split_point)
-        {
-          samples_Y[individual] -= curr_split.M_s;
-        }
-        else
+    // Final memory cleanup: drop training-only buffers and shrink containers
+    auto keys = getKeys(curr_family);
+    for (auto &key : keys) {
+      auto itTree = curr_family.find(key);
+      if (itTree == curr_family.end()) continue;
+      auto &treePtr = itTree->second;
+      if (treePtr->leaves.size() == 0) { curr_family.erase(itTree); continue; }
+      for (auto &leaf : treePtr->leaves) {
+        // Individuals are not used after training; caches are training-only
+        leaf.individuals.clear();
+        leaf.individuals.shrink_to_fit();
+        // Free per-leaf caches decisively
         {
-          samples_Y[individual] -= curr_split.M_b;
+          std::unordered_map<int, std::vector<size_t>>().swap(leaf.order_cache);
+          std::unordered_map<int, std::vector<double>>().swap(leaf.sorted_vals_cache);
+          std::unordered_map<int, std::vector<double>>().swap(leaf.unique_vals_cache);
+          std::unordered_map<int, size_t>()          .swap(leaf.unique_count_cache);
+        }
+        // Keep intervals and value but release spare capacity
+        leaf.intervals.shrink_to_fit();
+        leaf.value.shrink_to_fit();
+      }
+      // Clear per-dimension sampling caches (used only during training)
+      for (auto &v : treePtr->fenwick_by_dim_v) { v.clear(); v.shrink_to_fit(); }
+      treePtr->fenwick_by_dim_v.clear();
+      treePtr->fenwick_by_dim_v.shrink_to_fit();
+      for (auto &v : treePtr->leaf_weights_by_dim_v) { v.clear(); v.shrink_to_fit(); }
+      treePtr->leaf_weights_by_dim_v.clear();
+      treePtr->leaf_weights_by_dim_v.shrink_to_fit();
+      treePtr->weights_total_by_dim_v.clear();
+      treePtr->weights_total_by_dim_v.shrink_to_fit();
+      treePtr->weights_epoch_by_dim_v.clear();
+      treePtr->weights_epoch_by_dim_v.shrink_to_fit();
+      treePtr->leaves.shrink_to_fit();
+    }
+    tree_families[n] = curr_family; return;
+  }
+
+  // Non-res_trees modes use SplitCandidate pool
+  std::vector<SplitCandidate> possible_splits;
+  if (split_structure_mode_ == 3 || split_structure_mode_ == 4) {
+    // leaves: seed with leaf-level candidates from null tree (single leaf at index 0)
+    auto add_leaf_candidates = [&](const std::shared_ptr<DecisionTree>& T, size_t li) {
+      for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) {
+        std::set<int> res_dims = T->split_dims; res_dims.insert(feature_dim); res_dims.erase(0);
+        if (max_interaction >= 0 && res_dims.size() > (size_t)max_interaction) continue;
+        if (!leafCandidateExists(possible_splits, T, li, feature_dim)) possible_splits.emplace_back(feature_dim, T, li);
+      }
+    };
+    auto null_tree = curr_family[{0}];
+    if (!null_tree->leaves.empty()) add_leaf_candidates(null_tree, 0);
+    
+
+    // bootstrap
+    int sample_index; std::vector<std::vector<double>> samples_X, samples_Y; std::vector<int> boot_idx(sample_size);
+    if (deterministic) { samples_X = X; samples_Y = Y; this->t_try = 1; for (size_t i=0;i<sample_size;++i) boot_idx[i] = static_cast<int>(i); }
+    else {
+      samples_X = std::vector<std::vector<double>>(sample_size); samples_Y = std::vector<std::vector<double>>(sample_size);
+      for (size_t i=0;i<sample_size;++i){ sample_index = rng_randint(0, (int)sample_size); boot_idx[i] = sample_index; samples_Y[i] = Y[sample_index]; samples_X[i] = X[sample_index]; }
+    }
+    
+
+    // In histogram mode, cache per-feature bin ids for the working (bootstrapped) dataset
+    if (split_structure_mode_ == 4) {
+      tls_working_bin_id.assign((size_t)feature_size, std::vector<int>(sample_size, 0));
+      for (int k = 0; k < feature_size; ++k) {
+        // Reuse global precomputed bin ids via bootstrap index mapping
+        if (!feature_cut_points_.empty() && (size_t)k < sample_bin_id_.size()) {
+          for (size_t i = 0; i < sample_size; ++i) tls_working_bin_id[k][i] = sample_bin_id_[k][(size_t)boot_idx[i]];
+        } else {
+          // Fallback: compute on-the-fly (should be rare if cuts are available)
+          const auto &cuts_k = (k >= 0 && k < (int)feature_cut_points_.size()) ? feature_cut_points_[k] : std::vector<double>{};
+          for (size_t i = 0; i < sample_size; ++i) {
+            int bin = 0; if (!cuts_k.empty()) { auto itb = std::upper_bound(cuts_k.begin(), cuts_k.end(), samples_X[i][k]); bin = (int)std::distance(cuts_k.begin(), itb); }
+            tls_working_bin_id[k][i] = bin;
+          }
         }
       }
+    }
 
-      // construct new leaves
-      Leaf leaf_s, leaf_b;
-      {
-        leaf_s.individuals = curr_split.I_s;
-        leaf_b.individuals = curr_split.I_b;
-
-        leaf_s.value = curr_split.M_s;
-        leaf_b.value = curr_split.M_b;
-
-        // initialize interval with split interval
-        leaf_s.intervals = curr_split.leaf_index->intervals;
-        leaf_b.intervals = curr_split.leaf_index->intervals;
+    Split curr_split;
+    for (int split_count = 0; split_count < n_splits; ++split_count) {
+      
+      if (split_structure_mode_ == 4) curr_split = this->calcOptimalSplit_hist(samples_Y, samples_X, possible_splits, curr_family);
+      else curr_split = this->calcOptimalSplit_leaves(samples_Y, samples_X, possible_splits, curr_family);
+      
+      if (!std::isinf(curr_split.min_sum)) {
+        
+        // Mutate residuals (restore old behavior)
+        for (int individual : curr_split.leaf_index->individuals) {
+          if (samples_X[individual][curr_split.split_coordinate - 1] < curr_split.split_point) samples_Y[individual] -= curr_split.M_s; else samples_Y[individual] -= curr_split.M_b;
+        }
+        
+        Leaf leaf_s, leaf_b; leaf_s.individuals = curr_split.I_s; leaf_b.individuals = curr_split.I_b; leaf_s.value = curr_split.M_s; leaf_b.value = curr_split.M_b; leaf_s.intervals = curr_split.leaf_index->intervals; leaf_b.intervals = curr_split.leaf_index->intervals; leaf_s.intervals[curr_split.split_coordinate - 1].second = curr_split.split_point; leaf_b.intervals[curr_split.split_coordinate - 1].first = curr_split.split_point;
+        
+        std::set<int> resulting_dims = curr_split.tree_index->split_dims; resulting_dims.insert(curr_split.split_coordinate); resulting_dims.erase(0);
+        
+        std::shared_ptr<DecisionTree> found_tree = treeExists(resulting_dims, curr_family);
+        if (!found_tree) {
+          curr_family.insert({resulting_dims, std::make_shared<DecisionTree>(DecisionTree(resulting_dims))});
+          found_tree = curr_family[resulting_dims];
+        }
+        
+        auto add_leaf_candidates = [&](const std::shared_ptr<DecisionTree>& T, size_t li) {
+          if (!T) return;
+          // Re-add per-leaf candidate entries for all dimensions, respecting max_interaction and dedup
+          for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) {
+            std::set<int> res_dims = T->split_dims;
+            res_dims.insert(feature_dim);
+            res_dims.erase(0);
+            if (max_interaction >= 0 && res_dims.size() > (size_t)max_interaction) continue;
+            if (!leafCandidateExists(possible_splits, T, li, feature_dim)) {
+              possible_splits.emplace_back(feature_dim, T, li);
+            }
+          }
+        };
+        if ((curr_split.tree_index->split_dims.count(curr_split.split_coordinate)) && delete_leaves) {
+          
+          leaf_s.value += curr_split.leaf_index->value; leaf_b.value += curr_split.leaf_index->value;
+          // Compute index BEFORE any push_back that may reallocate
+          size_t idx_b = static_cast<size_t>(curr_split.leaf_index - &curr_split.tree_index->leaves[0]);
+          // Assign by value to avoid aliasing issues if vector reallocates later
+          *curr_split.leaf_index = leaf_b;
+          curr_split.tree_index->leaves.push_back(leaf_s);
+          size_t idx_s = curr_split.tree_index->leaves.size() - 1;
+          add_leaf_candidates(curr_split.tree_index, idx_b);
+          add_leaf_candidates(curr_split.tree_index, idx_s);
+          // invalidate per-leaf unique caches for new structure (affects cur_trees_2 only for this tree)
+          if (!curr_split.tree_index->leaves.empty()) {
+            for (auto &lf : curr_split.tree_index->leaves) { lf.unique_count_cache.clear(); lf.unique_vals_cache.clear(); }
+          }
+        } else {
+          
+          // Append by value; avoid referencing invalidated addresses
+          found_tree->leaves.push_back(leaf_s);
+          found_tree->leaves.push_back(leaf_b);
+          // Add candidates for both new leaves
+          size_t idx_s = found_tree->leaves.size() - 2;
+          size_t idx_b = found_tree->leaves.size() - 1;
+          add_leaf_candidates(found_tree, idx_s);
+          add_leaf_candidates(found_tree, idx_b);
+          // invalidate unique caches on the receiving tree (cur_trees_2)
+          if (!found_tree->leaves.empty()) {
+            for (auto &lf : found_tree->leaves) { lf.unique_count_cache.clear(); lf.unique_vals_cache.clear(); }
+          }
+        }
+      }
+    }
+    // Release histogram working buffers (thread-local) if used
+    tls_working_bin_id.clear();
+    tls_working_bin_id.shrink_to_fit();
 
-        // interval of leaf with smaller individuals has new upper bound in splitting dimension
-        leaf_s.intervals[curr_split.split_coordinate - 1].second = curr_split.split_point;
-        // interval of leaf with bigger individuals has new lower bound in splitting dimension
-        leaf_b.intervals[curr_split.split_coordinate - 1].first = curr_split.split_point;
+    // Final memory cleanup: drop training-only buffers and shrink containers
+    auto keys = getKeys(curr_family);
+    for (auto &key : keys) {
+      auto itTree = curr_family.find(key);
+      if (itTree == curr_family.end()) continue;
+      auto &treePtr = itTree->second;
+      if (treePtr->leaves.size() == 0) { curr_family.erase(itTree); continue; }
+      for (auto &leaf : treePtr->leaves) {
+        leaf.individuals.clear();
+        leaf.individuals.shrink_to_fit();
+        std::unordered_map<int, std::vector<size_t>>().swap(leaf.order_cache);
+        std::unordered_map<int, std::vector<double>>().swap(leaf.sorted_vals_cache);
+        std::unordered_map<int, std::vector<double>>().swap(leaf.unique_vals_cache);
+        std::unordered_map<int, size_t>()          .swap(leaf.unique_count_cache);
+        leaf.intervals.shrink_to_fit();
+        leaf.value.shrink_to_fit();
       }
+      for (auto &v : treePtr->fenwick_by_dim_v) { v.clear(); v.shrink_to_fit(); }
+      treePtr->fenwick_by_dim_v.clear();
+      treePtr->fenwick_by_dim_v.shrink_to_fit();
+      for (auto &v : treePtr->leaf_weights_by_dim_v) { v.clear(); v.shrink_to_fit(); }
+      treePtr->leaf_weights_by_dim_v.clear();
+      treePtr->leaf_weights_by_dim_v.shrink_to_fit();
+      treePtr->weights_total_by_dim_v.clear();
+      treePtr->weights_total_by_dim_v.shrink_to_fit();
+      treePtr->weights_epoch_by_dim_v.clear();
+      treePtr->weights_epoch_by_dim_v.shrink_to_fit();
+      treePtr->leaves.shrink_to_fit();
+    }
+    tree_families[n] = curr_family; return;
+  }
+
+  // cur_trees_1 and cur_trees_2: initialize with {j} trees
+  for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) {
+    auto treePtr = std::make_shared<DecisionTree>(DecisionTree({feature_dim}));
+    curr_family.insert({{feature_dim}, treePtr});
+    // leaf_idx unused for these modes
+    possible_splits.emplace_back(feature_dim, treePtr, static_cast<size_t>(0));
+  }
+
+  // bootstrap
+  int sample_index; std::vector<std::vector<double>> samples_X, samples_Y;
+  if (deterministic) { samples_X = X; samples_Y = Y; this->t_try = 1; }
+  else { samples_X = std::vector<std::vector<double>>(sample_size); samples_Y = std::vector<std::vector<double>>(sample_size); for (size_t i=0;i<sample_size;++i){ sample_index = rng_randint(0, (int)sample_size); samples_Y[i] = Y[sample_index]; samples_X[i] = X[sample_index]; } }
 
-      // construct split_dims of resulting tree when splitting in split_coordinate
-      std::set<int> resulting_dims = curr_split.tree_index->split_dims;
-      resulting_dims.insert(curr_split.split_coordinate);
-      resulting_dims.erase(0);
+  Split curr_split;
+  for (int split_count = 0; split_count < n_splits; ++split_count) {
+    if (split_structure_mode_ == 2) curr_split = this->calcOptimalSplit_curTrees1(samples_Y, samples_X, possible_splits, curr_family);
+    else curr_split = this->calcOptimalSplit_curTrees2(samples_Y, samples_X, possible_splits, curr_family);
+    if (!std::isinf(curr_split.min_sum)) {
+      // Update possible_splits like tryeveryleaf/splittrynew
+      for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) {
+        std::set<int> curr_dims = curr_split.tree_index->split_dims; curr_dims.insert(curr_split.split_coordinate); curr_dims.insert(feature_dim); curr_dims.erase(0);
+        if (possibleExists(feature_dim, possible_splits, curr_dims)) continue;
+        if (max_interaction >= 0 && curr_dims.size() > (size_t)max_interaction) continue;
+        if (auto found = treeExists(curr_dims, curr_family)) possible_splits.emplace_back(feature_dim, found, static_cast<size_t>(0));
+        else { curr_family.insert({curr_dims, std::make_shared<DecisionTree>(DecisionTree(curr_dims))}); possible_splits.emplace_back(feature_dim, curr_family[curr_dims], static_cast<size_t>(0)); }
+      }
 
-      // check if resulting tree already exists in family
+      for (int individual : curr_split.leaf_index->individuals) {
+        if (samples_X[individual][curr_split.split_coordinate - 1] < curr_split.split_point) samples_Y[individual] -= curr_split.M_s; else samples_Y[individual] -= curr_split.M_b;
+      }
+      Leaf leaf_s, leaf_b; leaf_s.individuals = curr_split.I_s; leaf_b.individuals = curr_split.I_b; leaf_s.value = curr_split.M_s; leaf_b.value = curr_split.M_b; leaf_s.intervals = curr_split.leaf_index->intervals; leaf_b.intervals = curr_split.leaf_index->intervals; leaf_s.intervals[curr_split.split_coordinate - 1].second = curr_split.split_point; leaf_b.intervals[curr_split.split_coordinate - 1].first = curr_split.split_point;
+      std::set<int> resulting_dims = curr_split.tree_index->split_dims; resulting_dims.insert(curr_split.split_coordinate); resulting_dims.erase(0);
       std::shared_ptr<DecisionTree> found_tree = treeExists(resulting_dims, curr_family);
-
-      // determine which tree is modified
-      if (curr_split.tree_index->split_dims.count(curr_split.split_coordinate))
-      { // if split variable is already in tree to be split
-        // change values
-        {
-          leaf_s.value += curr_split.leaf_index->value;
-          leaf_b.value += curr_split.leaf_index->value;
+      if (!found_tree) { curr_family.insert({resulting_dims, std::make_shared<DecisionTree>(DecisionTree(resulting_dims))}); found_tree = curr_family[resulting_dims]; }
+      if ((curr_split.tree_index->split_dims.count(curr_split.split_coordinate)) && delete_leaves) {
+        leaf_s.value += curr_split.leaf_index->value; leaf_b.value += curr_split.leaf_index->value;
+        // index of the replaced leaf BEFORE push_back
+        size_t idx_b = static_cast<size_t>(curr_split.leaf_index - &curr_split.tree_index->leaves[0]);
+        *curr_split.leaf_index = leaf_b;
+        curr_split.tree_index->leaves.push_back(leaf_s);
+        size_t idx_s = curr_split.tree_index->leaves.size() - 1;
+        // Incrementally update sampling caches if initialized
+        if ((int)curr_split.tree_index->fenwick_by_dim_v.size() >= this->feature_size) {
+          for (int kdim = 0; kdim < this->feature_size; ++kdim) {
+            auto &bit = curr_split.tree_index->fenwick_by_dim_v[(size_t)kdim];
+            auto &wts = curr_split.tree_index->leaf_weights_by_dim_v[(size_t)kdim];
+            if (!bit.empty() && wts.size() == bit.size()) {
+              // update replaced leaf
+              double m_b = (double)curr_split.tree_index->leaves[idx_b].individuals.size();
+              int leaf_min = this->n_leaves[kdim];
+              double w_new_b = std::max(0.0, m_b - 2.0 * (double)leaf_min);
+              double delta_b = w_new_b - (idx_b < wts.size() ? wts[idx_b] : 0.0);
+              if ((size_t)idx_b < wts.size()) wts[idx_b] = w_new_b;
+              if (delta_b != 0.0) { rpf_utils::fenwick_add(bit, idx_b + 1, delta_b); curr_split.tree_index->weights_total_by_dim_v[(size_t)kdim] += delta_b; }
+              // append new leaf
+              double m_s = (double)curr_split.tree_index->leaves[idx_s].individuals.size();
+              double w_new_s = std::max(0.0, m_s - 2.0 * (double)leaf_min);
+              bit.push_back(0.0);
+              wts.push_back(0.0);
+              if (w_new_s != 0.0) rpf_utils::fenwick_add(bit, bit.size(), w_new_s);
+              wts[wts.size() - 1] = w_new_s;
+              curr_split.tree_index->weights_total_by_dim_v[(size_t)kdim] += w_new_s;
+            }
+          }
         }
-        *curr_split.leaf_index = leaf_b;                 // replace old interval
-        curr_split.tree_index->leaves.push_back(leaf_s); // add new leaf
       }
-      else
-      {                                       // otherwise
-        found_tree->leaves.push_back(leaf_s); // append new leaves
-        found_tree->leaves.push_back(leaf_b);
+      else {
+        found_tree->leaves.push_back(leaf_s); found_tree->leaves.push_back(leaf_b);
+        size_t idx_s = found_tree->leaves.size() - 2; size_t idx_b = found_tree->leaves.size() - 1;
+        // Incrementally update sampling caches if initialized
+        if ((int)found_tree->fenwick_by_dim_v.size() >= this->feature_size) {
+          for (int kdim = 0; kdim < this->feature_size; ++kdim) {
+            auto &bit = found_tree->fenwick_by_dim_v[(size_t)kdim];
+            auto &wts = found_tree->leaf_weights_by_dim_v[(size_t)kdim];
+            if (!bit.empty() && wts.size() == bit.size()) {
+              int leaf_min = this->n_leaves[kdim];
+              // append s
+              double m_s = (double)found_tree->leaves[idx_s].individuals.size();
+              double w_new_s = std::max(0.0, m_s - 2.0 * (double)leaf_min);
+              bit.push_back(0.0); wts.push_back(0.0);
+              if (w_new_s != 0.0) rpf_utils::fenwick_add(bit, bit.size(), w_new_s);
+              wts[wts.size() - 1] = w_new_s;
+              found_tree->weights_total_by_dim_v[(size_t)kdim] += w_new_s;
+              // append b
+              double m_b = (double)found_tree->leaves[idx_b].individuals.size();
+              double w_new_b = std::max(0.0, m_b - 2.0 * (double)leaf_min);
+              bit.push_back(0.0); wts.push_back(0.0);
+              if (w_new_b != 0.0) rpf_utils::fenwick_add(bit, bit.size(), w_new_b);
+              wts[wts.size() - 1] = w_new_b;
+              found_tree->weights_total_by_dim_v[(size_t)kdim] += w_new_b;
+            }
+          }
+        }
       }
     }
   }
 
-  // remove empty trees & clear individuals of each tree
+  // Final memory cleanup: drop training-only buffers and shrink containers
   auto keys = getKeys(curr_family);
-  for (auto &key : keys)
-  {
-    if (curr_family[key]->leaves.size() == 0)
-    {
-      curr_family.erase(key);
-      continue;
-    }
-    for (auto &leaf : curr_family[key]->leaves)
-    {
+  for (auto &key : keys) {
+    auto itTree = curr_family.find(key);
+    if (itTree == curr_family.end()) continue;
+    auto &treePtr = itTree->second;
+    if (treePtr->leaves.size() == 0) { curr_family.erase(itTree); continue; }
+    for (auto &leaf : treePtr->leaves) {
       leaf.individuals.clear();
-    }
+      leaf.individuals.shrink_to_fit();
+      std::unordered_map<int, std::vector<size_t>>().swap(leaf.order_cache);
+      std::unordered_map<int, std::vector<double>>().swap(leaf.sorted_vals_cache);
+      std::unordered_map<int, std::vector<double>>().swap(leaf.unique_vals_cache);
+      std::unordered_map<int, size_t>()          .swap(leaf.unique_count_cache);
+      leaf.intervals.shrink_to_fit();
+      leaf.value.shrink_to_fit();
+    }
+    for (auto &v : treePtr->fenwick_by_dim_v) { v.clear(); v.shrink_to_fit(); }
+    treePtr->fenwick_by_dim_v.clear();
+    treePtr->fenwick_by_dim_v.shrink_to_fit();
+    for (auto &v : treePtr->leaf_weights_by_dim_v) { v.clear(); v.shrink_to_fit(); }
+    treePtr->leaf_weights_by_dim_v.clear();
+    treePtr->leaf_weights_by_dim_v.shrink_to_fit();
+    treePtr->weights_total_by_dim_v.clear();
+    treePtr->weights_total_by_dim_v.shrink_to_fit();
+    treePtr->weights_epoch_by_dim_v.clear();
+    treePtr->weights_epoch_by_dim_v.shrink_to_fit();
+    treePtr->leaves.shrink_to_fit();
   }
-
   tree_families[n] = curr_family;
 }
 
 // fit forest to new data
-void RandomPlantedForest::fit()
-{
+// fit() moved to lib/training.cpp
 
-  // setup initial set of individuals
-  std::vector<int> initial_individuals(sample_size);
-  std::iota(initial_individuals.begin(), initial_individuals.end(), 0);
+// predict single feature vector (from leaves variant)
+// predict_single moved to lib/predict.cpp
 
-  // initialize intervals with lower and upper bounds
-  std::vector<Interval> initial_intervals(feature_size);
-  for (int i = 0; i < feature_size; ++i)
-    initial_intervals[i] = Interval{lower_bounds[i], upper_bounds[i]};
+// predict_matrix moved to lib/predict.cpp
+// predict_vector moved to lib/predict.cpp
 
-  // set properties of first leaf
-  Leaf initial_leaf;
-  {
-    initial_leaf.value = std::vector<double>(value_size, 0);
-    initial_leaf.individuals = initial_individuals;
-    initial_leaf.intervals = initial_intervals;
-  }
-  std::vector<Leaf> initial_leaves{initial_leaf}; // vector with initial leaf
+double RandomPlantedForest::MSE_vec(const NumericVector &Y_predicted, const NumericVector &Y_true)
+{ return sum(Rcpp::pow(Y_true - Y_predicted, 2)) / Y_true.size(); }
 
-  // initialize tree families
-  this->tree_families = std::vector<TreeFamily>(n_trees);
+double RandomPlantedForest::MSE(const NumericMatrix &Y_predicted, const NumericMatrix &Y_true)
+{
+  double sumv = 0; int Y_size = Y_predicted.size();
+  for (int i = 0; i < Y_size; ++i) sumv += MSE_vec(Y_predicted(i, _), Y_true(i, _));
+  return sumv / Y_size;
+}
 
-  // Loop over number of tree families and dispatch threads in batches
-  // of nhreads at once
-  if (nthreads > 1)
+void RandomPlantedForest::print()
+{
+  for (int n = 0; n < n_trees; ++n)
   {
-    if (nthreads > std::thread::hardware_concurrency())
-    {
-      Rcout << "Requested " << nthreads << " threads but only " << std::thread::hardware_concurrency() << " available" << std::endl;
-    }
-    // Create local thread count to not overwrite nthreads,
-    // would get reported wrongly by get_parameters()
-    unsigned int current_threads = nthreads;
-    for (int n = 0; n < n_trees; n += current_threads)
+    TreeFamily family = tree_families[n]; auto keys = getKeys(family);
+    for (size_t m = 0; m < keys.size(); ++m)
     {
-      if (n >= (n_trees - current_threads + 1))
-      {
-        current_threads = n_trees % current_threads;
-      }
-
-      std::vector<std::thread> threads(current_threads);
-      for (int t = 0; t < current_threads; ++t)
-      {
-        // Rcout << "Dispatching thread " << (n + t + 1) << "/" << n_trees << std::endl;
-        threads[t] = std::thread(&RandomPlantedForest::create_tree_family, this, std::ref(initial_leaves), n + t);
-      }
-      for (auto &t : threads)
+      DecisionTree tree = *(family[keys[m]]);
+      Rcout << m + 1 << " Tree: "; Rcout << "Dims="; for (const auto &dim : tree.split_dims) Rcout << dim << ",";
+      Rcout << std::endl << "Leaves: (" << tree.leaves.size() << ")" << std::endl;
+      for (const auto &leaf : tree.leaves)
       {
-        if (t.joinable())
-          t.join();
+        Rcout << "Intervals="; for (const auto &interval : leaf.intervals) { Rcout << interval.first << "," << interval.second << "/"; }
+        Rcout << " Value="; for (const auto &val : leaf.value) Rcout << val << ", "; Rcout << std::endl;
       }
+      Rcout << std::endl;
     }
+    Rcout << std::endl << std::endl;
   }
-  else
-  {
-    for (int n = 0; n < n_trees; ++n)
-    {
-      create_tree_family(initial_leaves, n);
-    }
-  }
+}
+
+void RandomPlantedForest::get_parameters()
+{
+  Rcout << "Parameters: n_trees=" << n_trees << ", n_splits=" << n_splits << ", max_interaction=" << max_interaction << ", t_try=" << t_try
+        << ", split_decay_rate=" << split_decay_rate_<< ", max_candidates="  << max_candidates_
+        << ", split_try=" << split_try << ", purified=" << purified << ", deterministic=" << deterministic << ", nthreads=" << nthreads
+        << ", feature_size=" << feature_size << ", sample_size=" << sample_size
+        << ", split_structure_mode=" << split_structure_mode_ << std::endl;
+}
 
-  // optionally purify tree
-  if (purify_forest)
+void RandomPlantedForest::set_parameters(StringVector keys, NumericVector values)
+{
+  if (keys.size() != values.size()) { Rcout << "Size of input vectors is not the same. " << std::endl; return; }
+  for (unsigned int i = 0; i < keys.size(); ++i)
   {
-    this->purify_3();
+    if (keys[i] == "deterministic") this->deterministic = values[i];
+    else if (keys[i] == "nthreads") this->nthreads = values[i];
+    else if (keys[i] == "purify") this->purify_forest = values[i];
+    else if (keys[i] == "n_trees") this->n_trees = values[i];
+    else if (keys[i] == "n_splits") this->n_splits = values[i];
+    else if (keys[i] == "t_try") this->t_try = values[i];
+    else if (keys[i] == "split_try") this->split_try = values[i];
+    else if (keys[i] == "max_interaction") this->max_interaction = values[i];
+    else if (keys[i] == "cv") this->cross_validate = values[i];
+    else if (keys[i] == "split_decay_rate") this->split_decay_rate_ = values[i];
+    else if (keys[i] == "max_candidates") this->max_candidates_ = static_cast<size_t>(values[i]);
+    else if (keys[i] == "delete_leaves") this->delete_leaves = static_cast<bool>(values[i]);
+    else if (keys[i] == "leaf_feature_cache_cap") this->leaf_feature_cache_cap_ = static_cast<size_t>(values[i]);
+    
+    else if (keys[i] == "split_structure_mode") this->split_structure_mode_ = static_cast<int>(values[i]);
+    else Rcout << "Unkown parameter key  '" << keys[i] << "' ." << std::endl;
   }
-  else
+  this->fit();
+}
+
+List RandomPlantedForest::get_model()
+{
+  List model;
+  for (const auto &family : tree_families)
   {
-    purified = false;
+    List variables, family_values, family_intervals;
+    for (const auto &tree : family)
+    {
+      List tree_values; List tree_intervals; variables.push_back(from_std_set(tree.first));
+      for (const auto &leaf : tree.second->leaves)
+      {
+        NumericMatrix leaf_values; for (const auto &val : leaf.value) leaf_values.push_back(val);
+        tree_values.push_back(leaf_values);
+        NumericVector intervals; for (const auto &interval : leaf.intervals) { intervals.push_back(interval.first); intervals.push_back(interval.second); }
+        NumericMatrix leaf_intervals(2, feature_size, intervals.begin()); tree_intervals.push_back(leaf_intervals);
+      }
+      family_intervals.push_back(tree_intervals); family_values.push_back(tree_values);
+    }
+    model.push_back(List::create(Named("variables") = variables, _["values"] = family_values, _["intervals"] = family_intervals));
   }
+  return (model);
 }
 
+
+
 void RandomPlantedForest::cross_validation(int n_sets, IntegerVector splits, NumericVector t_tries, IntegerVector split_tries)
 {
 
@@ -616,1471 +744,10 @@ void RandomPlantedForest::cross_validation(int n_sets, IntegerVector splits, Num
    */
 }
 
-// predict single feature vector
-std::vector<double> RandomPlantedForest::predict_single(const std::vector<double> &X, std::set<int> component_index)
-{
-
-  std::vector<double> total_res = std::vector<double>(value_size, 0);
-
-  if (!purified)
-  {
-    // consider all components
-    if (component_index == std::set<int>{0})
-    {
-      for (auto &tree_family : this->tree_families)
-      {
-        for (auto &tree : tree_family)
-        {
-          for (auto &leaf : tree.second->leaves)
-          {
-            bool valid = true;
-            for (auto &dim : tree.first)
-            {
-              if (!((leaf.intervals[std::max(0, dim - 1)].first <= X[std::max(0, dim - 1)] || leaf.intervals[std::max(0, dim - 1)].first == lower_bounds[std::max(0, dim - 1)]) && (leaf.intervals[std::max(0, dim - 1)].second > X[std::max(0, dim - 1)] || leaf.intervals[std::max(0, dim - 1)].second == upper_bounds[std::max(0, dim - 1)])))
-              {
-                valid = false;
-              }
-            }
-            if (valid)
-            {
-
-              // Rcout << leaf.value[0] << "\n";
-              total_res += leaf.value;
-            }
-          }
-        }
-      }
-    }
-    else
-    { // choose components for prediction
-      for (auto &tree_family : this->tree_families)
-      {
-        for (auto &tree : tree_family)
-        {
-
-          // only consider trees with same dimensions as component_index
-          if (tree.first != component_index)
-            continue;
-
-          std::vector<int> dims;
-          for (auto dim : tree.first)
-          {
-            dims.push_back(dim);
-          }
-
-          for (auto &leaf : tree.second->leaves)
-          {
-            bool valid = true;
-            for (unsigned int i = 0; i < dims.size(); ++i)
-            {
-
-              int dim = dims[i];
-
-              if (!((leaf.intervals[std::max(0, dim - 1)].first <= X[i] || leaf.intervals[std::max(0, dim - 1)].first == lower_bounds[std::max(0, dim - 1)]) && (leaf.intervals[std::max(0, dim - 1)].second > X[i] || leaf.intervals[std::max(0, dim - 1)].second == upper_bounds[std::max(0, dim - 1)])))
-              {
-                valid = false;
-              }
-            }
-            if (valid)
-              total_res += leaf.value;
-          }
-        }
-      }
-    }
-  }
-  else
-  {
-    if (component_index == std::set<int>{-1})
-    {
-      for (auto &tree_family : this->tree_families)
-      {
-        for (auto &tree : tree_family)
-        {
-          std::vector<int> leaf_index(tree.first.size(), -1);
-          // add value of null tree
-          if (tree.first == std::set<int>{0})
-          {
-
-            // Rcout << tree.first.size() ;
-            leaf_index = std::vector<int>(tree.first.size(), 0);
-            total_res += tree.second->GridLeaves.values[leaf_index];
-          }
-        }
-      }
-    }
-    else if (component_index == std::set<int>{0})
-    {
-      for (auto &tree_family : this->tree_families)
-      {
-        for (auto &tree : tree_family)
-        {
-          std::vector<int> leaf_index(tree.first.size(), -1);
-
-          // add value of null tree
-          if (tree.first == std::set<int>{0})
-          {
-
-            // Rcout << tree.first.size() ;
-            leaf_index = std::vector<int>(tree.first.size(), 0);
-          }
-          else
-          {
-
-            // go through limits of grid
-            for (size_t dim_index = 0; dim_index < tree.first.size(); ++dim_index)
-            {
-              // get dim at dim_index
-              int dim = 0;
-              {
-                auto dim_pnt = tree.first.begin();
-                std::advance(dim_pnt, dim_index);
-                dim = *dim_pnt;
-                --dim; // transform into index
-              }
-
-              auto bounds = tree.second->GridLeaves.lim_list[dim];
-              for (double bound : bounds)
-              {
-
-                // check if sample in leaf at dimension
-                if (X[dim] < bound)
-                  break; // changed
-
-                // if no interval smaller, set to end of bounds, otherwise set to leaf index
-                leaf_index[dim_index] = std::min(leaf_index[dim_index] + 1, (int)bounds.size() - 2);
-              }
-            }
-          }
-
-          // if interval of first leaf smaller smaller
-          for (int &index : leaf_index)
-            index = std::max(0, index);
-
-          total_res += tree.second->GridLeaves.values[leaf_index];
-        }
-      }
-    }
-    else
-    {
-
-      for (auto &tree_family : this->tree_families)
-      {
-        for (auto &tree : tree_family)
-        {
-
-          // only consider trees with same dimensions as component_index
-          if (tree.first != component_index)
-            continue;
-
-          std::vector<int> leaf_index(tree.first.size(), -1);
 
-          // add value of null tree
-          if (tree.first == std::set<int>{0})
-          {
-            leaf_index = std::vector<int>(tree.first.size(), 0);
-          }
-          else
-          {
-
-            // go through limits of grid
-            for (size_t dim_index = 0; dim_index < tree.first.size(); ++dim_index)
-            {
-              // get dim at dim_index
-              int dim = 0;
-              {
-                auto dim_pnt = tree.first.begin();
-                std::advance(dim_pnt, dim_index);
-                dim = *dim_pnt;
-                --dim; // transform into index
-              }
-
-              auto bounds = tree.second->GridLeaves.lim_list[dim];
-              for (double bound : bounds)
-              {
 
-                // check if sample in leaf at dimension
-                if (X[dim_index] < bound)
-                  break; // changed
+// purify_1 moved to lib/purify.cpp
 
-                // if no interval smaller, set to end of bounds, otherwise set to leaf index
-                leaf_index[dim_index] = std::min(leaf_index[dim_index] + 1, (int)bounds.size() - 2);
-              }
-            }
-          }
+// purify_2 moved to lib/purify.cpp
 
-          // if interval of first leaf smaller smaller
-          for (int &index : leaf_index)
-            index = std::max(0, index);
-
-          total_res += tree.second->GridLeaves.values[leaf_index];
-        }
-      }
-    }
-  }
-
-  return total_res / n_trees;
-}
-
-// predict multiple feature vectors
-Rcpp::NumericMatrix RandomPlantedForest::predict_matrix(const NumericMatrix &X, const NumericVector components)
-{
-  std::vector<std::vector<double>> feature_vec = to_std_vec(X);
-  std::set<int> component_index = to_std_set(components);
-  std::vector<std::vector<double>> predictions;
-
-  // todo: sanity check for X
-  if (feature_vec.empty())
-    throw std::invalid_argument("Feature vector is empty.");
-  if (component_index == std::set<int>{0} && this->feature_size >= 0 && feature_vec[0].size() != (size_t)this->feature_size)
-    throw std::invalid_argument("Feature vector has wrong dimension.");
-  if (component_index != std::set<int>{0} && component_index != std::set<int>{-1} && component_index.size() != feature_vec[0].size())
-    throw std::invalid_argument("The input X has the wrong dimension in order to calculate f_i(x)");
-
-  for (auto &vec : feature_vec)
-  {
-    predictions.push_back(predict_single(vec, component_index));
-  }
-
-  return from_std_vec(predictions);
-}
-
-Rcpp::NumericMatrix RandomPlantedForest::predict_vector(const NumericVector &X, const NumericVector components)
-{
-  std::vector<double> feature_vec = to_std_vec(X);
-  std::set<int> component_index = to_std_set(components);
-  std::vector<std::vector<double>> predictions;
-  Rcpp::NumericMatrix res;
-
-  // todo: sanity check for X
-  if (feature_vec.empty())
-  {
-    Rcout << "Feature vector is empty." << std::endl;
-    return res;
-  }
-
-  if (component_index == std::set<int>{0} && this->feature_size >= 0 && feature_vec.size() != (size_t)this->feature_size)
-  {
-    Rcout << "Feature vector has wrong dimension." << std::endl;
-    return res;
-  }
-
-  if (component_index == std::set<int>{0})
-  {
-    predictions.push_back(predict_single(feature_vec, component_index));
-  }
-  else
-  {
-    for (auto vec : feature_vec)
-    {
-      predictions.push_back(predict_single(std::vector<double>{vec}, component_index));
-    }
-  }
-
-  res = from_std_vec(predictions);
-  return res;
-}
-
-double RandomPlantedForest::MSE_vec(const NumericVector &Y_predicted, const NumericVector &Y_true)
-{
-  return sum(Rcpp::pow(Y_true - Y_predicted, 2)) / Y_true.size();
-}
-
-double RandomPlantedForest::MSE(const NumericMatrix &Y_predicted, const NumericMatrix &Y_true)
-{
-  // todo: multiclass
-  double sum = 0;
-  int Y_size = Y_predicted.size();
-
-  for (int i = 0; i < Y_size; ++i)
-  {
-    sum += MSE_vec(Y_predicted(i, _), Y_true(i, _));
-  }
-
-  return sum / Y_size;
-}
-
-void RandomPlantedForest::purify_1()
-{
-
-  // go through all n_trees families
-  for (auto &curr_family : this->tree_families)
-  {
-
-    // recap maximum number of dimensions of current family
-    unsigned int curr_max = 0;
-    for (auto tree : curr_family)
-    {
-      if (tree.first.size() > curr_max)
-        curr_max = tree.first.size();
-    }
-
-    while (curr_max >= 1)
-    {
-
-      // go through split dimensions of all trees
-      auto keys = getKeys(curr_family);
-      std::vector<std::set<int>>::reverse_iterator key = keys.rbegin();
-      while (key != keys.rend())
-      {
-
-        auto &curr_tree = curr_family[(*key)];
-        std::set<int> curr_dims = curr_tree->split_dims;
-
-        // check if number of dims same as current max_interaction
-        if (curr_dims.size() == curr_max)
-        {
-
-          // go through feature dims
-          for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim)
-          {
-
-            // continue only if dim in current tree
-            if (curr_tree->split_dims.count(feature_dim) != 0)
-            {
-
-              std::set<int> tree_dims = curr_tree->split_dims;
-              tree_dims.erase(tree_dims.find(feature_dim)); // remove current feature dim from current tree
-
-              // check if tree with dimensions exists, if not create
-              std::shared_ptr<DecisionTree> tree = treeExists(tree_dims, curr_family);
-              if (curr_max == 1)
-              {
-                tree = curr_family[std::set<int>{0}];
-              }
-              else
-              {
-                if (!tree)
-                {
-                  curr_family.insert(std::make_pair(tree_dims, std::make_shared<DecisionTree>(DecisionTree(tree_dims))));
-                  tree = curr_family[tree_dims];
-                }
-              }
-
-              // go through leaves of current tree
-              int n_leaves = curr_tree->leaves.size();
-              for (int l = 0; l < n_leaves; ++l)
-              {
-                auto &curr_leaf = curr_tree->leaves[l];
-
-                double multiplier = (curr_leaf.intervals[feature_dim - 1].second - curr_leaf.intervals[feature_dim - 1].first) / (upper_bounds[feature_dim - 1] - lower_bounds[feature_dim - 1]);
-
-                // new leaf including intervals and value
-                Leaf new_leaf = curr_leaf; // initialize intervals with first leaf
-                new_leaf.intervals[feature_dim - 1].first = lower_bounds[feature_dim - 1];
-                new_leaf.intervals[feature_dim - 1].second = upper_bounds[feature_dim - 1];
-                for (size_t i = 0; i < value_size; ++i)
-                  new_leaf.value[i] = -curr_leaf.value[i] * multiplier; // update value of new leaf
-
-                // append new leaf
-                if (!leafExists(new_leaf.intervals, curr_tree))
-                  curr_tree->leaves.push_back(new_leaf);
-                for (size_t i = 0; i < value_size; ++i)
-                  new_leaf.value[i] = curr_leaf.value[i] * multiplier; // update value of new leaf
-                if (!leafExists(new_leaf.intervals, tree))
-                  tree->leaves.push_back(new_leaf);
-              }
-            }
-          }
-        }
-        key++;
-      }
-
-      // update currently considered dimension size
-      --curr_max;
-    }
-  }
-
-  purified = true;
-}
-
-void RandomPlantedForest::purify_2()
-{
-
-  // go through all n_trees families
-  for (auto &curr_family : this->tree_families)
-  {
-
-    // lim_list is a list giving for each variable all interval end-points
-    std::vector<std::vector<double>> lim_list(feature_size);
-
-    // go through all variables of the component
-    for (int curr_dim = 1; curr_dim <= feature_size; ++curr_dim)
-    {
-      std::vector<double> bounds;
-
-      // go through trees of family
-      for (const auto &curr_tree : curr_family)
-      {
-
-        // consider only relevant trees that have current dimension as variable
-        if (!curr_tree.first.count(curr_dim))
-          continue;
-
-        // go through leaves of tree
-        for (const auto &curr_leaf : curr_tree.second->leaves)
-        {
-          // get interval ends of variable
-          bounds.push_back(curr_leaf.intervals[curr_dim - 1].second);
-        }
-      }
-      std::sort(bounds.begin(), bounds.end());
-      bounds.erase(std::unique(bounds.begin(), bounds.end()), bounds.end());
-      lim_list[curr_dim - 1] = bounds;
-    }
-
-    // initialize values and individuals for each tree in family
-    std::vector<grid::NDGrid> grids(curr_family.size() - 1);
-    std::vector<utils::Matrix<int>> individuals(curr_family.size() - 1);
-    std::vector<utils::Matrix<std::vector<double>>> values(curr_family.size() - 1);
-    std::vector<std::set<int>> variables(curr_family.size() - 1);
-
-    //  ------------- setup finer grid  -------------
-
-    int tree_index = 0;
-    for (const auto &curr_tree : curr_family)
-    {
-
-      if (curr_tree.first == std::set<int>{0})
-        continue; // ignore null tree
-
-      // fill space with dimensions
-      std::vector<int> dimensions;
-      for (const auto &dim : curr_tree.first)
-      {
-        dimensions.push_back(lim_list[dim - 1].size() - 1); // size - 1 ?
-      }
-
-      // setup grid for leaf indices
-      auto grid = grid::NDGrid(dimensions);
-
-      // initialize data for current tree
-      grids[tree_index] = grid;
-      individuals[tree_index] = utils::Matrix<int>(dimensions, 0);
-      values[tree_index] = utils::Matrix<std::vector<double>>(dimensions, std::vector<double>(value_size, 0)); // changed
-      variables[tree_index] = curr_tree.first;
-
-      // fill grid points with individuals and values
-      while (!grid.nextPoint())
-      {
-
-        std::vector<int> gridPoint = grid.getPoint();
-
-        bool in_leaf = true;
-
-        // go through sample points to sum up individuals
-        for (const auto &feature_vec : X)
-        {
-          int dim_index = 0;
-          in_leaf = true;
-          for (const auto &dim : curr_tree.first)
-          {
-            double val = feature_vec[dim - 1];
-            if (!((val >= lim_list[dim - 1][gridPoint[dim_index]]) && (val < lim_list[dim - 1][gridPoint[dim_index] + 1])))
-              in_leaf = false;
-            ++dim_index;
-          }
-
-          // consider individuals only if all in
-          if (in_leaf)
-            individuals[tree_index][gridPoint] += 1;
-        }
-
-        // go through leaves of tree to sum up values
-        for (const auto &leaf : curr_tree.second->get_leaves())
-        {
-
-          in_leaf = true;
-          int dim_index = 0;
-          for (const auto &dim : curr_tree.first)
-          {
-            // consider values only if all in
-            if (!((leaf.intervals[dim - 1].first <= lim_list[dim - 1][gridPoint[dim_index]]) && (leaf.intervals[dim - 1].second >= lim_list[dim - 1][gridPoint[dim_index] + 1])))
-              in_leaf = false;
-            ++dim_index;
-          }
-
-          // sum up values
-          if (in_leaf)
-            values[tree_index][gridPoint] += leaf.value; // todo: multiclass
-        }
-      }
-
-      ++tree_index;
-    }
-
-    // ------------- create new trees -------------
-
-    // insert null tree
-    grids.insert(grids.begin(), grid::NDGrid());
-    values.insert(values.begin(), utils::Matrix<std::vector<double>>(std::vector<int>{1}, std::vector<double>(value_size, 0)));
-    individuals.insert(individuals.begin(), utils::Matrix<int>(std::vector<int>{1}));
-    variables.insert(variables.begin(), std::set<int>{0});
-
-    // recap maximum number of dimensions of current family
-    unsigned int curr_max = 0;
-    for (const auto &tree : curr_family)
-    {
-      if (tree.first.size() > curr_max)
-        curr_max = tree.first.size();
-    }
-
-    auto keys = getKeys(curr_family);
-    while (curr_max > 1)
-    {
-
-      // go through split dimensions of all trees
-      for (std::vector<std::set<int>>::reverse_iterator key = keys.rbegin(); key != keys.rend(); ++key)
-      {
-
-        auto &curr_tree = curr_family[(*key)];
-        std::set<int> curr_dims = curr_tree->split_dims;
-
-        // check if number of dims same as current max_interaction
-        if (curr_dims.size() == curr_max)
-        {
-
-          // go through feature dims
-          int dim_index = 0;
-          for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim)
-          {
-
-            // continue only if dim in current tree
-            if (curr_tree->split_dims.count(feature_dim) != 0)
-            {
-
-              std::set<int> tree_dims = curr_tree->split_dims;
-              tree_dims.erase(tree_dims.find(feature_dim)); // remove current feature dim from current tree
-
-              // check if tree with dimensions exists, if not create
-              std::shared_ptr<DecisionTree> tree = treeExists(tree_dims, curr_family);
-              if (!tree)
-              {
-
-                // get index of old and new tree
-                auto old_tree_index = std::distance(std::begin(curr_family), curr_family.find(curr_tree->get_split_dims()));
-                curr_family.insert(std::make_pair(tree_dims, std::make_shared<DecisionTree>(DecisionTree(tree_dims))));
-                auto tree_index = std::distance(std::begin(curr_family), curr_family.find(tree_dims));
-
-                // remove matrix dimension of respective variable
-                std::vector<int> matrix_dimensions = values[old_tree_index].dims;
-                matrix_dimensions.erase(matrix_dimensions.begin() + dim_index);
-
-                // initialize data for new tree
-                auto grid = grid::NDGrid(matrix_dimensions);
-                grids.insert(grids.begin() + tree_index, grid);
-                values.insert(values.begin() + tree_index, utils::Matrix<std::vector<double>>(matrix_dimensions, std::vector<double>(0, value_size)));
-                individuals.insert(individuals.begin() + tree_index, utils::Matrix<int>(matrix_dimensions));
-                variables.insert(variables.begin() + tree_index, tree_dims);
-
-                // fill individuals of new trees
-                while (!grid.nextPoint())
-                {
-
-                  std::vector<int> gridPoint = grid.getPoint();
-                  bool in_leaf = true;
-
-                  // go through sample points to sum up individuals
-                  for (const auto &feature_vec : X)
-                  {
-                    int dim_index = 0;
-                    in_leaf = true;
-                    for (const auto &dim : tree_dims)
-                    {
-                      double val = feature_vec[dim - 1];
-                      if (!((val >= lim_list[dim - 1][gridPoint[dim_index]]) && (val < lim_list[dim - 1][gridPoint[dim_index] + 1])))
-                        in_leaf = false;
-                      ++dim_index;
-                    }
-
-                    // consider individuals only if all in
-                    if (in_leaf)
-                      individuals[tree_index][gridPoint] += 1;
-                  }
-                }
-              }
-
-              dim_index++;
-            }
-          }
-        }
-      }
-
-      // update currently considered dimension size
-      --curr_max;
-    }
-
-    // ------------- purify -------------
-
-    // measure tolerance and number of iterations
-    std::vector<double> tol(curr_family.size(), 1);
-    int iter;
-
-    // iterate backwards through tree family
-    int curr_tree_index = curr_family.size() - 1;
-    for (TreeFamily::reverse_iterator curr_tree = curr_family.rbegin(); curr_tree != curr_family.rend(); ++curr_tree)
-    {
-      iter = 0;
-      std::set<int> curr_dims = curr_tree->second->get_split_dims();
-
-      // do not purify null
-      if (curr_dims == std::set<int>{0})
-        continue;
-
-      // repeat until tolerance small enough and (?) maximum number of iterations reached
-      while ((tol[curr_tree_index] > 0.00000000001) && (iter < 100))
-      {
-
-        // go through feature dims
-        int curr_dim_index = 0;
-        for (const auto &feature_dim : curr_dims)
-        {
-
-          // get tree that has same variables as curr_tree minus j-variable
-          std::set<int> tree_dims = curr_dims;
-          tree_dims.erase(tree_dims.find(feature_dim));
-          int tree_index = 0; // if tree not exist, set to null tree
-          if (curr_family.find(tree_dims) != curr_family.end())
-            tree_index = std::distance(std::begin(curr_family), curr_family.find(tree_dims)) - 1;
-
-          // update values
-          if (grids[curr_tree_index].dimensions.size() == 1)
-          { // one dimensional case
-
-            int sum_ind = 0;
-            std::vector<double> avg(value_size, 0);
-
-            // get sum of individuals
-            for (int i = 0; i < individuals[curr_tree_index].n_entries; ++i)
-            {
-              std::vector<int> tmp{i};
-              sum_ind += individuals[curr_tree_index][tmp];
-            }
-            if (sum_ind == 0)
-              continue;
-
-            // calc avg
-            for (int i = 0; i < individuals[curr_tree_index].n_entries; ++i)
-            {
-              std::vector<int> tmp{i};
-              avg += (individuals[curr_tree_index][tmp] * values[curr_tree_index][tmp]) / sum_ind;
-            }
-
-            // update values of one dimensional and null tree
-            for (int i = 0; i < values[curr_tree_index].n_entries; ++i)
-            {
-              std::vector<int> tmp{i};
-              values[curr_tree_index][tmp] -= avg;
-            }
-            std::vector<int> tmp{0};
-            values[tree_index][tmp] += avg;
-          }
-          else
-          { // higher dimensional case
-
-            // setup new grid without dimension j
-            std::vector<int> new_dimensions = grids[curr_tree_index].dimensions;
-            int j_dim = new_dimensions[curr_dim_index];
-            new_dimensions.erase(new_dimensions.begin() + curr_dim_index);
-            grid::NDGrid grid = grid::NDGrid(new_dimensions);
-
-            // go through values without dimension j
-            while (!grid.nextPoint())
-            {
-              auto gridPoint = grid.getPoint();
-              gridPoint.push_back(0);
-
-              int sum_ind = 0;
-              std::vector<double> avg(value_size, 0);
-
-              // go through slice to sum up individuals
-              for (int j = 0; j < j_dim; ++j)
-              {
-                gridPoint.back() = j;
-
-                // get sum of individuals
-                sum_ind += individuals[curr_tree_index][gridPoint];
-              }
-
-              // go through slice to calc avg
-              for (int j = 0; j < j_dim; ++j)
-              {
-                gridPoint.back() = j;
-
-                // calc avg
-                avg += (individuals[curr_tree_index][gridPoint] * values[curr_tree_index][gridPoint]) / sum_ind;
-              }
-
-              // go through slice to update values
-              for (int j = 0; j < j_dim; ++j)
-              {
-                gridPoint.back() = j;
-
-                // update values of current slice
-                values[curr_tree_index][gridPoint] -= avg;
-              }
-
-              // update lower dimensional tree
-              gridPoint.pop_back();
-              values[tree_index][gridPoint] += avg;
-            }
-          }
-
-          ++curr_dim_index;
-        }
-
-        // update tolerance
-        if (variables[curr_tree_index].size() == 1)
-        {
-          tol[curr_tree_index] = 1; // todo
-        }
-        else
-        {
-          tol[curr_tree_index] = 1;
-        }
-
-        ++iter;
-      }
-
-      --curr_tree_index;
-    }
-
-    // ------------- attach to rpf class -------------
-
-    // fill with new trees
-    for (size_t tree_index = 0; tree_index < variables.size(); ++tree_index)
-    {
-      LeafGrid curr_gridLeaf;
-      curr_gridLeaf.grid = grids[tree_index];
-      curr_gridLeaf.individuals = individuals[tree_index];
-      curr_gridLeaf.lim_list = lim_list;
-      curr_gridLeaf.values = values[tree_index];
-      curr_family[variables[tree_index]]->GridLeaves = curr_gridLeaf;
-    }
-  }
-
-  purified = true;
-}
-
-void RandomPlantedForest::purify_3()
-{
-
-  // go through all n_trees families
-  for (auto &curr_family : this->tree_families)
-  {
-
-    // lim_list is a list giving for each variable all interval end-points
-    std::vector<std::vector<double>> lim_list(feature_size);
-
-    // go through all variables of the component
-    for (int curr_dim = 1; curr_dim <= feature_size; ++curr_dim)
-    {
-      std::vector<double> bounds;
-
-      // go through trees of family
-      for (const auto &curr_tree : curr_family)
-      {
-
-        // consider only relevant trees that have current dimension as variable
-        if (!curr_tree.first.count(curr_dim))
-          continue;
-
-        // go through leaves of tree
-        for (const auto &curr_leaf : curr_tree.second->leaves)
-        {
-          // get interval ends of variable
-          bounds.push_back(curr_leaf.intervals[curr_dim - 1].first);
-          bounds.push_back(curr_leaf.intervals[curr_dim - 1].second);
-        }
-      }
-      std::sort(bounds.begin(), bounds.end());
-      bounds.erase(std::unique(bounds.begin(), bounds.end()), bounds.end());
-      // int i_last = bounds.size()-1;
-      // double bibi = bounds[i_last] + 0.0001;
-      // bounds[i_last] = bounds[i_last] + 0.0001;
-      lim_list[curr_dim - 1] = bounds;
-    }
-
-    // initialize values and individuals for each tree in family
-    std::vector<grid::NDGrid> grids(curr_family.size() - 1);
-    std::vector<utils::Matrix<int>> individuals(curr_family.size() - 1);
-    std::vector<utils::Matrix<std::vector<double>>> values(curr_family.size() - 1);
-    std::vector<utils::Matrix<std::vector<double>>> values_old(curr_family.size() - 1);
-    std::vector<std::set<int>> variables(curr_family.size() - 1);
-
-    //  ------------- setup finer grid  -------------
-
-    int tree_index = 0;
-    for (const auto &curr_tree : curr_family)
-    {
-
-      if (curr_tree.first == std::set<int>{0})
-      {
-
-        // values[tree_index] = rpf::Matrix<std::vector<double>>(dimensions, std::vector<double>(value_size, 0)); // changed
-        continue; // ignore null tree
-      }
-
-      // fill space with dimensions
-      std::vector<int> dimensions;
-      for (const auto &dim : curr_tree.first)
-      {
-        dimensions.push_back(lim_list[dim - 1].size()); // size - 1 ? WICHTIG
-      }
-
-      // setup grid for leaf indices
-      auto grid = grid::NDGrid(dimensions);
-
-      // initialize data for current tree
-      grids[tree_index] = grid;
-      individuals[tree_index] = utils::Matrix<int>(dimensions, 0);
-      values[tree_index] = utils::Matrix<std::vector<double>>(dimensions, std::vector<double>(value_size, 0));     // changed
-      values_old[tree_index] = utils::Matrix<std::vector<double>>(dimensions, std::vector<double>(value_size, 0)); // changed
-      variables[tree_index] = curr_tree.first;
-
-      // fill grid points with individuals and values
-      while (!grid.nextPoint())
-      {
-
-        std::vector<int> gridPoint = grid.getPoint();
-
-        bool in_leaf = true;
-
-        // go through sample points to sum up individuals
-        for (const auto &feature_vec : X)
-        {
-          int dim_index = 0;
-          in_leaf = true;
-          for (const auto &dim : curr_tree.first)
-          {
-            double val = feature_vec[dim - 1];
-            if (!((val >= lim_list[dim - 1][gridPoint[dim_index]]) && (val < lim_list[dim - 1][gridPoint[dim_index] + 1])))
-              in_leaf = false;
-            ++dim_index;
-          }
-
-          // consider individuals only if all in
-          if (in_leaf)
-            individuals[tree_index][gridPoint] += 1;
-        }
-
-        // go through leaves of tree to sum up values
-        for (const auto &leaf : curr_tree.second->get_leaves())
-        {
-
-          in_leaf = true;
-          int dim_index = 0;
-          for (const auto &dim : curr_tree.first)
-          {
-            // consider values only if all in
-            if (!((leaf.intervals[dim - 1].first <= lim_list[dim - 1][gridPoint[dim_index]]) && (leaf.intervals[dim - 1].second >= lim_list[dim - 1][gridPoint[dim_index] + 1])))
-              in_leaf = false;
-            ++dim_index;
-          }
-
-          // sum up values
-          if (in_leaf)
-          {
-
-            values[tree_index][gridPoint] += leaf.value;     // todo: multiclass
-            values_old[tree_index][gridPoint] += leaf.value; // todo: multiclass
-          }
-        }
-      }
-
-      ++tree_index;
-    }
-
-    // Rcout << variables.size();
-    // for(int i = 0; i<variables.size(); ++i){
-    //
-    //   // Rcout << variables[i].size();
-    //
-    //   for(auto dim: variables[i]) Rcout << dim << ",";
-    //
-    //   //  Rcout << variables[i][j] << ",";
-    //   //}
-    //
-    //   Rcout << std::endl;
-    // }
-
-    // ------------- create new trees -------------
-
-    // insert null tree
-    grids.insert(grids.begin(), grid::NDGrid());
-    values.insert(values.begin(), utils::Matrix<std::vector<double>>(std::vector<int>{1}, std::vector<double>(value_size, 0)));
-    values_old.insert(values_old.begin(), utils::Matrix<std::vector<double>>(std::vector<int>{1}, std::vector<double>(value_size, 0)));
-    individuals.insert(individuals.begin(), utils::Matrix<int>(std::vector<int>{1}));
-    variables.insert(variables.begin(), std::set<int>{0});
-
-    // recap maximum number of dimensions of current family
-    unsigned int curr_max = curr_family.rbegin()->first.size();
-
-    while (curr_max > 1)
-    {
-
-      auto keys = getKeys(curr_family);
-      // go through split dimensions of all trees
-      for (std::vector<std::set<int>>::reverse_iterator key = keys.rbegin(); key != keys.rend(); ++key)
-      {
-        auto &curr_tree = curr_family[(*key)];
-        std::set<int> curr_dims = curr_tree->split_dims;
-        // check if number of dims same as current max_interaction
-        if (curr_dims.size() == curr_max)
-        {
-          // go through feature dims
-          int dim_index = 0;
-          for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim)
-          {
-            // continue only if dim in current tree
-            if (curr_tree->split_dims.count(feature_dim) != 0)
-            {
-              std::set<int> tree_dims = curr_tree->split_dims;
-              tree_dims.erase(tree_dims.find(feature_dim)); // remove current feature dim from current tree
-              // check if tree with dimensions exists, if not create
-              std::shared_ptr<DecisionTree> tree = treeExists(tree_dims, curr_family);
-              if (!tree)
-              {
-                // get index of old and new tree
-                auto old_tree_index = std::distance(std::begin(curr_family), curr_family.find(curr_tree->get_split_dims()));
-                curr_family.insert(std::make_pair(tree_dims, std::make_shared<DecisionTree>(DecisionTree(tree_dims))));
-                auto tree_index = std::distance(std::begin(curr_family), curr_family.find(tree_dims));
-                // remove matrix dimension of respective variable
-                std::vector<int> matrix_dimensions = values[old_tree_index].dims;
-                // std::vector<int> matrix_dimensions = values_old[old_tree_index].dims;
-
-                // Rcout << typeof(matrix_dimensions.begin()) << std::endl;
-
-                matrix_dimensions.erase(matrix_dimensions.begin() + dim_index);
-                // initialize data for new tree
-                auto grid = grid::NDGrid(matrix_dimensions);
-                grids.insert(grids.begin() + tree_index, grid);
-                values.insert(values.begin() + tree_index, utils::Matrix<std::vector<double>>(matrix_dimensions, std::vector<double>(value_size, 0)));
-                values_old.insert(values_old.begin() + tree_index, utils::Matrix<std::vector<double>>(matrix_dimensions, std::vector<double>(value_size, 0)));
-                individuals.insert(individuals.begin() + tree_index, utils::Matrix<int>(matrix_dimensions));
-                variables.insert(variables.begin() + tree_index, tree_dims);
-                // fill individuals of new trees
-                while (!grid.nextPoint())
-                {
-                  std::vector<int> gridPoint = grid.getPoint();
-                  bool in_leaf = true;
-                  // go through sample points to sum up individuals
-                  for (const auto &feature_vec : X)
-                  {
-                    int dim_index2 = 0;
-                    in_leaf = true;
-                    for (const auto &dim : tree_dims)
-                    {
-                      double val = feature_vec[dim - 1];
-                      if (!((val >= lim_list[dim - 1][gridPoint[dim_index2]]) && (val < lim_list[dim - 1][gridPoint[dim_index2] + 1])))
-                        in_leaf = false;
-                      ++dim_index2;
-                    }
-                    // consider individuals only if all in
-                    if (in_leaf)
-                      individuals[tree_index][gridPoint] += 1;
-                  }
-                }
-              }
-              dim_index++;
-            }
-          }
-        }
-      }
-      // update currently considered dimension size
-      --curr_max;
-    }
-
-    // Rcout << std::endl;
-    // Rcout << std::endl;
-    // Rcout << std::endl;
-    //
-    // for(int i = 0; i<variables.size(); ++i){
-    //
-    //   // Rcout << variables[i].size();
-    //
-    //   for(auto dim: variables[i]) Rcout << dim << ",";
-    //
-    //   //  Rcout << variables[i][j] << ",";
-    //   //}
-    //
-    //   Rcout << std::endl;
-    // }
-
-    // ------------- purify -------------
-    // iterate backwards through tree family
-    int tree_index_t = curr_family.size() - 1;
-    for (auto tree_t = variables.rbegin(); tree_t != variables.rend(); ++tree_t)
-    {
-      std::set<int> curr_dims = *tree_t;
-      // do not purify null
-      if (curr_dims == std::set<int>{0})
-        continue;
-      // Rcout << std::endl << tree_index_t << " - T: ";
-      //  Rcout << "tree_t:";
-      //  for(auto dim: curr_dims) Rcout << dim << ", ";
-      //  Rcout << std::endl;
-
-      auto grid = grids[tree_index_t];
-      //     Rcout << "Grid dimensions of T: ";
-      //     for(auto dim: grid.dimensions) Rcout << dim << ", ";
-      //     Rcout << std::endl;
-      // go through subtrees of t
-      int tree_index_u = variables.size();
-      for (auto tree_u = variables.rbegin(); tree_u != variables.rend(); ++tree_u)
-      {
-        --tree_index_u;
-        // j_dims = dims of t without u
-        std::set<int> j_dims = curr_dims;
-        if (tree_u->size() > curr_dims.size())
-          continue;
-        // check if subset
-        bool subset = true;
-        for (const auto dim : *tree_u)
-        {
-          if (tree_t->count(dim) == 0)
-          {
-            subset = false;
-            break;
-          }
-          j_dims.erase(dim);
-        }
-        if (!subset)
-          continue;
-
-        // Rcout << "Hello";
-        // Rcout << "   " << tree_index_u << " - U: ";
-        // for(auto dim: *tree_u) Rcout << dim << ", ";
-        // Rcout << std::endl;
-        // Rcout << "   Individuals: ";
-
-        double tot_sum = 0;
-        grid = grids[tree_index_u];
-        while (!grid.nextPoint())
-        {
-          auto gridPoint = grid.getPoint();
-          //     Rcout << individuals[tree_index_u][gridPoint] << ", ";
-          tot_sum += individuals[tree_index_u][gridPoint];
-        }
-        // Rcout << "Total sum: " << tot_sum << std::endl;
-        // Rcout << std::endl;
-
-        grid = grids[tree_index_u];
-        //     Rcout << "      Grid dimensions of U: ";
-        //     for(auto dim: grid.dimensions) Rcout << dim << ", ";
-        //     Rcout << std::endl;
-
-        // Rcout<< "j_dims: "<<j_dims.size() << std::endl;;
-
-        std::vector<double> update(value_size, 0);
-
-        if (j_dims.size() == 0)
-        {
-
-          // grid = grids[tree_index_u];
-          while (!grid.nextPoint())
-          {
-            auto gridPoint_i = grid.getPoint();
-            //     Rcout << "         " << "i: ";
-            //     for(auto p: gridPoint_i) Rcout << p << ", ";
-            //     Rcout << std::endl << "         ";
-            double curr_sum = individuals[tree_index_u][gridPoint_i];
-            //     Rcout << ", Current Sum: " << curr_sum << std::endl;
-            //     Rcout << std::endl << "         " << "i, j: ";
-            update += (curr_sum / tot_sum) * values_old[tree_index_t][gridPoint_i];
-            //     Rcout << std::endl;
-          }
-
-          int tree_index_s = variables.size();
-          for (auto tree_s = variables.rbegin(); tree_s != variables.rend(); ++tree_s)
-          {
-
-            // Rcout << "tree_s:";
-            // for(auto dim: *tree_s) Rcout << dim << ", ";
-            // Rcout << std::endl;
-
-            --tree_index_s;
-            if (*tree_s == std::set<int>{0})
-            {
-
-              auto gridPoint_0 = std::vector<int>{0};
-              values[tree_index_s][gridPoint_0] += update;
-              //     Rcout << std::endl;
-              //}
-
-              /*
-               for(auto tree_0: curr_family){
-
-               if(tree_0.first == std::set<int>{0}){
-
-               Rcout << tree_0.first.size();
-               std::vector<int> leaf_index(tree_0.first.size(), 0);
-               std::vector<int> leaf_index(tree_0.second->GridLeaves.values.size(), 0);
-
-               int Test = tree_0.second->GridLeaves.values.size();
-               Rcout << Test;
-               tree_0.second->GridLeaves.values[leaf_index] += update;
-               }
-               }
-               */
-            }
-            else
-            {
-
-              // check if S subset of T
-
-              bool subset = true;
-              for (const auto dim : *tree_s)
-              {
-                if (tree_t->count(dim) == 0)
-                {
-                  subset = false;
-                  break;
-                }
-              }
-              if (!subset)
-                continue;
-
-              // Rcout << pow(-1, (*tree_s).size()) << std::endl;
-
-              auto grid_k = grids[tree_index_s];
-              while (!grid_k.nextPoint())
-              {
-                auto gridPoint_k = grid_k.getPoint();
-                //
-                //      if((*tree_s).size()>2){
-                //      Rcout << std::endl << "            " << "j, k: ";
-                //      for(auto p: gridPoint_k) Rcout << p << ", ";
-                //      Rcout << std::endl;
-                //      }
-                //
-                //      Rcout << pow(-1, (*tree_s).size()) * update << std::endl;
-                values[tree_index_s][gridPoint_k] += pow(-1, (*tree_s).size()) * update;
-              }
-            }
-          }
-          // Rcout << std::endl;
-        }
-        else
-        {
-
-          std::vector<int> j_sizes(j_dims.size(), 0);
-          for (size_t j = 0; j < j_dims.size(); ++j)
-          {
-            auto tmp = j_dims.begin();
-            std::advance(tmp, j);
-            int j_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*tmp));
-            j_sizes[j] = grids[tree_index_t].dimensions[j_index];
-          }
-
-          // Rcout<<"Hello 1";
-
-          grid::NDGrid grid_j = grid::NDGrid(j_sizes);
-          while (!grid_j.nextPoint())
-          {
-
-            std::vector<double> update(value_size, 0);
-            auto gridPoint_j = grid_j.getPoint();
-            //     Rcout << "         " << "j: ";
-            //     for(auto p: gridPoint_j) Rcout << p << ", ";
-            //     Rcout << std::endl;
-            // calc update
-            grid = grids[tree_index_u];
-            while (!grid.nextPoint())
-            {
-              auto gridPoint_i = grid.getPoint();
-              //     Rcout << "         " << "i: ";
-              //     for(auto p: gridPoint_i) Rcout << p << ", ";
-              //     Rcout << std::endl << "         ";
-              double curr_sum = individuals[tree_index_u][gridPoint_i];
-              //     Rcout << ", Current Sum: " << curr_sum << std::endl;
-              std::vector<int> gridPoint_ij(tree_t->size(), 0);
-              for (size_t j = 0; j < gridPoint_j.size(); ++j)
-              {
-                auto j_dim = j_dims.begin();
-                std::advance(j_dim, j);
-                int j_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*j_dim));
-                //     Rcout << "         j_dim=" << *j_dim << ", j_index=" << j_index;
-                gridPoint_ij[j_index] = gridPoint_j[j];
-              }
-              for (size_t i = 0; i < gridPoint_i.size(); ++i)
-              {
-                auto i_dim = tree_u->begin();
-                std::advance(i_dim, i);
-                int i_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*i_dim));
-                //     Rcout << "         i_dim=" << *i_dim << ", i_index=" << i_index;
-                gridPoint_ij[i_index] = gridPoint_i[i];
-              }
-              //     Rcout << std::endl << "         " << "i, j: ";
-              //     for(auto p: gridPoint_ij) Rcout << p << ", ";
-              //     Rcout << std::endl;
-              update += (curr_sum / tot_sum) * values_old[tree_index_t][gridPoint_ij];
-              //     Rcout << std::endl;
-            }
-
-            // Rcout << "Hello_2";
-            // update trees
-            int tree_index_s = variables.size();
-            for (auto tree_s = variables.rbegin(); tree_s != variables.rend(); ++tree_s)
-            {
-              --tree_index_s;
-              // check if T\U=j_dims subset of S and S subset of T
-              bool subset = true;
-              for (const auto dim : j_dims)
-              {
-                if (tree_s->count(dim) == 0)
-                {
-                  subset = false;
-                  break;
-                }
-              }
-              for (const auto dim : *tree_s)
-              {
-                if (tree_t->count(dim) == 0)
-                {
-                  subset = false;
-                  break;
-                }
-              }
-              if (!subset)
-                continue;
-              //     Rcout << "         " << "S: ";
-              //     for(auto dim: *tree_s) Rcout << dim << ", ";
-              //     Rcout << std::endl;
-              // S cap U
-              std::set<int> k_dims = *tree_s;
-              std::set<int> k_dims_h1 = *tree_s;
-              std::set<int> k_dims_h2 = *tree_u;
-              for (const auto dim : *tree_u)
-                k_dims.insert(dim);
-              for (const auto dim : *tree_s)
-                k_dims_h2.erase(dim);
-              for (const auto dim : *tree_u)
-                k_dims_h1.erase(dim);
-              for (const auto dim : k_dims_h1)
-                k_dims.erase(dim);
-              for (const auto dim : k_dims_h2)
-                k_dims.erase(dim);
-
-              // std::set<int> k_dims = *tree_s;
-              // for(const auto dim: *tree_t) k_dims.erase(dim);
-              // for(const auto dim: *tree_u) k_dims.insert(dim);
-
-              //     Rcout << "         " << "k_dims: ";
-              //     for(auto dim: k_dims) Rcout << dim << ", ";
-              //     Rcout << std::endl;
-
-              if (k_dims.size() == 0)
-              {
-
-                values[tree_index_s][gridPoint_j] += pow(-1, (*tree_s).size() - j_dims.size()) * update;
-              }
-              else
-              {
-
-                // Rcout <<"k_dims :";
-                // for(auto dim: k_dims) Rcout << dim << ", ";
-                // Rcout << std::endl;
-
-                std::vector<int> k_sizes(k_dims.size(), 0);
-                for (size_t k = 0; k < k_dims.size(); ++k)
-                {
-                  auto tmp = k_dims.begin();
-                  std::advance(tmp, k);
-                  int k_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*tmp));
-                  k_sizes[k] = grids[tree_index_t].dimensions[k_index];
-                }
-                // Rcout << "         " << "k_sizes: ";
-                // for(auto dim: k_sizes) Rcout << dim << ", ";
-                // Rcout << std::endl;
-                grid::NDGrid grid_k = grid::NDGrid(k_sizes);
-                while (!grid_k.nextPoint())
-                {
-                  auto gridPoint_k = grid_k.getPoint();
-                  // Rcout << "            " << "k: ";
-                  // for(auto p: gridPoint_k) Rcout << p << ", ";
-                  // Rcout << std::endl << "         ";
-                  std::vector<int> gridPoint_jk(tree_s->size(), 0);
-                  for (size_t j = 0; j < gridPoint_j.size(); ++j)
-                  {
-                    auto j_dim = j_dims.begin();
-                    std::advance(j_dim, j);
-                    int j_index = std::distance(variables[tree_index_s].begin(), variables[tree_index_s].find(*j_dim));
-                    // Rcout << "         j_dim=" << *j_dim << ", j_index=" << j_index;
-                    gridPoint_jk[j_index] = gridPoint_j[j];
-                  }
-                  for (size_t k = 0; k < gridPoint_k.size(); ++k)
-                  {
-                    auto k_dim = k_dims.begin();
-                    std::advance(k_dim, k);
-                    int k_index = std::distance(variables[tree_index_s].begin(), variables[tree_index_s].find(*k_dim));
-                    // Rcout << "         k_dim=" << *k_dim << ", k_index=" << k_index;
-                    gridPoint_jk[k_index] = gridPoint_k[k];
-                  }
-                  // Rcout << std::endl << "            " << "j, k: ";
-                  // for(auto p: gridPoint_jk) Rcout << p << ", ";
-                  // Rcout << std::endl;
-
-                  // Rcout << pow(-1, (*tree_s).size() - j_dims.size()) * update[0];
-                  values[tree_index_s][gridPoint_jk] += pow(-1, (*tree_s).size() - j_dims.size()) * update;
-                }
-              }
-            }
-          }
-        }
-      }
-      --tree_index_t;
-    }
-
-    // ------------- attach to rpf class -------------
-
-    // fill with new trees
-    for (size_t tree_index = 0; tree_index < variables.size(); ++tree_index)
-    {
-      LeafGrid curr_gridLeaf;
-      curr_gridLeaf.grid = grids[tree_index];
-      curr_gridLeaf.individuals = individuals[tree_index];
-      curr_gridLeaf.lim_list = lim_list;
-      curr_gridLeaf.values = values[tree_index];
-      curr_family[variables[tree_index]]->GridLeaves = curr_gridLeaf;
-    }
-  }
-
-  purified = true;
-}
-
-void RandomPlantedForest::print()
-{
-  for (int n = 0; n < n_trees; ++n)
-  {
-    TreeFamily family = tree_families[n];
-    auto keys = getKeys(family);
-    for (size_t m = 0; m < keys.size(); ++m)
-    {
-      DecisionTree tree = *(family[keys[m]]);
-      Rcout << m + 1 << " Tree: ";
-      Rcout << "Dims=";
-      for (const auto &dim : tree.split_dims)
-        Rcout << dim << ",";
-      Rcout << std::endl
-            << "Leaves: (" << tree.leaves.size() << ")" << std::endl;
-      for (const auto &leaf : tree.leaves)
-      {
-        Rcout << "Intervals=";
-        for (const auto &interval : leaf.intervals)
-        {
-          Rcout << interval.first << "," << interval.second << "/";
-        }
-        Rcout << " Value=";
-        for (const auto &val : leaf.value)
-          Rcout << val << ", ";
-        Rcout << std::endl;
-      }
-      Rcout << std::endl;
-    }
-    Rcout << std::endl
-          << std::endl;
-  }
-}
-
-// print parameters of the model to the console
-void RandomPlantedForest::get_parameters()
-{
-  Rcout << "Parameters: n_trees=" << n_trees << ", n_splits=" << n_splits << ", max_interaction=" << max_interaction << ", t_try=" << t_try
-        << ", split_try=" << split_try << ", purified=" << purified << ", deterministic=" << deterministic << ", nthreads=" << nthreads
-        << ", feature_size=" << feature_size << ", sample_size=" << sample_size << std::endl;
-}
-
-/*  retrospectively change parameters of existing class object,
- updates the model, so far only single valued parameters supported,
- for replacing training data use 'set_data',
- note that changing cv does not trigger cross validation */
-void RandomPlantedForest::set_parameters(StringVector keys, NumericVector values)
-{
-  if (keys.size() != values.size())
-  {
-    Rcout << "Size of input vectors is not the same. " << std::endl;
-    return;
-  }
-
-  for (unsigned int i = 0; i < keys.size(); ++i)
-  {
-    if (keys[i] == "deterministic")
-    {
-      this->deterministic = values[i];
-    }
-    else if (keys[i] == "nthreads")
-    {
-      this->nthreads = values[i];
-    }
-    else if (keys[i] == "purify")
-    {
-      this->purify_forest = values[i];
-    }
-    else if (keys[i] == "n_trees")
-    {
-      this->n_trees = values[i];
-    }
-    else if (keys[i] == "n_splits")
-    {
-      this->n_splits = values[i];
-    }
-    else if (keys[i] == "t_try")
-    {
-      this->t_try = values[i];
-    }
-    else if (keys[i] == "split_try")
-    {
-      this->split_try = values[i];
-    }
-    else if (keys[i] == "max_interaction")
-    {
-      this->max_interaction = values[i];
-    }
-    else if (keys[i] == "cv")
-    {
-      this->cross_validate = values[i];
-    }
-    else
-    {
-      Rcout << "Unkown parameter key  '" << keys[i] << "' ." << std::endl;
-    }
-  }
-  this->fit();
-}
-
-List RandomPlantedForest::get_model()
-{
-  List model;
-  for (const auto &family : tree_families)
-  {
-    List variables, family_values, family_intervals;
-    for (const auto &tree : family)
-    {
-      List tree_values;
-      List tree_intervals;
-      variables.push_back(from_std_set(tree.first));
-      for (const auto &leaf : tree.second->leaves)
-      {
-        NumericMatrix leaf_values;
-        for (const auto &val : leaf.value)
-        {
-          leaf_values.push_back(val);
-        }
-        tree_values.push_back(leaf_values);
-
-        NumericVector intervals;
-        for (const auto &interval : leaf.intervals)
-        {
-          intervals.push_back(interval.first);
-          intervals.push_back(interval.second);
-        }
-        NumericMatrix leaf_intervals(2, feature_size, intervals.begin());
-        tree_intervals.push_back(leaf_intervals);
-      }
-      family_intervals.push_back(tree_intervals);
-      family_values.push_back(tree_values);
-    }
-    model.push_back(List::create(Named("variables") = variables, _["values"] = family_values, _["intervals"] = family_intervals));
-  }
-  return (model);
-}
+// purify_3 moved to lib/purify.cpp
diff --git a/src/lib/splits_cur_trees_1.cpp b/src/lib/splits_cur_trees_1.cpp
new file mode 100644
index 0000000..5658a44
--- /dev/null
+++ b/src/lib/splits_cur_trees_1.cpp
@@ -0,0 +1,203 @@
+// Split-mode: cur_trees_1. Samples feasible leaves proportionally to their
+// number of candidate thresholds, then evaluates a single threshold.
+#include "rpf.hpp"
+#include "internal_utils.hpp"
+
+using namespace rpf_utils;
+
+Split RandomPlantedForest::calcOptimalSplit_curTrees1(const std::vector<std::vector<double>> &Y,
+                                                      const std::vector<std::vector<double>> &X,
+                                                      std::vector<SplitCandidate> &possible_splits,
+                                                      TreeFamily &curr_family)
+{
+  Split curr_split, min_split; min_split.min_sum = std::numeric_limits<double>::infinity(); curr_split.Y = &Y;
+
+  unsigned int raw = (unsigned int)std::ceil(this->t_try * possible_splits.size());
+  unsigned int upper = std::min<unsigned int>((unsigned int)this->max_candidates_, (unsigned int)possible_splits.size());
+  unsigned int n_candidates = std::max<unsigned int>(1u, std::min<unsigned int>(raw, upper));
+  std::vector<double> weights(possible_splits.size());
+
+  for (size_t i = 0; i < possible_splits.size(); ++i) weights[i] = std::exp(-this->split_decay_rate_ * possible_splits[i].age);
+
+  size_t positive_count_ = 0; for (double w : weights) if (w > 0.0) ++positive_count_;
+
+  if (positive_count_ == 0) { n_candidates = 1; }
+  else { if (n_candidates > positive_count_) n_candidates = static_cast<unsigned int>(positive_count_); }
+  
+  std::vector<size_t> sample_idxs; sample_idxs.reserve(n_candidates);
+  if (!this->deterministic) {
+    std::vector<size_t> pos_idx; pos_idx.reserve(possible_splits.size());
+    std::vector<double> pos_w;   pos_w.reserve(possible_splits.size());
+    for (size_t i = 0; i < weights.size(); ++i) if (weights[i] > 0.0) { pos_idx.push_back(i); pos_w.push_back(weights[i]); }
+    const size_t P = pos_idx.size();
+    if (P == 0) {
+      std::vector<size_t> all(possible_splits.size()); std::iota(all.begin(), all.end(), 0);
+      size_t k = std::min<size_t>(n_candidates, all.size());
+      for (size_t i = 0; i < k; ++i) { size_t j = i + static_cast<size_t>(rng_runif01() * (double)(all.size() - i)); if (j >= all.size()) j = all.size() - 1; std::swap(all[i], all[j]); }
+      for (size_t i = 0; i < k; ++i) sample_idxs.push_back(all[i]);
+    } else if (n_candidates * 8 < P) {
+      size_t k2 = std::min<size_t>(n_candidates, P);
+      std::vector<std::pair<double,size_t>> keys; keys.reserve(P);
+      for (size_t i = 0; i < P; ++i) { double u = rng_runif01(); if (u <= 0.0) u = std::numeric_limits<double>::min(); double key = -std::log(u) / pos_w[i]; keys.emplace_back(key, pos_idx[i]); }
+      if (k2 < keys.size()) { std::nth_element(keys.begin(), keys.begin() + k2, keys.end(), [](const auto& a, const auto& b){ return a.first < b.first; }); keys.resize(k2); }
+      for (auto &kv : keys) sample_idxs.push_back(kv.second);
+    } else {
+      size_t k = std::min<size_t>(n_candidates, P);
+      std::vector<std::pair<double,size_t>> keys; keys.reserve(P);
+      for (size_t i = 0; i < P; ++i) { double u = rng_runif01(); if (u <= 0.0) u = std::numeric_limits<double>::min(); double key = -std::log(u) / pos_w[i]; keys.emplace_back(key, pos_idx[i]); }
+      if (k < keys.size()) { std::nth_element(keys.begin(), keys.begin() + k, keys.end(), [](const auto& a, const auto& b){ return a.first < b.first; }); keys.resize(k); }
+      for (auto &kv : keys) sample_idxs.push_back(kv.second);
+    }
+  } else { for (size_t i=0;i+n_candidates<=possible_splits.size() && i< n_candidates;++i) sample_idxs.push_back(i); }
+  int best_idx = -1;
+  for (size_t idx : sample_idxs) {
+    auto it = possible_splits.begin(); std::advance(it, idx); int k = it->dim - 1; int leaf_size = this->n_leaves[k];
+    std::set<int> Dprime_minus_k = it->tree->split_dims; Dprime_minus_k.erase(k + 1); Dprime_minus_k.erase(0);
+    std::vector<std::shared_ptr<DecisionTree>> sources; sources.reserve(2);
+    if (Dprime_minus_k.empty()) { if (auto itZero = curr_family.find(std::set<int>{0}); itZero != curr_family.end()) sources.push_back(itZero->second); }
+    else { if (auto itS = curr_family.find(Dprime_minus_k); itS != curr_family.end()) sources.push_back(itS->second); }
+    if (auto itD = curr_family.find(it->tree->split_dims); itD != curr_family.end()) if (sources.empty() || sources.back().get() != itD->second.get()) sources.push_back(itD->second);
+
+    if (!this->deterministic) {
+      auto ensure_weights_cache = [&](const std::shared_ptr<DecisionTree>& tree, int kdim){
+        // Lazy-size vectors to feature_size once
+        if ((int)tree->weights_epoch_by_dim_v.size() < this->feature_size) {
+          tree->weights_epoch_by_dim_v.assign((size_t)this->feature_size, -1);
+          tree->fenwick_by_dim_v.assign((size_t)this->feature_size, std::vector<double>());
+          tree->leaf_weights_by_dim_v.assign((size_t)this->feature_size, std::vector<double>());
+          tree->weights_total_by_dim_v.assign((size_t)this->feature_size, 0.0);
+        }
+        // Recompute BIT if epoch mismatches or size changed
+        bool need = true;
+        if (tree->weights_epoch_by_dim_v[(size_t)kdim] == tree->weights_epoch) {
+          if (tree->fenwick_by_dim_v[(size_t)kdim].size() == tree->leaves.size()) need = false;
+        }
+        if (!need) return; // cache fresh
+        const size_t L = tree->leaves.size();
+        std::vector<double> bit(L, 0.0), wts(L, 0.0);
+        double total = 0.0;
+        for (size_t li = 0; li < L; ++li) {
+          auto &leaf = tree->leaves[li];
+          // Determine number of unique thresholds available in this leaf for kdim
+          size_t unique_count = 0;
+          auto it_uc = leaf.unique_count_cache.find(kdim);
+          if (it_uc != leaf.unique_count_cache.end()) {
+            unique_count = it_uc->second;
+          } else {
+            // Build or reuse sorted values, then count uniques
+            std::vector<size_t> order_cf; std::vector<double> sorted_vals_cf;
+            ensure_order_and_sorted_vals_for_leaf(X, leaf, kdim, order_cf, sorted_vals_cf);
+            if (!sorted_vals_cf.empty()) {
+              unique_count = 1;
+              for (size_t i = 1; i < sorted_vals_cf.size(); ++i)
+                if (sorted_vals_cf[i] != sorted_vals_cf[i - 1]) ++unique_count;
+            }
+            leaf.unique_count_cache[kdim] = unique_count;
+          }
+          // Weight = number of unique thresholds that respect min leaf size
+          const long width_unique = (long)unique_count - 2L * (long)leaf_size;
+          const double w = (width_unique > 0L) ? static_cast<double>(width_unique) : 0.0;
+          wts[li] = w; total += w; if (w != 0.0) rpf_utils::fenwick_add(bit, li + 1, w);
+        }
+        tree->fenwick_by_dim_v[(size_t)kdim] = std::move(bit);
+        tree->leaf_weights_by_dim_v[(size_t)kdim] = std::move(wts);
+        tree->weights_total_by_dim_v[(size_t)kdim] = total;
+        tree->weights_epoch_by_dim_v[(size_t)kdim] = tree->weights_epoch;
+      };
+
+      struct SourceInfo { std::shared_ptr<DecisionTree> tree; double total; };
+      std::vector<SourceInfo> src_info; src_info.reserve(sources.size());
+      double total_all = 0.0;
+      for (const auto &src_tree : sources) {
+        if (!src_tree || src_tree->leaves.empty()) continue;
+        ensure_weights_cache(src_tree, k);
+        double tot = src_tree->weights_total_by_dim_v[(size_t)k];
+        if (tot <= 0.0) continue;
+        src_info.push_back({src_tree, tot});
+        total_all += tot;
+      }
+      if (src_info.empty() || total_all <= 0.0) continue;
+
+      for (size_t t = 0; t < (size_t)this->split_try; ++t) {
+        // Sample a source tree proportionally to its total weight
+        double r_src = rng_runif(0.0, total_all);
+        size_t si = 0;
+        while (si + 1 < src_info.size() && r_src > src_info[si].total) { r_src -= src_info[si].total; ++si; }
+        auto &sel_tree = src_info[si].tree;
+
+        // Sample a leaf within the selected tree using prefix sums
+        const auto &bit = sel_tree->fenwick_by_dim_v[(size_t)k];
+        double tot_leaf = src_info[si].total;
+        if (bit.empty() || tot_leaf <= 0.0) continue;
+        double r_leaf = rng_runif(0.0, tot_leaf);
+        size_t leaf_idx_sel = rpf_utils::fenwick_find_by_prefix(bit, r_leaf);
+        if (leaf_idx_sel == 0) continue; // safety
+        leaf_idx_sel -= 1; // to 0-based
+        if (leaf_idx_sel >= sel_tree->leaves.size()) continue;
+        Leaf *leaf_ptr = &sel_tree->leaves[leaf_idx_sel];
+        // Sample by unique thresholds: build/reuse unique values for this leaf and dim
+        std::vector<size_t> order_cf; std::vector<double> sorted_vals_cf;
+        ensure_order_and_sorted_vals_for_leaf(X, *leaf_ptr, k, order_cf, sorted_vals_cf);
+        size_t unique_count = 0; std::vector<double>* unique_ptr = nullptr;
+        if (leaf_ptr->unique_vals_cache.count(k)) {
+          unique_ptr = &leaf_ptr->unique_vals_cache[k];
+          unique_count = unique_ptr->size();
+          leaf_ptr->unique_count_cache[k] = unique_count;
+        } else {
+          auto uniques = compute_unique_sorted_values(sorted_vals_cf);
+          unique_count = uniques.size();
+          leaf_ptr->unique_count_cache[k] = unique_count;
+          leaf_ptr->unique_vals_cache[k] = std::move(uniques);
+          unique_ptr = &leaf_ptr->unique_vals_cache[k];
+        }
+        const int left = leaf_size; const int right_exclusive = (int)unique_count - leaf_size;
+        if (right_exclusive - left <= 1) continue;
+        int s_idx = rng_randint(left, right_exclusive);
+        double sp = (*unique_ptr)[(size_t)s_idx];
+
+        size_t ns = 0, nb = 0; std::vector<double> sum_s_adj(this->value_size, 0.0), sum_b_adj(this->value_size, 0.0);
+        for (int ind : leaf_ptr->individuals) {
+          if (X[ind][k] < sp) { ++ns; for (size_t p = 0; p < this->value_size; ++p) sum_s_adj[p] += Y[ind][p]; }
+          else { ++nb; for (size_t p = 0; p < this->value_size; ++p) sum_b_adj[p] += Y[ind][p]; }
+        }
+        if (ns == 0 || nb == 0) continue; double loss = 0.0;
+        for (size_t p = 0; p < this->value_size; ++p) { loss -= (sum_s_adj[p] * sum_s_adj[p]) / (double)ns; loss -= (sum_b_adj[p] * sum_b_adj[p]) / (double)nb; }
+        if (loss < min_split.min_sum) { min_split.min_sum = loss; min_split.tree_index = sel_tree; min_split.leaf_index = leaf_ptr; min_split.split_coordinate = k + 1; min_split.split_point = sp; best_idx = (int)idx; min_split.sum_s = sum_s_adj; min_split.sum_b = sum_b_adj; }
+      }
+    } else {
+      for (const auto &src_tree : sources) {
+        if (src_tree->leaves.empty()) continue;
+        for (auto &leaf : src_tree->leaves) {
+          std::vector<size_t> order_cf; std::vector<double> sorted_vals_cf; ensure_order_and_sorted_vals_for_leaf(X, leaf, k, order_cf, sorted_vals_cf);
+          // Build/reuse unique values for deterministic sampling across unique thresholds
+          size_t unique_count = 0; std::vector<double>* unique_ptr = nullptr;
+          if (leaf.unique_vals_cache.count(k)) {
+            unique_ptr = &leaf.unique_vals_cache[k];
+            unique_count = unique_ptr->size();
+            leaf.unique_count_cache[k] = unique_count;
+          } else {
+            auto uniques = compute_unique_sorted_values(sorted_vals_cf);
+            unique_count = uniques.size();
+            leaf.unique_count_cache[k] = unique_count;
+            leaf.unique_vals_cache[k] = std::move(uniques);
+            unique_ptr = &leaf.unique_vals_cache[k];
+          }
+          if ((int)unique_count <= 2 * leaf_size) continue; int left = leaf_size; int right = (int)unique_count - leaf_size;
+          std::vector<int> samples = compute_even_spread_indices(left, right, (size_t)this->split_try);
+          for (int s_idx : samples) {
+            const double sp = (*unique_ptr)[(size_t)s_idx]; size_t ns = 0, nb = 0; std::vector<double> sum_s_adj(this->value_size, 0.0), sum_b_adj(this->value_size, 0.0);
+            for (int ind : leaf.individuals) { if (X[ind][k] < sp) { ++ns; for (size_t p = 0; p < this->value_size; ++p) sum_s_adj[p] += Y[ind][p]; } else { ++nb; for (size_t p = 0; p < this->value_size; ++p) sum_b_adj[p] += Y[ind][p]; } }
+            if (ns == 0 || nb == 0) continue; double loss = 0.0; for (size_t p = 0; p < this->value_size; ++p) { loss -= (sum_s_adj[p] * sum_s_adj[p]) / (double)ns; loss -= (sum_b_adj[p] * sum_b_adj[p]) / (double)nb; }
+            if (loss < min_split.min_sum) { min_split.min_sum = loss; min_split.tree_index = src_tree; min_split.leaf_index = &leaf; min_split.split_coordinate = k + 1; min_split.split_point = sp; best_idx = (int)idx; min_split.sum_s = sum_s_adj; min_split.sum_b = sum_b_adj; }
+          }
+        }
+      }
+    }
+  }
+
+  rpf_utils::age_pool_by_sample(sample_idxs, best_idx, possible_splits);
+  finalize_split_from_sums(min_split, X, this->value_size);
+  return min_split;
+}
+
+
diff --git a/src/lib/splits_cur_trees_2.cpp b/src/lib/splits_cur_trees_2.cpp
new file mode 100644
index 0000000..8bfdfe1
--- /dev/null
+++ b/src/lib/splits_cur_trees_2.cpp
@@ -0,0 +1,165 @@
+// Split-mode: cur_trees_2. Tries random thresholds across all leaves of
+// predecessor/current trees, using age decay for candidate sampling.
+#include "rpf.hpp"
+#include "internal_utils.hpp"
+
+using namespace rpf_utils;
+
+Split RandomPlantedForest::calcOptimalSplit_curTrees2(const std::vector<std::vector<double>> &Y,
+                                                      const std::vector<std::vector<double>> &X,
+                                                      std::vector<SplitCandidate> &possible_splits,
+                                                      TreeFamily &curr_family)
+{
+  Split curr_split, min_split;
+  min_split.min_sum = std::numeric_limits<double>::infinity();
+  curr_split.Y = &Y;
+
+  unsigned int raw_candidates = static_cast<unsigned int>(std::ceil(this->t_try * possible_splits.size()));
+  unsigned int upper = std::min<size_t>(this->max_candidates_, possible_splits.size());
+  unsigned int n_candidates = std::max<unsigned int>(1u, std::min<unsigned int>(raw_candidates, upper));
+
+  std::vector<double> weights(possible_splits.size());
+  for (size_t i = 0; i < possible_splits.size(); ++i) weights[i] = std::exp(-this->split_decay_rate_ * possible_splits[i].age);
+  std::vector<size_t> sample_idxs; sample_idxs.reserve(n_candidates);
+  if (!this->deterministic) {
+    std::vector<size_t> pos_idx; pos_idx.reserve(possible_splits.size());
+    std::vector<double> pos_w;   pos_w.reserve(possible_splits.size());
+    for (size_t i = 0; i < weights.size(); ++i) if (weights[i] > 0.0) { pos_idx.push_back(i); pos_w.push_back(weights[i]); }
+    const size_t P = pos_idx.size();
+    if (P == 0) {
+      std::vector<size_t> all(possible_splits.size()); std::iota(all.begin(), all.end(), 0);
+      size_t k = std::min<size_t>(n_candidates, all.size());
+      for (size_t i = 0; i < k; ++i) { size_t j = i + static_cast<size_t>(rng_runif01() * (double)(all.size() - i)); if (j >= all.size()) j = all.size() - 1; std::swap(all[i], all[j]); }
+      for (size_t i = 0; i < k; ++i) sample_idxs.push_back(all[i]);
+    } else if (n_candidates * 8 < P) {
+      size_t k2 = std::min<size_t>(n_candidates, P);
+      std::vector<std::pair<double,size_t>> keys; keys.reserve(P);
+      for (size_t i = 0; i < P; ++i) { double u = rng_runif01(); if (u <= 0.0) u = std::numeric_limits<double>::min(); double key = -std::log(u) / pos_w[i]; keys.emplace_back(key, pos_idx[i]); }
+      if (k2 < keys.size()) { std::nth_element(keys.begin(), keys.begin() + k2, keys.end(), [](const auto& a, const auto& b){ return a.first < b.first; }); keys.resize(k2); }
+      for (auto &kv : keys) sample_idxs.push_back(kv.second);
+    } else {
+      size_t k = std::min<size_t>(n_candidates, P);
+      std::vector<std::pair<double,size_t>> keys; keys.reserve(P);
+      for (size_t i = 0; i < P; ++i) { double u = rng_runif01(); if (u <= 0.0) u = std::numeric_limits<double>::min(); double key = -std::log(u) / pos_w[i]; keys.emplace_back(key, pos_idx[i]); }
+      if (k < keys.size()) { std::nth_element(keys.begin(), keys.begin() + k, keys.end(), [](const auto& a, const auto& b){ return a.first < b.first; }); keys.resize(k); }
+      for (auto &kv : keys) sample_idxs.push_back(kv.second);
+    }
+  } else { for (size_t i=0;i<n_candidates && i<possible_splits.size();++i) sample_idxs.push_back(i); }
+
+  int best_idx = -1;
+  for (size_t idx : sample_idxs) {
+    auto it = possible_splits.begin(); std::advance(it, idx);
+    int k = it->dim - 1;
+    int leaf_size = this->n_leaves[k];
+
+    std::set<int> tree_dims = it->tree->split_dims;
+    tree_dims.erase(k + 1); tree_dims.erase(0);
+
+    std::vector<std::shared_ptr<DecisionTree>> curr_trees;
+    if (tree_dims.empty()) {
+      auto itZero = curr_family.find(std::set<int>{0});
+      if (itZero != curr_family.end() && itZero->second) curr_trees.push_back(itZero->second);
+    }
+    if (auto itS = curr_family.find(tree_dims); itS != curr_family.end() && itS->second) curr_trees.push_back(itS->second);
+    if (auto itD = curr_family.find(it->tree->split_dims); itD != curr_family.end() && itD->second) {
+      if (curr_trees.empty() || curr_trees.back().get() != itD->second.get()) curr_trees.push_back(itD->second);
+    }
+
+    for (auto &curr_tree : curr_trees) {
+      if (curr_tree->leaves.empty()) continue;
+      for (auto &leaf : curr_tree->leaves) {
+        // Reuse cached order and sorted values
+        std::vector<size_t> order_cf; std::vector<double> sorted_vals_cf;
+        ensure_order_and_sorted_vals_for_leaf(X, leaf, k, order_cf, sorted_vals_cf);
+        // Unique count & values caching
+        std::vector<double> *unique_ptr = nullptr;
+        size_t unique_count = 0;
+        if (leaf.unique_count_cache.count(k)) {
+          unique_count = leaf.unique_count_cache[k];
+          if (unique_count != 0 && leaf.unique_vals_cache.count(k)) unique_ptr = &leaf.unique_vals_cache[k];
+        }
+        if (!unique_ptr) {
+          auto uniques = compute_unique_sorted_values(sorted_vals_cf);
+          unique_count = uniques.size();
+          leaf.unique_count_cache[k] = unique_count;
+          leaf.unique_vals_cache[k] = std::move(uniques);
+          unique_ptr = &leaf.unique_vals_cache[k];
+        }
+        if (unique_count < 2 * static_cast<size_t>(leaf_size)) continue;
+
+        const size_t m = leaf.individuals.size();
+        std::vector<int> samples;
+        if (this->deterministic) {
+          int maxp = std::min<int>((int)unique_count - 1, 9);
+          samples.resize(maxp); std::iota(samples.begin(), samples.end(), 1);
+        } else {
+          samples.resize(this->split_try);
+          for (size_t i = 0; i < samples.size(); ++i) samples[i] = rng_randint(leaf_size, (int)unique_count - leaf_size);
+          std::sort(samples.begin(), samples.end());
+        }
+        const bool single_eval = (samples.size() == 1);
+        std::vector<std::vector<double>> prefix_cf; // [value_size][m]
+        std::vector<double> total_cf;               // [value_size]
+        if (!single_eval) build_prefix_and_total_given_order(Y, leaf, order_cf, this->value_size, prefix_cf, total_cf);
+
+        for (size_t si = 0; si < samples.size(); ++si) {
+          const double sp = (*unique_ptr)[samples[si]];
+          size_t pos = static_cast<size_t>(std::lower_bound(sorted_vals_cf.begin(), sorted_vals_cf.end(), sp) - sorted_vals_cf.begin());
+          if (pos == 0 || pos >= m) continue;
+          if (pos < static_cast<size_t>(leaf_size) || (m - pos) < static_cast<size_t>(leaf_size)) continue;
+          double loss = 0.0;
+          if (!single_eval) {
+            for (size_t p = 0; p < this->value_size; ++p) {
+              const double sum_s_base = prefix_cf[p][pos - 1];
+              const double sum_b_base = total_cf[p] - sum_s_base;
+              loss -= (sum_s_base * sum_s_base) / static_cast<double>(pos);
+              loss -= (sum_b_base * sum_b_base) / static_cast<double>(m - pos);
+            }
+          } else {
+            size_t ns = 0, nb = 0;
+            std::vector<double> sum_s_adj(this->value_size, 0.0), sum_b_adj(this->value_size, 0.0);
+            for (int ind : leaf.individuals) {
+              const bool left_side = (X[ind][k] < sp);
+              if (left_side) { ++ns; for (size_t p = 0; p < this->value_size; ++p) { double v = Y[ind][p]; sum_s_adj[p] += v; } }
+              else { ++nb; for (size_t p = 0; p < this->value_size; ++p) { double v = Y[ind][p]; sum_b_adj[p] += v; } }
+            }
+            if (ns == 0 || nb == 0) { continue; }
+            for (size_t p = 0; p < this->value_size; ++p) {
+              loss -= (sum_s_adj[p] * sum_s_adj[p]) / static_cast<double>(ns);
+              loss -= (sum_b_adj[p] * sum_b_adj[p]) / static_cast<double>(nb);
+            }
+          }
+          if (loss < min_split.min_sum) {
+            min_split.min_sum = loss;
+            min_split.tree_index = curr_tree;
+            min_split.leaf_index = &leaf;
+            min_split.split_coordinate = k + 1;
+            min_split.split_point = sp;
+            best_idx = (int)idx;
+            min_split.sum_s.assign(this->value_size, 0.0);
+            min_split.sum_b.assign(this->value_size, 0.0);
+            if (!single_eval) {
+              for (size_t p = 0; p < this->value_size; ++p) {
+                const double sum_s_base = prefix_cf[p][pos - 1];
+                const double sum_b_base = total_cf[p] - sum_s_base;
+                min_split.sum_s[p] = sum_s_base;
+                min_split.sum_b[p] = sum_b_base;
+              }
+            } else {
+              for (int ind : leaf.individuals) {
+                if (X[ind][k] < sp) { for (size_t p = 0; p < this->value_size; ++p) { double v = Y[ind][p]; min_split.sum_s[p] += v; } }
+                else { for (size_t p = 0; p < this->value_size; ++p) { double v = Y[ind][p]; min_split.sum_b[p] += v; } }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  rpf_utils::age_pool_by_sample(sample_idxs, best_idx, possible_splits);
+  finalize_split_from_sums(min_split, X, this->value_size);
+  return min_split;
+}
+
+
diff --git a/src/lib/splits_hist.cpp b/src/lib/splits_hist.cpp
new file mode 100644
index 0000000..0077ca1
--- /dev/null
+++ b/src/lib/splits_hist.cpp
@@ -0,0 +1,137 @@
+// Split-mode: histogram-binned evaluation (mode 4). Mirrors leaves mode but
+// evaluates candidate thresholds at per-feature global bin boundaries.
+#include "rpf.hpp"
+#include "internal_utils.hpp"
+
+using namespace rpf_utils;
+
+Split RandomPlantedForest::calcOptimalSplit_hist(const std::vector<std::vector<double>> &Y,
+                                                 const std::vector<std::vector<double>> &X,
+                                                 std::vector<SplitCandidate> &possible_splits,
+                                                 TreeFamily &curr_family)
+{
+  Split min_split; min_split.min_sum = std::numeric_limits<double>::infinity();
+  if (possible_splits.empty()) return min_split;
+
+  unsigned int raw_candidates = static_cast<unsigned int>(std::ceil(this->t_try * possible_splits.size()));
+  unsigned int upper = std::min<size_t>(this->max_candidates_, possible_splits.size());
+  unsigned int n_candidates = std::max<unsigned int>(1u, std::min<unsigned int>(raw_candidates, upper));
+  std::vector<double> weights(possible_splits.size());
+  for (size_t i = 0; i < possible_splits.size(); ++i) weights[i] = std::exp(-this->split_decay_rate_ * possible_splits[i].age);
+  std::vector<size_t> sample_idxs = this->deterministic ? std::vector<size_t>() : sample_weighted_indices_filtered(weights, n_candidates);
+  if (this->deterministic) { for (size_t i = 0; i < n_candidates && i < possible_splits.size(); ++i) sample_idxs.push_back(i); }
+
+  // Use per-feature effective bin count based on actual cut count for stability
+  int best_idx = -1;
+  for (size_t idx : sample_idxs) {
+    auto it = possible_splits.begin(); std::advance(it, idx);
+    if (!it->tree || it->leaf_idx >= it->tree->leaves.size()) continue;
+    const int k_dim = it->dim; // 1-based
+    const int k = k_dim - 1;
+    Leaf* leafPtr = &it->tree->leaves[it->leaf_idx];
+    const int leaf_min = this->n_leaves[k];
+    const size_t m = leafPtr->individuals.size();
+    if (m == 0) continue;
+
+    // Build histogram for this leaf and feature k using cached working bin ids
+    const auto &cuts_k = (k >= 0 && k < (int)feature_cut_points_.size()) ? feature_cut_points_[k] : std::vector<double>{};
+    size_t Kf = cuts_k.size() + 1; if (Kf < 2) continue; // cannot split without at least 2 bins
+    std::vector<int> cnt(Kf, 0);
+    std::vector<std::vector<double>> sum(Kf, std::vector<double>(this->value_size, 0.0));
+    const bool have_cached = ((size_t)k < tls_working_bin_id.size());
+    if (have_cached) {
+      const std::vector<int> &bin_k = tls_working_bin_id[(size_t)k];
+      for (int ind : leafPtr->individuals) {
+        int b = bin_k[(size_t)ind];
+        cnt[(size_t)b] += 1;
+        for (size_t p = 0; p < this->value_size; ++p) sum[(size_t)b][p] += Y[ind][p];
+      }
+    } else {
+      for (int ind : leafPtr->individuals) {
+        double v = X[ind][k];
+        int b = 0;
+        if (!cuts_k.empty()) {
+          auto itb = std::upper_bound(cuts_k.begin(), cuts_k.end(), v);
+          b = (int)std::distance(cuts_k.begin(), itb);
+          if (b < 0) b = 0; if ((size_t)b >= Kf) b = (int)Kf - 1;
+        }
+        cnt[(size_t)b] += 1;
+        for (size_t p = 0; p < this->value_size; ++p) sum[(size_t)b][p] += Y[ind][p];
+      }
+    }
+
+    // Build prefix across bins then sample only split_try boundaries
+    const int total_n = (int)m;
+    std::vector<double> total_sum(this->value_size, 0.0);
+    for (size_t b = 0; b < Kf; ++b) {
+      for (size_t p = 0; p < this->value_size; ++p) total_sum[p] += sum[b][p];
+    }
+    std::vector<int> prefix_cnt(Kf, 0);
+    std::vector<std::vector<double>> prefix_sum(Kf, std::vector<double>(this->value_size, 0.0));
+    for (size_t b = 0; b < Kf; ++b) {
+      prefix_cnt[b] = cnt[b] + (b > 0 ? prefix_cnt[b - 1] : 0);
+      for (size_t p = 0; p < this->value_size; ++p)
+        prefix_sum[b][p] = sum[b][p] + (b > 0 ? prefix_sum[b - 1][p] : 0.0);
+    }
+
+    // Valid boundaries are b_left in [0, Kf-2] such that both sides satisfy leaf_min
+    int first_valid = -1, last_valid = -1;
+    for (size_t b_left = 0; b_left + 1 <= Kf - 1; ++b_left) {
+      int ln = prefix_cnt[b_left];
+      int rn = total_n - ln;
+      if (ln >= leaf_min && rn >= leaf_min) {
+        if (first_valid < 0) first_valid = (int)b_left;
+        last_valid = (int)b_left;
+      }
+    }
+    if (first_valid < 0 || last_valid < first_valid) continue;
+
+    // Sample boundary indices within [first_valid, last_valid]
+    std::vector<int> samples = this->deterministic
+      ? compute_even_spread_indices(first_valid, last_valid + 1, (size_t)this->split_try)
+      : sample_unique_ints_uniform_R(first_valid, last_valid + 1, (size_t)this->split_try);
+
+    for (size_t si = 0; si < samples.size(); ++si) {
+      int b_left = samples[si];
+      if (b_left < first_valid || b_left > last_valid) continue;
+      int left_n = prefix_cnt[(size_t)b_left];
+      int right_n = total_n - left_n;
+      if (left_n < leaf_min || right_n < leaf_min) continue;
+
+      double loss = 0.0;
+      for (size_t p = 0; p < this->value_size; ++p) {
+        double ls = prefix_sum[(size_t)b_left][p];
+        double rs = total_sum[p] - ls;
+        loss -= (ls * ls) / (double)left_n;
+        loss -= (rs * rs) / (double)right_n;
+      }
+      if (loss < min_split.min_sum) {
+        min_split.min_sum = loss;
+        min_split.tree_index = it->tree;
+        min_split.leaf_index = leafPtr;
+        min_split.split_coordinate = k + 1;
+        // Map boundary index to actual split point using precomputed cuts
+        double sp = 0.0;
+        if (k >= 0 && k < (int)feature_cut_points_.size() && !feature_cut_points_[k].empty()) {
+          const auto &cuts = feature_cut_points_[k];
+          size_t cp_idx = (size_t)std::min<size_t>((size_t)b_left, cuts.size() - 1);
+          sp = cuts[cp_idx];
+        } else {
+          sp = 0.5 * (leafPtr->intervals[k].first + leafPtr->intervals[k].second);
+        }
+        min_split.split_point = sp;
+        best_idx = (int)idx;
+        // Store sums for this boundary
+        min_split.sum_s.assign(this->value_size, 0.0);
+        min_split.sum_b.assign(this->value_size, 0.0);
+        for (size_t p = 0; p < this->value_size; ++p) { min_split.sum_s[p] = prefix_sum[(size_t)b_left][p]; min_split.sum_b[p] = total_sum[p] - prefix_sum[(size_t)b_left][p]; }
+      }
+    }
+  }
+
+  rpf_utils::age_pool_by_sample(sample_idxs, best_idx, possible_splits);
+  finalize_split_from_sums(min_split, X, this->value_size);
+  return min_split;
+}
+
+
diff --git a/src/lib/splits_leaves.cpp b/src/lib/splits_leaves.cpp
new file mode 100644
index 0000000..75771ac
--- /dev/null
+++ b/src/lib/splits_leaves.cpp
@@ -0,0 +1,105 @@
+// Split-mode: leaves. Evaluates per-leaf candidate splits using cached
+// per-leaf orders and prefix sums, with age-weighted candidate sampling.
+#include "rpf.hpp"
+#include "internal_utils.hpp"
+
+using namespace rpf_utils;
+
+Split RandomPlantedForest::calcOptimalSplit_leaves(const std::vector<std::vector<double>> &Y,
+                                                    const std::vector<std::vector<double>> &X,
+                                                    std::vector<SplitCandidate> &possible_splits,
+                                                    TreeFamily &curr_family)
+{
+  Split curr_split, min_split;
+  min_split.min_sum = std::numeric_limits<double>::infinity();
+  curr_split.Y = &Y;
+
+  if (possible_splits.empty()) return min_split;
+
+  unsigned int raw_candidates = static_cast<unsigned int>(std::ceil(this->t_try * possible_splits.size()));
+  unsigned int upper = std::min<size_t>(this->max_candidates_, possible_splits.size());
+  unsigned int n_candidates = std::max<unsigned int>(1u, std::min<unsigned int>(raw_candidates, upper));
+  std::vector<double> weights(possible_splits.size());
+  for (size_t i = 0; i < possible_splits.size(); ++i) weights[i] = std::exp(-this->split_decay_rate_ * possible_splits[i].age);
+  std::vector<size_t> sample_idxs = this->deterministic ? std::vector<size_t>() : sample_weighted_indices_filtered(weights, n_candidates);
+  if (this->deterministic) { for (size_t i = 0; i < n_candidates && i < possible_splits.size(); ++i) sample_idxs.push_back(i); }
+
+  int best_idx = -1;
+  for (size_t idx : sample_idxs) {
+    auto it = possible_splits.begin(); std::advance(it, idx);
+    int k = it->dim - 1;
+    if (!it->tree || it->leaf_idx >= it->tree->leaves.size()) continue;
+    Leaf* leafPtr = &it->tree->leaves[it->leaf_idx];
+
+    const int leaf_size = this->n_leaves[k];
+    const size_t m = leafPtr->individuals.size();
+    if (m == 0) continue;
+    // Quick infeasibility check: cannot split if fewer than 2*leaf_size individuals
+    if (m < static_cast<size_t>(2 * leaf_size)) continue;
+
+    std::vector<size_t> order; std::vector<double> sorted_vals;
+    ensure_order_and_sorted_vals_for_leaf(X, *leafPtr, k, order, sorted_vals);
+    std::vector<double> unique = compute_unique_sorted_values(sorted_vals);
+    // Build first positions of each unique value (same length/order as `unique`)
+    std::vector<size_t> first_pos;
+    first_pos.reserve(unique.size());
+    if (!sorted_vals.empty()) {
+      first_pos.push_back(0);
+      for (size_t i = 1; i < sorted_vals.size(); ++i) {
+        if (sorted_vals[i] != sorted_vals[i - 1]) first_pos.push_back(i);
+      }
+    }
+
+    if (unique.size() < 2 * static_cast<size_t>(leaf_size)) continue;
+
+    std::vector<int> samples;
+    int left = leaf_size; int right_exclusive = (int)unique.size() - leaf_size + 1;
+    samples = this->deterministic ? compute_even_spread_indices(left, right_exclusive, (size_t)this->split_try)
+                                  : sample_unique_ints_uniform_R(left, right_exclusive, (size_t)this->split_try);
+
+    // Build prefix sums once per candidate evaluation
+    std::vector<std::vector<double>> prefix; // [p][i]
+    std::vector<double> total;               // [p]
+    build_prefix_and_total_given_order(Y, *leafPtr, order, this->value_size, prefix, total);
+
+    for (size_t si = 0; si < samples.size(); ++si) {
+      const size_t uidx = static_cast<size_t>(samples[si]);
+      if (uidx >= unique.size() || uidx >= first_pos.size()) continue;
+      const double sp = unique[uidx];
+      const size_t pos = first_pos[uidx];
+      if (pos == 0 || pos >= m) continue;
+      if (pos < static_cast<size_t>(leaf_size) || (m - pos) < static_cast<size_t>(leaf_size)) continue;
+
+      double loss = 0.0;
+      for (size_t p = 0; p < this->value_size; ++p) {
+        const double sum_s_base = prefix[p][pos - 1];
+        const double sum_b_base = total[p] - sum_s_base;
+        loss -= (sum_s_base * sum_s_base) / static_cast<double>(pos);
+        loss -= (sum_b_base * sum_b_base) / static_cast<double>(m - pos);
+      }
+
+      if (loss < min_split.min_sum) {
+        min_split.min_sum = loss;
+        min_split.tree_index = it->tree;
+        min_split.leaf_index = leafPtr;
+        min_split.split_coordinate = k + 1;
+        min_split.split_point = sp;
+        best_idx = (int)idx;
+        min_split.sum_s.assign(this->value_size, 0.0);
+        min_split.sum_b.assign(this->value_size, 0.0);
+        for (size_t p = 0; p < this->value_size; ++p) {
+          const double sum_s_base = prefix[p][pos - 1];
+          const double sum_b_base = total[p] - sum_s_base;
+          min_split.sum_s[p] = sum_s_base;
+          min_split.sum_b[p] = sum_b_base;
+        }
+      }
+    }
+  }
+
+  rpf_utils::age_pool_by_sample(sample_idxs, best_idx, possible_splits);
+  finalize_split_from_sums(min_split, X, this->value_size);
+  return min_split;
+}
+
+
diff --git a/src/lib/splits_res_trees.cpp b/src/lib/splits_res_trees.cpp
new file mode 100644
index 0000000..3c11180
--- /dev/null
+++ b/src/lib/splits_res_trees.cpp
@@ -0,0 +1,258 @@
+// Split-mode: res_trees. Operates on the pool of resulting trees constructed
+// by expanding dimension sets, evaluating one threshold per leaf via prefix sums.
+#include "rpf.hpp"
+#include "internal_utils.hpp"
+
+using namespace rpf_utils;
+
+Split RandomPlantedForest::calcOptimalSplit_resTrees(const std::vector<std::vector<double>> &Y,
+                                                     const std::vector<std::vector<double>> &X,
+                                                     std::vector<ResultingTreeCandidate> &possible_trees,
+                                                     TreeFamily &curr_family)
+{
+  Split curr_split, min_split; min_split.min_sum = std::numeric_limits<double>::infinity(); curr_split.Y = &Y;
+
+  if (possible_trees.empty()) return min_split;
+  unsigned int raw_candidates = (unsigned int)std::ceil(this->t_try * possible_trees.size());
+  unsigned int upper = std::min<unsigned int>((unsigned int)this->max_candidates_, (unsigned int)possible_trees.size());
+  unsigned int n_candidates = std::max<unsigned int>(1u, std::min<unsigned int>(raw_candidates, upper));
+
+  std::vector<double> weights(possible_trees.size());
+  for (size_t i = 0; i < possible_trees.size(); ++i) weights[i] = std::exp(-this->split_decay_rate_ * possible_trees[i].age);
+  std::vector<size_t> sample_idxs; sample_idxs.reserve(n_candidates);
+  if (!this->deterministic) {
+    std::vector<size_t> pos_idx; pos_idx.reserve(possible_trees.size());
+    std::vector<double> pos_w;   pos_w.reserve(possible_trees.size());
+    for (size_t i = 0; i < weights.size(); ++i) if (weights[i] > 0.0) { pos_idx.push_back(i); pos_w.push_back(weights[i]); }
+    const size_t P = pos_idx.size();
+    if (P == 0) {
+      std::vector<size_t> all(possible_trees.size()); std::iota(all.begin(), all.end(), 0);
+      size_t k = std::min<size_t>(n_candidates, all.size());
+      for (size_t i = 0; i < k; ++i) { size_t j = i + static_cast<size_t>(rng_runif01() * (double)(all.size() - i)); if (j >= all.size()) j = all.size() - 1; std::swap(all[i], all[j]); }
+      for (size_t i = 0; i < k; ++i) sample_idxs.push_back(all[i]);
+    } else {
+      size_t k = std::min<size_t>(n_candidates, P);
+      std::vector<std::pair<double,size_t>> keys; keys.reserve(P);
+      for (size_t i = 0; i < P; ++i) { double u = rng_runif01(); if (u <= 0.0) u = std::numeric_limits<double>::min(); double key = -std::log(u) / pos_w[i]; keys.emplace_back(key, pos_idx[i]); }
+      if (k < keys.size()) { std::nth_element(keys.begin(), keys.begin() + k, keys.end(), [](const auto& a, const auto& b){ return a.first < b.first; }); keys.resize(k); }
+      for (auto &kv : keys) sample_idxs.push_back(kv.second);
+    }
+  } else { for (size_t i=0;i<n_candidates && i<possible_trees.size();++i) sample_idxs.push_back(i); }
+
+  int best_idx = -1;
+  for (size_t idx : sample_idxs) {
+    if (idx >= possible_trees.size()) continue;
+    auto &cand = possible_trees[idx];
+    auto treePtr = cand.tree; if (!treePtr) continue;
+
+    // Ensure per-tree, per-dimension weight caches (Fenwick + totals) like cur_trees_1
+    auto ensure_weights_cache = [&](const std::shared_ptr<DecisionTree>& tree, int kdim){
+      if (!tree) return;
+      // Lazy-size vectors to feature_size once
+      if ((int)tree->weights_epoch_by_dim_v.size() < this->feature_size) {
+        tree->weights_epoch_by_dim_v.assign((size_t)this->feature_size, -1);
+        tree->fenwick_by_dim_v.assign((size_t)this->feature_size, std::vector<double>());
+        tree->leaf_weights_by_dim_v.assign((size_t)this->feature_size, std::vector<double>());
+        tree->weights_total_by_dim_v.assign((size_t)this->feature_size, 0.0);
+      }
+      bool need = true;
+      if (tree->weights_epoch_by_dim_v[(size_t)kdim] == tree->weights_epoch) {
+        if (tree->fenwick_by_dim_v[(size_t)kdim].size() == tree->leaves.size()) need = false;
+      }
+      if (!need) return;
+      const int k = kdim; const size_t L = tree->leaves.size();
+      std::vector<double> bit(L, 0.0), wts(L, 0.0);
+      double total = 0.0;
+      const int leaf_size_local = this->n_leaves[k];
+      for (size_t li = 0; li < L; ++li) {
+        auto &leaf = tree->leaves[li];
+        size_t unique_count = 0;
+        auto it_uc = leaf.unique_count_cache.find(kdim);
+        if (it_uc != leaf.unique_count_cache.end()) {
+          unique_count = it_uc->second;
+        } else {
+          std::vector<size_t> order_cf; std::vector<double> sorted_vals_cf;
+          ensure_order_and_sorted_vals_for_leaf(X, leaf, k, order_cf, sorted_vals_cf);
+          if (!sorted_vals_cf.empty()) {
+            unique_count = 1;
+            for (size_t i = 1; i < sorted_vals_cf.size(); ++i)
+              if (sorted_vals_cf[i] != sorted_vals_cf[i - 1]) ++unique_count;
+          }
+          leaf.unique_count_cache[kdim] = unique_count;
+        }
+        const long width_unique = (long)unique_count - 2L * (long)leaf_size_local;
+        const double w = (width_unique > 0L) ? static_cast<double>(width_unique) : 0.0;
+        wts[li] = w; total += w; if (w != 0.0) rpf_utils::fenwick_add(bit, li + 1, w);
+      }
+      tree->fenwick_by_dim_v[(size_t)kdim] = std::move(bit);
+      tree->leaf_weights_by_dim_v[(size_t)kdim] = std::move(wts);
+      tree->weights_total_by_dim_v[(size_t)kdim] = total;
+      tree->weights_epoch_by_dim_v[(size_t)kdim] = tree->weights_epoch;
+    };
+
+    // Per-candidate local state for leaves and dimensions used
+    struct LeafDimState {
+      bool initialized=false; int left=0; int right=0; size_t used_count=0; std::vector<char> used_flags;
+      std::vector<size_t> order_cf; std::vector<double> sorted_vals; std::vector<double>* unique_ptr=nullptr; std::vector<std::vector<double>> prefix_cf; std::vector<double> total_cf;
+    };
+    // Keyed by Leaf* then by k (dimension index)
+    std::unordered_map<Leaf*, std::unordered_map<int, LeafDimState>> local_states;
+
+    // Buckets over (kdim, src_tree) with lazy local mutable copies of weights
+    struct Bucket { int kdim; std::shared_ptr<DecisionTree> tree; const std::vector<double>* bit_src=nullptr; const std::vector<double>* wts_src=nullptr; std::vector<double> bit; std::vector<double> wts; bool has_local=false; double total=0.0; };
+    std::vector<Bucket> buckets;
+    size_t grand_total_remaining = 0;
+
+    for (int kdim : treePtr->split_dims) {
+      if (kdim == 0) continue; const int k = kdim - 1; const int leaf_size = this->n_leaves[k];
+
+      std::vector<std::shared_ptr<DecisionTree>> sources; sources.reserve(2);
+      std::set<int> S = treePtr->split_dims; S.erase(kdim);
+      if (S.empty()) { if (auto itZero = curr_family.find(std::set<int>{0}); itZero != curr_family.end()) sources.push_back(itZero->second); }
+      else { if (auto itS = curr_family.find(S); itS != curr_family.end()) sources.push_back(itS->second); }
+      if (auto itD = curr_family.find(treePtr->split_dims); itD != curr_family.end()) if (sources.empty() || sources.back().get() != itD->second.get()) sources.push_back(itD->second);
+
+      for (const auto &src_tree : sources) {
+        if (!src_tree || src_tree->leaves.empty()) continue;
+        ensure_weights_cache(src_tree, k);
+        double tot = src_tree->weights_total_by_dim_v[(size_t)k];
+        if (tot <= 0.0) continue;
+        Bucket b; b.kdim = kdim; b.tree = src_tree; b.total = tot;
+        b.bit_src = &src_tree->fenwick_by_dim_v[(size_t)k];
+        b.wts_src = &src_tree->leaf_weights_by_dim_v[(size_t)k];
+        buckets.push_back(std::move(b));
+        grand_total_remaining += (size_t)std::llround(std::max(0.0, tot));
+        (void)leaf_size; // silence unused if compiled with warnings
+      }
+    }
+    if (buckets.empty() || grand_total_remaining == 0) continue;
+
+    // Fenwick over bucket totals for O(log B) selection and updates
+    std::vector<double> bucket_bit(buckets.size(), 0.0);
+    for (size_t i=0;i<buckets.size(); ++i) if (buckets[i].total > 0.0) rpf_utils::fenwick_add(bucket_bit, i+1, buckets[i].total);
+    auto fenwick_prefix_sum = [&](const std::vector<double>& bit, size_t idx1)->double { double s=0.0; while (idx1>0) { s += bit[idx1-1]; idx1 -= idx1 & (~idx1 + 1); } return s; };
+
+    const double total_all0 = std::accumulate(buckets.begin(), buckets.end(), 0.0, [](double s, const Bucket& b){ return s + std::max(0.0, b.total); });
+    double bucket_total_all = total_all0;
+    size_t draws = std::min((size_t)this->split_try, (size_t)std::llround(total_all0));
+
+    for (size_t t=0; t<draws; ++t) {
+      // Select a bucket according to current totals
+      size_t b_idx;
+      if (this->deterministic) {
+        if (total_all0 <= 0.0) break;
+        double step = total_all0 / (double)draws;
+        double target = step * (t + 0.5); if (target >= total_all0) target = std::max(0.0, total_all0 - 1.0);
+        b_idx = rpf_utils::fenwick_find_by_prefix(bucket_bit, target);
+        if (b_idx == 0) continue; --b_idx;
+      } else {
+        if (bucket_total_all <= 0.0) break;
+        double r = rng_runif(0.0, bucket_total_all);
+        b_idx = rpf_utils::fenwick_find_by_prefix(bucket_bit, r);
+        if (b_idx == 0) continue; --b_idx;
+      }
+
+      auto &bucket = buckets[b_idx];
+      const int kdim = bucket.kdim; const int k = kdim - 1;
+      if (bucket.total <= 0.0) { continue; }
+
+      // Sample a leaf in the bucket via local Fenwick
+      double r_leaf;
+      if (this->deterministic) {
+        double step = (total_all0 <= 0.0) ? 0.0 : (total_all0 / (double)draws);
+        double target_global = step * (t + 0.5); if (target_global >= total_all0) target_global = std::max(0.0, total_all0 - 1.0);
+        double before = fenwick_prefix_sum(bucket_bit, b_idx);
+        double inside = target_global - before; if (inside < 0.0) inside = 0.0; if (inside >= bucket.total) inside = std::max(0.0, bucket.total - 1.0);
+        r_leaf = inside;
+      } else {
+        r_leaf = rng_runif(0.0, std::max(0.0, bucket.total));
+      }
+      const std::vector<double>& bit_view = bucket.has_local ? bucket.bit : *(bucket.bit_src);
+      size_t leaf_idx_sel = rpf_utils::fenwick_find_by_prefix(bit_view, r_leaf);
+      if (leaf_idx_sel == 0) continue; leaf_idx_sel -= 1; size_t wts_size = bucket.has_local ? bucket.wts.size() : (bucket.wts_src ? bucket.wts_src->size() : 0); if (leaf_idx_sel >= wts_size || leaf_idx_sel >= bucket.tree->leaves.size()) continue;
+      Leaf *leaf_ptr = &bucket.tree->leaves[leaf_idx_sel];
+
+      // Prepare local per-leaf-per-dim state lazily
+      auto &state = local_states[leaf_ptr][k];
+      if (!state.initialized) {
+        // Build order and sorted values
+        ensure_order_and_sorted_vals_for_leaf(X, *leaf_ptr, k, state.order_cf, state.sorted_vals);
+        // Compute or reuse unique values and left/right bounds
+        size_t unique_count = 0;
+        if (leaf_ptr->unique_vals_cache.count(k)) {
+          state.unique_ptr = &leaf_ptr->unique_vals_cache[k];
+          unique_count = state.unique_ptr->size();
+          leaf_ptr->unique_count_cache[k] = unique_count;
+        } else {
+          auto uniques = compute_unique_sorted_values(state.sorted_vals);
+          unique_count = uniques.size();
+          leaf_ptr->unique_count_cache[k] = unique_count;
+          leaf_ptr->unique_vals_cache[k] = std::move(uniques);
+          state.unique_ptr = &leaf_ptr->unique_vals_cache[k];
+        }
+        const int leaf_size_here = this->n_leaves[k];
+        state.left = leaf_size_here; state.right = (int)unique_count - leaf_size_here;
+        if (state.right < state.left) { state.left = 0; state.right = 0; }
+        if (state.right > state.left) state.used_flags.assign((size_t)(state.right - state.left), 0);
+        // Build prefix sums for fast evaluation
+        build_prefix_and_total_given_order(Y, *leaf_ptr, state.order_cf, this->value_size, state.prefix_cf, state.total_cf);
+        state.initialized = true;
+      }
+      const std::vector<double>& wts_view = bucket.has_local ? bucket.wts : *(bucket.wts_src);
+      if (state.right <= state.left || wts_view[leaf_idx_sel] <= 0.0) { continue; }
+
+      // Select threshold index within [left, right) avoiding repeats
+      int s_idx;
+      if (this->deterministic) {
+        int range = state.right - state.left;
+        int remaining_here = (int)wts_view[leaf_idx_sel];
+        int guess = state.left + (int)(((double)state.used_count + 0.5) / ((double)remaining_here + 0.5) * range);
+        if (guess >= state.right) guess = state.right - 1;
+        int lo = guess, hi = guess; bool found = false;
+        while (lo >= state.left || hi < state.right) {
+          if (lo >= state.left && (state.used_flags.empty() || !state.used_flags[lo - state.left])) { s_idx = lo; found = true; break; }
+          if (hi < state.right && (state.used_flags.empty() || !state.used_flags[hi - state.left])) { s_idx = hi; found = true; break; }
+          --lo; ++hi;
+        }
+        if (!found) {
+          for (int p = state.left; p < state.right; ++p) {
+            if (state.used_flags.empty() || !state.used_flags[p - state.left]) { s_idx = p; break; }
+          }
+        }
+      } else {
+        do { s_idx = rng_randint(state.left, state.right); } while (!state.used_flags.empty() && state.used_flags[s_idx - state.left]);
+      }
+      if (!state.used_flags.empty()) state.used_flags[(size_t)(s_idx - state.left)] = 1;
+      state.used_count += 1;
+
+      if (!bucket.has_local) { bucket.bit = *(bucket.bit_src); bucket.wts = *(bucket.wts_src); bucket.has_local = true; }
+      // Evaluate loss at chosen threshold
+      double sp = (*state.unique_ptr)[(size_t)s_idx];
+      const size_t m_eval = leaf_ptr->individuals.size();
+      size_t pos_in_sorted = static_cast<size_t>(std::lower_bound(state.sorted_vals.begin(), state.sorted_vals.end(), sp) - state.sorted_vals.begin());
+      if (pos_in_sorted == 0 || pos_in_sorted >= m_eval) { continue; }
+
+      double loss = 0.0; std::vector<double> sum_s_adj(this->value_size, 0.0), sum_b_adj(this->value_size, 0.0);
+      for (size_t p = 0; p < this->value_size; ++p) {
+        const double sum_s_base = state.prefix_cf[p][pos_in_sorted - 1]; const double sum_b_base = state.total_cf[p] - sum_s_base;
+        sum_s_adj[p] = sum_s_base; sum_b_adj[p] = sum_b_base;
+        loss -= (sum_s_adj[p] * sum_s_adj[p]) / static_cast<double>(pos_in_sorted);
+        loss -= (sum_b_adj[p] * sum_b_adj[p]) / static_cast<double>(m_eval - pos_in_sorted);
+      }
+      if (loss < min_split.min_sum) {
+        min_split.min_sum = loss; min_split.tree_index = bucket.tree; min_split.leaf_index = leaf_ptr; min_split.split_coordinate = kdim; min_split.split_point = sp; best_idx = (int)idx; min_split.sum_s = sum_s_adj; min_split.sum_b = sum_b_adj;
+      }
+
+      // Consume one threshold from this leaf locally: update BIT, wts, totals
+      bucket.wts[leaf_idx_sel] -= 1.0; if (bucket.wts[leaf_idx_sel] < 0.0) bucket.wts[leaf_idx_sel] = 0.0;
+      rpf_utils::fenwick_add(bucket.bit, leaf_idx_sel + 1, -1.0);
+      bucket.total -= 1.0; if (bucket.total < 0.0) bucket.total = 0.0;
+      rpf_utils::fenwick_add(bucket_bit, b_idx + 1, -1.0);
+      bucket_total_all -= 1.0; if (bucket_total_all < 0.0) bucket_total_all = 0.0;
+    }
+  }
+
+  rpf_utils::age_pool_by_sample(sample_idxs, best_idx, possible_trees);
+  finalize_split_from_sums(min_split, X, this->value_size);
+  return min_split;
+}
diff --git a/src/lib/training.cpp b/src/lib/training.cpp
new file mode 100644
index 0000000..fa2898a
--- /dev/null
+++ b/src/lib/training.cpp
@@ -0,0 +1,102 @@
+// Training orchestration split out from rpf.cpp. Builds tree families,
+// manages bootstrapping and threading, and handles optional purification.
+#include "rpf.hpp"
+#include "internal_utils.hpp"
+
+using namespace rpf_utils;
+
+void RandomPlantedForest::fit()
+{
+  std::vector<int> initial_individuals(sample_size);
+  std::iota(initial_individuals.begin(), initial_individuals.end(), 0);
+
+  std::vector<Interval> initial_intervals(feature_size);
+  for (int i = 0; i < feature_size; ++i)
+    initial_intervals[i] = Interval{lower_bounds[i], upper_bounds[i]};
+
+  Leaf initial_leaf;
+  {
+    initial_leaf.value = std::vector<double>(value_size, 0);
+    initial_leaf.individuals = initial_individuals;
+    initial_leaf.intervals = initial_intervals;
+  }
+  std::vector<Leaf> initial_leaves{initial_leaf};
+
+  this->tree_families = std::vector<TreeFamily>(n_trees);
+
+  // Generate per-tree seeds from R's RNG to ensure reproducibility across runs
+  // when the user sets the R seed. These seeds will be used regardless of
+  // threading mode.
+  tree_seeds_.assign((size_t)std::max(0, n_trees), 0ULL);
+  for (int i = 0; i < n_trees; ++i) {
+    // Two 32-bit chunks composed into a 64-bit seed using R's RNG
+    unsigned long long hi = static_cast<unsigned long long>(R::runif(0.0, 4294967296.0));
+    unsigned long long lo = static_cast<unsigned long long>(R::runif(0.0, 4294967296.0));
+    tree_seeds_[(size_t)i] = (hi << 32) ^ lo ^ static_cast<unsigned long long>(i);
+  }
+
+  unsigned int threads_to_use = static_cast<unsigned int>(nthreads);
+  if (threads_to_use == 0) threads_to_use = 1;
+  if (threads_to_use > 1)
+  {
+    if (threads_to_use > std::thread::hardware_concurrency())
+    {
+      Rcout << "Requested " << threads_to_use << " threads but only " << std::thread::hardware_concurrency() << " available" << std::endl;
+    }
+    for (int start = 0; start < n_trees; start += (int)threads_to_use)
+    {
+      int batch = std::min<int>((int)threads_to_use, n_trees - start);
+      if (batch <= 0) break;
+      std::vector<std::thread> threads((size_t)batch);
+      for (int i = 0; i < batch; ++i)
+      {
+        int tree_index = start + i;
+        threads[(size_t)i] = std::thread([this, &initial_leaves](int tree_index_inner){
+          std::mt19937_64 rng_local;
+          std::mt19937_64* prev_ptr = rpf_utils::swap_tls_rng(nullptr);
+          if (!tree_seeds_.empty() && (size_t)tree_index_inner < tree_seeds_.size()) {
+            rng_local.seed(tree_seeds_[(size_t)tree_index_inner]);
+          } else {
+            rng_local.seed(88172645463393265ULL ^ (unsigned long long)tree_index_inner);
+          }
+          rpf_utils::swap_tls_rng(&rng_local);
+          this->create_tree_family(initial_leaves, (size_t)tree_index_inner);
+          rpf_utils::swap_tls_rng(prev_ptr);
+        }, tree_index);
+      }
+      for (auto &th : threads)
+      {
+        if (th.joinable()) th.join();
+      }
+    }
+  }
+  else
+  {
+    // Single-threaded: still drive randomness from per-tree seeds
+    std::mt19937_64 rng_local;
+    std::mt19937_64* prev_ptr = rpf_utils::swap_tls_rng(nullptr);
+    for (int n = 0; n < n_trees; ++n)
+    {
+      if (!tree_seeds_.empty() && (size_t)n < tree_seeds_.size()) {
+        rng_local.seed(tree_seeds_[(size_t)n]);
+      } else {
+        rng_local.seed(88172645463393265ULL ^ (unsigned long long)n);
+      }
+      rpf_utils::swap_tls_rng(&rng_local);
+      create_tree_family(initial_leaves, n);
+    }
+    rpf_utils::swap_tls_rng(prev_ptr);
+  }
+
+  if (purify_forest)
+  {
+    // Default: cap=0 (uncapped), nthreads=0 (auto; min(object nthreads, available)), mode=2 (fast exact)
+    this->purify(0, 0, 2);
+  }
+  else
+  {
+    purified = false;
+  }
+}
+
+
diff --git a/src/randomPlantedForest.cpp b/src/randomPlantedForest.cpp
index 9fd3630..4bb7b5a 100644
--- a/src/randomPlantedForest.cpp
+++ b/src/randomPlantedForest.cpp
@@ -10,13 +10,13 @@ RCPP_MODULE(mod_rpf)
   class_<RandomPlantedForest>("RandomPlantedForest")
       .constructor<const NumericMatrix, const NumericMatrix, const NumericVector>()
       .method("set_data", &RandomPlantedForest::set_data)
+      .method("get_parameters", &RandomPlantedForest::get_parameters)
       .method("cross_validation", &RandomPlantedForest::cross_validation)
       .method("predict_matrix", &RandomPlantedForest::predict_matrix)
       .method("predict_vector", &RandomPlantedForest::predict_vector)
       .method("MSE", &RandomPlantedForest::MSE)
-      .method("purify", &RandomPlantedForest::purify_3)
+      .method("purify_threads", static_cast<void (RandomPlantedForest::*)(int,int,int)>(&RandomPlantedForest::purify))
       .method("print", &RandomPlantedForest::print)
-      .method("get_parameters", &RandomPlantedForest::get_parameters)
       .method("set_parameters", &RandomPlantedForest::set_parameters)
       .method("get_model", &RandomPlantedForest::get_model)
       .method("is_purified", &RandomPlantedForest::is_purified);
diff --git a/tests/testthat/test-predict-components.R b/tests/testthat/test-predict-components.R
index a775f09..d0aa2a6 100644
--- a/tests/testthat/test-predict-components.R
+++ b/tests/testthat/test-predict-components.R
@@ -81,8 +81,8 @@ test_that(".predict_single_component is consistent with predictor order", {
 
   # Internal data preprocessing only done in predict_components to save time
   processed <- hardhat::forge(mtcars, rp$blueprint)
-  new_data <- preprocess_predictors_predict(rp, processed$predictors)
-
+  new_data <- randomPlantedForest::preprocess_predictors_predict(rp, processed$predictors)
+  
   expect_equal(
     .predict_single_component(rp, new_data, c("cyl", "am")),
     .predict_single_component(rp, new_data, c("am", "cyl"))
diff --git a/tests/testthat/test-purify-modes-equivalence.R b/tests/testthat/test-purify-modes-equivalence.R
new file mode 100644
index 0000000..33f858c
--- /dev/null
+++ b/tests/testthat/test-purify-modes-equivalence.R
@@ -0,0 +1,49 @@
+set.seed(2025)
+
+test_that("single component predictions match across purify modes (non-capped)", {
+  rp1 <- rpf(mpg ~ cyl + disp + hp + wt, data = mtcars,
+             max_interaction = 3, ntrees = 30, deterministic = TRUE)
+  rp2 <- rpf(mpg ~ cyl + disp + hp + wt, data = mtcars,
+             max_interaction = 3, ntrees = 30, deterministic = TRUE)
+
+  expect_false(is_purified(rp1))
+  expect_false(is_purified(rp2))
+
+  purify(rp1, mode = 1L)
+  purify(rp2, mode = 2L)
+
+  expect_true(is_purified(rp1))
+  expect_true(is_purified(rp2))
+
+  m1 <- predict_components(rp1, mtcars)
+  m2 <- predict_components(rp2, mtcars)
+
+  expect_equal(colnames(m1$m), colnames(m2$m))
+  expect_equal(as.matrix(m1$m), as.matrix(m2$m), tolerance = 1e-8)
+  expect_equal(m1$intercept, m2$intercept, tolerance = 1e-10)
+})
+
+test_that("single component predictions match across purify modes (capped)", {
+  rp1 <- rpf(mpg ~ cyl + disp + hp + wt, data = mtcars,
+             max_interaction = 3, ntrees = 30, deterministic = TRUE)
+  rp2 <- rpf(mpg ~ cyl + disp + hp + wt, data = mtcars,
+             max_interaction = 3, ntrees = 30, deterministic = TRUE)
+
+  expect_false(is_purified(rp1))
+  expect_false(is_purified(rp2))
+
+  purify(rp1, maxp_interaction = 2L, mode = 1L)
+  purify(rp2, maxp_interaction = 2L, mode = 2L)
+
+  expect_true(is_purified(rp1))
+  expect_true(is_purified(rp2))
+
+  m1 <- predict_components(rp1, mtcars, max_interaction = 2L)
+  m2 <- predict_components(rp2, mtcars, max_interaction = 2L)
+
+  expect_equal(colnames(m1$m), colnames(m2$m))
+  expect_equal(as.matrix(m1$m), as.matrix(m2$m), tolerance = 1e-8)
+  expect_equal(m1$intercept, m2$intercept, tolerance = 1e-10)
+})
+
+
diff --git a/tests/testthat/test-purify.R b/tests/testthat/test-purify.R
index f305e16..146ad19 100644
--- a/tests/testthat/test-purify.R
+++ b/tests/testthat/test-purify.R
@@ -31,7 +31,7 @@ test_that("purification does not alter predictions (null effect)", {
 
   pred_post <- predict(bin_fit, new_data = xdat, type = "numeric")
 
-  expect_equal(pred_pre, pred_post, tolerance = 1e-14)
+  expect_equal(pred_pre, pred_post, tolerance = 1e-10)
 })
 
 test_that("purification does not alter predictions (with effect)", {
@@ -52,5 +52,5 @@ test_that("purification does not alter predictions (with effect)", {
 
   pred_post <- predict(rpfit, test)
 
-  expect_equal(pred_pre, pred_post, tolerance = 1e-15)
+  expect_equal(pred_pre, pred_post, tolerance = 1e-10)
 })