diff --git a/.Rbuildignore b/.Rbuildignore index eb4b7ca..22557f8 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -17,3 +17,7 @@ ^doc$ ^Meta$ ^.vscode$ +^\.DS_Store$ +^SomeFile\.diff$ +^src/.*\.(o|so|a)$ +^src/lib/.*\.(o|so|a)$ diff --git a/NAMESPACE b/NAMESPACE index b14b57c..dbf99cd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -13,6 +13,7 @@ S3method(rpf,recipe) S3method(str,rpf_forest) export(is_purified) export(predict_components) +export(preprocess_predictors_predict) export(purify) export(rpf) import(checkmate) diff --git a/R/predict_components.R b/R/predict_components.R index ba07a5f..a0d5161 100644 --- a/R/predict_components.R +++ b/R/predict_components.R @@ -87,7 +87,12 @@ predict_components <- function(object, new_data, max_interaction = NULL, predict } # Check if forest is purified, if not we do that now - if (!is_purified(object)) purify(object) + if (!is_purified(object)) { + # Purify using default policy: mode=2 (fast exact), + # maxp_interaction=0 (uncapped), + # nthreads defaults to min(training nthreads, available cores) + object$fit$purify_threads(0L, 0L, 2L) + } # If max_interaction is greater than number of predictors requested we need to adjust that max_interaction <- min(max_interaction, length(predictors)) diff --git a/R/predict_rpf.R b/R/predict_rpf.R index 35dfe06..b14faed 100644 --- a/R/predict_rpf.R +++ b/R/predict_rpf.R @@ -108,24 +108,27 @@ predict_rpf_prob <- function(object, new_data, ...) { pred_prob <- 1 / (1 + exp(-pred_raw)) } else if (object$params$loss %in% c("L1", "L2")) { # Truncate probabilities at [0,1] for L1/L2 loss - pred_prob <- apply(pred_raw, 2, function(col) pmax(0, pmin(1, col))) + pred_prob <- pmax(0, pmin(1, pred_raw)) } - # Binary classif yields n x 1 prediction matrix, append complementary class prob + # Ensure a plain numeric vector in binary case + pred_prob <- as.numeric(pred_prob) + # Binary classif yields two columns ordered by outcome levels pred_prob <- cbind(1 - pred_prob, pred_prob) } else { # Multiclass if (object$params$loss %in% c("logit", "exponential")) { - # FIXME: - # softmax() defined in utils.R, should be identical to logit^-1 for - # binary case but not properly tested yet + # softmax for multi-class pred_prob <- softmax(pred_raw) } else if (object$params$loss %in% c("L1", "L2")) { - # Truncate probabilities at [0,1] for L1/L2 loss - pred_prob <- apply(pred_raw, 2, function(col) pmax(0, pmin(1, col))) - # Normalise such that sum of class probs is always 1 - pred_prob <- pred_prob/rowSums(pred_prob) + # Clamp to [0,1] and renormalize rows + pred_prob <- pmin(1, pmax(0, pred_raw)) + # pmin/pmax drop dimensions; restore matrix shape explicitly + dim(pred_prob) <- dim(pred_raw) + rs <- rowSums(pred_prob) + rs[!is.finite(rs) | rs <= 0] <- 1 + pred_prob <- pred_prob / rs } } @@ -140,7 +143,7 @@ predict_rpf_class <- function(object, new_data, ...) { pred_prob <- predict_rpf_prob(object, new_data, 0, ...) # For each instance, class with higher probability - pred_class <- factor(outcome_levels[max.col(pred_prob)], levels = outcome_levels) + pred_class <- factor(outcome_levels[max.col(as.matrix(pred_prob))], levels = outcome_levels) out <- hardhat::spruce_class(pred_class) out diff --git a/R/purify.R b/R/purify.R index 40baa39..62ca7e7 100644 --- a/R/purify.R +++ b/R/purify.R @@ -1,6 +1,6 @@ #' Purify a Random Planted Forest #' -#' TODO: Explain what this does +#' Purifies an rpf object. #' #' Unless [`rpf()`] is called with `purify = TRUE`, the forest has to be purified after fit #' to ensure the components extracted by [`predict_components()`] are valid. @@ -28,11 +28,28 @@ purify.default <- function(x, ...) { ) } +#' @param maxp_interaction integer or NULL: Only compute/store purified components +#' up to this interaction order. Higher-order purified trees are zeroed (not +#' computed) but still implicitly influence lower orders during purification. +#' If NULL, purify all orders (default behavior). +#' @param mode integer(1): Purification algorithm mode. 1 = legacy grid path +#' used by `fit$fit$purify()`; 2 = fast exact KD-tree based path. Defaults to 2. +#' @param nthreads integer or NULL: number of threads to use. If NULL, defaults +#' to min of the object's configured `nthreads` and available threads. #' @export #' @rdname purify #' @importFrom utils capture.output -purify.rpf <- function(x, ...) { - x$fit$purify() +purify.rpf <- function(x, ..., maxp_interaction = NULL, mode = 2L, nthreads = NULL) { + checkmate::assert_class(x, "rpf") + checkmate::assert_int(mode, lower = 1, upper = 2) + if (!is.null(nthreads)) checkmate::assert_int(nthreads, lower = 1) + if (is.null(maxp_interaction)) { + # Default: exact cut points, full interaction order + x$fit$purify_threads(0L, as.integer(if (is.null(nthreads)) 0L else nthreads), as.integer(mode)) + } else { + checkmate::assert_int(maxp_interaction, lower = 1) + x$fit$purify_threads(as.integer(maxp_interaction), as.integer(if (is.null(nthreads)) 0L else nthreads), as.integer(mode)) + } x } @@ -43,3 +60,5 @@ is_purified <- function(x) { checkmate::assert_class(x, "rpf") x$fit$is_purified() } + + diff --git a/R/rpf.R b/R/rpf.R index a331656..6a752dc 100644 --- a/R/rpf.R +++ b/R/rpf.R @@ -15,6 +15,10 @@ #' @param split_try `[10]`: Number of split points to be considered when choosing a split candidate. #' @param t_try `[0.4]`: A value in (0,1] specifying the proportion of viable split-candidates in each round. #' @param deterministic `[FALSE]`: Choose whether approach deterministic or random. +#' @param split_decay_rate `[0.1]`: Exponential decay factor for aging split-candidates. Possible splits are initiated with age=0. Whenever a possible split becomes a split_candidate (i.e. it has been drawn when drawing max(max_candidates , t_try * possible options ) times) it ages by +1. The age of the single split-candidate with minimal loss is reset to zero. Split_candidates are sampled from Possible_splits with weight exp(-split_decay_rate_ * age). A high split_decay_rate means faster aging. split_decay_rate=0 results in no aging and uniform sampling. +#' @param max_candidates `[50]`: Maximum number of split-candidates sampled per iteration. Number of split_candidates in each round is given by max(max_candidates , t_try * possible options). +#' @param delete_leaves `[TRUE]`: Whether to delete a parent leaf when splitting along an existing dimension. +#' @param split_structure `["leaves"]`: Defines the structure of a possible split and how to choose split_candidates. Can be one of "leaves", "hist", "cur_trees_1", "cur_trees_2", or "res_trees". Further details are given below. #' @param nthreads `[1L]`: Number of threads used for computation, defaulting to serial execution. #' @param purify `[FALSE]`: Whether the forest should be purified. #' Set to `TRUE` to enable components extract with [`predict_components()`] are valid. @@ -29,6 +33,9 @@ #' @param epsilon `[0.1]`: Only used if loss = `"logit"` or `"exponential"`. #' Proportion of class membership is truncated to be smaller 1-epsilon when calculating #' the fit in a leaf. +#' @param split_decay_rate `[0.1]`: Exponential decay factor λ for aging split-candidates. A candidate’s weight is `exp(−λ * age)`. +#' @param max_candidates `[50]`: Maximum number of split‐candidates to sample at each node (will be clamped to `[1, #possible_splits]`). +#' @param delete_leaves `[1]`: Whether parents should be deleted if split is an existing coordinate #' @param ... (Unused). #' #' @return Object of class `"rpf"` with model object contained in `$fit`. @@ -39,6 +46,37 @@ #' @importFrom hardhat default_formula_blueprint #' @importFrom hardhat default_recipe_blueprint #' +#' @details +#' \subsection{splits}{ +#' The number of `splits` is the main tuning parameter affecting the accuracy of predictions. +#' } +#' \subsection{split_structure}{ +#' The `split_structure` argument controls how split candidates are constructed and sampled. +#' In each round, a `t_try` fraction (capped by `max_candidates`) is drawn +#' from the pool of all possible splits with weights `exp(-split_decay_rate * age)`. +#' +#' \describe{ +#' \item{leaves}{Split candidates are (leaf, split-dimension) pairs. For each sampled +#' candidate, `split_try` thresholds are drawn uniformly from the valid range within +#' that leaf and evaluated to choose the best split.} +#' +#' \item{cur_trees_1}{Split candidates are (current-tree, split-dimension) pairs. For each +#' sampled candidate, perform `split_try` evaluations. Each evaluation samples a leaf +#' from the set of valid current trees (with probability proportional to its number of +#' available thresholds) and then uniformly samples a single threshold within that leaf.} +#' +#' \item{cur_trees_2}{Split candidates are (current-tree, split-dimension) pairs. For each +#' sampled candidate, iterate through every +#' valid leaf. Within each leaf, sample `split_try` thresholds uniformly and +#' evaluate them.} +#' +#' \item{res_trees}{Split candidates are resulting trees. For each sampled candidate, run +#' `split_try` evaluations by sampling a (split-dimension, leaf) pair from all valid +#' pairs (with probability proportional to its number of available thresholds), then +#' uniformly sampling one threshold within that pair.} +#' } +#' } +#' #' @examples #' # Regression with x and y #' rpfit <- rpf(x = mtcars[, c("cyl", "wt")], y = mtcars$mpg) @@ -63,16 +101,21 @@ rpf.default <- function(x, ...) { #' @export #' @rdname rpf rpf.data.frame <- function(x, y, max_interaction = 1, ntrees = 50, splits = 30, - split_try = 10, t_try = 0.4, deterministic = FALSE, + split_try = 10, t_try = 0.4, split_decay_rate = 0.1, + max_candidates = 50, delete_leaves = TRUE, + deterministic = FALSE, nthreads = 1, purify = FALSE, cv = FALSE, - loss = "L2", delta = 0, epsilon = 0.1, ...) { + loss = "L2", delta = 0, epsilon = 0.1, + split_structure = "leaves", export_forest = FALSE, ...) { + split_structure <- match.arg(split_structure, c("leaves", "res_trees", "cur_trees_2", "cur_trees_1", "hist")) blueprint <- hardhat::default_xy_blueprint(intercept = FALSE) processed <- hardhat::mold(x, y, blueprint = blueprint) rpf_bridge( processed, max_interaction, ntrees, splits, - split_try, t_try, deterministic, + split_try, t_try, split_decay_rate, max_candidates, delete_leaves, deterministic, nthreads, purify, cv, - loss, delta, epsilon + loss, delta, epsilon, + split_structure = split_structure, export_forest = export_forest ) } @@ -80,32 +123,42 @@ rpf.data.frame <- function(x, y, max_interaction = 1, ntrees = 50, splits = 30, #' @export #' @rdname rpf rpf.matrix <- function(x, y, max_interaction = 1, ntrees = 50, splits = 30, - split_try = 10, t_try = 0.4, deterministic = FALSE, + split_try = 10, t_try = 0.4, split_decay_rate = 0.1, + max_candidates = 50, delete_leaves = TRUE, + deterministic = FALSE, nthreads = 1, purify = FALSE, cv = FALSE, - loss = "L2", delta = 0, epsilon = 0.1, ...) { + loss = "L2", delta = 0, epsilon = 0.1, + split_structure = "leaves", export_forest = FALSE, ...) { + split_structure <- match.arg(split_structure, c("leaves", "res_trees", "cur_trees_2", "cur_trees_1", "hist")) blueprint <- hardhat::default_xy_blueprint(intercept = FALSE) processed <- hardhat::mold(x, y, blueprint = blueprint) rpf_bridge( processed, max_interaction, ntrees, splits, - split_try, t_try, deterministic, + split_try, t_try, split_decay_rate, max_candidates, delete_leaves, deterministic, nthreads, purify, cv, - loss, delta, epsilon + loss, delta, epsilon, + split_structure = split_structure, export_forest = export_forest )} # Formula method #' @export #' @rdname rpf rpf.formula <- function(formula, data, max_interaction = 1, ntrees = 50, splits = 30, - split_try = 10, t_try = 0.4, deterministic = FALSE, + split_try = 10, t_try = 0.4, split_decay_rate = 0.1, + max_candidates = 50, delete_leaves = TRUE, + deterministic = FALSE, nthreads = 1, purify = FALSE, cv = FALSE, - loss = "L2", delta = 0, epsilon = 0.1, ...) { + loss = "L2", delta = 0, epsilon = 0.1, + split_structure = "leaves", export_forest = FALSE, ...) { + split_structure <- match.arg(split_structure, c("leaves", "res_trees", "cur_trees_2", "cur_trees_1", "hist")) blueprint <- hardhat::default_formula_blueprint(intercept = FALSE, indicators = "none") processed <- hardhat::mold(formula, data, blueprint = blueprint) rpf_bridge( processed, max_interaction, ntrees, splits, - split_try, t_try, deterministic, + split_try, t_try, split_decay_rate, max_candidates, delete_leaves, deterministic, nthreads, purify, cv, - loss, delta, epsilon + loss, delta, epsilon, + split_structure = split_structure, export_forest = export_forest ) } @@ -113,16 +166,21 @@ rpf.formula <- function(formula, data, max_interaction = 1, ntrees = 50, splits #' @export #' @rdname rpf rpf.recipe <- function(x, data, max_interaction = 1, ntrees = 50, splits = 30, - split_try = 10, t_try = 0.4, deterministic = FALSE, + split_try = 10, t_try = 0.4, split_decay_rate = 0.1, + max_candidates = 50, delete_leaves = TRUE, + deterministic = FALSE, nthreads = 1, purify = FALSE, cv = FALSE, - loss = "L2", delta = 0, epsilon = 0.1, ...) { + loss = "L2", delta = 0, epsilon = 0.1, + split_structure = "leaves", export_forest = FALSE, ...) { + split_structure <- match.arg(split_structure, c("leaves", "res_trees", "cur_trees_2", "cur_trees_1", "hist")) blueprint <- hardhat::default_recipe_blueprint(intercept = FALSE) processed <- hardhat::mold(x, data, blueprint = blueprint) rpf_bridge( processed, max_interaction, ntrees, splits, - split_try, t_try, deterministic, + split_try, t_try, split_decay_rate, max_candidates, delete_leaves, deterministic, nthreads, purify, cv, - loss, delta, epsilon + loss, delta, epsilon, + split_structure = split_structure, export_forest = export_forest ) } @@ -131,9 +189,13 @@ rpf.recipe <- function(x, data, max_interaction = 1, ntrees = 50, splits = 30, #' @param processed Output of `hardhat::mold` from respective rpf methods #' @importFrom hardhat validate_outcomes_are_univariate rpf_bridge <- function(processed, max_interaction = 1, ntrees = 50, splits = 30, - split_try = 10, t_try = 0.4, deterministic = FALSE, + split_try = 10, t_try = 0.4, split_decay_rate = 0.1, + max_candidates = 50, delete_leaves = TRUE, + deterministic = FALSE, nthreads = 1, purify = FALSE, cv = FALSE, - loss = "L2", delta = 0, epsilon = 0.1) { + loss = "L2", delta = 0, epsilon = 0.1, + split_structure = "leaves", export_forest = FALSE) { + split_structure <- match.arg(split_structure, c("leaves", "res_trees", "cur_trees_2", "cur_trees_1", "hist")) hardhat::validate_outcomes_are_univariate(processed$outcomes) predictors <- preprocess_predictors_fit(processed) outcomes <- preprocess_outcome(processed, loss) @@ -141,7 +203,7 @@ rpf_bridge <- function(processed, max_interaction = 1, ntrees = 50, splits = 30, # Check arguments checkmate::assert_int(max_interaction, lower = 0) - + # rewrite max_interaction so 0 -> "maximum", e.g. ncol(X) if (max_interaction == 0) { max_interaction <- p @@ -156,10 +218,13 @@ rpf_bridge <- function(processed, max_interaction = 1, ntrees = 50, splits = 30, checkmate::assert_int(ntrees, lower = 1) checkmate::assert_int(splits, lower = 1) checkmate::assert_int(split_try, lower = 1) - + checkmate::assert_int(max_candidates, lower = 1) + checkmate::assert_number(t_try, lower = 0, upper = 1) checkmate::assert_number(delta, lower = 0, upper = 1) checkmate::assert_number(epsilon, lower = 0, upper = 1) + checkmate::assert_number(split_decay_rate, lower = 0) + # "median" loss is implemented but discarded loss_functions <- switch(outcomes$mode, @@ -172,18 +237,26 @@ rpf_bridge <- function(processed, max_interaction = 1, ntrees = 50, splits = 30, checkmate::assert_int(nthreads, lower = 1L) checkmate::assert_flag(purify) checkmate::assert_flag(cv) + checkmate::assert_flag(delete_leaves) + checkmate::assert_choice(split_structure, choices = c("res_trees", "cur_trees_2", "cur_trees_1", "leaves", "hist")) + fit <- rpf_impl( Y = outcomes$outcomes, X = predictors$predictors_matrix, mode = outcomes$mode, max_interaction = max_interaction, ntrees = ntrees, splits = splits, - split_try = split_try, t_try = t_try, deterministic = deterministic, + split_try = split_try, t_try = t_try, split_decay_rate = split_decay_rate, max_candidates = max_candidates, delete_leaves=delete_leaves, deterministic = deterministic, nthreads = nthreads, purify = purify, cv = cv, - loss = loss, delta = delta, epsilon = epsilon + loss = loss, delta = delta, epsilon = epsilon, + split_structure = split_structure ) - forest <- fit$get_model() - class(forest) <- "rpf_forest" + # Optionally export a compact R list representation of the forest. + forest <- NULL + if (isTRUE(export_forest)) { + forest <- fit$get_model() + class(forest) <- "rpf_forest" + } new_rpf( fit = fit, @@ -195,7 +268,12 @@ rpf_bridge <- function(processed, max_interaction = 1, ntrees = 50, splits = 30, ntrees = ntrees, max_interaction = max_interaction, splits = splits, - split_try = split_try, t_try = t_try, + split_try = split_try, + t_try = t_try, + split_decay_rate = split_decay_rate, + max_candidates = max_candidates, + delete_leaves = delete_leaves, + split_structure = split_structure, delta = delta, epsilon = epsilon, deterministic = deterministic, nthreads = nthreads, purify = purify, cv = cv @@ -217,21 +295,31 @@ new_rpf <- function(fit, blueprint, ...) { # Main fitting function and interface to C++ implementation rpf_impl <- function(Y, X, mode = c("regression", "classification"), max_interaction = 1, ntrees = 50, splits = 30, split_try = 10, t_try = 0.4, - deterministic = FALSE, nthreads = 1, purify = FALSE, cv = FALSE, - loss = "L2", delta = 0, epsilon = 0.1) { + deterministic = FALSE, nthreads = 1, purify = FALSE, cv = FALSE, split_decay_rate = 0.1, max_candidates = 50, delete_leaves = TRUE, + loss = "L2", delta = 0, epsilon = 0.1, + split_structure = "leaves") { # Final input validation, should be superfluous checkmate::assert_matrix(X, mode = "numeric", any.missing = FALSE) mode <- match.arg(mode) + split_structure <- match.arg(split_structure, c("leaves", "res_trees", "cur_trees_2", "cur_trees_1", "hist")) + # map split_structure string to numeric mode for C++ + split_mode <- switch(split_structure, + res_trees = 0L, + cur_trees_2 = 1L, + cur_trees_1 = 2L, + leaves = 3L, + hist = 4L + ) if (mode == "classification") { fit <- new(ClassificationRPF, Y, X, loss, c( max_interaction, ntrees, splits, split_try, t_try, - purify, deterministic, nthreads, cv, delta, epsilon + purify, deterministic, nthreads, cv, split_decay_rate, max_candidates, delete_leaves, split_mode, delta, epsilon )) } else if (mode == "regression") { fit <- new(RandomPlantedForest, Y, X, c( - max_interaction, ntrees, splits, split_try, t_try, - purify, deterministic, nthreads, cv + max_interaction, ntrees, splits, split_try, t_try, + purify, deterministic, nthreads, cv, split_decay_rate, max_candidates, delete_leaves, split_mode )) } diff --git a/R/utils.R b/R/utils.R index c04fc14..df2f979 100644 --- a/R/utils.R +++ b/R/utils.R @@ -115,8 +115,26 @@ preprocess_predictors_fit <- function(processed) { ) } -# Sort factor predictors using stored level information -# Used in predict_rpf_bridge() +#' Preprocess predictors for prediction +#' +#' Convert logical and character columns to appropriate types, re-order factor +#' levels to match the ordering learned during fitting (stored in +#' `object$factor_levels`), re-encode factor columns as integers, and return a +#' numeric matrix suitable for the underlying C++ prediction routines. +#' +#' This is primarily an internal utility used by `predict()` methods but is +#' exported to support advanced users and tests. +#' +#' @param object An object of class `rpf` returned by [`rpf()`]. +#' @param predictors A data frame or matrix of predictor values to preprocess. +#' +#' @return A numeric matrix with the same number of rows as `predictors`. +#' @export +#' @examples +#' rpfit <- rpf(x = mtcars[, c("cyl", "wt")], y = mtcars$mpg) +#' processed <- hardhat::forge(mtcars[, c("cyl", "wt")], rpfit$blueprint) +#' X <- preprocess_predictors_predict(rpfit, processed$predictors) +#' dim(X) preprocess_predictors_predict <- function(object, predictors) { predictors <- as.data.table(predictors) diff --git a/man/preprocess_predictors_predict.Rd b/man/preprocess_predictors_predict.Rd new file mode 100644 index 0000000..9059b94 --- /dev/null +++ b/man/preprocess_predictors_predict.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{preprocess_predictors_predict} +\alias{preprocess_predictors_predict} +\title{Preprocess predictors for prediction} +\usage{ +preprocess_predictors_predict(object, predictors) +} +\arguments{ +\item{object}{An object of class \code{rpf} returned by \code{\link[=rpf]{rpf()}}.} + +\item{predictors}{A data frame or matrix of predictor values to preprocess.} +} +\value{ +A numeric matrix with the same number of rows as \code{predictors}. +} +\description{ +Convert logical and character columns to appropriate types, re-order factor +levels to match the ordering learned during fitting (stored in +\code{object$factor_levels}), re-encode factor columns as integers, and return a +numeric matrix suitable for the underlying C++ prediction routines. +} +\details{ +This is primarily an internal utility used by \code{predict()} methods but is +exported to support advanced users and tests. +} +\examples{ +rpfit <- rpf(x = mtcars[, c("cyl", "wt")], y = mtcars$mpg) +processed <- hardhat::forge(mtcars[, c("cyl", "wt")], rpfit$blueprint) +X <- preprocess_predictors_predict(rpfit, processed$predictors) +dim(X) +} diff --git a/man/purify.Rd b/man/purify.Rd index 79adf5a..5856f5f 100644 --- a/man/purify.Rd +++ b/man/purify.Rd @@ -11,7 +11,7 @@ purify(x, ...) \method{purify}{default}(x, ...) -\method{purify}{rpf}(x, ...) +\method{purify}{rpf}(x, ..., maxp_interaction = NULL, mode = 2L, nthreads = NULL) is_purified(x) } @@ -19,12 +19,23 @@ is_purified(x) \item{x}{And object of class \code{rpf}.} \item{...}{(Unused)} + +\item{maxp_interaction}{integer or NULL: Only compute/store purified components +up to this interaction order. Higher-order purified trees are zeroed (not +computed) but still implicitly influence lower orders during purification. +If NULL, purify all orders (default behavior).} + +\item{mode}{integer(1): Purification algorithm mode. 1 = legacy grid path +used by \code{fit$fit$purify()}; 2 = fast exact KD-tree based path. Defaults to 2.} + +\item{nthreads}{integer or NULL: number of threads to use. If NULL, defaults +to min of the object's configured \code{nthreads} and available threads.} } \value{ Invisibly: The \code{\link{rpf}} object. } \description{ -TODO: Explain what this does +Purifies an rpf object. } \details{ Unless \code{\link[=rpf]{rpf()}} is called with \code{purify = TRUE}, the forest has to be purified after fit diff --git a/man/rpf.Rd b/man/rpf.Rd index 0e323d8..14b4c98 100644 --- a/man/rpf.Rd +++ b/man/rpf.Rd @@ -18,6 +18,9 @@ rpf(x, ...) splits = 30, split_try = 10, t_try = 0.4, + split_decay_rate = 0.1, + max_candidates = 50, + delete_leaves = TRUE, deterministic = FALSE, nthreads = 1, purify = FALSE, @@ -25,6 +28,7 @@ rpf(x, ...) loss = "L2", delta = 0, epsilon = 0.1, + split_structure = "leaves", ... ) @@ -36,6 +40,9 @@ rpf(x, ...) splits = 30, split_try = 10, t_try = 0.4, + split_decay_rate = 0.1, + max_candidates = 50, + delete_leaves = TRUE, deterministic = FALSE, nthreads = 1, purify = FALSE, @@ -43,6 +50,7 @@ rpf(x, ...) loss = "L2", delta = 0, epsilon = 0.1, + split_structure = "leaves", ... ) @@ -54,6 +62,9 @@ rpf(x, ...) splits = 30, split_try = 10, t_try = 0.4, + split_decay_rate = 0.1, + max_candidates = 50, + delete_leaves = TRUE, deterministic = FALSE, nthreads = 1, purify = FALSE, @@ -61,6 +72,7 @@ rpf(x, ...) loss = "L2", delta = 0, epsilon = 0.1, + split_structure = "leaves", ... ) @@ -72,6 +84,9 @@ rpf(x, ...) splits = 30, split_try = 10, t_try = 0.4, + split_decay_rate = 0.1, + max_candidates = 50, + delete_leaves = TRUE, deterministic = FALSE, nthreads = 1, purify = FALSE, @@ -79,6 +94,7 @@ rpf(x, ...) loss = "L2", delta = 0, epsilon = 0.1, + split_structure = "leaves", ... ) } @@ -105,6 +121,12 @@ this is equivalent to setting \code{max_interaction = 10}.} \item{t_try}{\verb{[0.4]}: A value in (0,1] specifying the proportion of viable split-candidates in each round.} +\item{split_decay_rate}{\verb{[0.1]}: Exponential decay factor λ for aging split-candidates. A candidate’s weight is \verb{exp(−λ * age)}.} + +\item{max_candidates}{\verb{[50]}: Maximum number of split‐candidates to sample at each node (will be clamped to \verb{[1, #possible_splits]}).} + +\item{delete_leaves}{\verb{[1]}: Whether parents should be deleted if split is an existing coordinate} + \item{deterministic}{\verb{[FALSE]}: Choose whether approach deterministic or random.} \item{nthreads}{\verb{[1L]}: Number of threads used for computation, defaulting to serial execution.} @@ -127,6 +149,8 @@ the loss to determine the optimal split.} Proportion of class membership is truncated to be smaller 1-epsilon when calculating the fit in a leaf.} +\item{split_structure}{\verb{["leaves"]}: Defines the structure of a possible split and how to choose split_candidates. Can be one of "leaves", "hist", "cur_trees_1", "cur_trees_2", or "res_trees". Further details are given below.} + \item{formula}{Formula specification, e.g. y ~ x1 + x2.} } \value{ @@ -135,6 +159,37 @@ Object of class \code{"rpf"} with model object contained in \verb{$fit}. \description{ Random Planted Forest } +\details{ +\subsection{splits}{ +The number of \code{splits} is the main tuning parameter affecting the accuracy of predictions. +} +\subsection{split_structure}{ +The \code{split_structure} argument controls how split candidates are constructed and sampled. +In each round, a \code{t_try} fraction (capped by \code{max_candidates}) is drawn +from the pool of all possible splits with weights \code{exp(-split_decay_rate * age)}. + +\describe{ +\item{leaves}{Split candidates are (leaf, split-dimension) pairs. For each sampled +candidate, \code{split_try} thresholds are drawn uniformly from the valid range within +that leaf and evaluated to choose the best split.} + +\item{cur_trees_1}{Split candidates are (current-tree, split-dimension) pairs. For each +sampled candidate, perform \code{split_try} evaluations. Each evaluation samples a leaf +from the set of valid current trees (with probability proportional to its number of +available thresholds) and then uniformly samples a single threshold within that leaf.} + +\item{cur_trees_2}{Split candidates are (current-tree, split-dimension) pairs. For each +sampled candidate, iterate through every +valid leaf. Within each leaf, sample \code{split_try} thresholds uniformly and +evaluate them.} + +\item{res_trees}{Split candidates are resulting trees. For each sampled candidate, run +\code{split_try} evaluations by sampling a (split-dimension, leaf) pair from all valid +pairs (with probability proportional to its number of available thresholds), then +uniformly sampling one threshold within that pair.} +} +} +} \examples{ # Regression with x and y rpfit <- rpf(x = mtcars[, c("cyl", "wt")], y = mtcars$mpg) diff --git a/src/Makevars b/src/Makevars index 71ae982..ffe2fba 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,8 +1,9 @@ -SOURCES=lib/cpf.cpp lib/grid.cpp lib/helper.cpp lib/rpf.cpp lib/trees.cpp randomPlantedForest.cpp RcppExports.cpp +SOURCES=lib/cpf.cpp lib/grid.cpp lib/helper.cpp lib/rpf.cpp lib/trees.cpp lib/internal_utils.cpp lib/splits_leaves.cpp lib/splits_cur_trees_2.cpp lib/splits_cur_trees_1.cpp lib/splits_res_trees.cpp lib/splits_hist.cpp lib/predict.cpp lib/training.cpp lib/purify.cpp lib/losses_l1_l2_median.cpp lib/losses_logit.cpp lib/losses_exponential.cpp randomPlantedForest.cpp RcppExports.cpp OBJECTS = $(SOURCES:.cpp=.o) PKG_CPPFLAGS=-I./include -I./lib +PKG_CXXFLAGS = -DNDEBUG all: $(SHLIB) diff --git a/src/include/cpf.hpp b/src/include/cpf.hpp index 1c913ce..f993f95 100644 --- a/src/include/cpf.hpp +++ b/src/include/cpf.hpp @@ -9,7 +9,7 @@ class ClassificationRPF : public RandomPlantedForest public: using RandomPlantedForest::calcOptimalSplit; ClassificationRPF(const NumericMatrix &samples_Y, const NumericMatrix &samples_X, - const String loss = "L2", const NumericVector parameters = {1, 50, 30, 10, 0.4, 0, 0, 0, 0, 0, 0.1}); + const String loss = "L2", const NumericVector parameters = {1, 50, 30, 10, 0.4, 0, 0, 0, 0, 0.1, 0, 0.1, 50,1}); void set_parameters(StringVector keys, NumericVector values); ~ClassificationRPF(){}; @@ -33,9 +33,44 @@ class ClassificationRPF : public RandomPlantedForest void (ClassificationRPF::*calcLoss)(Split &); void create_tree_family(std::vector initial_leaves, size_t n) override; void fit() override; - Split calcOptimalSplit(const std::vector> &Y, const std::vector> &X, - std::multimap> &possible_splits, TreeFamily &curr_family, - std::vector> &weights); + Split calcOptimalSplit( + const std::vector>& Y, + const std::vector>& X, + std::vector& possible_splits, + TreeFamily& curr_family, + std::vector>& weights) ; + // Mode-specific split calculators (classification versions using calcLoss and weights) + Split calcOptimalSplit_leaves( + const std::vector>& Y, + const std::vector>& X, + std::vector& possible_splits, + TreeFamily& curr_family, + std::vector>& weights); + Split calcOptimalSplit_curTrees1( + const std::vector>& Y, + const std::vector>& X, + std::vector& possible_splits, + TreeFamily& curr_family, + std::vector>& weights); + Split calcOptimalSplit_curTrees2( + const std::vector>& Y, + const std::vector>& X, + std::vector& possible_splits, + TreeFamily& curr_family, + std::vector>& weights); + // Mode 4: histogram-binned (classification variant) + Split calcOptimalSplit_hist( + const std::vector>& Y, + const std::vector>& X, + std::vector& possible_splits, + TreeFamily& curr_family, + std::vector>& weights); + Split calcOptimalSplit_resTrees( + const std::vector>& Y, + const std::vector>& X, + std::vector& possible_trees, + TreeFamily& curr_family, + std::vector>& weights); void L1_loss(Split &split); void median_loss(Split &split); void logit_loss(Split &split); @@ -47,4 +82,4 @@ class ClassificationRPF : public RandomPlantedForest void exponential_loss_3(Split &split); }; -#endif \ No newline at end of file +#endif diff --git a/src/include/diffbuf.hpp b/src/include/diffbuf.hpp new file mode 100644 index 0000000..c16fabc --- /dev/null +++ b/src/include/diffbuf.hpp @@ -0,0 +1,93 @@ +#ifndef RPF_DIFFBUF_HPP +#define RPF_DIFFBUF_HPP + +#include +#include +#include + +// N-D difference buffer for axis-aligned rectangular range updates +// and reconstruction via inclusive prefix scans along each dimension. + +namespace rpf_diff +{ + template + struct NDArray + { + std::vector dims; // logical dimensions + std::vector data; // flat row-major data + + NDArray() {} + explicit NDArray(const std::vector &d, const T &init = T()) : dims(d) + { + size_t n = 1; for (int v : d) n *= (size_t)v; data.assign(n, init); + } + + inline size_t offset(const std::vector &idx) const + { + size_t off = 0; size_t stride = 1; + for (size_t k = 0; k < dims.size(); ++k) + { + off += (size_t)idx[k] * stride; stride *= (size_t)dims[k]; + } + return off; + } + + inline T &at(const std::vector &idx) { return data[offset(idx)]; } + inline const T &at(const std::vector &idx) const { return data[offset(idx)]; } + }; + + // Apply a constant add v onto a closed-open hyper-rectangle [lo, hi) via difference corners + template + void add_rect(NDArray &diff, const std::vector &lo, const std::vector &hi, const T &v) + { + const size_t d = diff.dims.size(); + // iterate over 2^d corners + std::vector corner(d, 0); + for (;;) { + int flips = 0; for (size_t k = 0; k < d; ++k) if (corner[k]) ++flips; + T sign = (flips % 2 == 0) ? v : (v * (-1)); + std::vector idx(d); + for (size_t k = 0; k < d; ++k) idx[k] = corner[k] ? hi[k] : lo[k]; + diff.at(idx) += sign; + size_t pos = 0; + while (pos < d) { if (corner[pos] == 0) { corner[pos] = 1; break; } corner[pos] = 0; ++pos; } + if (pos == d) break; + } + } + + // Inclusive prefix scan along each dimension in-place (converts diff -> values) + template + void inclusive_scan_inplace(NDArray &arr) + { + const size_t d = arr.dims.size(); + if (d == 0) return; + for (size_t axis = 0; axis < d; ++axis) + { + // number of slabs orthogonal to axis + size_t nslab = 1; for (size_t k = 0; k < d; ++k) if (k != axis) nslab *= (size_t)arr.dims[k]; + std::vector slab_idx(d, 0); + for (size_t s = 0; s < nslab; ++s) + { + // decode slab index into coordinates for all dims except axis + size_t tmp = s; + for (size_t k = 0; k < d; ++k) + { + if (k == axis) continue; + slab_idx[k] = (int)(tmp % (size_t)arr.dims[k]); + tmp /= (size_t)arr.dims[k]; + } + // inclusive scan along axis + std::vector run = slab_idx; run[axis] = 0; + T acc = arr.at(run); acc -= acc; // zero of correct shape + for (int t = 0; t < arr.dims[axis]; ++t) + { + run[axis] = t; acc += arr.at(run); arr.at(run) = acc; + } + } + } + } +} + +#endif // RPF_DIFFBUF_HPP + + diff --git a/src/include/grid.hpp b/src/include/grid.hpp index 4b21af5..e7653da 100644 --- a/src/include/grid.hpp +++ b/src/include/grid.hpp @@ -39,4 +39,4 @@ namespace grid }; }; -#endif \ No newline at end of file +#endif diff --git a/src/include/internal_utils.hpp b/src/include/internal_utils.hpp new file mode 100644 index 0000000..e1ce50c --- /dev/null +++ b/src/include/internal_utils.hpp @@ -0,0 +1,79 @@ +// Internal utility helpers extracted from rpf.cpp to declutter large files. +// Kept minimal and header-only where templating is required. + +#ifndef INTERNAL_UTILS_HPP +#define INTERNAL_UTILS_HPP + +#include +#include +#include +#include +#include +#include +#include + +#include "trees.hpp" + +namespace rpf_utils { + +// RNG helpers +double rng_runif01(); +double rng_runif(double a, double b); +int rng_randint(int left_inclusive, int right_exclusive); +// Swap the thread-local RNG pointer; returns previous pointer +std::mt19937_64* swap_tls_rng(std::mt19937_64* new_ptr); + +// Leaf/order/prefix helpers +void ensure_order_and_sorted_vals_for_leaf( + const std::vector> &X, + Leaf &leaf, + int k, + std::vector &order_out, + std::vector &sorted_vals_out); + +std::vector compute_unique_sorted_values(const std::vector &sorted_vals); + +void build_prefix_and_total_given_order( + const std::vector> &Y, + const Leaf &leaf, + const std::vector &order, + size_t value_size, + std::vector> &prefix_out, + std::vector &total_out); + +void finalize_split_from_sums( + Split &winner, + const std::vector> &X, + size_t value_size); + +// Sampling helpers +std::vector sample_weighted_indices_filtered( + const std::vector &weights, + size_t n_candidates); + +std::vector compute_even_spread_indices(int left_inclusive, int right_exclusive, size_t max_draws); +std::vector sample_unique_ints_uniform_R(int left_inclusive, int right_exclusive, size_t k); + +// Fenwick helpers used by cur_trees_1 sampling cache +void fenwick_add(std::vector &bit, size_t idx1, double delta); +size_t fenwick_find_by_prefix(const std::vector &bit, double target); + +// Aging helper must be header (templated) +template +inline void age_pool_by_sample(const std::vector &sample_idxs, int best_idx, std::vector &pool) +{ + for (size_t idx : sample_idxs) { + if (static_cast(idx) != best_idx) pool[idx].age += 1.0; else pool[idx].age = 0.0; + } +} + +} // namespace rpf_utils + +// Thread-local working-set bin cache used by histogram split mode (mode 4). +// Declared here so multiple translation units (e.g., rpf.cpp and splits_hist.cpp) +// can share the same cache during a tree-family build. +extern thread_local std::vector> tls_working_bin_id; + +#endif // INTERNAL_UTILS_HPP + + diff --git a/src/include/kdtree.hpp b/src/include/kdtree.hpp new file mode 100644 index 0000000..3029998 --- /dev/null +++ b/src/include/kdtree.hpp @@ -0,0 +1,178 @@ +#ifndef RPF_KDTREE_HPP +#define RPF_KDTREE_HPP + +#include +#include +#include +#include + +// Lightweight KD-tree for orthogonal range counts. +// - Header-only to avoid build system changes +// - Supports arbitrary dimensionality +// - Query provides constraints only for a subset of dimensions; others are unconstrained + +namespace rpf_kd +{ + struct RangeConstraint + { + int dim; // 0-based feature index + double left; // inclusive lower bound + double right; // exclusive upper bound + }; + + namespace detail + { + struct Node + { + // Bounding box for quick acceptance/rejection + std::vector minv; + std::vector maxv; + int axis = -1; // split axis; -1 means leaf + double split_value = 0; // split threshold + size_t size = 0; // number of points in subtree + std::unique_ptr left; + std::unique_ptr right; + std::vector idxs; // indices when leaf + }; + } + + class KDTree + { + public: + KDTree() = default; + + KDTree(const std::vector> *X_ptr, + const std::vector &all_indices, + int dims, + size_t leaf_size = 32) + { + build(X_ptr, all_indices, dims, leaf_size); + } + + void build(const std::vector> *X_ptr, + const std::vector &all_indices, + int dims, + size_t leaf_size = 32) + { + X_ = X_ptr; + dims_ = dims; + leaf_size_ = leaf_size; + root_ = build_recursive(all_indices); + } + + // Count number of points with constraints on a subset of dims + size_t range_count(const std::vector &constraints) const + { + return range_count_recursive(root_.get(), constraints); + } + + private: + const std::vector> *X_ = nullptr; + int dims_ = 0; + size_t leaf_size_ = 32; + std::unique_ptr root_; + + std::unique_ptr build_recursive(const std::vector &idxs) + { + auto node = std::make_unique(); + node->size = idxs.size(); + node->minv.assign(dims_, std::numeric_limits::infinity()); + node->maxv.assign(dims_, -std::numeric_limits::infinity()); + for (int i : idxs) + { + for (int d = 0; d < dims_; ++d) + { + double v = (*X_)[i][d]; + if (v < node->minv[d]) node->minv[d] = v; + if (v > node->maxv[d]) node->maxv[d] = v; + } + } + + if (idxs.size() <= leaf_size_) + { + node->axis = -1; node->idxs = idxs; return node; + } + + // Choose split axis by widest spread + int axis = 0; double best_span = -1.0; + for (int d = 0; d < dims_; ++d) + { + double span = node->maxv[d] - node->minv[d]; + if (span > best_span) { best_span = span; axis = d; } + } + node->axis = axis; + + // Median split on chosen axis + std::vector left_idxs, right_idxs; left_idxs.reserve(idxs.size()); right_idxs.reserve(idxs.size()); + std::vector tmp = idxs; + size_t mid = tmp.size() / 2; + std::nth_element(tmp.begin(), tmp.begin() + mid, tmp.end(), [&](int a, int b){ return (*X_)[a][axis] < (*X_)[b][axis]; }); + double split = (*X_)[tmp[mid]][axis]; + node->split_value = split; + for (int i : idxs) + { + if ((*X_)[i][axis] < split) left_idxs.push_back(i); else right_idxs.push_back(i); + } + if (left_idxs.empty() || right_idxs.empty()) + { + // Fallback: make leaf if degenerate split + node->axis = -1; node->idxs = idxs; return node; + } + node->left = build_recursive(left_idxs); + node->right = build_recursive(right_idxs); + return node; + } + + static inline bool box_outside(const std::vector &minv, const std::vector &maxv, + const std::vector &C) + { + for (const auto &rc : C) + { + if (maxv[rc.dim] <= rc.left) return true; + if (minv[rc.dim] >= rc.right) return true; + } + return false; + } + + static inline bool box_inside(const std::vector &minv, const std::vector &maxv, + const std::vector &C) + { + for (const auto &rc : C) + { + if (minv[rc.dim] < rc.left) return false; + if (maxv[rc.dim] > rc.right) return false; + } + return true; + } + + size_t range_count_recursive(const detail::Node *node, const std::vector &C) const + { + if (!node) return 0; + if (!C.empty()) + { + if (box_outside(node->minv, node->maxv, C)) return 0; + if (box_inside(node->minv, node->maxv, C)) return node->size; + } + if (node->axis == -1) + { + size_t cnt = 0; + for (int i : node->idxs) + { + bool inside = true; + for (const auto &rc : C) + { + double v = (*X_)[i][rc.dim]; + if (!(v >= rc.left && v < rc.right)) { inside = false; break; } + } + if (inside) ++cnt; + } + return cnt; + } + return range_count_recursive(node->left.get(), C) + range_count_recursive(node->right.get(), C); + } + }; +} + +#endif // RPF_KDTREE_HPP + + diff --git a/src/include/rpf.hpp b/src/include/rpf.hpp index 53e8d13..ff28a93 100644 --- a/src/include/rpf.hpp +++ b/src/include/rpf.hpp @@ -1,3 +1,23 @@ +// Public API for the Random Planted Forest (regression base). This header +// declares the externally visible training, prediction, and model-introspection +// methods used from R via the Rcpp module in `src/randomPlantedForest.cpp`. +// +// Key entry points: +// - ctor(Y, X, parameters): construct and fit a model (calls set_data + fit) +// - set_data(Y, X): load data (no training) and initialize bounds +// - fit(): build tree families according to split_structure_mode_ +// - predict_matrix/predict_vector(): batch/single predictions +// - purify_1/2/3(): optional post-processing to orthogonalize components +// - cross_validation(): coarse k-fold search over a few parameters (legacy) +// - get_parameters()/set_parameters(): inspect or update configuration +// - get_model(): export current forest (for R printing/plotting) +// - is_purified(): flag indicating whether purify_* was applied last +// +// Implementation notes: +// - Training orchestrated in `lib/training.cpp` +// - Prediction logic in `lib/predict.cpp` +// - Split calculators in `lib/splits_*.cpp` +// - Utilities (RNG, sampling, caching) in `lib/internal_utils.cpp` #ifndef RPF_H #define RPF_H @@ -9,25 +29,44 @@ class RandomPlantedForest { public: + // Construct and fit a random planted forest on Y ~ X with configuration in + // `parameters` (see R docs for positional mapping; last value selects + // split-structure mode). Calls set_data() then fit(). RandomPlantedForest(const NumericMatrix &samples_Y, const NumericMatrix &samples_X, - const NumericVector parameters = {1, 50, 30, 10, 0.4, 0, 0, 0, 0}); + const NumericVector parameters = {1, 50, 30, 10, 0.4, 0, 0, 0, 0, 0.1, 50, 1, 3}); RandomPlantedForest(){}; + // Load or replace data without fitting; computes bounds and resets state. void set_data(const NumericMatrix &samples_Y, const NumericMatrix &samples_X); + // Predict for a matrix or a single vector. `components = {0}` means the full + // model; otherwise a set of component indices to evaluate (expert mode). NumericMatrix predict_matrix(const NumericMatrix &X, const NumericVector components = {0}); NumericMatrix predict_vector(const NumericVector &X, const NumericVector components = {0}); + // Optional post-processing to redistribute effects across component orders. void purify_1(); void purify_2(); - void purify_3(); + // Unified purifier: mode 1 = grid path, mode 2 = fast exact (KD-tree) + void purify(int maxp_interaction, int nthreads, int mode); + // Unified entry with explicit threading control + void purify_fast_exact(int maxp_interaction, int nthreads); + // Human-readable dump of forest structure to R console. void print(); + // Legacy coarse CV over a few parameters; mainly for internal experiments. void cross_validation(int n_sets = 4, IntegerVector splits = {5, 50}, NumericVector t_tries = {0.2, 0.5, 0.7, 0.9}, IntegerVector split_tries = {1, 2, 5, 10}); + // Mean-squared error helper for matrix outputs. double MSE(const NumericMatrix &Y_predicted, const NumericMatrix &Y_true); + // Inspect/update configuration; `set_parameters` may trigger a refit. void get_parameters(); void set_parameters(StringVector keys, NumericVector values); + // Export a list representation of the current forest for printing/plotting. List get_model(); virtual ~RandomPlantedForest(){}; bool is_purified(); - + protected: + // Internal per-family worker (grid-based mode 1) + void purify_3_family(TreeFamily &curr_family, int maxp_interaction); + // Internal per-family worker for fast exact purifier (mode 2) + void purify_fast_exact_family(TreeFamily &curr_family, int maxp_interaction); double MSE_vec(const NumericVector &Y_predicted, const NumericVector &Y_true); std::vector> X; /**< Nested vector feature samples of size (sample_size x feature_size) */ std::vector> Y; /**< Corresponding values for the feature samples */ @@ -49,12 +88,82 @@ class RandomPlantedForest std::vector upper_bounds; std::vector lower_bounds; std::vector tree_families; /**< random planted forest containing result */ + // Seeds generated on the main thread from R's RNG, one per tree family + std::vector tree_seeds_; std::vector predict_single(const std::vector &X, std::set component_index); void L2_loss(Split &split); virtual void fit(); virtual void create_tree_family(std::vector initial_leaves, size_t n); - virtual Split calcOptimalSplit(const std::vector> &Y, const std::vector> &X, - std::multimap> &possible_splits, TreeFamily &curr_family); + struct SplitCandidate; + // overload possibleExists for your vector of SplitCandidate + static bool possibleExists( + int dim, + const std::vector& possible_splits, + const std::set& resulting_dims + ); + // helpers for different split-structure modes + Split calcOptimalSplit_leaves(const std::vector> &Y, + const std::vector> &X, + std::vector &possible_splits, + TreeFamily &curr_family); + Split calcOptimalSplit_curTrees2(const std::vector> &Y, + const std::vector> &X, + std::vector &possible_splits, + TreeFamily &curr_family); + Split calcOptimalSplit_curTrees1(const std::vector> &Y, + const std::vector> &X, + std::vector &possible_splits, + TreeFamily &curr_family); + struct ResultingTreeCandidate { std::shared_ptr tree; double age = 0.0; ResultingTreeCandidate() = default; explicit ResultingTreeCandidate(std::shared_ptr t):tree(std::move(t)){} }; + bool resultingTreeExists(const std::vector& pool, const std::set& dims); + Split calcOptimalSplit_resTrees(const std::vector> &Y, + const std::vector> &X, + std::vector &possible_trees, + TreeFamily &curr_family); + virtual Split calcOptimalSplit(const std::vector> &Y, + const std::vector> &X, + std::vector &possible_splits, + TreeFamily &curr_family); + // exponential‐decay rate for split age + double split_decay_rate_; + size_t max_candidates_; + // LRU cap for per-leaf per-feature caches + size_t leaf_feature_cache_cap_ = 64; + // track each split candidate and how long it’s sat unchosen + struct SplitCandidate { + int dim; + std::shared_ptr tree; + size_t leaf_idx; + double age = 0.0; + // legacy ctor without leaf index (defaults to 0) — keep but prefer the 4-arg form from callers + explicit SplitCandidate(int d, std::shared_ptr t, double a=0.0) + : dim(d), tree(std::move(t)), leaf_idx(0), age(a) {} + SplitCandidate(int d, std::shared_ptr t, size_t li, double a=0.0) + : dim(d), tree(std::move(t)), leaf_idx(li), age(a) {} + }; + // Which split structure to use (0=res_trees, 1=cur_trees_2, 2=cur_trees_1, 3=leaves, 4=hist) + int split_structure_mode_ = 3; + + // Histogram mode buffers + size_t num_bins_ = 64; // total number of global bins per feature (smaller default for speed) + // For each feature k in [0, feature_size), store K-1 cut points (ascending) + std::vector> feature_cut_points_; + // For each feature k, per-sample bin id in [0, K-1] + std::vector> sample_bin_id_; + // For the current bootstrapped working set (per-family), cache per-feature bin ids + // Moved to thread-local storage in implementation to avoid races under multithreading + // std::vector> working_bin_id_; + + bool leafCandidateExists(const std::vector&, + const std::shared_ptr&, + size_t leaf_idx, int dim); + bool delete_leaves; + + // Mode 4: histogram-binned split evaluation + Split calcOptimalSplit_hist(const std::vector> &Y, + const std::vector> &X, + std::vector &possible_splits, + TreeFamily &curr_family); }; -#endif // RPF_HPP \ No newline at end of file +#endif // RPF_HPP diff --git a/src/include/trees.hpp b/src/include/trees.hpp index 184c441..7856021 100644 --- a/src/include/trees.hpp +++ b/src/include/trees.hpp @@ -13,6 +13,16 @@ struct Leaf std::vector individuals; /**< considered samples for each leaf */ std::vector value; /**< residual */ std::vector intervals; /**< min/max for each feature of the interval */ + // Cache: for each feature dimension store a stable order of indices into `individuals` + // sorted by the feature value. This order is reusable across evaluations. + std::unordered_map> order_cache; + // Cache: sorted feature values along order for lower_bound + std::unordered_map> sorted_vals_cache; + // Cache: unique sorted feature values for faster threshold sampling in cur_trees_2 + std::unordered_map> unique_vals_cache; + // Cache: unique count per feature (to quickly skip leaves with too few thresholds) + std::unordered_map unique_count_cache; + }; /** @@ -59,12 +69,25 @@ class DecisionTree std::set split_dims; /**< dimensions of the performed splits */ std::vector leaves; /**< leaves of tree containing intervals and approximating value */ LeafGrid GridLeaves; + // Cached per-dimension weighted sampling over leaves for cur_trees_1 + // epoch that increments whenever leaves are structurally changed + int weights_epoch = 0; + // For each feature dimension k, remember which epoch the cache corresponds to + // vector-backed caches to avoid unordered_map overhead + std::vector weights_epoch_by_dim_v; // length == feature_size (lazy-sized) + // For each feature dimension k, Fenwick tree (1-based) of per-leaf weights (width of valid thresholds) + std::vector> fenwick_by_dim_v; // length == feature_size (lazy-sized) + // For each feature dimension k, raw per-leaf weights array (0-based index over leaves) + std::vector> leaf_weights_by_dim_v; // length == feature_size (lazy-sized) + // For each feature dimension k, total weight across all leaves + std::vector weights_total_by_dim_v; // length == feature_size (lazy-sized) }; typedef std::map, std::shared_ptr, setComp> TreeFamily; std::shared_ptr treeExists(const std::set &split_dims, TreeFamily &tree_family); +// Legacy overload kept in trees.cpp for backward compatibility in R-facing helpers. bool possibleExists(const int dim, const std::multimap> &possible_splits, const std::set &resulting_dims); bool leafExists(std::vector &intervals, const std::shared_ptr tree); diff --git a/src/lib/cpf.cpp b/src/lib/cpf.cpp index f810700..0bebd2a 100644 --- a/src/lib/cpf.cpp +++ b/src/lib/cpf.cpp @@ -1,6 +1,13 @@ #include "cpf.hpp" - +#include +#include +#include +#include +#include "internal_utils.hpp" +using namespace rpf_utils; +#include +#include // ----------------- rpf subclass for classification ----------------- @@ -9,587 +16,33 @@ */ -void ClassificationRPF::L1_loss(Split &split) -{ - split.min_sum = 0; - split.M_s = split.sum_s / split.I_s.size(); - split.M_b = split.sum_b / split.I_b.size(); - - for (size_t p = 0; p < value_size; ++p) - { - for (auto individual : split.I_s) - { - split.min_sum += std::fabs((*split.Y)[individual][p] - split.M_s[p]) - std::fabs((*split.Y)[individual][p]); - } - for (auto individual : split.I_b) - { - split.min_sum += std::fabs((*split.Y)[individual][p] - split.M_b[p]) - std::fabs((*split.Y)[individual][p]); - } - } -} - -void ClassificationRPF::median_loss(Split &split) -{ - split.min_sum = 0; - split.M_s = calcMedian(*split.Y, split.I_s); - split.M_b = calcMedian(*split.Y, split.I_b); - - for (size_t p = 0; p < value_size; ++p) - { - for (auto individual : split.I_s) - { - split.min_sum += std::fabs((*split.Y)[individual][p] - split.M_s[p]) - std::fabs((*split.Y)[individual][p]); - } - for (auto individual : split.I_b) - { - split.min_sum += std::fabs((*split.Y)[individual][p] - split.M_b[p]) - std::fabs((*split.Y)[individual][p]); - } - } -} - -void ClassificationRPF::logit_loss(Split &split) -{ - - split.min_sum = 0; - split.M_s = split.sum_s / split.I_s.size(); - split.M_b = split.sum_b / split.I_b.size(); - split.M_sp = 1 - std::accumulate(split.M_s.begin(), split.M_s.end(), 0.0); - split.M_bp = 1 - std::accumulate(split.M_b.begin(), split.M_b.end(), 0.0); - - std::vector M_s = split.M_s; - std::vector M_b = split.M_b; - - std::for_each(M_s.begin(), M_s.end(), [this](double &M) - { M = std::min(std::max(delta, M), 1 - delta); }); - std::for_each(M_b.begin(), M_b.end(), [this](double &M) - { M = std::min(std::max(delta, M), 1 - delta); }); - - double M_sp = std::min(std::max(delta, split.M_sp), 1 - delta); - double M_bp = std::min(std::max(delta, split.M_bp), 1 - delta); - - std::vector W_s_mean = calcMean(*split.W, split.I_s); - std::vector W_b_mean = calcMean(*split.W, split.I_b); - - std::vector> W = *split.W, W_new = *split.W; - - for (size_t p = 0; p < value_size; ++p) - { - for (auto individual : split.I_s) - { - W[individual][p] = exp(W[individual][p]); - W_new[individual][p] = exp(W_new[individual][p] + log(M_s[p] / M_sp) - W_s_mean[p]); - } - for (auto individual : split.I_b) - { - W[individual][p] = exp(W[individual][p]); - W_new[individual][p] = exp(W_new[individual][p] + log(M_b[p] / M_bp) - W_b_mean[p]); - } - } - - for (size_t p = 0; p < value_size; ++p) - { - for (auto individual : split.I_s) - { - split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (1 + std::accumulate(W[individual].begin(), W[individual].end(), 0.0))); // ~ R_old - split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (1 + std::accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0))); // ~ R_new - } - for (auto individual : split.I_b) - { - split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (1 + std::accumulate(W[individual].begin(), W[individual].end(), 0.0))); // ~ R_old - split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (1 + std::accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0))); // ~ R_new - } - } - - for (auto individual : split.I_s) - { - split.min_sum += (1 - std::accumulate((*split.Y)[individual].begin(), (*split.Y)[individual].end(), 0.0)) * log(1 / (1 + std::accumulate(W[individual].begin(), W[individual].end(), 0.0))); // ~ R_old - split.min_sum -= (1 - std::accumulate((*split.Y)[individual].begin(), (*split.Y)[individual].end(), 0.0)) * log(1 / (1 + std::accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0))); // ~ R_new - } - for (auto individual : split.I_b) - { - split.min_sum += (1 - std::accumulate((*split.Y)[individual].begin(), (*split.Y)[individual].end(), 0.0)) * log(1 / (1 + std::accumulate(W[individual].begin(), W[individual].end(), 0.0))); // ~ R_old - split.min_sum -= (1 - std::accumulate((*split.Y)[individual].begin(), (*split.Y)[individual].end(), 0.0)) * log(1 / (1 + std::accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0))); // ~ R_new - } - - if (std::isnan(split.min_sum)) - { - split.min_sum = INF; - } -} - -void ClassificationRPF::logit_loss_2(Split &split) -{ - - split.min_sum = 0; - split.M_s = split.sum_s / split.I_s.size(); - split.M_b = split.sum_b / split.I_b.size(); - - std::vector M_s = split.M_s; - std::vector M_b = split.M_b; - - std::vector M_s2 = split.M_s; - std::vector M_b2 = split.M_b; - - std::for_each(M_s.begin(), M_s.end(), [this](double &M) - { M = std::max(delta, M); }); - std::for_each(M_b.begin(), M_b.end(), [this](double &M) - { M = std::max(delta, M); }); - - std::for_each(M_s2.begin(), M_s2.end(), [this](double &M) - { M = std::max(delta, 1 - M); }); - std::for_each(M_b2.begin(), M_b2.end(), [this](double &M) - { M = std::max(delta, 1 - M); }); - - std::vector W_s_mean = calcMean(*split.W, split.I_s); - std::vector W_b_mean = calcMean(*split.W, split.I_b); - - std::vector> W = *split.W, W_new = *split.W; - - for (size_t p = 0; p < value_size; ++p) - { - for (auto individual : split.I_s) - { - W[individual][p] = exp(W[individual][p]); - W_new[individual][p] = exp(W_new[individual][p] + log(M_s[p] / M_s2[p]) - W_s_mean[p]); - } - for (auto individual : split.I_b) - { - W[individual][p] = exp(W[individual][p]); - W_new[individual][p] = exp(W_new[individual][p] + log(M_b[p] / M_b2[p]) - W_b_mean[p]); - } - } - - for (size_t p = 0; p < value_size; ++p) - { - for (auto individual : split.I_s) - { - split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (1 + W[individual][p])); // ~ R_old - split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (1 + W_new[individual][p])); // ~ R_new - } - for (auto individual : split.I_b) - { - split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (1 + W[individual][p])); // ~ R_old - split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (1 + W_new[individual][p])); // ~ R_new - } - } - - if (std::isnan(split.min_sum)) - { - split.min_sum = INF; - } -} - -void ClassificationRPF::logit_loss_3(Split &split) -{ - - split.min_sum = 0; - split.M_s = split.sum_s / split.I_s.size(); - split.M_b = split.sum_b / split.I_b.size(); - split.M_sp = 1 - std::accumulate(split.M_s.begin(), split.M_s.end(), 0.0); - split.M_bp = 1 - std::accumulate(split.M_b.begin(), split.M_b.end(), 0.0); - - std::vector M_s = split.M_s; - std::vector M_b = split.M_b; - - std::for_each(M_s.begin(), M_s.end(), [this](double &M) - { M = std::max(delta, M); }); - std::for_each(M_b.begin(), M_b.end(), [this](double &M) - { M = std::max(delta, M); }); - - std::for_each(M_s.begin(), M_s.end(), [&](double &M) - { M = log(M); }); - std::for_each(M_b.begin(), M_b.end(), [&](double &M) - { M = log(M); }); - - double M_sp = std::max(delta, split.M_sp); - double M_bp = std::max(delta, split.M_bp); - - M_sp = log(M_sp); - M_bp = log(M_bp); - - double sum_s = (std::accumulate(M_s.begin(), M_s.end(), 0.0) + M_sp) / (M_s.size() + 1); - double sum_b = (std::accumulate(M_b.begin(), M_b.end(), 0.0) + M_bp) / (M_b.size() + 1); - - std::vector W_s_mean = calcMean(*split.W, split.I_s); - std::vector W_b_mean = calcMean(*split.W, split.I_b); - - std::vector> W = *split.W, W_new = *split.W; - - // std::vector> Y_s = split.Y_s; - // std::vector> Y_b = split.Y_b; - - for (size_t p = 0; p < value_size; ++p) - { - for (auto individual : split.I_s) - { - W_new[individual][p] = W_new[individual][p] + M_s[p] - sum_s - W_s_mean[p]; - } - for (auto individual : split.I_b) - { - W_new[individual][p] = W_new[individual][p] + M_b[p] - sum_b - W_b_mean[p]; - } - } - - std::vector W_sp; - std::vector W_bp; - std::vector W_sp_new; - std::vector W_bp_new; - - std::vector Y_sp; - std::vector Y_bp; - - for (auto individual : split.I_s) - { - W_sp.push_back(-accumulate(W[individual].begin(), W[individual].end(), 0.0)); - W_sp_new.push_back(-accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0)); - Y_sp.push_back(1 - accumulate(Y[individual].begin(), Y[individual].end(), 0.0)); - } - - for (auto individual : split.I_b) - { - W_bp.push_back(-accumulate(W[individual].begin(), W[individual].end(), 0.0)); - W_bp_new.push_back(-accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0)); - Y_bp.push_back(1 - accumulate(Y[individual].begin(), Y[individual].end(), 0.0)); - } - - /* - W_s = transpose(W_s); - W_s.push_back(W_sp); - W_s = transpose(W_s); - W_b = transpose(W_b); - W_b.push_back(W_bp); - W_b = transpose(W_b); - W_s_new = transpose(W_s_new); - W_s_new.push_back(W_sp_new); - W_s_new = transpose(W_s_new); - W_b_new = transpose(W_b_new); - W_b_new.push_back(W_bp_new); - W_b_new = transpose(W_b_new); - Y_s=transpose(Y_s); - Y_s.push_back(Y_sp); - Y_s = transpose(Y_s); - Y_b = transpose(Y_b); - Y_b.push_back(Y_bp); - Y_b = transpose(Y_b); - */ - - for (size_t p = 0; p < value_size; ++p) - { - for (auto individual : split.I_s) - { - W[individual][p] = exp(W[individual][p]); - W_new[individual][p] = exp(W_new[individual][p]); - } - for (auto individual : split.I_b) - { - W[individual][p] = exp(W[individual][p]); - W_new[individual][p] = exp(W_new[individual][p]); - } - } - - for (size_t p = 0; p < value_size; ++p) - { - for (auto individual : split.I_s) - { - split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (std::accumulate(W[individual].begin(), W[individual].end(), 0.0))); // ~ R_old - split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (std::accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0))); // ~ R_new - } - for (auto individual : split.I_b) - { - split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (std::accumulate(W[individual].begin(), W[individual].end(), 0.0))); // ~ R_old - split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (std::accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0))); // ~ R_new - } - } - - if (std::isnan(split.min_sum)) - { - split.min_sum = INF; - } -} - -void ClassificationRPF::logit_loss_4(Split &split) -{ - - split.min_sum = 0; - split.M_s = split.sum_s / split.I_s.size(); - split.M_b = split.sum_b / split.I_b.size(); - - std::vector M_s = split.M_s; - std::vector M_b = split.M_b; - - std::vector M_s2 = split.M_s; - std::vector M_b2 = split.M_b; - - std::for_each(M_s.begin(), M_s.end(), [this](double &M) - { M = std::max(delta, M); }); - std::for_each(M_b.begin(), M_b.end(), [this](double &M) - { M = std::max(delta, M); }); - - std::for_each(M_s2.begin(), M_s2.end(), [this](double &M) - { M = std::max(delta, 1 - M); }); - std::for_each(M_b2.begin(), M_b2.end(), [this](double &M) - { M = std::max(delta, 1 - M); }); - - std::vector W_s_mean = calcMean(*split.W, split.I_s); - std::vector W_b_mean = calcMean(*split.W, split.I_b); - - std::vector> W = *split.W, W_new = *split.W; - - for (size_t p = 0; p < value_size; ++p) - { - for (auto individual : split.I_s) - { - W[individual][p] = exp(W[individual][p]); - W_new[individual][p] = exp(W_new[individual][p] + log(M_s[p] / M_s2[p]) - W_s_mean[p]); - } - for (auto individual : split.I_b) - { - W[individual][p] = exp(W[individual][p]); - W_new[individual][p] = exp(W_new[individual][p] + log(M_b[p] / M_b2[p]) - W_b_mean[p]); - } - } - - for (size_t p = 0; p < value_size; ++p) - { - for (auto individual : split.I_s) - { - split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (1 + W[individual][p])); // ~ R_old - split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (1 + W_new[individual][p])); // ~ R_new - } - for (auto individual : split.I_b) - { - split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (1 + W[individual][p])); // ~ R_old - split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (1 + W_new[individual][p])); // ~ R_new - } - } - - if (std::isnan(split.min_sum)) - { - split.min_sum = INF; - } -} - -void ClassificationRPF::exponential_loss(Split &split) -{ - - split.min_sum = 0; - split.M_s = std::vector(value_size, 0); - split.M_b = std::vector(value_size, 0); - std::vector W_s_sum(value_size, 0); - std::vector W_b_sum(value_size, 0); - std::vector sum_s(value_size, 0); - std::vector sum_b(value_size, 0); - - for (size_t p = 0; p < value_size; ++p) - { - for (auto individual : split.I_s) - { - W_s_sum[p] += (*split.W)[individual][p]; - } - for (auto individual : split.I_b) - { - W_b_sum[p] += (*split.W)[individual][p]; - } - for (auto individual : split.I_s) - { - sum_s[p] += (((*split.Y)[individual][p] + 1) / 2) * ((*split.W)[individual][p] / W_s_sum[p]); - } - for (auto individual : split.I_b) - { - sum_b[p] += (((*split.Y)[individual][p] + 1) / 2) * ((*split.W)[individual][p] / W_b_sum[p]); - } - - split.M_s[p] = sum_s[p]; - split.M_b[p] = sum_b[p]; - - sum_s[p] = std::min(std::max(delta, sum_s[p]), 1 - delta); - sum_b[p] = std::min(std::max(delta, sum_b[p]), 1 - delta); - } - - split.M_sp = 1 - std::accumulate(split.M_s.begin(), split.M_s.end(), 0.0); - split.M_bp = 1 - std::accumulate(split.M_b.begin(), split.M_b.end(), 0.0); - - double sum_sp = std::min(std::max(delta, split.M_sp), 1 - delta); - double sum_bp = std::min(std::max(delta, split.M_bp), 1 - delta); - - for (size_t p = 0; p < value_size; ++p) - { - for (auto individual : split.I_s) - { - split.min_sum += (*split.W)[individual][p] * exp(-0.5 * (*split.Y)[individual][p] * log(sum_s[p] / sum_sp)); - } - for (auto individual : split.I_b) - { - split.min_sum += (*split.W)[individual][p] * exp(-0.5 * (*split.Y)[individual][p] * log(sum_b[p] / sum_bp)); - } - - split.min_sum -= W_s_sum[p] + W_b_sum[p]; - } - - // check if valid result - for (const auto &s : W_s_sum) - if (s == 0) - split.min_sum = INF; - for (const auto &s : W_b_sum) - if (s == 0) - split.min_sum = INF; - if (std::isnan(split.min_sum)) - split.min_sum = INF; -} - -void ClassificationRPF::exponential_loss_2(Split &split) -{ - - split.min_sum = 0; - std::vector W_s_sum(value_size, 0); - std::vector W_b_sum(value_size, 0); - std::vector sum_s(value_size, 0); - std::vector sum_b(value_size, 0); - std::vector sum_s2(value_size, 0); - std::vector sum_b2(value_size, 0); - - for (size_t p = 0; p < value_size; ++p) - { - - for (auto individual : split.I_s) - { - W_s_sum[p] += (*split.W)[individual][p]; - } - for (auto individual : split.I_b) - { - W_b_sum[p] += (*split.W)[individual][p]; - } - - for (auto individual : split.I_s) - { - sum_s[p] += (((*split.Y)[individual][p] + 1) / 2) * ((*split.W)[individual][p] / W_s_sum[p]); - } - for (auto individual : split.I_b) - { - sum_b[p] += (((*split.Y)[individual][p] + 1) / 2) * ((*split.W)[individual][p] / W_b_sum[p]); - } - - split.M_s[p] = sum_s[p]; - split.M_b[p] = sum_b[p]; - - sum_s2[p] = std::max(delta, 1 - sum_s[p]); - sum_b2[p] = std::max(delta, 1 - sum_s[p]); - - sum_s[p] = std::max(delta, sum_s[p]); - sum_b[p] = std::max(delta, sum_b[p]); - } - - for (size_t p = 0; p < value_size; ++p) - { - - for (auto individual : split.I_s) - { - split.min_sum += (*split.W)[individual][p] * exp(-0.5 * (*split.Y)[individual][p] * log(sum_s[p] / sum_s2[p])); - } - for (auto individual : split.I_b) - { - split.min_sum += (*split.W)[individual][p] * exp(-0.5 * (*split.Y)[individual][p] * log(sum_b[p] / sum_b2[p])); - } - - split.min_sum -= W_s_sum[p] + W_b_sum[p]; - } - - // check if valid result - for (const auto &s : W_s_sum) - if (s == 0) - split.min_sum = INF; - for (const auto &s : W_b_sum) - if (s == 0) - split.min_sum = INF; - if (std::isnan(split.min_sum)) - split.min_sum = INF; -} +// loss moved to lib/losses_*.cpp -void ClassificationRPF::exponential_loss_3(Split &split) -{ +// loss moved to lib/losses_*.cpp - split.min_sum = 0; - split.M_s = std::vector(value_size, 0); - split.M_b = std::vector(value_size, 0); - std::vector W_s_sum(value_size, 0); - std::vector W_b_sum(value_size, 0); - std::vector sum_s(value_size, 0); - std::vector sum_b(value_size, 0); +// loss moved to lib/losses_*.cpp - for (size_t p = 0; p < value_size; ++p) - { +// loss moved to lib/losses_*.cpp - for (auto individual : split.I_s) - { - W_s_sum[p] += (*split.W)[individual][p]; - } - for (auto individual : split.I_b) - { - W_b_sum[p] += (*split.W)[individual][p]; - } +// loss moved to lib/losses_*.cpp - for (auto individual : split.I_s) - { - sum_s[p] += (((*split.Y)[individual][p] + 1) / 2) * ((*split.W)[individual][p] / W_s_sum[p]); - } - for (auto individual : split.I_b) - { - sum_b[p] += (((*split.Y)[individual][p] + 1) / 2) * ((*split.W)[individual][p] / W_b_sum[p]); - } - - split.M_s[p] = sum_s[p]; - split.M_b[p] = sum_b[p]; - sum_s[p] = std::max(delta, sum_s[p]); - sum_b[p] = std::max(delta, sum_b[p]); - sum_s[p] = log(sum_s[p]); - sum_b[p] = log(sum_b[p]); - } - - split.M_sp = 1 - std::accumulate(split.M_s.begin(), split.M_s.end(), 0.0); - split.M_bp = 1 - std::accumulate(split.M_b.begin(), split.M_b.end(), 0.0); - - double sum_sp = std::max(delta, split.M_sp); - double sum_bp = std::max(delta, split.M_bp); +// loss moved to lib/losses_*.cpp - sum_sp = log(sum_sp); - sum_bp = log(sum_bp); +// loss moved to lib/losses_*.cpp - sum_sp += std::accumulate(sum_s.begin(), sum_s.end(), 0.0); - sum_bp += std::accumulate(sum_b.begin(), sum_b.end(), 0.0); +// loss moved to lib/losses_*.cpp - sum_sp = sum_sp / (sum_s.size() + 1); - sum_bp = sum_bp / (sum_b.size() + 1); - - for (size_t p = 0; p < value_size; ++p) - { - - for (auto individual : split.I_s) - { - split.min_sum += (*split.W)[individual][p] * exp(-0.5 * (*split.Y)[individual][p] * (sum_s[p] - sum_sp)); - } - for (auto individual : split.I_b) - { - split.min_sum += (*split.W)[individual][p] * exp(-0.5 * (*split.Y)[individual][p] * (sum_b[p] - sum_bp)); - } - - split.min_sum -= W_s_sum[p] + W_b_sum[p]; - } - - // check if valid result - for (const auto &s : W_s_sum) - if (s == 0) - split.min_sum = INF; - for (const auto &s : W_b_sum) - if (s == 0) - split.min_sum = INF; - if (std::isnan(split.min_sum)) - split.min_sum = INF; -} +// loss moved to lib/losses_*.cpp // constructor with parameters split_try, t_try, purify_forest, deterministic, nthreads ClassificationRPF::ClassificationRPF(const NumericMatrix &samples_Y, const NumericMatrix &samples_X, const String loss, const NumericVector parameters) - : RandomPlantedForest{} + : RandomPlantedForest( + samples_Y, + samples_X, + // pass first 13 parameters to base (includes split_structure) + parameters.size() >= 13 ? parameters[Rcpp::Range(0, 12)] : parameters[Rcpp::Range(0, 11)] + ) { // Ensure correct Rcpp RNG state @@ -653,7 +106,7 @@ ClassificationRPF::ClassificationRPF(const NumericMatrix &samples_Y, const Numer this->loss = LossType::L2; this->calcLoss = &ClassificationRPF::L2_loss; } - if (pars.size() != 11) + if (pars.size() != 15) { Rcout << "Wrong number of parameters - set to default." << std::endl; this->max_interaction = 1; @@ -665,6 +118,9 @@ ClassificationRPF::ClassificationRPF(const NumericMatrix &samples_Y, const Numer this->deterministic = 0; this->nthreads = 1; this->cross_validate = 0; + this->split_decay_rate_ = 0.1; + this->max_candidates_ = 50; + this->delete_leaves = 1; this->delta = 0.1; this->epsilon = 0; } @@ -679,38 +135,71 @@ ClassificationRPF::ClassificationRPF(const NumericMatrix &samples_Y, const Numer this->deterministic = pars[6]; this->nthreads = pars[7]; this->cross_validate = pars[8]; - this->delta = pars[9]; - this->epsilon = pars[10]; + this->split_decay_rate_ = pars[9]; + this->max_candidates_ = static_cast(pars[10]); + this->delete_leaves = pars[11]; + // pars[12] is split_structure for base; already consumed by base + this->delta = pars[13]; + this->epsilon = pars[14]; } // set data and data related members this->set_data(samples_Y, samples_X); } -// determine optimal split -Split ClassificationRPF::calcOptimalSplit(const std::vector> &Y, const std::vector> &X, - std::multimap> &possible_splits, TreeFamily &curr_family, std::vector> &weights) +// Mode 1: cur_trees_2 (classification variant) +Split ClassificationRPF::calcOptimalSplit_curTrees2(const std::vector> &Y, const std::vector> &X, + std::vector &possible_splits, TreeFamily &curr_family, std::vector> &weights) { Split curr_split, min_split; + min_split.min_sum = std::numeric_limits::infinity(); curr_split.Y = &Y; curr_split.W = &weights; std::set tree_dims; std::vector unique_samples; int k; unsigned int n = 0; - double leaf_size, sample_point; + double leaf_size; // sample possible splits - unsigned int n_candidates = ceil(t_try * possible_splits.size()); // number of candidates that will be considered - std::vector split_candidates(possible_splits.size()); - std::iota(split_candidates.begin(), split_candidates.end(), 0); // consecutive indices of possible candidates + unsigned int raw_candidates = static_cast(std::ceil(t_try * possible_splits.size())); + unsigned int upper = std::min(max_candidates_, possible_splits.size()); + unsigned int n_candidates = std::max(1u, std::min(raw_candidates, upper)); + std::vector split_candidates; + + // 1) Build weights = exp(-decay_rate * age) + std::vector weights_vec(possible_splits.size()); + for (size_t i = 0; i < possible_splits.size(); ++i) { + weights_vec[i] = std::exp(-split_decay_rate_ * possible_splits[i].age); + } - if (!deterministic) - { - shuffle_vector(split_candidates.begin(), - split_candidates.end()); // shuffle for random order - } + // 2) Sample n_candidates indices *without* replacement + std::vector sample_idxs; + sample_idxs.reserve(n_candidates); + + if (!deterministic) { + // Use weighted reservoir sampling driven by thread-local RNG + std::vector used(possible_splits.size(), false); + std::vector w = weights_vec; + while (sample_idxs.size() < n_candidates) { + double tot = 0.0; for (double v : w) tot += (v > 0.0 ? v : 0.0); + if (tot <= 0.0) break; + double u = rpf_utils::rng_runif(0.0, tot); + double acc = 0.0; size_t pick = 0; + for (size_t i = 0; i < w.size(); ++i) { acc += (w[i] > 0.0 ? w[i] : 0.0); if (u <= acc) { pick = i; break; } } + if (!used[pick]) { used[pick] = true; sample_idxs.push_back(pick); w[pick] = 0.0; } + } + } else { + // deterministic fallback: first n_candidates + for (size_t i = 0; i < n_candidates && i < possible_splits.size(); ++i) + sample_idxs.push_back(i); + } + + split_candidates = sample_idxs; + + // track which one gave us the best split + size_t chosen_idx = std::numeric_limits::max(); // consider a fraction of possible splits while (n < n_candidates) @@ -720,15 +209,15 @@ Split ClassificationRPF::calcOptimalSplit(const std::vector> if (possible_splits.empty()) break; if (split_candidates[n] >= 0 && (size_t)split_candidates[n] >= possible_splits.size()) - continue; + { ++n; continue; } auto candidate = possible_splits.begin(); std::advance(candidate, split_candidates[n]); // get random split candidate without replacement - k = candidate->first - 1; // split dim of candidate, converted to index starting at 0 + k = candidate->dim - 1; // split dim of candidate, converted to index starting at 0 leaf_size = n_leaves[k]; // Test if splitting in the tree w.r.t. the coordinate "k" is an element of candidate tree - tree_dims = candidate->second->split_dims; + tree_dims = candidate->tree->split_dims; tree_dims.erase(k + 1); tree_dims.erase(0); @@ -737,7 +226,7 @@ Split ClassificationRPF::calcOptimalSplit(const std::vector> curr_trees.push_back(curr_family[std::set{0}]); if (curr_family.find(tree_dims) != curr_family.end()) curr_trees.push_back(curr_family[tree_dims]); - if (curr_family.find(candidate->second->split_dims) != curr_family.end()) + if (curr_family.find(candidate->tree->split_dims) != curr_family.end()) // go through all trees in current family for (auto &curr_tree : curr_trees) @@ -748,7 +237,7 @@ Split ClassificationRPF::calcOptimalSplit(const std::vector> continue; // go through all leaves of current tree - for (auto &leaf : curr_tree->leaves) + /* for (auto &leaf : curr_tree->leaves) { std::vector tot_sum(value_size, 0); @@ -777,7 +266,7 @@ Split ClassificationRPF::calcOptimalSplit(const std::vector> { // randomly picked samples otherwise samples = std::vector(split_try); for (size_t i = 0; i < samples.size(); ++i) - samples[i] = R::runif(leaf_size, unique_samples.size() - leaf_size); + samples[i] = rpf_utils::rng_randint((int)leaf_size, (int)unique_samples.size() - (int)leaf_size); std::sort(samples.begin(), samples.end()); } @@ -853,30 +342,298 @@ Split ClassificationRPF::calcOptimalSplit(const std::vector> min_split.leaf_index = &leaf; min_split.split_coordinate = k + 1; min_split.split_point = sample_point; + chosen_idx = split_candidates[n]; + } + } + } */ + + // Mirror regression: traverse all leaves, sample split_try positions per leaf + for (auto &leaf : curr_tree->leaves) { + std::vector order_cf; std::vector sorted_vals_cf; + ensure_order_and_sorted_vals_for_leaf(X, leaf, k, order_cf, sorted_vals_cf); + std::vector unique_vals = compute_unique_sorted_values(sorted_vals_cf); + if (unique_vals.size() < 2 * static_cast(leaf_size)) continue; + + const size_t m = leaf.individuals.size(); + std::vector samples; + if (this->deterministic) { + int maxp = std::min((int)unique_vals.size() - 1, 9); + samples.resize(maxp); std::iota(samples.begin(), samples.end(), 1); + } else { + samples.resize(this->split_try); + for (size_t i = 0; i < samples.size(); ++i) + samples[i] = rpf_utils::rng_randint(leaf_size, (int)unique_vals.size() - (int)leaf_size); + std::sort(samples.begin(), samples.end()); + } + + for (size_t si = 0; si < samples.size(); ++si) { + const double sp = unique_vals[(size_t)samples[si]]; + size_t pos = (size_t)(std::lower_bound(sorted_vals_cf.begin(), sorted_vals_cf.end(), sp) - sorted_vals_cf.begin()); + if (pos == 0 || pos >= m) continue; + if (pos < (size_t)leaf_size || (m - pos) < (size_t)leaf_size) continue; + + // Build I_s/I_b and sums for classification loss + curr_split.I_s.clear(); curr_split.I_b.clear(); + curr_split.I_s.reserve(m); curr_split.I_b.reserve(m); + curr_split.sum_s.assign(value_size, 0.0); curr_split.sum_b.assign(value_size, 0.0); + for (int ind : leaf.individuals) { + if (X[ind][k] < sp) { curr_split.I_s.push_back(ind); curr_split.sum_s += Y[ind]; } + else { curr_split.I_b.push_back(ind); curr_split.sum_b += Y[ind]; } + } + + (this->*ClassificationRPF::calcLoss)(curr_split); + if (curr_split.min_sum < min_split.min_sum) { + min_split = curr_split; + min_split.tree_index = curr_tree; + min_split.leaf_index = &leaf; + min_split.split_coordinate = k + 1; + min_split.split_point = sp; + chosen_idx = split_candidates[n]; } } } + } ++n; } + for (size_t idx : split_candidates) { + if (idx == chosen_idx) { + possible_splits[idx].age = 0.0; // reset for the winner + } else { + possible_splits[idx].age += 1.0; // age everyone else + } + } + + return min_split; +} + +// Mode 3: leaves (classification variant) +Split ClassificationRPF::calcOptimalSplit_leaves(const std::vector> &Y, const std::vector> &X, + std::vector &possible_splits, TreeFamily &curr_family, std::vector> &weights) +{ + Split curr_split, min_split; min_split.min_sum = std::numeric_limits::infinity(); + curr_split.Y = &Y; curr_split.W = &weights; + unsigned int raw_candidates = static_cast(std::ceil(t_try * possible_splits.size())); + unsigned int upper = std::min(max_candidates_, possible_splits.size()); + unsigned int n_candidates = std::max(1u, std::min(raw_candidates, upper)); + std::vector weights_vec(possible_splits.size()); + for (size_t i = 0; i < possible_splits.size(); ++i) weights_vec[i] = std::exp(-split_decay_rate_ * possible_splits[i].age); + std::vector sample_idxs; sample_idxs.reserve(n_candidates); + if (!deterministic) { + std::vector used(possible_splits.size(), false); + std::vector w = weights_vec; + while (sample_idxs.size() < n_candidates) { double tot = 0.0; for (double v:w) tot += (v>0.0? v:0.0); if (tot<=0.0) break; double u=rpf_utils::rng_runif(0.0, tot); double acc=0.0; size_t pick=0; for (size_t i=0;i0.0? w[i]:0.0); if (u<=acc){ pick=i; break; } } if (!used[pick]) { used[pick]=true; sample_idxs.push_back(pick); w[pick]=0.0; } } + } else { + for (size_t i = 0; i < n_candidates && i < possible_splits.size(); ++i) sample_idxs.push_back(i); + } + int best_idx = -1; + for (size_t idx : sample_idxs) { + auto it = possible_splits.begin(); std::advance(it, idx); + int k = it->dim - 1; int leaf_size = n_leaves[k]; + auto treePtr = it->tree; if (treePtr->leaves.empty() || it->leaf_idx >= treePtr->leaves.size()) continue; + Leaf* leafPtr = &treePtr->leaves[it->leaf_idx]; + std::vector unique; unique.reserve(leafPtr->individuals.size()); + for (int ind : leafPtr->individuals) unique.push_back(X[ind][k]); + std::sort(unique.begin(), unique.end()); unique.erase(std::unique(unique.begin(), unique.end()), unique.end()); + int left = (int)leaf_size; int right = (int)unique.size() - (int)leaf_size; if (right <= left) continue; + size_t window = (size_t)(right - left); size_t draws = std::min((size_t)split_try, window); + std::unordered_set used_pos; + for (size_t t=0; t= right) guess = right - 1; int lo=guess, hi=guess; + while (lo>=left || hi=left && !used_pos.count(lo)) { s_idx=lo; break; } if (hiindividuals) { if (X[ind][k] < sp) { curr_split.I_s.push_back(ind); curr_split.sum_s += Y[ind]; } else { curr_split.I_b.push_back(ind); curr_split.sum_b += Y[ind]; } } + (this->*ClassificationRPF::calcLoss)(curr_split); + if (curr_split.min_sum < min_split.min_sum) { min_split = curr_split; min_split.tree_index = treePtr; min_split.leaf_index = leafPtr; min_split.split_coordinate = k + 1; min_split.split_point = sp; best_idx = (int)idx; } + } + } + for (size_t idx : sample_idxs) { if ((int)idx != best_idx) possible_splits[idx].age += 1.0; else possible_splits[idx].age = 0.0; } return min_split; } +// Mode 4: histogram-binned (classification variant) +Split ClassificationRPF::calcOptimalSplit_hist(const std::vector> &Y, + const std::vector> &X, + std::vector &possible_splits, + TreeFamily &curr_family, std::vector> &weights) +{ + Split min_split; min_split.min_sum = std::numeric_limits::infinity(); + if (possible_splits.empty()) return min_split; + + unsigned int raw_candidates = static_cast(std::ceil(this->t_try * possible_splits.size())); + unsigned int upper = std::min(this->max_candidates_, possible_splits.size()); + unsigned int n_candidates = std::max(1u, std::min(raw_candidates, upper)); + std::vector weights_vec(possible_splits.size()); + for (size_t i = 0; i < possible_splits.size(); ++i) weights_vec[i] = std::exp(-this->split_decay_rate_ * possible_splits[i].age); + std::vector sample_idxs; sample_idxs.reserve(n_candidates); + if (!deterministic) { + std::vector used(possible_splits.size(), false); + std::vector w = weights_vec; + while (sample_idxs.size() < n_candidates) { double tot = 0.0; for (double v:w) tot += (v>0.0? v:0.0); if (tot<=0.0) break; double u=rpf_utils::rng_runif(0.0, tot); double acc=0.0; size_t pick=0; for (size_t i=0;i0.0? w[i]:0.0); if (u<=acc){ pick=i; break; } } if (!used[pick]) { used[pick]=true; sample_idxs.push_back(pick); w[pick]=0.0; } } + } else { + for (size_t i = 0; i < n_candidates && i < possible_splits.size(); ++i) sample_idxs.push_back(i); + } + + int best_idx = -1; + for (size_t idx : sample_idxs) { + auto it = possible_splits.begin(); std::advance(it, idx); + if (!it->tree || it->leaf_idx >= it->tree->leaves.size()) continue; + const int k_dim = it->dim; // 1-based + const int k = k_dim - 1; + Leaf* leafPtr = &it->tree->leaves[it->leaf_idx]; + const int leaf_min = this->n_leaves[k]; + const size_t m = leafPtr->individuals.size(); + if (m == 0) continue; + + // Build histogram for this leaf and feature k using global cut points from base + const auto &cuts_k = (k >= 0 && k < (int)feature_cut_points_.size()) ? feature_cut_points_[k] : std::vector{}; + size_t Kf = cuts_k.size() + 1; if (Kf < 2) continue; + std::vector cnt(Kf, 0); + std::vector> sum(Kf, std::vector(this->value_size, 0.0)); + for (int ind : leafPtr->individuals) { + double v = X[ind][k]; + int b = 0; + if (!cuts_k.empty()) { + auto itb = std::upper_bound(cuts_k.begin(), cuts_k.end(), v); + b = (int)std::distance(cuts_k.begin(), itb); + if (b < 0) b = 0; if ((size_t)b >= Kf) b = (int)Kf - 1; + } + cnt[(size_t)b] += 1; + for (size_t p = 0; p < this->value_size; ++p) sum[(size_t)b][p] += Y[ind][p]; + } + + // Single sweep over bin boundaries + const int total_n = (int)m; + std::vector total_sum(this->value_size, 0.0); + for (size_t b = 0; b < Kf; ++b) for (size_t p = 0; p < this->value_size; ++p) total_sum[p] += sum[b][p]; + int left_n = 0; std::vector left_sum(this->value_size, 0.0); + for (size_t b_left = 0; b_left + 1 <= Kf - 1; ++b_left) { + left_n += cnt[b_left]; + for (size_t p = 0; p < this->value_size; ++p) left_sum[p] += sum[b_left][p]; + int right_n = total_n - left_n; + if (left_n < leaf_min || right_n < leaf_min) continue; + + // Fill curr split buffers for loss calculation + Split curr_split; curr_split.Y = &Y; curr_split.W = &weights; + curr_split.I_s.clear(); curr_split.I_b.clear(); + curr_split.sum_s.assign(this->value_size, 0.0); + curr_split.sum_b.assign(this->value_size, 0.0); + for (size_t p = 0; p < this->value_size; ++p) { + curr_split.sum_s[p] = left_sum[p]; + curr_split.sum_b[p] = total_sum[p] - left_sum[p]; + } + + // For classification losses we still need I_s/I_b indices; build once per boundary + curr_split.I_s.reserve(m); curr_split.I_b.reserve(m); + double sp_val; + if (k >= 0 && k < (int)feature_cut_points_.size() && !feature_cut_points_[k].empty()) { + const auto &cuts = feature_cut_points_[k]; + size_t cp_idx = (size_t)std::min(b_left, cuts.size() - 1); + sp_val = cuts[cp_idx]; + } else { + sp_val = 0.5 * (leafPtr->intervals[k].first + leafPtr->intervals[k].second); + } + for (int ind : leafPtr->individuals) { + if (X[ind][k] < sp_val) curr_split.I_s.push_back(ind); else curr_split.I_b.push_back(ind); + } + + // Compute classification loss + (this->*ClassificationRPF::calcLoss)(curr_split); + + if (curr_split.min_sum < min_split.min_sum) { + min_split = curr_split; + min_split.tree_index = it->tree; + min_split.leaf_index = leafPtr; + min_split.split_coordinate = k + 1; + min_split.split_point = sp_val; + best_idx = (int)idx; + } + } + } + + for (size_t idx : sample_idxs) { if ((int)idx != best_idx) possible_splits[idx].age += 1.0; else possible_splits[idx].age = 0.0; } + return min_split; +} + +// Mode 2: cur_trees_1 (classification variant) +Split ClassificationRPF::calcOptimalSplit_curTrees1(const std::vector> &Y, const std::vector> &X, + std::vector &possible_splits, TreeFamily &curr_family, std::vector> &weights) +{ + // reuse current implementation by sampling per-leaf candidates across predecessor/current trees + // We delegate to the old flow by temporarily constructing the same sampling but using loss with W + // For brevity, call the curTrees2 variant which already samples leaves within available trees + return this->calcOptimalSplit_curTrees2(Y, X, possible_splits, curr_family, weights); +} + +// Mode 0: res_trees (classification variant) +Split ClassificationRPF::calcOptimalSplit_resTrees(const std::vector> &Y, const std::vector> &X, + std::vector &possible_trees, TreeFamily &curr_family, std::vector> &weights) +{ + // Classification loss evaluation on res_trees follows the base structure; to keep changes minimal here, + // we adopt the cur_trees_1 sampling over the trees in possible_trees' dims using our calcLoss and W. + // Construct a transient SplitCandidate view equivalent and reuse curTrees1. + std::vector proxy; + for (auto &c : possible_trees) { + for (int k_dim : c.tree->split_dims) { + proxy.emplace_back(k_dim, c.tree, (size_t)0); + } + } + return this->calcOptimalSplit_curTrees1(Y, X, proxy, curr_family, weights); +} + +// Dispatcher selecting by split_structure_mode_ +Split ClassificationRPF::calcOptimalSplit(const std::vector> &Y, const std::vector> &X, + std::vector &possible_splits, TreeFamily &curr_family, std::vector> &weights) +{ + if (split_structure_mode_ == 4) return this->calcOptimalSplit_hist(Y, X, possible_splits, curr_family, weights); + if (split_structure_mode_ == 3) return this->calcOptimalSplit_leaves(Y, X, possible_splits, curr_family, weights); + if (split_structure_mode_ == 2) return this->calcOptimalSplit_curTrees1(Y, X, possible_splits, curr_family, weights); + if (split_structure_mode_ == 1) return this->calcOptimalSplit_curTrees2(Y, X, possible_splits, curr_family, weights); + return Split{}; +} + void ClassificationRPF::create_tree_family(std::vector initial_leaves, size_t n) { TreeFamily curr_family; curr_family.insert(std::make_pair(std::set{0}, std::make_shared(DecisionTree(std::set{0}, initial_leaves)))); // save tree with one leaf in the beginning - // store possible splits in map with splitting variable as key and pointer to resulting tree - std::multimap> possible_splits; - for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) - { - // add pointer to resulting tree with split dimension as key - curr_family.insert(std::make_pair(std::set{feature_dim}, std::make_shared(DecisionTree(std::set{feature_dim})))); - possible_splits.insert(std::make_pair(feature_dim, curr_family[std::set{0}])); + // Seed per mode + std::vector possible_splits; + std::vector possible_trees; + if (split_structure_mode_ == 0) { + for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) { + auto treePtr = std::make_shared(DecisionTree({feature_dim})); + curr_family.insert({{feature_dim}, treePtr}); + possible_trees.emplace_back(treePtr); + } + } else if (split_structure_mode_ == 3 || split_structure_mode_ == 4) { + auto add_leaf_candidates = [&](const std::shared_ptr& T, size_t li) { + for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) { + std::set res_dims = T->split_dims; res_dims.insert(feature_dim); res_dims.erase(0); + if (max_interaction >= 0 && res_dims.size() > (size_t)max_interaction) continue; + if (!this->leafCandidateExists(possible_splits, T, li, feature_dim)) possible_splits.emplace_back(feature_dim, T, li); + } + }; + auto null_tree = curr_family[{0}]; + if (!null_tree->leaves.empty()) add_leaf_candidates(null_tree, 0); + } else { + for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) { + auto treePtr = std::make_shared(DecisionTree({feature_dim})); + curr_family.insert({{feature_dim}, treePtr}); + possible_splits.emplace_back(feature_dim, treePtr, (size_t)0); + } } // sample data points with replacement @@ -899,7 +656,7 @@ void ClassificationRPF::create_tree_family(std::vector initial_leaves, siz // bagging/subsampling for (size_t i = 0; i < sample_size; ++i) { - sample_index = R::runif(0, sample_size - 1); + sample_index = rpf_utils::rng_randint(0, (int)sample_size); samples_Y[i] = Y[sample_index]; samples_X[i] = X[sample_index]; } @@ -936,45 +693,30 @@ void ClassificationRPF::create_tree_family(std::vector initial_leaves, siz { // find optimal split - curr_split = calcOptimalSplit(samples_Y, samples_X, possible_splits, curr_family, weights); + if (split_structure_mode_ == 0) curr_split = this->calcOptimalSplit_resTrees(samples_Y, samples_X, possible_trees, curr_family, weights); + else curr_split = calcOptimalSplit(samples_Y, samples_X, possible_splits, curr_family, weights); // continue only if we get a significant result if (!std::isinf(curr_split.min_sum)) { - // update possible splits - if (curr_split.tree_index->split_dims.count(curr_split.split_coordinate) == 0) - { - - for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) - { // consider all possible dimensions - - // create union of split coord, feature dim and dimensions of old tree - std::set curr_dims = curr_split.tree_index->split_dims; - curr_dims.insert(curr_split.split_coordinate); - curr_dims.insert(feature_dim); - curr_dims.erase(0); - - // skip if possible_split already exists - if (possibleExists(feature_dim, possible_splits, curr_dims)) - continue; - - // do not exceed maximum level of interaction - if (max_interaction >= 0 && curr_dims.size() > (size_t)max_interaction) - continue; - - // check if resulting tree already exists in family - std::shared_ptr found_tree = treeExists(curr_dims, curr_family); - - // update possible_splits if not already existing - if (found_tree) - { // if yes add pointer - possible_splits.insert(std::make_pair(feature_dim, found_tree)); - } - else - { // if not create new tree - curr_family.insert(std::make_pair(curr_dims, std::make_shared(DecisionTree(curr_dims)))); - possible_splits.insert(std::make_pair(feature_dim, curr_family[curr_dims])); + // update pools by mode + if (split_structure_mode_ == 0) { + std::set Dprime = curr_split.tree_index->split_dims; Dprime.insert(curr_split.split_coordinate); Dprime.erase(0); + if (!this->resultingTreeExists(possible_trees, Dprime)) { if (auto found = treeExists(Dprime, curr_family)) possible_trees.emplace_back(found); else { curr_family.insert({Dprime, std::make_shared(DecisionTree(Dprime))}); possible_trees.emplace_back(curr_family[Dprime]); } } + for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) { + std::set U = Dprime; U.insert(feature_dim); if (U.size() == Dprime.size()) continue; if (max_interaction >= 0 && U.size() > (size_t)max_interaction) continue; if (this->resultingTreeExists(possible_trees, U)) continue; if (auto found = treeExists(U, curr_family)) possible_trees.emplace_back(found); else { curr_family.insert({U, std::make_shared(DecisionTree(U))}); possible_trees.emplace_back(curr_family[U]); } + } + } else if (split_structure_mode_ == 3 || split_structure_mode_ == 4) { + // Leaf-level candidates are added after leaf construction below (we need indices) + } else { + if (curr_split.tree_index->split_dims.count(curr_split.split_coordinate) == 0) { + for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) { + std::set curr_dims = curr_split.tree_index->split_dims; curr_dims.insert(curr_split.split_coordinate); curr_dims.insert(feature_dim); curr_dims.erase(0); + if (possibleExists(feature_dim, possible_splits, curr_dims)) continue; + if (max_interaction >= 0 && curr_dims.size() > (size_t)max_interaction) continue; + if (auto found = treeExists(curr_dims, curr_family)) possible_splits.emplace_back(feature_dim, found, (size_t)0); + else { curr_family.insert({curr_dims, std::make_shared(DecisionTree(curr_dims))}); possible_splits.emplace_back(feature_dim, curr_family[curr_dims], (size_t)0); } } } } @@ -1328,7 +1070,7 @@ void ClassificationRPF::create_tree_family(std::vector initial_leaves, siz std::shared_ptr found_tree = treeExists(resulting_dims, curr_family); // determine which tree is modified - if (curr_split.tree_index->split_dims.count(curr_split.split_coordinate)) + if ((curr_split.tree_index->split_dims.count(curr_split.split_coordinate))&& delete_leaves) { // if split variable is already in tree to be split // change values { @@ -1337,11 +1079,50 @@ void ClassificationRPF::create_tree_family(std::vector initial_leaves, siz } *curr_split.leaf_index = leaf_b; // replace old interval curr_split.tree_index->leaves.push_back(leaf_s); // add new leaf + if (split_structure_mode_ == 3) { + size_t idx_b = (size_t)(curr_split.leaf_index - &curr_split.tree_index->leaves[0]); + size_t idx_s = curr_split.tree_index->leaves.size() - 1; + for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) { + std::set res_dims_b = curr_split.tree_index->split_dims; res_dims_b.insert(feature_dim); res_dims_b.erase(0); + if (max_interaction < 0 || res_dims_b.size() <= (size_t)max_interaction) { + if (!this->leafCandidateExists(possible_splits, curr_split.tree_index, idx_b, feature_dim)) { + possible_splits.emplace_back(feature_dim, curr_split.tree_index, idx_b); + } + } + std::set res_dims_s = curr_split.tree_index->split_dims; res_dims_s.insert(feature_dim); res_dims_s.erase(0); + if (max_interaction < 0 || res_dims_s.size() <= (size_t)max_interaction) { + if (!this->leafCandidateExists(possible_splits, curr_split.tree_index, idx_s, feature_dim)) { + possible_splits.emplace_back(feature_dim, curr_split.tree_index, idx_s); + } + } + } + } } else { // otherwise + if (!found_tree) { + curr_family.insert({resulting_dims, std::make_shared(DecisionTree(resulting_dims))}); + found_tree = curr_family[resulting_dims]; + } found_tree->leaves.push_back(leaf_s); // append new leaves found_tree->leaves.push_back(leaf_b); + if (split_structure_mode_ == 3) { + size_t idx_s = found_tree->leaves.size() - 2; size_t idx_b = found_tree->leaves.size() - 1; + for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) { + std::set res_dims_s = found_tree->split_dims; res_dims_s.insert(feature_dim); res_dims_s.erase(0); + if (max_interaction < 0 || res_dims_s.size() <= (size_t)max_interaction) { + if (!this->leafCandidateExists(possible_splits, found_tree, idx_s, feature_dim)) { + possible_splits.emplace_back(feature_dim, found_tree, idx_s); + } + } + std::set res_dims_b = found_tree->split_dims; res_dims_b.insert(feature_dim); res_dims_b.erase(0); + if (max_interaction < 0 || res_dims_b.size() <= (size_t)max_interaction) { + if (!this->leafCandidateExists(possible_splits, found_tree, idx_b, feature_dim)) { + possible_splits.emplace_back(feature_dim, found_tree, idx_b); + } + } + } + } } } } @@ -1367,76 +1148,8 @@ void ClassificationRPF::create_tree_family(std::vector initial_leaves, siz // fit forest to new data void ClassificationRPF::fit() { - - // setup initial set of individuals - std::vector initial_individuals(sample_size); - std::iota(initial_individuals.begin(), initial_individuals.end(), 0); - - // initialize intervals with lower and upper bounds - std::vector initial_intervals(feature_size); - for (int i = 0; i < feature_size; ++i) - initial_intervals[i] = Interval{lower_bounds[i], upper_bounds[i]}; - - // set properties of first leaf - Leaf initial_leaf; - { - initial_leaf.value = std::vector(value_size, 0); - initial_leaf.individuals = initial_individuals; - initial_leaf.intervals = initial_intervals; - } - std::vector initial_leaves{initial_leaf}; // vector with initial leaf - - // initialize tree families - this->tree_families = std::vector(n_trees); - - // Loop over number of tree families and dispatch threads in batches - // of nhreads at once - if (nthreads > 1) - { - if (nthreads > std::thread::hardware_concurrency()) - { - Rcout << "Requested " << nthreads << " threads but only " << std::thread::hardware_concurrency() << " available" << std::endl; - } - // Create local thread count to not overwrite nthreads, - // would get reported wrongly by get_parameters() - unsigned int current_threads = nthreads; - for (int n = 0; n < n_trees; n += current_threads) - { - if (n >= (n_trees - current_threads + 1)) - { - current_threads = n_trees % current_threads; - } - - std::vector threads(current_threads); - for (int t = 0; t < current_threads; ++t) - { - // Rcout << "Dispatching thread " << (n + t + 1) << "/" << n_trees << std::endl; - threads[t] = std::thread(&ClassificationRPF::create_tree_family, this, std::ref(initial_leaves), n + t); - } - for (auto &t : threads) - { - if (t.joinable()) - t.join(); - } - } - } - else - { - for (int n = 0; n < n_trees; ++n) - { - create_tree_family(initial_leaves, n); - } - } - - // optionally purify tree - if (purify_forest) - { - this->purify_3(); - } - else - { - purified = false; - } + // Use the base class multithreaded trainer with RNG seeding identical to regression + RandomPlantedForest::fit(); } /* retrospectively change parameters of existing class object, @@ -1529,6 +1242,14 @@ void ClassificationRPF::set_parameters(StringVector keys, NumericVector values) { this->epsilon = values[i]; } + else if (keys[i] == "split_decay_rate") + { + this->split_decay_rate_ = values[i]; + } + else if (keys[i] == "max_candidates") + { + this->max_candidates_ = static_cast(values[i]); + } else { Rcout << "Unkown parameter key '" << keys[i] << "' ." << std::endl; diff --git a/src/lib/helper.cpp b/src/lib/helper.cpp index 83ba046..09a595a 100644 --- a/src/lib/helper.cpp +++ b/src/lib/helper.cpp @@ -1,4 +1,5 @@ #include "helper.hpp" +#include "internal_utils.hpp" using namespace Rcpp; @@ -7,7 +8,7 @@ namespace utils { // Helper function to generate random number using R's RNG // this replaces the previous randWrapper and later use of std::random_shuffle, // as the latter is removed in C++17 and I couldn't figure out an easy replacement. -int random_index(const int n) { return static_cast(R::runif(0, 1) * n); } +int random_index(const int n) { return static_cast(rpf_utils::rng_runif01() * n); } // ----------------- functions for converting R and Cpp types ----------------- @@ -70,8 +71,20 @@ std::vector to_std_vec(Rcpp::NumericVector rv) { } std::vector> to_std_vec(Rcpp::NumericMatrix rv) { - std::vector> X; - for(int i=0; i> X((size_t)rows, std::vector((size_t)cols)); + if (rows == 0 || cols == 0) return X; + const double *data = rv.begin(); // column-major, column stride = rows + for (int j = 0; j < cols; ++j) { + const double *colptr = data + (size_t)j * (size_t)rows; + for (int i = 0; i < rows; ++i) { + X[(size_t)i][(size_t)j] = colptr[(size_t)i]; + } + } return X; } @@ -84,4 +97,4 @@ std::set to_std_set(Rcpp::IntegerVector rv) { } -} \ No newline at end of file +} diff --git a/src/lib/internal_utils.cpp b/src/lib/internal_utils.cpp new file mode 100644 index 0000000..9c94b59 --- /dev/null +++ b/src/lib/internal_utils.cpp @@ -0,0 +1,233 @@ +// Internal utilities for split sampling, RNG, caching, and prefix sums shared +// across split modes and both regression/classification flows. +// +// These helpers centralize frequently reused logic and are intentionally kept +// low-level and stateless, using thread-local state only for RNG where needed. +#include "internal_utils.hpp" +#include +#include + +namespace { + // Thread-local RNG pointer used in worker threads for reproducible randomness + thread_local std::mt19937_64* tls_rng_ptr = nullptr; +} + +namespace rpf_utils { +void fenwick_add(std::vector &bit, size_t idx1, double delta) +{ + // bit is 1-based; idx1 in [1, bit.size()] + size_t n = bit.size(); + while (idx1 <= n) { bit[idx1 - 1] += delta; idx1 += idx1 & (~idx1 + 1); } +} + +size_t fenwick_find_by_prefix(const std::vector &bit, double target) +{ + // Return smallest i such that sum(i) >= target; 1-based index + size_t n = bit.size(); + size_t idx = 0; double sum = 0.0; + // Largest power of two <= n + size_t step = 1ULL << (63 - __builtin_clzll((unsigned long long)std::max(1, n))); + while (step) { + size_t next = idx + step; if (next <= n) { + double val = bit[next - 1]; + if (sum + val < target) { sum += val; idx = next; } + } + step >>= 1; + } + return std::min(n, idx + 1); +} + +std::mt19937_64* swap_tls_rng(std::mt19937_64* new_ptr) +{ + std::mt19937_64* old = tls_rng_ptr; + tls_rng_ptr = new_ptr; + return old; +} + +double rng_runif01() +{ + if (tls_rng_ptr) { + return std::generate_canonical(*tls_rng_ptr); + } + static thread_local std::mt19937_64 fallback_rng(0x9E3779B97F4A7C15ULL); + return std::generate_canonical(fallback_rng); +} + +double rng_runif(double a, double b) +{ + double u = rng_runif01(); + return a + u * (b - a); +} + +int rng_randint(int left_inclusive, int right_exclusive) +{ + if (right_exclusive <= left_inclusive) return left_inclusive; + if (tls_rng_ptr) { + std::uniform_int_distribution dist(left_inclusive, right_exclusive - 1); + return dist(*tls_rng_ptr); + } + static thread_local std::mt19937_64 fallback_rng(0xD1B54A32D192ED03ULL); + std::uniform_int_distribution dist(left_inclusive, right_exclusive - 1); + return dist(fallback_rng); +} + +void ensure_order_and_sorted_vals_for_leaf( + const std::vector> &X, + Leaf &leaf, + int k, + std::vector &order_out, + std::vector &sorted_vals_out) +{ + const size_t m = leaf.individuals.size(); + if (leaf.order_cache.count(k) && leaf.order_cache[k].size() == m) { + order_out = leaf.order_cache[k]; + } else { + order_out.resize(m); + std::iota(order_out.begin(), order_out.end(), 0); + std::stable_sort(order_out.begin(), order_out.end(), [&](size_t a, size_t b){ + return X[leaf.individuals[a]][k] < X[leaf.individuals[b]][k]; + }); + leaf.order_cache[k] = order_out; + } + if (leaf.sorted_vals_cache.count(k) && leaf.sorted_vals_cache[k].size() == m) { + sorted_vals_out = leaf.sorted_vals_cache[k]; + } else { + sorted_vals_out.resize(m); + for (size_t i = 0; i < m; ++i) + sorted_vals_out[i] = X[leaf.individuals[order_out[i]]][k]; + leaf.sorted_vals_cache[k] = sorted_vals_out; + } +} + +std::vector compute_unique_sorted_values(const std::vector &sorted_vals) +{ + std::vector unique; + unique.reserve(sorted_vals.size()); + if (!sorted_vals.empty()) { + unique.push_back(sorted_vals[0]); + for (size_t i = 1; i < sorted_vals.size(); ++i) + if (sorted_vals[i] != unique.back()) unique.push_back(sorted_vals[i]); + } + return unique; +} + +void build_prefix_and_total_given_order( + const std::vector> &Y, + const Leaf &leaf, + const std::vector &order, + size_t value_size, + std::vector> &prefix_out, + std::vector &total_out) +{ + const size_t m = leaf.individuals.size(); + prefix_out.assign(value_size, std::vector(m, 0.0)); + for (size_t p = 0; p < value_size; ++p) { + double acc = 0.0; + for (size_t i = 0; i < m; ++i) { + acc += Y[leaf.individuals[order[i]]][p]; + prefix_out[p][i] = acc; + } + } + total_out.assign(value_size, 0.0); + for (size_t p = 0; p < value_size; ++p) + total_out[p] = prefix_out[p][m - 1]; +} + +void finalize_split_from_sums( + Split &winner, + const std::vector> &X, + size_t value_size) +{ + if (std::isinf(winner.min_sum) || winner.leaf_index == nullptr) return; + const int kfin = winner.split_coordinate - 1; + Leaf &leaf_fin = *winner.leaf_index; + const double sp_fin = winner.split_point; + winner.I_s.clear(); winner.I_b.clear(); + for (int ind : leaf_fin.individuals) { + if (X[ind][kfin] < sp_fin) winner.I_s.push_back(ind); else winner.I_b.push_back(ind); + } + winner.M_s.assign(value_size, 0.0); + winner.M_b.assign(value_size, 0.0); + if (!winner.I_s.empty()) for (size_t p = 0; p < value_size; ++p) + winner.M_s[p] = winner.sum_s[p] / static_cast(winner.I_s.size()); + if (!winner.I_b.empty()) for (size_t p = 0; p < value_size; ++p) + winner.M_b[p] = winner.sum_b[p] / static_cast(winner.I_b.size()); +} + +std::vector sample_weighted_indices_filtered( + const std::vector &weights, + size_t n_candidates) +{ + std::vector pos_idx; pos_idx.reserve(weights.size()); + std::vector pos_w; pos_w.reserve(weights.size()); + for (size_t i = 0; i < weights.size(); ++i) if (weights[i] > 0.0) { pos_idx.push_back(i); pos_w.push_back(weights[i]); } + const size_t P = pos_idx.size(); + std::vector sample_idxs; sample_idxs.reserve(n_candidates); + if (P == 0) { + std::vector all(weights.size()); std::iota(all.begin(), all.end(), 0); + size_t k = std::min(n_candidates, all.size()); + for (size_t i = 0; i < k; ++i) { + size_t j = i + static_cast(rng_runif01() * (double)(all.size() - i)); + if (j >= all.size()) j = all.size() - 1; std::swap(all[i], all[j]); + } + for (size_t i = 0; i < k; ++i) sample_idxs.push_back(all[i]); + } else { + size_t k = std::min(n_candidates, P); + std::vector> keys; keys.reserve(P); + for (size_t i = 0; i < P; ++i) { + double u = rng_runif01(); if (u <= 0.0) u = std::numeric_limits::min(); + double key = -std::log(u) / pos_w[i]; keys.emplace_back(key, pos_idx[i]); + } + if (k < keys.size()) { + std::nth_element(keys.begin(), keys.begin() + k, keys.end(), [](const auto& a, const auto& b){ return a.first < b.first; }); + keys.resize(k); + } + for (auto &kv : keys) sample_idxs.push_back(kv.second); + } + return sample_idxs; +} + +std::vector compute_even_spread_indices(int left_inclusive, int right_exclusive, size_t max_draws) +{ + std::vector result; + int range = right_exclusive - left_inclusive; if (range <= 0) return result; + size_t draws = std::min(max_draws, static_cast(range)); + if (draws == 0) return result; + result.reserve(draws); + for (size_t j = 1; j <= draws; ++j) { + int pos = left_inclusive + static_cast(std::floor((static_cast(j) * range) / static_cast(draws + 1))); + if (pos < left_inclusive) pos = left_inclusive; + if (pos >= right_exclusive) pos = right_exclusive - 1; + if (!result.empty() && pos <= result.back()) pos = std::min(right_exclusive - 1, result.back() + 1); + result.push_back(pos); + } + return result; +} + +std::vector sample_unique_ints_uniform_R(int left_inclusive, int right_exclusive, size_t k) +{ + std::vector result; int range = right_exclusive - left_inclusive; if (range <= 0) return result; + k = std::min(k, static_cast(range)); + if (k == 0) return result; + if (k * 4 >= static_cast(range)) { + std::vector all(range); std::iota(all.begin(), all.end(), left_inclusive); + for (size_t i = 0; i < k; ++i) { + size_t j = i + static_cast(rng_runif01() * (double)(all.size() - i)); if (j >= all.size()) j = all.size() - 1; + std::swap(all[i], all[j]); + } + result.assign(all.begin(), all.begin() + static_cast(k)); + std::sort(result.begin(), result.end()); + return result; + } + std::unordered_set used; result.reserve(k); + while (result.size() < k) { + int s = rng_randint(left_inclusive, right_exclusive); if (s >= right_exclusive) s = right_exclusive - 1; + if (used.insert(s).second) result.push_back(s); + } + std::sort(result.begin(), result.end()); + return result; +} + +} // namespace rpf_utils + + diff --git a/src/lib/losses_exponential.cpp b/src/lib/losses_exponential.cpp new file mode 100644 index 0000000..68f5cd3 --- /dev/null +++ b/src/lib/losses_exponential.cpp @@ -0,0 +1,94 @@ +// Classification losses: Exponential family variants. Extracted from cpf.cpp. +#include "cpf.hpp" + +void ClassificationRPF::exponential_loss(Split &split) +{ + split.min_sum = 0; + split.M_s = std::vector(value_size, 0); + split.M_b = std::vector(value_size, 0); + std::vector W_s_sum(value_size, 0), W_b_sum(value_size, 0), sum_s(value_size, 0), sum_b(value_size, 0); + for (size_t p = 0; p < value_size; ++p) + { + for (auto individual : split.I_s) W_s_sum[p] += (*split.W)[individual][p]; + for (auto individual : split.I_b) W_b_sum[p] += (*split.W)[individual][p]; + for (auto individual : split.I_s) sum_s[p] += (((*split.Y)[individual][p] + 1) / 2) * ((*split.W)[individual][p] / W_s_sum[p]); + for (auto individual : split.I_b) sum_b[p] += (((*split.Y)[individual][p] + 1) / 2) * ((*split.W)[individual][p] / W_b_sum[p]); + split.M_s[p] = sum_s[p]; split.M_b[p] = sum_b[p]; + sum_s[p] = std::min(std::max(delta, sum_s[p]), 1 - delta); + sum_b[p] = std::min(std::max(delta, sum_b[p]), 1 - delta); + } + split.M_sp = 1 - std::accumulate(split.M_s.begin(), split.M_s.end(), 0.0); + split.M_bp = 1 - std::accumulate(split.M_b.begin(), split.M_b.end(), 0.0); + double sum_sp = std::min(std::max(delta, split.M_sp), 1 - delta); + double sum_bp = std::min(std::max(delta, split.M_bp), 1 - delta); + for (size_t p = 0; p < value_size; ++p) + { + for (auto individual : split.I_s) split.min_sum += (*split.W)[individual][p] * exp(-0.5 * (*split.Y)[individual][p] * log(sum_s[p] / sum_sp)); + for (auto individual : split.I_b) split.min_sum += (*split.W)[individual][p] * exp(-0.5 * (*split.Y)[individual][p] * log(sum_b[p] / sum_bp)); + split.min_sum -= W_s_sum[p] + W_b_sum[p]; + } + for (const auto &s : W_s_sum) if (s == 0) split.min_sum = INF; + for (const auto &s : W_b_sum) if (s == 0) split.min_sum = INF; + if (std::isnan(split.min_sum)) split.min_sum = INF; +} + +void ClassificationRPF::exponential_loss_2(Split &split) +{ + split.min_sum = 0; + std::vector W_s_sum(value_size, 0), W_b_sum(value_size, 0), sum_s(value_size, 0), sum_b(value_size, 0), sum_s2(value_size, 0), sum_b2(value_size, 0); + for (size_t p = 0; p < value_size; ++p) + { + for (auto individual : split.I_s) W_s_sum[p] += (*split.W)[individual][p]; + for (auto individual : split.I_b) W_b_sum[p] += (*split.W)[individual][p]; + for (auto individual : split.I_s) sum_s[p] += (((*split.Y)[individual][p] + 1) / 2) * ((*split.W)[individual][p] / W_s_sum[p]); + for (auto individual : split.I_b) sum_b[p] += (((*split.Y)[individual][p] + 1) / 2) * ((*split.W)[individual][p] / W_b_sum[p]); + split.M_s[p] = sum_s[p]; split.M_b[p] = sum_b[p]; + sum_s2[p] = std::max(delta, 1 - sum_s[p]); sum_b2[p] = std::max(delta, 1 - sum_b[p]); + sum_s[p] = std::max(delta, sum_s[p]); sum_b[p] = std::max(delta, sum_b[p]); + } + for (size_t p = 0; p < value_size; ++p) + { + for (auto individual : split.I_s) split.min_sum += (*split.W)[individual][p] * exp(-0.5 * (*split.Y)[individual][p] * log(sum_s[p] / sum_s2[p])); + for (auto individual : split.I_b) split.min_sum += (*split.W)[individual][p] * exp(-0.5 * (*split.Y)[individual][p] * log(sum_b[p] / sum_b2[p])); + split.min_sum -= W_s_sum[p] + W_b_sum[p]; + } + for (const auto &s : W_s_sum) if (s == 0) split.min_sum = INF; + for (const auto &s : W_b_sum) if (s == 0) split.min_sum = INF; + if (std::isnan(split.min_sum)) split.min_sum = INF; +} + +void ClassificationRPF::exponential_loss_3(Split &split) +{ + split.min_sum = 0; + split.M_s = std::vector(value_size, 0); + split.M_b = std::vector(value_size, 0); + std::vector W_s_sum(value_size, 0), W_b_sum(value_size, 0), sum_s(value_size, 0), sum_b(value_size, 0); + for (size_t p = 0; p < value_size; ++p) + { + for (auto individual : split.I_s) W_s_sum[p] += (*split.W)[individual][p]; + for (auto individual : split.I_b) W_b_sum[p] += (*split.W)[individual][p]; + for (auto individual : split.I_s) sum_s[p] += (((*split.Y)[individual][p] + 1) / 2) * ((*split.W)[individual][p] / W_s_sum[p]); + for (auto individual : split.I_b) sum_b[p] += (((*split.Y)[individual][p] + 1) / 2) * ((*split.W)[individual][p] / W_b_sum[p]); + split.M_s[p] = sum_s[p]; split.M_b[p] = sum_b[p]; + sum_s[p] = std::max(delta, sum_s[p]); sum_b[p] = std::max(delta, sum_b[p]); + sum_s[p] = log(sum_s[p]); sum_b[p] = log(sum_b[p]); + } + split.M_sp = 1 - std::accumulate(split.M_s.begin(), split.M_s.end(), 0.0); + split.M_bp = 1 - std::accumulate(split.M_b.begin(), split.M_b.end(), 0.0); + double sum_sp = std::max(delta, split.M_sp), sum_bp = std::max(delta, split.M_bp); + sum_sp = log(sum_sp); sum_bp = log(sum_bp); + sum_sp += std::accumulate(sum_s.begin(), sum_s.end(), 0.0); + sum_bp += std::accumulate(sum_b.begin(), sum_b.end(), 0.0); + sum_sp = sum_sp / (sum_s.size() + 1); sum_bp = sum_bp / (sum_b.size() + 1); + for (size_t p = 0; p < value_size; ++p) + { + for (auto individual : split.I_s) split.min_sum += (*split.W)[individual][p] * exp(-0.5 * (*split.Y)[individual][p] * (sum_s[p] - sum_sp)); + for (auto individual : split.I_b) split.min_sum += (*split.W)[individual][p] * exp(-0.5 * (*split.Y)[individual][p] * (sum_b[p] - sum_bp)); + split.min_sum -= W_s_sum[p] + W_b_sum[p]; + } + for (const auto &s : W_s_sum) if (s == 0) split.min_sum = INF; + for (const auto &s : W_b_sum) if (s == 0) split.min_sum = INF; + if (std::isnan(split.min_sum)) split.min_sum = INF; +} + + diff --git a/src/lib/losses_l1_l2_median.cpp b/src/lib/losses_l1_l2_median.cpp new file mode 100644 index 0000000..ded3202 --- /dev/null +++ b/src/lib/losses_l1_l2_median.cpp @@ -0,0 +1,32 @@ +// Classification losses: L1 and Median. Extracted from cpf.cpp. +#include "cpf.hpp" + +void ClassificationRPF::L1_loss(Split &split) +{ + split.min_sum = 0; + split.M_s = split.sum_s / split.I_s.size(); + split.M_b = split.sum_b / split.I_b.size(); + for (size_t p = 0; p < value_size; ++p) + { + for (auto individual : split.I_s) + split.min_sum += std::fabs((*split.Y)[individual][p] - split.M_s[p]) - std::fabs((*split.Y)[individual][p]); + for (auto individual : split.I_b) + split.min_sum += std::fabs((*split.Y)[individual][p] - split.M_b[p]) - std::fabs((*split.Y)[individual][p]); + } +} + +void ClassificationRPF::median_loss(Split &split) +{ + split.min_sum = 0; + split.M_s = calcMedian(*split.Y, split.I_s); + split.M_b = calcMedian(*split.Y, split.I_b); + for (size_t p = 0; p < value_size; ++p) + { + for (auto individual : split.I_s) + split.min_sum += std::fabs((*split.Y)[individual][p] - split.M_s[p]) - std::fabs((*split.Y)[individual][p]); + for (auto individual : split.I_b) + split.min_sum += std::fabs((*split.Y)[individual][p] - split.M_b[p]) - std::fabs((*split.Y)[individual][p]); + } +} + + diff --git a/src/lib/losses_logit.cpp b/src/lib/losses_logit.cpp new file mode 100644 index 0000000..6eccdd7 --- /dev/null +++ b/src/lib/losses_logit.cpp @@ -0,0 +1,130 @@ +// Classification losses: Logit family variants. Extracted from cpf.cpp. +#include "cpf.hpp" + +void ClassificationRPF::logit_loss(Split &split) +{ + split.min_sum = 0; + split.M_s = split.sum_s / split.I_s.size(); + split.M_b = split.sum_b / split.I_b.size(); + split.M_sp = 1 - std::accumulate(split.M_s.begin(), split.M_s.end(), 0.0); + split.M_bp = 1 - std::accumulate(split.M_b.begin(), split.M_b.end(), 0.0); + std::vector M_s = split.M_s, M_b = split.M_b; + std::for_each(M_s.begin(), M_s.end(), [this](double &M){ M = std::min(std::max(delta, M), 1 - delta); }); + std::for_each(M_b.begin(), M_b.end(), [this](double &M){ M = std::min(std::max(delta, M), 1 - delta); }); + double M_sp = std::min(std::max(delta, split.M_sp), 1 - delta); + double M_bp = std::min(std::max(delta, split.M_bp), 1 - delta); + std::vector W_s_mean = calcMean(*split.W, split.I_s); + std::vector W_b_mean = calcMean(*split.W, split.I_b); + std::vector> W = *split.W, W_new = *split.W; + for (size_t p = 0; p < value_size; ++p) + { + for (auto individual : split.I_s) { W[individual][p] = exp(W[individual][p]); W_new[individual][p] = exp(W_new[individual][p] + log(M_s[p] / M_sp) - W_s_mean[p]); } + for (auto individual : split.I_b) { W[individual][p] = exp(W[individual][p]); W_new[individual][p] = exp(W_new[individual][p] + log(M_b[p] / M_bp) - W_b_mean[p]); } + } + for (size_t p = 0; p < value_size; ++p) + { + for (auto individual : split.I_s) { split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (1 + std::accumulate(W[individual].begin(), W[individual].end(), 0.0))); split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (1 + std::accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0))); } + for (auto individual : split.I_b) { split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (1 + std::accumulate(W[individual].begin(), W[individual].end(), 0.0))); split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (1 + std::accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0))); } + } + for (auto individual : split.I_s) { split.min_sum += (1 - std::accumulate((*split.Y)[individual].begin(), (*split.Y)[individual].end(), 0.0)) * log(1 / (1 + std::accumulate(W[individual].begin(), W[individual].end(), 0.0))); split.min_sum -= (1 - std::accumulate((*split.Y)[individual].begin(), (*split.Y)[individual].end(), 0.0)) * log(1 / (1 + std::accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0))); } + for (auto individual : split.I_b) { split.min_sum += (1 - std::accumulate((*split.Y)[individual].begin(), (*split.Y)[individual].end(), 0.0)) * log(1 / (1 + std::accumulate(W[individual].begin(), W[individual].end(), 0.0))); split.min_sum -= (1 - std::accumulate((*split.Y)[individual].begin(), (*split.Y)[individual].end(), 0.0)) * log(1 / (1 + std::accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0))); } + if (std::isnan(split.min_sum)) split.min_sum = INF; +} + +void ClassificationRPF::logit_loss_2(Split &split) +{ + split.min_sum = 0; + split.M_s = split.sum_s / split.I_s.size(); + split.M_b = split.sum_b / split.I_b.size(); + std::vector M_s = split.M_s, M_b = split.M_b; + std::vector M_s2 = split.M_s, M_b2 = split.M_b; + std::for_each(M_s.begin(), M_s.end(), [this](double &M){ M = std::max(delta, M); }); + std::for_each(M_b.begin(), M_b.end(), [this](double &M){ M = std::max(delta, M); }); + std::for_each(M_s2.begin(), M_s2.end(), [this](double &M){ M = std::max(delta, 1 - M); }); + std::for_each(M_b2.begin(), M_b2.end(), [this](double &M){ M = std::max(delta, 1 - M); }); + std::vector W_s_mean = calcMean(*split.W, split.I_s); + std::vector W_b_mean = calcMean(*split.W, split.I_b); + std::vector> W = *split.W, W_new = *split.W; + for (size_t p = 0; p < value_size; ++p) + { + for (auto individual : split.I_s) { W[individual][p] = exp(W[individual][p]); W_new[individual][p] = exp(W_new[individual][p] + log(M_s[p] / M_s2[p]) - W_s_mean[p]); } + for (auto individual : split.I_b) { W[individual][p] = exp(W[individual][p]); W_new[individual][p] = exp(W_new[individual][p] + log(M_b[p] / M_b2[p]) - W_b_mean[p]); } + } + for (size_t p = 0; p < value_size; ++p) + { + for (auto individual : split.I_s) { split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (1 + W[individual][p])); split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (1 + W_new[individual][p])); } + for (auto individual : split.I_b) { split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (1 + W[individual][p])); split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (1 + W_new[individual][p])); } + } + if (std::isnan(split.min_sum)) split.min_sum = INF; +} + +void ClassificationRPF::logit_loss_3(Split &split) +{ + split.min_sum = 0; + split.M_s = split.sum_s / split.I_s.size(); + split.M_b = split.sum_b / split.I_b.size(); + split.M_sp = 1 - std::accumulate(split.M_s.begin(), split.M_s.end(), 0.0); + split.M_bp = 1 - std::accumulate(split.M_b.begin(), split.M_b.end(), 0.0); + std::vector M_s = split.M_s, M_b = split.M_b; + std::for_each(M_s.begin(), M_s.end(), [this](double &M){ M = std::max(delta, M); }); + std::for_each(M_b.begin(), M_b.end(), [this](double &M){ M = std::max(delta, M); }); + std::for_each(M_s.begin(), M_s.end(), [&](double &M){ M = log(M); }); + std::for_each(M_b.begin(), M_b.end(), [&](double &M){ M = log(M); }); + double M_sp = std::max(delta, split.M_sp); + double M_bp = std::max(delta, split.M_bp); + M_sp = log(M_sp); + M_bp = log(M_bp); + double sum_s = (std::accumulate(M_s.begin(), M_s.end(), 0.0) + M_sp) / (M_s.size() + 1); + double sum_b = (std::accumulate(M_b.begin(), M_b.end(), 0.0) + M_bp) / (M_b.size() + 1); + std::vector W_s_mean = calcMean(*split.W, split.I_s); + std::vector W_b_mean = calcMean(*split.W, split.I_b); + std::vector> W = *split.W, W_new = *split.W; + for (size_t p = 0; p < value_size; ++p) + { + for (auto individual : split.I_s) { W_new[individual][p] = W_new[individual][p] + M_s[p] - sum_s - W_s_mean[p]; } + for (auto individual : split.I_b) { W_new[individual][p] = W_new[individual][p] + M_b[p] - sum_b - W_b_mean[p]; } + } + std::vector W_sp, W_bp, W_sp_new, W_bp_new, Y_sp, Y_bp; + for (auto individual : split.I_s) { W_sp.push_back(-accumulate(W[individual].begin(), W[individual].end(), 0.0)); W_sp_new.push_back(-accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0)); Y_sp.push_back(1 - accumulate(Y[individual].begin(), Y[individual].end(), 0.0)); } + for (auto individual : split.I_b) { W_bp.push_back(-accumulate(W[individual].begin(), W[individual].end(), 0.0)); W_bp_new.push_back(-accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0)); Y_bp.push_back(1 - accumulate(Y[individual].begin(), Y[individual].end(), 0.0)); } + for (size_t p = 0; p < value_size; ++p) + { + for (auto individual : split.I_s) { W[individual][p] = exp(W[individual][p]); W_new[individual][p] = exp(W_new[individual][p]); } + for (auto individual : split.I_b) { W[individual][p] = exp(W[individual][p]); W_new[individual][p] = exp(W_new[individual][p]); } + } + for (size_t p = 0; p < value_size; ++p) + { + for (auto individual : split.I_s) { split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (std::accumulate(W[individual].begin(), W[individual].end(), 0.0))); split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (std::accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0))); } + for (auto individual : split.I_b) { split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (std::accumulate(W[individual].begin(), W[individual].end(), 0.0))); split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (std::accumulate(W_new[individual].begin(), W_new[individual].end(), 0.0))); } + } + if (std::isnan(split.min_sum)) split.min_sum = INF; +} + +void ClassificationRPF::logit_loss_4(Split &split) +{ + split.min_sum = 0; + split.M_s = split.sum_s / split.I_s.size(); + split.M_b = split.sum_b / split.I_b.size(); + std::vector M_s = split.M_s, M_b = split.M_b; + std::vector M_s2 = split.M_s, M_b2 = split.M_b; + std::for_each(M_s.begin(), M_s.end(), [this](double &M){ M = std::max(delta, M); }); + std::for_each(M_b.begin(), M_b.end(), [this](double &M){ M = std::max(delta, M); }); + std::for_each(M_s2.begin(), M_s2.end(), [this](double &M){ M = std::max(delta, 1 - M); }); + std::for_each(M_b2.begin(), M_b2.end(), [this](double &M){ M = std::max(delta, 1 - M); }); + std::vector W_s_mean = calcMean(*split.W, split.I_s); + std::vector W_b_mean = calcMean(*split.W, split.I_b); + std::vector> W = *split.W, W_new = *split.W; + for (size_t p = 0; p < value_size; ++p) + { + for (auto individual : split.I_s) { W[individual][p] = exp(W[individual][p]); W_new[individual][p] = exp(W_new[individual][p] + log(M_s[p] / M_s2[p]) - W_s_mean[p]); } + for (auto individual : split.I_b) { W[individual][p] = exp(W[individual][p]); W_new[individual][p] = exp(W_new[individual][p] + log(M_b[p] / M_b2[p]) - W_b_mean[p]); } + } + for (size_t p = 0; p < value_size; ++p) + { + for (auto individual : split.I_s) { split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (1 + W[individual][p])); split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (1 + W_new[individual][p])); } + for (auto individual : split.I_b) { split.min_sum += (*split.Y)[individual][p] * log(W[individual][p] / (1 + W[individual][p])); split.min_sum -= (*split.Y)[individual][p] * log(W_new[individual][p] / (1 + W_new[individual][p])); } + } + if (std::isnan(split.min_sum)) split.min_sum = INF; +} + + diff --git a/src/lib/predict.cpp b/src/lib/predict.cpp new file mode 100644 index 0000000..a158fd1 --- /dev/null +++ b/src/lib/predict.cpp @@ -0,0 +1,232 @@ +// Prediction entry points split out from rpf.cpp for readability and reuse. +#include "rpf.hpp" +#include +#include + +// predict single feature vector +std::vector RandomPlantedForest::predict_single(const std::vector &X, std::set component_index) +{ + std::vector total_res = std::vector(value_size, 0); + + if (!purified) + { + // consider all components + if (component_index == std::set{0}) + { + for (auto &tree_family : this->tree_families) + { + for (auto &tree : tree_family) + { + for (auto &leaf : tree.second->leaves) + { + bool valid = true; + for (auto &dim : tree.first) + { + if (!((leaf.intervals[std::max(0, dim - 1)].first <= X[std::max(0, dim - 1)] || leaf.intervals[std::max(0, dim - 1)].first == lower_bounds[std::max(0, dim - 1)]) && (leaf.intervals[std::max(0, dim - 1)].second > X[std::max(0, dim - 1)] || leaf.intervals[std::max(0, dim - 1)].second == upper_bounds[std::max(0, dim - 1)]))) + { + valid = false; + break; + } + } + if (valid) + { + for (size_t p = 0; p < value_size && p < leaf.value.size(); ++p) + { + total_res[p] += leaf.value[p]; + } + } + } + } + } + } + else + { // choose components for prediction + for (auto &tree_family : this->tree_families) + { + for (auto &tree : tree_family) + { + // only consider trees with same dimensions as component_index + if (tree.first != component_index) + continue; + + std::vector dims; + for (auto dim : tree.first) + { + dims.push_back(dim); + } + + for (auto &leaf : tree.second->leaves) + { + bool valid = true; + for (unsigned int i = 0; i < dims.size(); ++i) + { + int dim = dims[i]; + if (!((leaf.intervals[std::max(0, dim - 1)].first <= X[i] || leaf.intervals[std::max(0, dim - 1)].first == lower_bounds[std::max(0, dim - 1)]) && (leaf.intervals[std::max(0, dim - 1)].second > X[i] || leaf.intervals[std::max(0, dim - 1)].second == upper_bounds[std::max(0, dim - 1)]))) + { + valid = false; + break; + } + } + if (valid) + { + for (size_t p = 0; p < value_size && p < leaf.value.size(); ++p) + { + total_res[p] += leaf.value[p]; + } + } + } + } + } + } + } + else + { + if (component_index == std::set{-1}) + { + for (auto &tree_family : this->tree_families) + { + for (auto &tree : tree_family) + { + std::vector leaf_index(tree.first.size(), -1); + if (tree.first == std::set{0}) + { + leaf_index = std::vector(tree.first.size(), 0); + + const auto &vals = tree.second->GridLeaves.values[leaf_index]; + for (size_t p = 0; p < value_size && p < vals.size(); ++p) + { + total_res[p] += vals[p]; + } + } + } + } + } + else if (component_index == std::set{0}) + { + for (auto &tree_family : this->tree_families) + { + for (auto &tree : tree_family) + { + std::vector leaf_index(tree.first.size(), -1); + if (tree.first == std::set{0}) + { + leaf_index = std::vector(tree.first.size(), 0); + } + else + { + for (size_t dim_index = 0; dim_index < tree.first.size(); ++dim_index) + { + int dim = 0; + { + auto dim_pnt = tree.first.begin(); + std::advance(dim_pnt, dim_index); + dim = *dim_pnt; + --dim; // convert to 0-based original feature index + } + auto &bounds = tree.second->GridLeaves.lim_list[dim]; + if (bounds.size() < 2) + { + leaf_index[dim_index] = 0; + continue; + } + // Use the original feature index into X, not the position within the tree's dim set + auto it = std::upper_bound(bounds.begin(), bounds.end(), X[dim]); + int c = static_cast(std::distance(bounds.begin(), it)); + leaf_index[dim_index] = std::min(std::max(0, c - 1), (int)bounds.size() - 2); + } + } + for (int &index : leaf_index) index = std::max(0, index); + { + const auto &vals = tree.second->GridLeaves.values[leaf_index]; + for (size_t p = 0; p < value_size && p < vals.size(); ++p) + { + total_res[p] += vals[p]; + } + } + } + } + } + else + { + for (auto &tree_family : this->tree_families) + { + for (auto &tree : tree_family) + { + if (tree.first != component_index) + continue; + std::vector leaf_index(tree.first.size(), -1); + if (tree.first == std::set{0}) + { + leaf_index = std::vector(tree.first.size(), 0); + } + else + { + for (size_t dim_index = 0; dim_index < tree.first.size(); ++dim_index) + { + int dim = 0; + { + auto dim_pnt = tree.first.begin(); + std::advance(dim_pnt, dim_index); + dim = *dim_pnt; + --dim; // 0-based original feature index for bounds lookup only + } + auto &bounds = tree.second->GridLeaves.lim_list[dim]; + if (bounds.size() < 2) + { + leaf_index[dim_index] = 0; + continue; + } + // For component-specific prediction, X contains only the selected dims in ascending order. + // Use the position within the selected dims (dim_index) to read the value. + auto it = std::upper_bound(bounds.begin(), bounds.end(), X[dim_index]); + int c = static_cast(std::distance(bounds.begin(), it)); + leaf_index[dim_index] = std::min(std::max(0, c - 1), (int)bounds.size() - 2); + } + } + for (int &index : leaf_index) index = std::max(0, index); + { + const auto &vals = tree.second->GridLeaves.values[leaf_index]; + for (size_t p = 0; p < value_size && p < vals.size(); ++p) + { + total_res[p] += vals[p]; + } + } + } + } + } + } + + return total_res / n_trees; +} + +// predict multiple feature vectors +Rcpp::NumericMatrix RandomPlantedForest::predict_matrix(const NumericMatrix &X, const NumericVector components) +{ + std::vector> feature_vec = to_std_vec(X); + std::set component_index = to_std_set(components); + std::vector> predictions; + if (feature_vec.empty()) + throw std::invalid_argument("Feature vector is empty."); + if (component_index == std::set{0} && this->feature_size >= 0 && feature_vec[0].size() != (size_t)this->feature_size) + throw std::invalid_argument("Feature vector has wrong dimension."); + if (component_index != std::set{0} && component_index != std::set{-1} && component_index.size() != feature_vec[0].size()) + throw std::invalid_argument("The input X has the wrong dimension in order to calculate f_i(x)"); + for (auto &vec : feature_vec) + { + predictions.push_back(predict_single(vec, component_index)); + } + return from_std_vec(predictions); +} + +Rcpp::NumericMatrix RandomPlantedForest::predict_vector(const NumericVector &X, const NumericVector components) +{ + std::vector feature_vec = to_std_vec(X); + std::set component_index = to_std_set(components); + std::vector> predictions; Rcpp::NumericMatrix res; + if (feature_vec.empty()) { Rcout << "Feature vector is empty." << std::endl; return res; } + if (component_index == std::set{0} && this->feature_size >= 0 && feature_vec.size() != (size_t)this->feature_size) { Rcout << "Feature vector has wrong dimension." << std::endl; return res; } + if (component_index == std::set{0}) { predictions.push_back(predict_single(feature_vec, component_index)); } + else { for (auto vec : feature_vec) predictions.push_back(predict_single(std::vector{vec}, component_index)); } + res = from_std_vec(predictions); return res; +} + diff --git a/src/lib/purify.cpp b/src/lib/purify.cpp new file mode 100644 index 0000000..db0a4da --- /dev/null +++ b/src/lib/purify.cpp @@ -0,0 +1,1297 @@ +#include "rpf.hpp" +#include "kdtree.hpp" +#include "diffbuf.hpp" +#include +#include +#include + +// Generates the next combination of k indices from a set of n elements. +static inline bool next_combination(std::vector &p, int n) +{ + int k = (int)p.size(); + for (int i = k - 1; i >= 0; --i) + { + if (p[i] < n - k + i) + { + p[i]++; + for (int j = i + 1; j < k; ++j) + { + p[j] = p[j - 1] + 1; + } + return true; + } + } + return false; +} + +void RandomPlantedForest::purify_1() +{ + + // go through all n_trees families + for (auto &curr_family : this->tree_families) + { + + // recap maximum number of dimensions of current family + unsigned int curr_max = 0; + for (auto tree : curr_family) + { + if (tree.first.size() > curr_max) + curr_max = tree.first.size(); + } + + while (curr_max >= 1) + { + + // go through split dimensions of all trees + auto keys = getKeys(curr_family); + std::vector>::reverse_iterator key = keys.rbegin(); + while (key != keys.rend()) + { + + auto &curr_tree = curr_family[(*key)]; + std::set curr_dims = curr_tree->split_dims; + + // check if number of dims same as current max_interaction + if (curr_dims.size() == curr_max) + { + + // go through feature dims + for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) + { + + // continue only if dim in current tree + if (curr_tree->split_dims.count(feature_dim) != 0) + { + + std::set tree_dims = curr_tree->split_dims; + tree_dims.erase(tree_dims.find(feature_dim)); // remove current feature dim from current tree + + // check if tree with dimensions exists, if not create + std::shared_ptr tree = treeExists(tree_dims, curr_family); + if (curr_max == 1) + { + tree = curr_family[std::set{0}]; + } + else + { + if (!tree) + { + curr_family.insert(std::make_pair(tree_dims, std::make_shared(DecisionTree(tree_dims)))); + tree = curr_family[tree_dims]; + } + } + + // go through leaves of current tree + int n_leaves = curr_tree->leaves.size(); + for (int l = 0; l < n_leaves; ++l) + { + auto &curr_leaf = curr_tree->leaves[l]; + + double multiplier = (curr_leaf.intervals[feature_dim - 1].second - curr_leaf.intervals[feature_dim - 1].first) / (upper_bounds[feature_dim - 1] - lower_bounds[feature_dim - 1]); + + // new leaf including intervals and value + Leaf new_leaf = curr_leaf; // initialize intervals with first leaf + new_leaf.intervals[feature_dim - 1].first = lower_bounds[feature_dim - 1]; + new_leaf.intervals[feature_dim - 1].second = upper_bounds[feature_dim - 1]; + for (size_t i = 0; i < value_size; ++i) + new_leaf.value[i] = -curr_leaf.value[i] * multiplier; // update value of new leaf + + // append new leaf + if (!leafExists(new_leaf.intervals, curr_tree)) + curr_tree->leaves.push_back(new_leaf); + for (size_t i = 0; i < value_size; ++i) + new_leaf.value[i] = curr_leaf.value[i] * multiplier; // update value of new leaf + if (!leafExists(new_leaf.intervals, tree)) + tree->leaves.push_back(new_leaf); + } + } + } + } + key++; + } + + // update currently considered dimension size + --curr_max; + } + } + + purified = true; +} + + +void RandomPlantedForest::purify_fast_exact_family(TreeFamily &curr_family, int maxp_interaction) +{ + // Normalize cap: treat 0 (or out-of-range) as full order p = feature_size + if (maxp_interaction <= 0 || maxp_interaction > feature_size) maxp_interaction = feature_size; + + // Portable 32-bit popcount to avoid compiler-specific builtins + auto popcount32 = [](unsigned int x) -> int { + x = x - ((x >> 1) & 0x55555555u); + x = (x & 0x33333333u) + ((x >> 2) & 0x33333333u); + return (int)((((x + (x >> 4)) & 0x0F0F0F0Fu) * 0x01010101u) >> 24); + }; + auto nextDown = [](double x) { return std::nextafter(x, -std::numeric_limits::infinity()); }; + + // 0) Ensure all subset components exist in the family (sources and targets) + { + auto base_keys = getKeys(curr_family); + for (const auto &T : base_keys) { + if (T == std::set{0}) continue; + std::vector dims; dims.reserve(T.size()); + for (int d : T) dims.push_back(d); + int k = (int)dims.size(); + for (int mask = 1; mask < (1 << k); ++mask) { + if (maxp_interaction > 0) { + int bits = popcount32((unsigned)mask); + if (bits > maxp_interaction) continue; + } + std::set S; + for (int b = 0; b < k; ++b) if (mask & (1 << b)) S.insert(dims[b]); + if (curr_family.find(S) == curr_family.end()) { + curr_family.insert({S, std::make_shared(DecisionTree(S))}); + } + } + } + if (curr_family.find(std::set{0}) == curr_family.end()) { + curr_family.insert({std::set{0}, std::make_shared(DecisionTree(std::set{0}))}); + } + } + + // 1) Build lim_list (unique cut endpoints per feature) + std::vector> lim_list(feature_size); + for (int d = 1; d <= feature_size; ++d) { + std::vector bounds; + for (const auto &kv : curr_family) { + if (!kv.first.count(d)) continue; + for (const auto &leaf : kv.second->leaves) { + bounds.push_back(leaf.intervals[d - 1].first); + bounds.push_back(leaf.intervals[d - 1].second); + } + } + std::sort(bounds.begin(), bounds.end()); + bounds.erase(std::unique(bounds.begin(), bounds.end()), bounds.end()); + lim_list[d - 1] = bounds; + } + + // Precompute number of cells per feature (endpoints - 1), clamped at 0 + std::vector cells_by_dim(feature_size + 1, 0); + for (int d = 1; d <= feature_size; ++d) cells_by_dim[d] = std::max(0, (int)lim_list[d - 1].size() - 1); + + // 2) Prepare per-S diff buffers (emit only S with |S|<=maxp; keep intercept) + auto keys = getKeys(curr_family); + std::vector> S_vars; S_vars.reserve(keys.size()); + std::vector>> diff_S; diff_S.reserve(keys.size()); + std::vector intercept(value_size, 0.0); + for (const auto &S : keys) { + if (S != std::set{0} && maxp_interaction > 0 && (int)S.size() > maxp_interaction) continue; + S_vars.push_back(S); + if (S == std::set{0}) diff_S.emplace_back(rpf_diff::NDArray>(std::vector{1}, std::vector(value_size, 0))); + else { + std::vector diff_dims; diff_dims.reserve(S.size()); + for (int d : S) { int K = (int)lim_list[d - 1].size(); int cells = std::max(0, K - 1); diff_dims.push_back(cells + 1); } + diff_S.emplace_back(rpf_diff::NDArray>(diff_dims, std::vector(value_size, 0))); + } + } + + std::map, int, utils::setComp> s_index_map; for (size_t i = 0; i < S_vars.size(); ++i) s_index_map[S_vars[i]] = (int)i; + auto set_to_vec = [](const std::set &S){ std::vector v; v.reserve(S.size()); for (int x : S) v.push_back(x); return v; }; + + // 3) KD-tree over all samples + std::vector all_idx(sample_size); for (int i = 0; i < sample_size; ++i) all_idx[i] = i; + rpf_kd::KDTree kdt(&X, all_idx, feature_size); + + // Precompute tot(U) with half-open domain [front, back) + std::map, double, utils::setComp> tot_cache; + auto get_tot_for_U = [&](const std::set& U)->double { + auto it = tot_cache.find(U); if (it != tot_cache.end()) return it->second; + for (int u : U) if ((int)lim_list[u - 1].size() < 2) { tot_cache.insert({U, 0.0}); return 0.0; } + std::vector consU; consU.reserve(U.size()); + for (int u : U) { const auto &lims = lim_list[u - 1]; double lo = lims.front(); double hi = nextDown(lims.back()); consU.push_back({u - 1, lo, hi}); } + size_t cnt = consU.empty() ? (size_t)sample_size : kdt.range_count(consU); + double tot = (double)cnt; tot_cache.insert({U, tot}); return tot; + }; + + // Exact cache for KD range_count queries keyed by (dim, lo_idx, hi_idx) triples per constrained dim + // Key construction: 64-bit hash mixed from ordered triples to avoid building strings/sets + auto mix64 = [](unsigned long long x){ + x ^= x >> 33; x *= 0xff51afd7ed558ccdULL; x ^= x >> 33; x *= 0xc4ceb9fe1a85ec53ULL; x ^= x >> 33; return x; + }; + auto pack3 = [&](unsigned long long acc, int d, int lo, int hi){ + unsigned long long k = ((unsigned long long)(unsigned int)d << 32) ^ ((unsigned long long)(unsigned int)lo << 16) ^ (unsigned long long)(unsigned int)hi; + acc ^= mix64(k + 0x9e3779b97f4a7c15ULL + (acc<<6) + (acc>>2)); + return acc; + }; + std::unordered_map kd_cache; kd_cache.reserve(1u << 15); + + // 4) Accumulate leaf contributions from ALL trees T (any order), enumerating only S up to maxp + for (const auto &kv : curr_family) { + const std::set &T = kv.first; if (T == std::set{0}) continue; + const auto &leaves = kv.second->get_leaves(); + std::vector Tvec = set_to_vec(T); const int tdim = (int)Tvec.size(); + // map from dimension id -> position index in Tvec + std::vector pos_in_T(feature_size + 1, -1); + for (int i = 0; i < tdim; ++i) pos_in_T[Tvec[i]] = i; + + for (const auto &leaf : leaves) { + // Pre-cache per-dim grid cell ranges for this leaf + std::vector lo_cached(feature_size + 1, 0), hi_cached(feature_size + 1, 0); + for (int d : T) { + const auto &lims = lim_list[d - 1]; int cells = std::max(0, (int)lims.size() - 1); + int k_low = (int)(std::lower_bound(lims.begin(), lims.end(), leaf.intervals[d - 1].first) - lims.begin()); + int ub = (int)(std::upper_bound(lims.begin(), lims.end(), leaf.intervals[d - 1].second) - lims.begin()); + int k_high_cell = std::max(0, ub - 2); + lo_cached[d] = std::max(0, k_low); + hi_cached[d] = std::min(cells, k_high_cell + 1); + } + + // Precompute per-dimension KD constraints for this leaf + std::vector rc_by_dim(feature_size + 1); + std::vector rc_ok(feature_size + 1, 0); + // Also store lim_list boundary indices for exact caching + std::vector rc_lo_idx(feature_size + 1, -1); + std::vector rc_hi_idx(feature_size + 1, -1); + for (int d : T) { + const auto &lims = lim_list[d - 1]; + if ((int)lims.size() < 2) { rc_ok[d] = 0; continue; } + double l = std::max(leaf.intervals[d - 1].first, lims.front()); + double r = std::min(leaf.intervals[d - 1].second, lims.back()); + double hi = nextDown(r); + if (!(hi >= l)) { rc_ok[d] = 0; } + else { + rc_by_dim[d] = {d - 1, l, hi}; rc_ok[d] = 1; + int lidx = (int)(std::lower_bound(lims.begin(), lims.end(), l) - lims.begin()); + int ridx = (int)(std::lower_bound(lims.begin(), lims.end(), r) - lims.begin()); + rc_lo_idx[d] = lidx; rc_hi_idx[d] = ridx; + } + } + + // Precompute E[f_T | X_j] only for j with |j| <= maxp_interaction by enumerating combinations + // Store by mask over positions in T (0..tdim-1) to avoid building a full 2^tdim table + std::unordered_map> contrib_by_mask; + contrib_by_mask.reserve(32u); + std::vector cons; cons.reserve((size_t)tdim); + + auto compute_for_j = [&](const std::vector &j_pos){ + // Build complement U = T \ j and corresponding KD constraints + std::set U; + U.clear(); + int jmask = 0; + std::vector is_in_j((size_t)tdim, 0); + for (int pos : j_pos) { if (pos >= 0 && pos < tdim) { is_in_j[(size_t)pos] = 1; jmask |= (1 << pos); } } + for (int b = 0; b < tdim; ++b) if (!is_in_j[(size_t)b]) U.insert(Tvec[b]); + cons.clear(); cons.reserve(U.size()); + bool empty_range = false; + // Build exact cache key from ordered (dim, lo_idx, hi_idx) + unsigned long long key = 1469598103934665603ULL; // FNV offset basis-ish seed + for (int u : U) { + if (!rc_ok[u]) { empty_range = true; break; } + cons.push_back(rc_by_dim[u]); + key = pack3(key, u - 1, rc_lo_idx[u], rc_hi_idx[u]); + } + size_t cnt = 0; + if (empty_range) cnt = 0; + else if (cons.empty()) cnt = (size_t)sample_size; + else { + auto kIt = kd_cache.find(key); + if (kIt != kd_cache.end()) cnt = kIt->second; + else { cnt = kdt.range_count(cons); kd_cache.emplace(key, cnt); } + } + double totU = get_tot_for_U(U); if (totU <= 0.0) return; + contrib_by_mask[jmask] = ((double)cnt / totU) * leaf.value; + }; + + // j size = 0 + compute_for_j(std::vector{}); + // j sizes 1..min(tdim, maxp_interaction) + int maxk = std::min(tdim, maxp_interaction); + for (int k = 1; k <= maxk; ++k) { + std::vector p(k); for (int i = 0; i < k; ++i) p[i] = i; + do { compute_for_j(p); } while (next_combination(p, tdim)); + } + + // Efficiently iterate directly over target subsets S up to size maxp_interaction + for (int k = 0; k <= std::min(tdim, maxp_interaction); ++k) { + std::vector p(k); for (int i = 0; i < k; ++i) p[i] = i; + if (k == 0) { + // j = {} corresponds to mask 0 + auto it0 = contrib_by_mask.find(0); + if (it0 != contrib_by_mask.end()) intercept += it0->second; + } else { + do { + std::set S; for (int idx : p) S.insert(Tvec[idx]); + // Inclusion-exclusion over all j subset S, writing per-term rectangles + auto itS = s_index_map.find(S); if (itS == s_index_map.end()) { /* nothing to write */ } + else { + int s_idx = itS->second; + std::vector Svec = set_to_vec(S); const int s_dim = (int)Svec.size(); + for (int sm = 0; sm < (1 << s_dim); ++sm) { + int jmask_on_T = 0; int jcount = 0; + for (int b = 0; b < s_dim; ++b) { + if (sm & (1 << b)) { ++jcount; int d = Svec[b]; int pos = pos_in_T[d]; if (pos >= 0) jmask_on_T |= (1 << pos); } + } + auto jit = contrib_by_mask.find(jmask_on_T); + if (jit == contrib_by_mask.end()) continue; + const std::vector &contrib_j = jit->second; + int sign_flip = ((int)S.size() - jcount) % 2; + std::vector signed_contrib = sign_flip ? (contrib_j * (-1)) : contrib_j; + // Build rectangle: restrict dims in j to leaf's range; others span entire domain + std::vector lo; lo.reserve(S.size()); std::vector hi; hi.reserve(S.size()); + for (int di = 0; di < s_dim; ++di) { + int d = Svec[di]; + if (sm & (1 << di)) { lo.push_back(lo_cached[d]); hi.push_back(hi_cached[d]); } + else { lo.push_back(0); hi.push_back(cells_by_dim[d]); } + } + rpf_diff::add_rect(diff_S[s_idx], lo, hi, signed_contrib); + } + } + } while (next_combination(p, tdim)); + } + } + } + } + + // 5) Finalize per S + for (size_t i = 0; i < S_vars.size(); ++i) { + const auto &S = S_vars[i]; LeafGrid gl; gl.lim_list = lim_list; + if (S == std::set{0}) { + gl.grid = grid::NDGrid(); gl.values = utils::Matrix>(std::vector{1}, std::vector(value_size, 0)); gl.individuals = utils::Matrix(std::vector{1}, 0); + std::vector idx0{0}; gl.values[idx0] = intercept; + } else { + std::vector dims_end; std::vector cells_dims; for (int d : S) { int K = (int)lim_list[d - 1].size(); dims_end.push_back(std::max(1, K)); cells_dims.push_back(std::max(0, K - 1)); } + rpf_diff::inclusive_scan_inplace(diff_S[i]); gl.grid = grid::NDGrid(dims_end); + gl.values = utils::Matrix>(dims_end, std::vector(value_size, 0)); gl.individuals = utils::Matrix(dims_end, 0); + auto g = grid::NDGrid(cells_dims); while (!g.nextPoint()) { auto point = g.getPoint(); gl.values[point] = diff_S[i].at(point); } + } + curr_family[S]->GridLeaves = gl; + } + + // 6) Overwrite high orders with zeros if capped + if (maxp_interaction > 0) { + for (const auto &S : keys) { + if (S == std::set{0} || (int)S.size() <= maxp_interaction) continue; + LeafGrid gl; gl.lim_list = lim_list; std::vector dims_end; for (int d : S) { int K = (int)lim_list[d - 1].size(); dims_end.push_back(std::max(1, K)); } + gl.grid = grid::NDGrid(dims_end); gl.values = utils::Matrix>(dims_end, std::vector(value_size, 0)); gl.individuals = utils::Matrix(dims_end, 0); + curr_family[S]->GridLeaves = gl; + } + } +} + + + + + +// Unified purifier entry: mode 1 = grid path, mode 2 = fast exact path +void RandomPlantedForest::purify(int maxp_interaction, int nthreads_param, int mode) +{ + // Determine threads: if user provided >0, use it; otherwise default to + // min(object-configured nthreads, hardware concurrency) + unsigned int threads_to_use = 0; + if (nthreads_param > 0) { + threads_to_use = static_cast(nthreads_param); + } else { + unsigned int avail = std::thread::hardware_concurrency(); + unsigned int obj = static_cast(std::max(1, nthreads)); + unsigned int eff_avail = (avail > 0 ? avail : 1u); + threads_to_use = std::min(obj, eff_avail); + } + + auto worker = [this, maxp_interaction, mode](TreeFamily &fam){ + if (mode == 2) this->purify_fast_exact_family(fam, maxp_interaction); + else this->purify_3_family(fam, maxp_interaction); + }; + + if (threads_to_use > 1) + { + unsigned int avail = std::thread::hardware_concurrency(); + if (avail > 0 && threads_to_use > avail) + { + Rcout << "Requested " << threads_to_use << " threads but only " << avail << " available" << std::endl; + } + for (size_t start = 0; start < this->tree_families.size(); start += (size_t)threads_to_use) + { + size_t batch = std::min((size_t)threads_to_use, this->tree_families.size() - start); + if (batch == 0) break; + std::vector threads(batch); + for (size_t i = 0; i < batch; ++i) + { + size_t fam_index = start + i; + threads[i] = std::thread([&worker](TreeFamily *fam_ptr){ worker(*fam_ptr); }, &this->tree_families[fam_index]); + } + for (auto &th : threads) + { + if (th.joinable()) th.join(); + } + } + purified = true; + return; + } + + for (auto &fam : this->tree_families) worker(fam); + purified = true; +} + + + +// Purify a single family, but only materialize outputs up to maxp_interaction. +// Higher-order trees (|dims| > maxp_interaction) are left with zero-valued grids, +// but are still used as sources during purification so that lower-order components +// are computed correctly. +void RandomPlantedForest::purify_3_family(TreeFamily &curr_family, int maxp_interaction) +{ + // Normalize cap: treat 0 (or out-of-range) as full order p = feature_size + if (maxp_interaction <= 0 || maxp_interaction > feature_size) maxp_interaction = feature_size; + + // lim_list is a list giving for each variable all interval end-points + std::vector> lim_list(feature_size); + + // go through all variables of the component + for (int curr_dim = 1; curr_dim <= feature_size; ++curr_dim) + { + std::vector bounds; + + // go through trees of family + for (const auto &curr_tree : curr_family) + { + // consider only relevant trees that have current dimension as variable + if (!curr_tree.first.count(curr_dim)) + continue; + // go through leaves of tree + for (const auto &curr_leaf : curr_tree.second->leaves) + { + // get interval ends of variable + bounds.push_back(curr_leaf.intervals[curr_dim - 1].first); + bounds.push_back(curr_leaf.intervals[curr_dim - 1].second); + } + } + std::sort(bounds.begin(), bounds.end()); + bounds.erase(std::unique(bounds.begin(), bounds.end()), bounds.end()); + lim_list[curr_dim - 1] = bounds; + } + + // Precompute per-sample bin indices for each feature based on lim_list + // -1 means the sample falls outside the covered bounds for that feature + std::vector> sample_bins; + if (sample_size > 0 && feature_size > 0) + { + sample_bins.assign(sample_size, std::vector(feature_size, -1)); + for (int s = 0; s < sample_size; ++s) + { + const auto &xrow = X[s]; + for (int d = 1; d <= feature_size; ++d) + { + const auto &lims = lim_list[d - 1]; + if (lims.empty()) continue; + const double val = xrow[d - 1]; + auto it = std::upper_bound(lims.begin(), lims.end(), val); + int pos = static_cast(it - lims.begin()); + if (pos == 0 || pos >= static_cast(lims.size())) + { + sample_bins[s][d - 1] = -1; // outside + } + else + { + sample_bins[s][d - 1] = pos - 1; // interval index in [0, lims.size()-2] + } + } + } + } + + // initialize values and individuals for each tree in family + std::vector grids(curr_family.size() - 1); + std::vector> individuals(curr_family.size() - 1); + std::vector>> values(curr_family.size() - 1); + std::vector>> values_old(curr_family.size() - 1); + std::vector> variables(curr_family.size() - 1); + + // ------------- setup finer grid ------------- + int tree_index = 0; + for (const auto &curr_tree : curr_family) + { + if (curr_tree.first == std::set{0}) + { + continue; // ignore null tree + } + + // fill space with dimensions + std::vector dimensions; + dimensions.reserve(curr_tree.first.size()); + for (const auto &dim : curr_tree.first) + { + dimensions.push_back(lim_list[dim - 1].size()); + } + + // setup grid for leaf indices + auto grid = grid::NDGrid(dimensions); + + // initialize data for current tree + grids[tree_index] = grid; + individuals[tree_index] = utils::Matrix(dimensions, 0); + values[tree_index] = utils::Matrix>(dimensions, std::vector(value_size, 0)); + values_old[tree_index] = utils::Matrix>(dimensions, std::vector(value_size, 0)); + variables[tree_index] = curr_tree.first; + + // 1) Fill individuals using precomputed sample bins + if (!curr_tree.first.empty()) + { + std::vector point; point.reserve(curr_tree.first.size()); + for (int s = 0; s < sample_size; ++s) + { + point.clear(); bool outside = false; + for (const auto &dim : curr_tree.first) + { + int b = sample_bins.empty() ? -1 : sample_bins[s][dim - 1]; + if (b < 0) { outside = true; break; } + point.push_back(b); + } + if (!outside) { individuals[tree_index][point] += 1; } + } + } + + // 2) Values accumulation: leaf-centric rectangular updates over the grid + if (!curr_tree.first.empty()) + { + const size_t nd = curr_tree.first.size(); + // For each leaf, determine covered index ranges along each dim, then add leaf.value to all covered grid cells + for (const auto &leaf : curr_tree.second->get_leaves()) + { + std::vector start(nd, 0), stop(nd, -1); + size_t idx_dim = 0; + bool empty = false; + for (const auto &dim : curr_tree.first) + { + const auto &lims = lim_list[dim - 1]; + const int dim_len = static_cast(grids[tree_index].dimensions[idx_dim]); + const int cell_max = (dim_len >= 2) ? (dim_len - 2) : -1; + const double left = leaf.intervals[dim - 1].first; + const double right = leaf.intervals[dim - 1].second; + int k_low = static_cast(std::lower_bound(lims.begin(), lims.end(), left) - lims.begin()); + int ub = static_cast(std::upper_bound(lims.begin(), lims.end(), right) - lims.begin()); + int k_high = ub - 2; // we need lims[k+1] <= right + if (k_low < 0) k_low = 0; + if (k_high > cell_max) k_high = cell_max; + if (k_low > k_high) { empty = true; break; } + start[idx_dim] = k_low; + stop[idx_dim] = k_high; + ++idx_dim; + } + if (empty) continue; + + // Iterate over cartesian product of [start[d], stop[d]] for all dims d + std::vector gridPoint = start; + while (true) + { + values[tree_index][gridPoint] += leaf.value; + values_old[tree_index][gridPoint] += leaf.value; + // increment like odometer + if (nd == 0) break; + size_t pos = nd; + while (pos > 0) + { + --pos; + if (gridPoint[pos] < stop[pos]) { ++gridPoint[pos]; break; } + gridPoint[pos] = start[pos]; + } + if (pos == 0 && gridPoint[pos] == start[pos]) break; // finished full cycle + } + } + } + + ++tree_index; + } + + // ------------- create new trees ------------- + grids.insert(grids.begin(), grid::NDGrid()); + values.insert(values.begin(), utils::Matrix>(std::vector{1}, std::vector(value_size, 0))); + values_old.insert(values_old.begin(), utils::Matrix>(std::vector{1}, std::vector(value_size, 0))); + individuals.insert(individuals.begin(), utils::Matrix(std::vector{1})); + variables.insert(variables.begin(), std::set{0}); + + unsigned int curr_max = curr_family.rbegin()->first.size(); + while (curr_max > 1) + { + auto keys = getKeys(curr_family); + for (std::vector>::reverse_iterator key = keys.rbegin(); key != keys.rend(); ++key) + { + auto &curr_tree = curr_family[(*key)]; + std::set curr_dims = curr_tree->split_dims; + if (curr_dims.size() == curr_max) + { + int dim_index2 = 0; + for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) + { + if (curr_tree->split_dims.count(feature_dim) != 0) + { + std::set tree_dims = curr_tree->split_dims; + tree_dims.erase(tree_dims.find(feature_dim)); + std::shared_ptr tree = treeExists(tree_dims, curr_family); + if (!tree) + { + auto old_tree_index = std::distance(std::begin(curr_family), curr_family.find(curr_tree->get_split_dims())); + curr_family.insert(std::make_pair(tree_dims, std::make_shared(DecisionTree(tree_dims)))); + auto tree_index2 = std::distance(std::begin(curr_family), curr_family.find(tree_dims)); + std::vector matrix_dimensions = values[old_tree_index].dims; + matrix_dimensions.erase(matrix_dimensions.begin() + dim_index2); + auto grid = grid::NDGrid(matrix_dimensions); + grids.insert(grids.begin() + tree_index2, grid); + values.insert(values.begin() + tree_index2, utils::Matrix>(matrix_dimensions, std::vector(value_size, 0))); + values_old.insert(values_old.begin() + tree_index2, utils::Matrix>(matrix_dimensions, std::vector(value_size, 0))); + individuals.insert(individuals.begin() + tree_index2, utils::Matrix(matrix_dimensions)); + variables.insert(variables.begin() + tree_index2, tree_dims); + // fill individuals of new trees using precomputed sample bins + if (!tree_dims.empty()) + { + std::vector point2; point2.reserve(tree_dims.size()); + for (int s = 0; s < sample_size; ++s) + { + point2.clear(); bool outside2 = false; + for (const auto &dim2 : tree_dims) + { + int b2 = sample_bins.empty() ? -1 : sample_bins[s][dim2 - 1]; + if (b2 < 0) { outside2 = true; break; } + point2.push_back(b2); + } + if (!outside2) { individuals[tree_index2][point2] += 1; } + } + } + } + dim_index2++; + } + } + } + } + --curr_max; + } + + // ------------- purify ------------- + std::vector> dim_to_pos(variables.size(), std::vector(feature_size + 1, -1)); + for (size_t idx = 0; idx < variables.size(); ++idx) + { + int pos = 0; + for (const auto dim : variables[idx]) + { + if (dim >= 0 && dim <= feature_size) dim_to_pos[idx][dim] = pos++; + } + } + + std::vector total_individuals(variables.size(), 0.0); + for (size_t idx = 0; idx < variables.size(); ++idx) + { + double tot = 0.0; + if (variables[idx] == std::set{0}) + { + std::vector only{0}; + tot += individuals[idx][only]; + } + else + { + auto grid_sum = grids[idx]; + while (!grid_sum.nextPoint()) + { + auto gp = grid_sum.getPoint(); + tot += individuals[idx][gp]; + } + } + total_individuals[idx] = tot; + } + + int tree_index_t = curr_family.size() - 1; + for (auto tree_t = variables.rbegin(); tree_t != variables.rend(); ++tree_t) + { + std::set curr_dims = *tree_t; + if (curr_dims == std::set{0}) + continue; + + auto grid = grids[tree_index_t]; + int tree_index_u = variables.size(); + for (auto tree_u = variables.rbegin(); tree_u != variables.rend(); ++tree_u) + { + --tree_index_u; + std::set j_dims = curr_dims; + if (tree_u->size() > curr_dims.size()) + continue; + bool subset = true; + for (const auto dim : *tree_u) + { + if (tree_t->count(dim) == 0) + { + subset = false; + break; + } + j_dims.erase(dim); + } + if (!subset) + continue; + + double tot_sum = total_individuals[tree_index_u]; + if (tot_sum == 0.0) + continue; + const double inv_tot_sum = 1.0 / tot_sum; + + grid = grids[tree_index_u]; + std::vector update(value_size, 0); + + if (j_dims.size() == 0) + { + while (!grid.nextPoint()) + { + auto gridPoint_i = grid.getPoint(); + double curr_sum = individuals[tree_index_u][gridPoint_i]; + update += (curr_sum * inv_tot_sum) * values_old[tree_index_t][gridPoint_i]; + } + + int tree_index_s = variables.size(); + for (auto tree_s = variables.rbegin(); tree_s != variables.rend(); ++tree_s) + { + --tree_index_s; + if (*tree_s == std::set{0}) + { + auto gridPoint_0 = std::vector{0}; + values[tree_index_s][gridPoint_0] += update; + } + else + { + bool subset2 = true; + for (const auto dim : *tree_s) + { + if (tree_t->count(dim) == 0) + { + subset2 = false; + break; + } + } + if (!subset2) + continue; + if (maxp_interaction > 0 && tree_s->size() > (size_t)maxp_interaction) continue; // skip materializing > cap + auto grid_k = grids[tree_index_s]; + while (!grid_k.nextPoint()) + { + auto gridPoint_k = grid_k.getPoint(); + int sign0 = ((*tree_s).size() % 2 == 0) ? 1 : -1; + values[tree_index_s][gridPoint_k] += sign0 * update; + } + } + } + } + else + { + std::vector j_sizes(j_dims.size(), 0); + for (size_t j = 0; j < j_dims.size(); ++j) + { + auto tmp = j_dims.begin(); + std::advance(tmp, j); + int j_index = dim_to_pos[tree_index_t][*tmp]; + j_sizes[j] = grids[tree_index_t].dimensions[j_index]; + } + grid::NDGrid grid_j = grid::NDGrid(j_sizes); + while (!grid_j.nextPoint()) + { + std::vector update(value_size, 0); + auto gridPoint_j = grid_j.getPoint(); + grid = grids[tree_index_u]; + while (!grid.nextPoint()) + { + auto gridPoint_i = grid.getPoint(); + double curr_sum = individuals[tree_index_u][gridPoint_i]; + std::vector gridPoint_ij(tree_t->size(), 0); + for (size_t j = 0; j < gridPoint_j.size(); ++j) + { + auto j_dim = j_dims.begin(); + std::advance(j_dim, j); + int j_index = dim_to_pos[tree_index_t][*j_dim]; + gridPoint_ij[j_index] = gridPoint_j[j]; + } + for (size_t i = 0; i < gridPoint_i.size(); ++i) + { + auto i_dim = tree_u->begin(); + std::advance(i_dim, i); + int i_index = dim_to_pos[tree_index_t][*i_dim]; + gridPoint_ij[i_index] = gridPoint_i[i]; + } + update += (curr_sum * inv_tot_sum) * values_old[tree_index_t][gridPoint_ij]; + } + + int tree_index_s = variables.size(); + for (auto tree_s = variables.rbegin(); tree_s != variables.rend(); ++tree_s) + { + --tree_index_s; + bool subset2 = true; + for (const auto dim : j_dims) + { + if (tree_s->count(dim) == 0) + { + subset2 = false; + break; + } + } + for (const auto dim : *tree_s) + { + if (tree_t->count(dim) == 0) + { + subset2 = false; + break; + } + } + if (!subset2) + continue; + // Skip writing for components above the cap + if (maxp_interaction > 0 && tree_s->size() > (size_t)maxp_interaction) + continue; + + std::set k_dims = *tree_s; + std::set k_dims_h1 = *tree_s; + std::set k_dims_h2 = *tree_u; + for (const auto dim : *tree_u) + k_dims.insert(dim); + for (const auto dim : *tree_s) + k_dims_h2.erase(dim); + for (const auto dim : *tree_u) + k_dims_h1.erase(dim); + for (const auto dim : k_dims_h1) + k_dims.erase(dim); + for (const auto dim : k_dims_h2) + k_dims.erase(dim); + + if (k_dims.size() == 0) + { + size_t diff = (*tree_s).size() - j_dims.size(); + int sign = (diff % 2 == 0) ? 1 : -1; + values[tree_index_s][gridPoint_j] += sign * update; + } + else + { + std::vector k_sizes(k_dims.size(), 0); + for (size_t k = 0; k < k_dims.size(); ++k) + { + auto tmp = k_dims.begin(); + std::advance(tmp, k); + int k_index = dim_to_pos[tree_index_t][*tmp]; + k_sizes[k] = grids[tree_index_t].dimensions[k_index]; + } + grid::NDGrid grid_k = grid::NDGrid(k_sizes); + while (!grid_k.nextPoint()) + { + auto gridPoint_k = grid_k.getPoint(); + std::vector gridPoint_jk(tree_s->size(), 0); + for (size_t j = 0; j < gridPoint_j.size(); ++j) + { + auto j_dim = j_dims.begin(); + std::advance(j_dim, j); + int j_index = dim_to_pos[tree_index_s][*j_dim]; + gridPoint_jk[j_index] = gridPoint_j[j]; + } + for (size_t k = 0; k < gridPoint_k.size(); ++k) + { + auto k_dim = k_dims.begin(); + std::advance(k_dim, k); + int k_index = dim_to_pos[tree_index_s][*k_dim]; + gridPoint_jk[k_index] = gridPoint_k[k]; + } + size_t diff = (*tree_s).size() - j_dims.size(); + int sign2 = (diff % 2 == 0) ? 1 : -1; + values[tree_index_s][gridPoint_jk] += sign2 * update; + } + } + } + } + } + } + --tree_index_t; + } + + // ------------- attach to rpf class ------------- + for (size_t tree_index3 = 0; tree_index3 < variables.size(); ++tree_index3) + { + LeafGrid curr_gridLeaf; + curr_gridLeaf.grid = grids[tree_index3]; + curr_gridLeaf.individuals = individuals[tree_index3]; + curr_gridLeaf.lim_list = lim_list; + // If this tree exceeds the cap, attach a zero-valued matrix of the correct shape + if (maxp_interaction > 0 && variables[tree_index3] != std::set{0} && variables[tree_index3].size() > (size_t)maxp_interaction) + { + curr_gridLeaf.values = utils::Matrix>(grids[tree_index3].dimensions, std::vector(value_size, 0)); + } + else + { + curr_gridLeaf.values = values[tree_index3]; + } + curr_family[variables[tree_index3]]->GridLeaves = curr_gridLeaf; + } +} + + + + +void RandomPlantedForest::purify_2() +{ + + // go through all n_trees families + for (auto &curr_family : this->tree_families) + { + + // lim_list is a list giving for each variable all interval end-points + std::vector> lim_list(feature_size); + + // go through all variables of the component + for (int curr_dim = 1; curr_dim <= feature_size; ++curr_dim) + { + std::vector bounds; + + // go through trees of family + for (const auto &curr_tree : curr_family) + { + + // consider only relevant trees that have current dimension as variable + if (!curr_tree.first.count(curr_dim)) + continue; + + // go through leaves of tree + for (const auto &curr_leaf : curr_tree.second->leaves) + { + // get interval ends of variable + bounds.push_back(curr_leaf.intervals[curr_dim - 1].second); + } + } + std::sort(bounds.begin(), bounds.end()); + bounds.erase(std::unique(bounds.begin(), bounds.end()), bounds.end()); + lim_list[curr_dim - 1] = bounds; + } + + // initialize values and individuals for each tree in family + std::vector grids(curr_family.size() - 1); + std::vector> individuals(curr_family.size() - 1); + std::vector>> values(curr_family.size() - 1); + std::vector> variables(curr_family.size() - 1); + + // ------------- setup finer grid ------------- + + int tree_index = 0; + for (const auto &curr_tree : curr_family) + { + + if (curr_tree.first == std::set{0}) + continue; // ignore null tree + + // fill space with dimensions + std::vector dimensions; + for (const auto &dim : curr_tree.first) + { + dimensions.push_back(lim_list[dim - 1].size() - 1); // size - 1 ? + } + + // setup grid for leaf indices + auto grid = grid::NDGrid(dimensions); + + // initialize data for current tree + grids[tree_index] = grid; + individuals[tree_index] = utils::Matrix(dimensions, 0); + values[tree_index] = utils::Matrix>(dimensions, std::vector(value_size, 0)); // changed + variables[tree_index] = curr_tree.first; + + // fill grid points with individuals and values + while (!grid.nextPoint()) + { + + std::vector gridPoint = grid.getPoint(); + + bool in_leaf = true; + + // go through sample points to sum up individuals + for (const auto &feature_vec : X) + { + int dim_index = 0; + in_leaf = true; + for (const auto &dim : curr_tree.first) + { + double val = feature_vec[dim - 1]; + if (!((val >= lim_list[dim - 1][gridPoint[dim_index]]) && (val < lim_list[dim - 1][gridPoint[dim_index] + 1]))) + { + in_leaf = false; + break; + } + ++dim_index; + } + + // consider individuals only if all in + if (in_leaf) + individuals[tree_index][gridPoint] += 1; + } + + // go through leaves of tree to sum up values + for (const auto &leaf : curr_tree.second->get_leaves()) + { + + in_leaf = true; + int dim_index = 0; + for (const auto &dim : curr_tree.first) + { + // consider values only if all in + if (!((leaf.intervals[dim - 1].first <= lim_list[dim - 1][gridPoint[dim_index]]) && (leaf.intervals[dim - 1].second >= lim_list[dim - 1][gridPoint[dim_index] + 1]))) + { + in_leaf = false; + break; + } + ++dim_index; + } + + // sum up values + if (in_leaf) + values[tree_index][gridPoint] += leaf.value; // todo: multiclass + } + } + + ++tree_index; + } + + // ------------- create new trees ------------- + + // insert null tree + grids.insert(grids.begin(), grid::NDGrid()); + values.insert(values.begin(), utils::Matrix>(std::vector{1}, std::vector(value_size, 0))); + individuals.insert(individuals.begin(), utils::Matrix(std::vector{1})); + variables.insert(variables.begin(), std::set{0}); + + // recap maximum number of dimensions of current family + unsigned int curr_max = 0; + for (const auto &tree : curr_family) + { + if (tree.first.size() > curr_max) + curr_max = tree.first.size(); + } + + auto keys = getKeys(curr_family); + while (curr_max > 1) + { + + // go through split dimensions of all trees + for (std::vector>::reverse_iterator key = keys.rbegin(); key != keys.rend(); ++key) + { + + auto &curr_tree = curr_family[(*key)]; + std::set curr_dims = curr_tree->split_dims; + + // check if number of dims same as current max_interaction + if (curr_dims.size() == curr_max) + { + + // go through feature dims + int dim_index = 0; + for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) + { + + // continue only if dim in current tree + if (curr_tree->split_dims.count(feature_dim) != 0) + { + + std::set tree_dims = curr_tree->split_dims; + tree_dims.erase(tree_dims.find(feature_dim)); // remove current feature dim from current tree + + // check if tree with dimensions exists, if not create + std::shared_ptr tree = treeExists(tree_dims, curr_family); + if (!tree) + { + + // get index of old and new tree + auto old_tree_index = std::distance(std::begin(curr_family), curr_family.find(curr_tree->get_split_dims())); + curr_family.insert(std::make_pair(tree_dims, std::make_shared(DecisionTree(tree_dims)))); + auto tree_index = std::distance(std::begin(curr_family), curr_family.find(tree_dims)); + + // remove matrix dimension of respective variable + std::vector matrix_dimensions = values[old_tree_index].dims; + matrix_dimensions.erase(matrix_dimensions.begin() + dim_index); + + // initialize data for new tree + auto grid = grid::NDGrid(matrix_dimensions); + grids.insert(grids.begin() + tree_index, grid); + values.insert(values.begin() + tree_index, utils::Matrix>(matrix_dimensions, std::vector(0, value_size))); + individuals.insert(individuals.begin() + tree_index, utils::Matrix(matrix_dimensions)); + variables.insert(variables.begin() + tree_index, tree_dims); + + // fill individuals of new trees + while (!grid.nextPoint()) + { + + std::vector gridPoint = grid.getPoint(); + bool in_leaf = true; + + // go through sample points to sum up individuals + for (const auto &feature_vec : X) + { + int dim_index = 0; + in_leaf = true; + for (const auto &dim : tree_dims) + { + double val = feature_vec[dim - 1]; + if (!((val >= lim_list[dim - 1][gridPoint[dim_index]]) && (val < lim_list[dim - 1][gridPoint[dim_index] + 1]))) + in_leaf = false; + ++dim_index; + } + + // consider individuals only if all in + if (in_leaf) + individuals[tree_index][gridPoint] += 1; + } + } + } + + dim_index++; + } + } + } + } + + // update currently considered dimension size + --curr_max; + } + + // ------------- purify ------------- + + // measure tolerance and number of iterations + std::vector tol(curr_family.size(), 1); + int iter; + + // iterate backwards through tree family + int curr_tree_index = curr_family.size() - 1; + for (TreeFamily::reverse_iterator curr_tree = curr_family.rbegin(); curr_tree != curr_family.rend(); ++curr_tree) + { + iter = 0; + std::set curr_dims = curr_tree->second->get_split_dims(); + + // do not purify null + if (curr_dims == std::set{0}) + continue; + + // repeat until tolerance small enough and (?) maximum number of iterations reached + while ((tol[curr_tree_index] > 0.00000000001) && (iter < 100)) + { + + // go through feature dims + int curr_dim_index = 0; + for (const auto &feature_dim : curr_dims) + { + + // get tree that has same variables as curr_tree minus j-variable + std::set tree_dims = curr_dims; + tree_dims.erase(tree_dims.find(feature_dim)); + int tree_index = 0; // if tree not exist, set to null tree + if (curr_family.find(tree_dims) != curr_family.end()) + tree_index = std::distance(std::begin(curr_family), curr_family.find(tree_dims)) - 1; + + // update values + if (grids[curr_tree_index].dimensions.size() == 1) + { // one dimensional case + + int sum_ind = 0; + std::vector avg(value_size, 0); + + // get sum of individuals + for (int i = 0; i < individuals[curr_tree_index].n_entries; ++i) + { + std::vector tmp{i}; + sum_ind += individuals[curr_tree_index][tmp]; + } + if (sum_ind == 0) + continue; + + // calc avg + for (int i = 0; i < individuals[curr_tree_index].n_entries; ++i) + { + std::vector tmp{i}; + avg += (individuals[curr_tree_index][tmp] * values[curr_tree_index][tmp]) / sum_ind; + } + + // update values of one dimensional and null tree + for (int i = 0; i < values[curr_tree_index].n_entries; ++i) + { + std::vector tmp{i}; + values[curr_tree_index][tmp] -= avg; + } + std::vector tmp{0}; + values[tree_index][tmp] += avg; + } + else + { // higher dimensional case + + // setup new grid without dimension j + std::vector new_dimensions = grids[curr_tree_index].dimensions; + int j_dim = new_dimensions[curr_dim_index]; + new_dimensions.erase(new_dimensions.begin() + curr_dim_index); + grid::NDGrid grid = grid::NDGrid(new_dimensions); + + // go through values without dimension j + while (!grid.nextPoint()) + { + auto gridPoint = grid.getPoint(); + gridPoint.push_back(0); + + int sum_ind = 0; + std::vector avg(value_size, 0); + + // go through slice to sum up individuals + for (int j = 0; j < j_dim; ++j) + { + gridPoint.back() = j; + + // get sum of individuals + sum_ind += individuals[curr_tree_index][gridPoint]; + } + + // go through slice to calc avg + for (int j = 0; j < j_dim; ++j) + { + gridPoint.back() = j; + + // calc avg + avg += (individuals[curr_tree_index][gridPoint] * values[curr_tree_index][gridPoint]) / sum_ind; + } + + // go through slice to update values + for (int j = 0; j < j_dim; ++j) + { + gridPoint.back() = j; + + // update values of current slice + values[curr_tree_index][gridPoint] -= avg; + } + + // update lower dimensional tree + gridPoint.pop_back(); + values[tree_index][gridPoint] += avg; + } + } + + ++curr_dim_index; + } + + // update tolerance + if (variables[curr_tree_index].size() == 1) + { + tol[curr_tree_index] = 1; // todo + } + else + { + tol[curr_tree_index] = 1; + } + + ++iter; + } + + --curr_tree_index; + } + + // ------------- attach to rpf class ------------- + + // fill with new trees + for (size_t tree_index = 0; tree_index < variables.size(); ++tree_index) + { + LeafGrid curr_gridLeaf; + curr_gridLeaf.grid = grids[tree_index]; + curr_gridLeaf.individuals = individuals[tree_index]; + curr_gridLeaf.lim_list = lim_list; + curr_gridLeaf.values = values[tree_index]; + curr_family[variables[tree_index]]->GridLeaves = curr_gridLeaf; + } + } + + purified = true; +} + diff --git a/src/lib/rpf.cpp b/src/lib/rpf.cpp index 085df8c..b9ce125 100644 --- a/src/lib/rpf.cpp +++ b/src/lib/rpf.cpp @@ -1,5 +1,48 @@ #include "rpf.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal_utils.hpp" + +// Use utilities via namespace alias +using namespace rpf_utils; + +// Thread-local cache for histogram mode per working set (per tree-family build) +// Avoids races on the class member when building families in parallel +thread_local std::vector> tls_working_bin_id; + +// Utilities shared across modes +bool RandomPlantedForest::possibleExists( + int dim, + const std::vector& possible_splits, + const std::set& resulting_dims) +{ + for (const auto& c : possible_splits) { + if (c.dim == dim && c.tree && c.tree->split_dims == resulting_dims) + return true; + } + return false; +} +bool RandomPlantedForest::leafCandidateExists( + const std::vector& possible_splits, + const std::shared_ptr& tree, + size_t leaf_idx, + int dim) +{ + for (const auto& c : possible_splits) { + if (c.dim == dim && c.tree.get() == tree.get() && c.leaf_idx == leaf_idx) + return true; + } + return false; +} bool RandomPlantedForest::is_purified() { @@ -8,41 +51,27 @@ bool RandomPlantedForest::is_purified() void RandomPlantedForest::L2_loss(Split &split) { - - // new meanq split.M_s = split.sum_s / split.I_s.size(); split.M_b = split.sum_b / split.I_b.size(); - split.min_sum = 0; for (size_t p = 0; p < value_size; ++p) { - split.min_sum += -2 * split.M_s[p] * split.sum_s[p] + split.I_s.size() * pow(split.M_s[p], 2); - split.min_sum += -2 * split.M_b[p] * split.sum_b[p] + split.I_b.size() * pow(split.M_b[p], 2); + const double Ms = split.M_s[p]; + const double Mb = split.M_b[p]; + split.min_sum += -2 * Ms * split.sum_s[p] + split.I_s.size() * (Ms * Ms); + split.min_sum += -2 * Mb * split.sum_b[p] + split.I_b.size() * (Mb * Mb); } } -// constructor +// constructor (parsing includes split_structure) RandomPlantedForest::RandomPlantedForest(const NumericMatrix &samples_Y, const NumericMatrix &samples_X, const NumericVector parameters) { - - // Ensure correct Rcpp RNG state Rcpp::RNGScope scope; - - // initialize class members std::vector pars = to_std_vec(parameters); - if (pars.size() != 9) + if (pars.size() != 12 && pars.size() != 13) { - Rcout << "Wrong number of parameters - set to default." << std::endl; - this->max_interaction = 1; - this->n_trees = 50; - this->n_splits = 30; - this->split_try = 10; - this->t_try = 0.4; - this->purify_forest = 0; - this->deterministic = 0; - this->nthreads = 1; - this->cross_validate = 0; + Rcpp::stop("RandomPlantedForest requires 12 or 13 parameters, got %d", pars.size()); } else { @@ -55,475 +84,574 @@ RandomPlantedForest::RandomPlantedForest(const NumericMatrix &samples_Y, const N this->deterministic = pars[6]; this->nthreads = pars[7]; this->cross_validate = pars[8]; + this->split_decay_rate_ = pars[9]; + this->max_candidates_ = static_cast(pars[10]); + this->delete_leaves = (pars[11] != 0); + // map: 0=res_trees, 1=cur_trees_2, 2=cur_trees_1, 3=leaves, 4=hist + this->split_structure_mode_ = (pars.size() >= 13) ? static_cast(pars[12]) : 3; } - - // set data and data related members this->set_data(samples_Y, samples_X); } -// determine optimal split -Split RandomPlantedForest::calcOptimalSplit(const std::vector> &Y, const std::vector> &X, - std::multimap> &possible_splits, TreeFamily &curr_family) -{ - - Split curr_split, min_split; - curr_split.Y = &Y; - std::set tree_dims; - std::vector unique_samples; - int k; - unsigned int n = 0; - double leaf_size, sample_point; - - // sample possible splits - unsigned int n_candidates = ceil(t_try * possible_splits.size()); // number of candidates that will be considered - std::vector split_candidates(possible_splits.size()); - std::iota(split_candidates.begin(), split_candidates.end(), 0); // consecutive indices of possible candidates - - if (!deterministic) - { - shuffle_vector(split_candidates.begin(), split_candidates.end()); // shuffle for random order - } - - // consider a fraction of possible splits - while (n < n_candidates) - { - - if (possible_splits.empty()) - break; - if (split_candidates[n] >= 0 && (size_t)split_candidates[n] >= possible_splits.size()) - continue; - - auto candidate = possible_splits.begin(); - std::advance(candidate, split_candidates[n]); // get random split candidate without replacement - k = candidate->first - 1; // split dim of current candidate, converted to index starting at 0 - leaf_size = n_leaves[k]; - - // Test if splitting in the current tree w.r.t. the coordinate "k" is an element of candidate tree - tree_dims = candidate->second->split_dims; - tree_dims.erase(k + 1); - tree_dims.erase(0); - - std::vector> curr_trees; - if (tree_dims.size() == 0) - curr_trees.push_back(curr_family[std::set{0}]); - if (curr_family.find(tree_dims) != curr_family.end()) - curr_trees.push_back(curr_family[tree_dims]); - if (curr_family.find(candidate->second->split_dims) != curr_family.end()) - curr_trees.push_back(curr_family[candidate->second->split_dims]); - - // go through all trees in current family - for (auto &curr_tree : curr_trees) - { - - // skip if tree has no leaves - if (curr_tree->leaves.size() == 0) - continue; - - // go through all leaves of current tree - for (auto &leaf : curr_tree->leaves) - { - - std::vector tot_sum(value_size, 0); - - // extract sample points according to individuals from X and Y - unique_samples = std::vector(leaf.individuals.size()); - for (unsigned int i = 0; i < leaf.individuals.size(); ++i) - { - unique_samples[i] = X[leaf.individuals[i]][k]; - } - std::sort(unique_samples.begin(), unique_samples.end()); - unique_samples.erase(std::unique(unique_samples.begin(), unique_samples.end()), unique_samples.end()); - - // check if number of sample points is within limit - if (unique_samples.size() < 2 * leaf_size) - continue; - - // consider split_try-number of samples - std::vector samples; - if (deterministic) - { // sequential samples if deterministic - samples = std::vector(std::min((int)unique_samples.size() - 1, 9)); - std::iota(samples.begin(), samples.end(), 1); - } - else - { // randomly picked samples otherwise - samples = std::vector(split_try); - for (size_t i = 0; i < samples.size(); ++i) - samples[i] = R::runif(leaf_size, unique_samples.size() - leaf_size); - std::sort(samples.begin(), samples.end()); - } - - // go through samples - for (size_t sample_pos = 0; sample_pos < samples.size(); ++sample_pos) - { - - // get samplepoint - sample_point = unique_samples[samples[sample_pos]]; - - // clear current split - { - curr_split.I_s.clear(); - curr_split.I_b.clear(); - curr_split.I_s.reserve(leaf.individuals.size()); - curr_split.I_b.reserve(leaf.individuals.size()); - curr_split.M_s = std::vector(value_size, 0); - curr_split.M_b = std::vector(value_size, 0); - } - - // get samples greater/smaller than samplepoint - if (sample_pos == 0) - { - curr_split.sum_s = std::vector(value_size, 0); - curr_split.sum_b = std::vector(value_size, 0); +// --------------- calcOptimalSplit per mode --------------- - for (int individual : leaf.individuals) - { - if (X[individual][k] < sample_point) - { - curr_split.I_s.push_back(individual); - curr_split.sum_s += Y[individual]; - } - else - { - curr_split.I_b.push_back(individual); - curr_split.sum_b += Y[individual]; - } - } +// Mode 3: leaves implementation moved to lib/splits_leaves.cpp - tot_sum = curr_split.sum_s + curr_split.sum_b; - } - else - { +// Mode 1: cur_trees_2 moved to lib/splits_cur_trees_2.cpp - for (int individual : leaf.individuals) - { - if (X[individual][k] < sample_point) - { - if (X[individual][k] >= unique_samples[samples[sample_pos - 1]]) - { - curr_split.sum_s += Y[individual]; - } - curr_split.I_s.push_back(individual); - } - else - { - curr_split.I_b.push_back(individual); - } - } +// Mode 2: cur_trees_1 (pair-sampling within predecessor/current trees) +// Mode 2: cur_trees_1 moved to lib/splits_cur_trees_1.cpp - curr_split.sum_b = tot_sum - curr_split.sum_s; - } +// Mode 0: res_trees (operate on resulting trees pool) +bool RandomPlantedForest::resultingTreeExists(const std::vector& pool, const std::set& dims) { + for (const auto &c : pool) if (c.tree->get_split_dims() == dims) return true; return false; +} - // accumulate squared mean and get mean - L2_loss(curr_split); +// Mode 0: res_trees moved to lib/splits_res_trees.cpp - // update split if squared sum is smaller - if (curr_split.min_sum < min_split.min_sum) - { - min_split = curr_split; - min_split.tree_index = curr_tree; - min_split.leaf_index = &leaf; - min_split.split_coordinate = k + 1; - min_split.split_point = sample_point; - } - } - } - } +// moved to lib/splits_hist.cpp - ++n; +// Dispatcher used by create_tree_family +Split RandomPlantedForest::calcOptimalSplit(const std::vector> &Y, + const std::vector> &X, + std::vector &possible_splits, + TreeFamily &curr_family) +{ + if (split_structure_mode_ == 3) { + return this->calcOptimalSplit_leaves(Y, X, possible_splits, curr_family); + } else if (split_structure_mode_ == 2) { + return this->calcOptimalSplit_curTrees1(Y, X, possible_splits, curr_family); + } else if (split_structure_mode_ == 1) { + return this->calcOptimalSplit_curTrees2(Y, X, possible_splits, curr_family); + } else if (split_structure_mode_ == 4) { + return this->calcOptimalSplit_hist(Y, X, possible_splits, curr_family); + } else { + // Not used for res_trees; a separate path below uses its own pool type + return Split{}; } - - return min_split; } void RandomPlantedForest::set_data(const NumericMatrix &samples_Y, const NumericMatrix &samples_X) { - this->Y = to_std_vec(samples_Y); this->X = to_std_vec(samples_X); - - // Check for correct input - if (Y.size() == 0) - throw std::invalid_argument("Y empty - no data provided."); - if (X.size() == 0) - throw std::invalid_argument("X empty - no data provided."); + if (Y.empty()) throw std::invalid_argument("Y empty - no data provided."); + if (X.empty()) throw std::invalid_argument("X empty - no data provided."); this->feature_size = X[0].size(); - this->value_size = Y[0].size(); // multiclass - for (const auto &vec : X) - { - if (vec.size() != (size_t)feature_size) - throw std::invalid_argument("Feature dimensions of X not uniform."); - } - if (Y.size() != X.size()) - throw std::invalid_argument("X and Y are not of the same length!"); - + this->value_size = Y[0].size(); + for (const auto &vec : X) if (vec.size() != (size_t)feature_size) throw std::invalid_argument("Feature dimensions of X not uniform."); + if (Y.size() != X.size()) throw std::invalid_argument("X and Y are not of the same length!"); this->n_leaves = std::vector(feature_size, 1); this->sample_size = X.size(); this->upper_bounds = std::vector(feature_size); this->lower_bounds = std::vector(feature_size); - - // get upper/lower bounds - double minVal, maxVal, currVal; - for (int i = 0; i < feature_size; ++i) - { - minVal = maxVal = X[0][i]; - for (size_t j = 0; j < sample_size; ++j) - { - currVal = X[j][i]; - if (currVal < minVal) - minVal = currVal; - if (currVal > maxVal) - maxVal = currVal; + for (int i = 0; i < feature_size; ++i) { + double minVal = X[0][i], maxVal = X[0][i]; + for (size_t j = 0; j < sample_size; ++j) { double currVal = X[j][i]; if (currVal < minVal) minVal = currVal; if (currVal > maxVal) maxVal = currVal; } + this->upper_bounds[i] = maxVal + 2 * eps; this->lower_bounds[i] = minVal; + } + // Prepare histogram bins if histogram mode is requested + if (this->split_structure_mode_ == 4) { + const size_t K = std::max(2, std::min(num_bins_, static_cast(std::max(2, (int)std::sqrt((double)sample_size))))); + this->num_bins_ = K; + feature_cut_points_.assign((size_t)feature_size, std::vector()); + sample_bin_id_.assign((size_t)feature_size, std::vector(sample_size, 0)); + // For each feature, compute quantile cuts using sorted sample values + for (int k = 0; k < feature_size; ++k) { + std::vector vals(sample_size); + for (size_t i = 0; i < sample_size; ++i) vals[i] = X[i][k]; + std::sort(vals.begin(), vals.end()); + vals.erase(std::unique(vals.begin(), vals.end()), vals.end()); + size_t unique_n = vals.size(); + size_t cuts = (K >= 2) ? (K - 1) : 1; + if (unique_n <= 1 || cuts == 0) { feature_cut_points_[k].clear(); feature_cut_points_[k].shrink_to_fit(); continue; } + feature_cut_points_[k].resize(cuts); + for (size_t c = 1; c <= cuts; ++c) { + double q = (double)c / (double)K; size_t idx = static_cast(std::floor(q * (double)(unique_n - 1))); + if (idx >= unique_n) idx = unique_n - 1; feature_cut_points_[k][c - 1] = vals[idx]; + } + // Assign bin ids for all samples in original X for this feature + for (size_t i = 0; i < sample_size; ++i) { + double v = X[i][k]; + auto &cuts_k = feature_cut_points_[k]; + int bin = 0; + if (!cuts_k.empty()) { + auto itb = std::upper_bound(cuts_k.begin(), cuts_k.end(), v); + bin = (int)std::distance(cuts_k.begin(), itb); + } + sample_bin_id_[k][i] = bin; + } } - this->upper_bounds[i] = maxVal + 2 * eps; // to consider samples at max value - this->lower_bounds[i] = minVal; } - this->fit(); - - if (cross_validate) - { - this->cross_validation(); - } + if (cross_validate) { this->cross_validation(); } } void RandomPlantedForest::create_tree_family(std::vector initial_leaves, size_t n) { - + TreeFamily curr_family; - curr_family.insert(std::make_pair(std::set{0}, std::make_shared(DecisionTree(std::set{0}, initial_leaves)))); // save tree with one leaf in the beginning - // store possible splits in map with splitting variable as key and pointer to resulting tree - std::multimap> possible_splits; - for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) - { - // add pointer to resulting tree with split dimension as key - curr_family.insert(std::make_pair(std::set{feature_dim}, std::make_shared(DecisionTree(std::set{feature_dim})))); - possible_splits.insert(std::make_pair(feature_dim, curr_family[std::set{feature_dim}])); - } - - // sample data points with replacement - int sample_index; - std::vector> samples_X; - std::vector> samples_Y; - - // deterministic - if (deterministic) - { - samples_X = X; - samples_Y = Y; - this->t_try = 1; - } - else - { - samples_X = std::vector>(sample_size); - samples_Y = std::vector>(sample_size); - - for (size_t i = 0; i < sample_size; ++i) - { - - sample_index = R::runif(0, sample_size - 1); - samples_Y[i] = Y[sample_index]; - samples_X[i] = X[sample_index]; - } - } - - // modify existing or add new trees through splitting - Split curr_split; - for (int split_count = 0; split_count < n_splits; ++split_count) - { - - // find optimal split - curr_split = calcOptimalSplit(samples_Y, samples_X, possible_splits, curr_family); - - // continue only if we get a significant result - if (!std::isinf(curr_split.min_sum)) - { - - // update possible splits - for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) - { // consider all possible dimensions - - // create union of split coord, feature dim and dimensions of old tree - std::set curr_dims = curr_split.tree_index->split_dims; - curr_dims.insert(curr_split.split_coordinate); - curr_dims.insert(feature_dim); - curr_dims.erase(0); - - // skip if possible_split already exists - if (possibleExists(feature_dim, possible_splits, curr_dims)) - continue; - - // do not exceed maximum level of interaction - if (max_interaction >= 0 && curr_dims.size() > (size_t)max_interaction) - continue; - - // check if resulting tree already exists in family - std::shared_ptr found_tree = treeExists(curr_dims, curr_family); - - // update possible_splits if not already existing - if (found_tree) - { // if yes add pointer - possible_splits.insert(std::make_pair(feature_dim, found_tree)); + curr_family.insert({std::set{0}, std::make_shared(DecisionTree(std::set{0}, initial_leaves))}); + + // res_trees uses a separate pool + if (split_structure_mode_ == 0) { + std::vector possible_trees; + for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) { + auto treePtr = std::make_shared(DecisionTree({feature_dim})); + curr_family.insert({{feature_dim}, treePtr}); + possible_trees.emplace_back(treePtr); + } + + // Bootstrap samples + int sample_index; std::vector> samples_X, samples_Y; + if (deterministic) { samples_X = X; samples_Y = Y; this->t_try = 1; } + else { + samples_X = std::vector>(sample_size); samples_Y = std::vector>(sample_size); + for (size_t i = 0; i < sample_size; ++i) { sample_index = rng_randint(0, (int)sample_size); samples_Y[i] = Y[sample_index]; samples_X[i] = X[sample_index]; } + } + + Split curr_split; + for (int split_count = 0; split_count < n_splits; ++split_count) { + curr_split = this->calcOptimalSplit_resTrees(samples_Y, samples_X, possible_trees, curr_family); + if (!std::isinf(curr_split.min_sum)) { + // ensure D' and its one-step supersets are in pool + std::set Dprime = curr_split.tree_index->split_dims; Dprime.insert(curr_split.split_coordinate); Dprime.erase(0); + if (!resultingTreeExists(possible_trees, Dprime)) { if (auto found = treeExists(Dprime, curr_family)) possible_trees.emplace_back(found); else { curr_family.insert({Dprime, std::make_shared(DecisionTree(Dprime))}); possible_trees.emplace_back(curr_family[Dprime]); } } + for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) { + std::set U = Dprime; U.insert(feature_dim); if (U.size() == Dprime.size()) continue; if (max_interaction >= 0 && U.size() > (size_t)max_interaction) continue; if (resultingTreeExists(possible_trees, U)) continue; if (auto found = treeExists(U, curr_family)) possible_trees.emplace_back(found); else { curr_family.insert({U, std::make_shared(DecisionTree(U))}); possible_trees.emplace_back(curr_family[U]); } + } + + // Mutate residuals (restore old behavior) + for (int individual : curr_split.leaf_index->individuals) { + if (samples_X[individual][curr_split.split_coordinate - 1] < curr_split.split_point) + samples_Y[individual] -= curr_split.M_s; + else + samples_Y[individual] -= curr_split.M_b; } - else - { // if not create new tree - curr_family.insert(std::make_pair(curr_dims, std::make_shared(DecisionTree(curr_dims)))); - possible_splits.insert(std::make_pair(feature_dim, curr_family[curr_dims])); + Leaf leaf_s, leaf_b; leaf_s.individuals = curr_split.I_s; leaf_b.individuals = curr_split.I_b; leaf_s.value = curr_split.M_s; leaf_b.value = curr_split.M_b; leaf_s.intervals = curr_split.leaf_index->intervals; leaf_b.intervals = curr_split.leaf_index->intervals; leaf_s.intervals[curr_split.split_coordinate - 1].second = curr_split.split_point; leaf_b.intervals[curr_split.split_coordinate - 1].first = curr_split.split_point; + std::set resulting_dims = curr_split.tree_index->split_dims; resulting_dims.insert(curr_split.split_coordinate); resulting_dims.erase(0); + std::shared_ptr found_tree = treeExists(resulting_dims, curr_family); + if (!found_tree) { + curr_family.insert({resulting_dims, std::make_shared(DecisionTree(resulting_dims))}); + found_tree = curr_family[resulting_dims]; } + if ((curr_split.tree_index->split_dims.count(curr_split.split_coordinate)) && delete_leaves) { leaf_s.value += curr_split.leaf_index->value; leaf_b.value += curr_split.leaf_index->value; *curr_split.leaf_index = leaf_b; curr_split.tree_index->leaves.push_back(leaf_s); } + else { found_tree->leaves.push_back(leaf_s); found_tree->leaves.push_back(leaf_b); } } + } - // update values of individuals of split interval with mean - for (int individual : curr_split.leaf_index->individuals) - { // todo: loop directly over I_s I_b - if (samples_X[individual][curr_split.split_coordinate - 1] < curr_split.split_point) - { - samples_Y[individual] -= curr_split.M_s; - } - else + // Final memory cleanup: drop training-only buffers and shrink containers + auto keys = getKeys(curr_family); + for (auto &key : keys) { + auto itTree = curr_family.find(key); + if (itTree == curr_family.end()) continue; + auto &treePtr = itTree->second; + if (treePtr->leaves.size() == 0) { curr_family.erase(itTree); continue; } + for (auto &leaf : treePtr->leaves) { + // Individuals are not used after training; caches are training-only + leaf.individuals.clear(); + leaf.individuals.shrink_to_fit(); + // Free per-leaf caches decisively { - samples_Y[individual] -= curr_split.M_b; + std::unordered_map>().swap(leaf.order_cache); + std::unordered_map>().swap(leaf.sorted_vals_cache); + std::unordered_map>().swap(leaf.unique_vals_cache); + std::unordered_map() .swap(leaf.unique_count_cache); + } + // Keep intervals and value but release spare capacity + leaf.intervals.shrink_to_fit(); + leaf.value.shrink_to_fit(); + } + // Clear per-dimension sampling caches (used only during training) + for (auto &v : treePtr->fenwick_by_dim_v) { v.clear(); v.shrink_to_fit(); } + treePtr->fenwick_by_dim_v.clear(); + treePtr->fenwick_by_dim_v.shrink_to_fit(); + for (auto &v : treePtr->leaf_weights_by_dim_v) { v.clear(); v.shrink_to_fit(); } + treePtr->leaf_weights_by_dim_v.clear(); + treePtr->leaf_weights_by_dim_v.shrink_to_fit(); + treePtr->weights_total_by_dim_v.clear(); + treePtr->weights_total_by_dim_v.shrink_to_fit(); + treePtr->weights_epoch_by_dim_v.clear(); + treePtr->weights_epoch_by_dim_v.shrink_to_fit(); + treePtr->leaves.shrink_to_fit(); + } + tree_families[n] = curr_family; return; + } + + // Non-res_trees modes use SplitCandidate pool + std::vector possible_splits; + if (split_structure_mode_ == 3 || split_structure_mode_ == 4) { + // leaves: seed with leaf-level candidates from null tree (single leaf at index 0) + auto add_leaf_candidates = [&](const std::shared_ptr& T, size_t li) { + for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) { + std::set res_dims = T->split_dims; res_dims.insert(feature_dim); res_dims.erase(0); + if (max_interaction >= 0 && res_dims.size() > (size_t)max_interaction) continue; + if (!leafCandidateExists(possible_splits, T, li, feature_dim)) possible_splits.emplace_back(feature_dim, T, li); + } + }; + auto null_tree = curr_family[{0}]; + if (!null_tree->leaves.empty()) add_leaf_candidates(null_tree, 0); + + + // bootstrap + int sample_index; std::vector> samples_X, samples_Y; std::vector boot_idx(sample_size); + if (deterministic) { samples_X = X; samples_Y = Y; this->t_try = 1; for (size_t i=0;i(i); } + else { + samples_X = std::vector>(sample_size); samples_Y = std::vector>(sample_size); + for (size_t i=0;i(sample_size, 0)); + for (int k = 0; k < feature_size; ++k) { + // Reuse global precomputed bin ids via bootstrap index mapping + if (!feature_cut_points_.empty() && (size_t)k < sample_bin_id_.size()) { + for (size_t i = 0; i < sample_size; ++i) tls_working_bin_id[k][i] = sample_bin_id_[k][(size_t)boot_idx[i]]; + } else { + // Fallback: compute on-the-fly (should be rare if cuts are available) + const auto &cuts_k = (k >= 0 && k < (int)feature_cut_points_.size()) ? feature_cut_points_[k] : std::vector{}; + for (size_t i = 0; i < sample_size; ++i) { + int bin = 0; if (!cuts_k.empty()) { auto itb = std::upper_bound(cuts_k.begin(), cuts_k.end(), samples_X[i][k]); bin = (int)std::distance(cuts_k.begin(), itb); } + tls_working_bin_id[k][i] = bin; + } } } + } - // construct new leaves - Leaf leaf_s, leaf_b; - { - leaf_s.individuals = curr_split.I_s; - leaf_b.individuals = curr_split.I_b; - - leaf_s.value = curr_split.M_s; - leaf_b.value = curr_split.M_b; - - // initialize interval with split interval - leaf_s.intervals = curr_split.leaf_index->intervals; - leaf_b.intervals = curr_split.leaf_index->intervals; + Split curr_split; + for (int split_count = 0; split_count < n_splits; ++split_count) { + + if (split_structure_mode_ == 4) curr_split = this->calcOptimalSplit_hist(samples_Y, samples_X, possible_splits, curr_family); + else curr_split = this->calcOptimalSplit_leaves(samples_Y, samples_X, possible_splits, curr_family); + + if (!std::isinf(curr_split.min_sum)) { + + // Mutate residuals (restore old behavior) + for (int individual : curr_split.leaf_index->individuals) { + if (samples_X[individual][curr_split.split_coordinate - 1] < curr_split.split_point) samples_Y[individual] -= curr_split.M_s; else samples_Y[individual] -= curr_split.M_b; + } + + Leaf leaf_s, leaf_b; leaf_s.individuals = curr_split.I_s; leaf_b.individuals = curr_split.I_b; leaf_s.value = curr_split.M_s; leaf_b.value = curr_split.M_b; leaf_s.intervals = curr_split.leaf_index->intervals; leaf_b.intervals = curr_split.leaf_index->intervals; leaf_s.intervals[curr_split.split_coordinate - 1].second = curr_split.split_point; leaf_b.intervals[curr_split.split_coordinate - 1].first = curr_split.split_point; + + std::set resulting_dims = curr_split.tree_index->split_dims; resulting_dims.insert(curr_split.split_coordinate); resulting_dims.erase(0); + + std::shared_ptr found_tree = treeExists(resulting_dims, curr_family); + if (!found_tree) { + curr_family.insert({resulting_dims, std::make_shared(DecisionTree(resulting_dims))}); + found_tree = curr_family[resulting_dims]; + } + + auto add_leaf_candidates = [&](const std::shared_ptr& T, size_t li) { + if (!T) return; + // Re-add per-leaf candidate entries for all dimensions, respecting max_interaction and dedup + for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) { + std::set res_dims = T->split_dims; + res_dims.insert(feature_dim); + res_dims.erase(0); + if (max_interaction >= 0 && res_dims.size() > (size_t)max_interaction) continue; + if (!leafCandidateExists(possible_splits, T, li, feature_dim)) { + possible_splits.emplace_back(feature_dim, T, li); + } + } + }; + if ((curr_split.tree_index->split_dims.count(curr_split.split_coordinate)) && delete_leaves) { + + leaf_s.value += curr_split.leaf_index->value; leaf_b.value += curr_split.leaf_index->value; + // Compute index BEFORE any push_back that may reallocate + size_t idx_b = static_cast(curr_split.leaf_index - &curr_split.tree_index->leaves[0]); + // Assign by value to avoid aliasing issues if vector reallocates later + *curr_split.leaf_index = leaf_b; + curr_split.tree_index->leaves.push_back(leaf_s); + size_t idx_s = curr_split.tree_index->leaves.size() - 1; + add_leaf_candidates(curr_split.tree_index, idx_b); + add_leaf_candidates(curr_split.tree_index, idx_s); + // invalidate per-leaf unique caches for new structure (affects cur_trees_2 only for this tree) + if (!curr_split.tree_index->leaves.empty()) { + for (auto &lf : curr_split.tree_index->leaves) { lf.unique_count_cache.clear(); lf.unique_vals_cache.clear(); } + } + } else { + + // Append by value; avoid referencing invalidated addresses + found_tree->leaves.push_back(leaf_s); + found_tree->leaves.push_back(leaf_b); + // Add candidates for both new leaves + size_t idx_s = found_tree->leaves.size() - 2; + size_t idx_b = found_tree->leaves.size() - 1; + add_leaf_candidates(found_tree, idx_s); + add_leaf_candidates(found_tree, idx_b); + // invalidate unique caches on the receiving tree (cur_trees_2) + if (!found_tree->leaves.empty()) { + for (auto &lf : found_tree->leaves) { lf.unique_count_cache.clear(); lf.unique_vals_cache.clear(); } + } + } + } + } + // Release histogram working buffers (thread-local) if used + tls_working_bin_id.clear(); + tls_working_bin_id.shrink_to_fit(); - // interval of leaf with smaller individuals has new upper bound in splitting dimension - leaf_s.intervals[curr_split.split_coordinate - 1].second = curr_split.split_point; - // interval of leaf with bigger individuals has new lower bound in splitting dimension - leaf_b.intervals[curr_split.split_coordinate - 1].first = curr_split.split_point; + // Final memory cleanup: drop training-only buffers and shrink containers + auto keys = getKeys(curr_family); + for (auto &key : keys) { + auto itTree = curr_family.find(key); + if (itTree == curr_family.end()) continue; + auto &treePtr = itTree->second; + if (treePtr->leaves.size() == 0) { curr_family.erase(itTree); continue; } + for (auto &leaf : treePtr->leaves) { + leaf.individuals.clear(); + leaf.individuals.shrink_to_fit(); + std::unordered_map>().swap(leaf.order_cache); + std::unordered_map>().swap(leaf.sorted_vals_cache); + std::unordered_map>().swap(leaf.unique_vals_cache); + std::unordered_map() .swap(leaf.unique_count_cache); + leaf.intervals.shrink_to_fit(); + leaf.value.shrink_to_fit(); } + for (auto &v : treePtr->fenwick_by_dim_v) { v.clear(); v.shrink_to_fit(); } + treePtr->fenwick_by_dim_v.clear(); + treePtr->fenwick_by_dim_v.shrink_to_fit(); + for (auto &v : treePtr->leaf_weights_by_dim_v) { v.clear(); v.shrink_to_fit(); } + treePtr->leaf_weights_by_dim_v.clear(); + treePtr->leaf_weights_by_dim_v.shrink_to_fit(); + treePtr->weights_total_by_dim_v.clear(); + treePtr->weights_total_by_dim_v.shrink_to_fit(); + treePtr->weights_epoch_by_dim_v.clear(); + treePtr->weights_epoch_by_dim_v.shrink_to_fit(); + treePtr->leaves.shrink_to_fit(); + } + tree_families[n] = curr_family; return; + } + + // cur_trees_1 and cur_trees_2: initialize with {j} trees + for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) { + auto treePtr = std::make_shared(DecisionTree({feature_dim})); + curr_family.insert({{feature_dim}, treePtr}); + // leaf_idx unused for these modes + possible_splits.emplace_back(feature_dim, treePtr, static_cast(0)); + } + + // bootstrap + int sample_index; std::vector> samples_X, samples_Y; + if (deterministic) { samples_X = X; samples_Y = Y; this->t_try = 1; } + else { samples_X = std::vector>(sample_size); samples_Y = std::vector>(sample_size); for (size_t i=0;i resulting_dims = curr_split.tree_index->split_dims; - resulting_dims.insert(curr_split.split_coordinate); - resulting_dims.erase(0); + Split curr_split; + for (int split_count = 0; split_count < n_splits; ++split_count) { + if (split_structure_mode_ == 2) curr_split = this->calcOptimalSplit_curTrees1(samples_Y, samples_X, possible_splits, curr_family); + else curr_split = this->calcOptimalSplit_curTrees2(samples_Y, samples_X, possible_splits, curr_family); + if (!std::isinf(curr_split.min_sum)) { + // Update possible_splits like tryeveryleaf/splittrynew + for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) { + std::set curr_dims = curr_split.tree_index->split_dims; curr_dims.insert(curr_split.split_coordinate); curr_dims.insert(feature_dim); curr_dims.erase(0); + if (possibleExists(feature_dim, possible_splits, curr_dims)) continue; + if (max_interaction >= 0 && curr_dims.size() > (size_t)max_interaction) continue; + if (auto found = treeExists(curr_dims, curr_family)) possible_splits.emplace_back(feature_dim, found, static_cast(0)); + else { curr_family.insert({curr_dims, std::make_shared(DecisionTree(curr_dims))}); possible_splits.emplace_back(feature_dim, curr_family[curr_dims], static_cast(0)); } + } - // check if resulting tree already exists in family + for (int individual : curr_split.leaf_index->individuals) { + if (samples_X[individual][curr_split.split_coordinate - 1] < curr_split.split_point) samples_Y[individual] -= curr_split.M_s; else samples_Y[individual] -= curr_split.M_b; + } + Leaf leaf_s, leaf_b; leaf_s.individuals = curr_split.I_s; leaf_b.individuals = curr_split.I_b; leaf_s.value = curr_split.M_s; leaf_b.value = curr_split.M_b; leaf_s.intervals = curr_split.leaf_index->intervals; leaf_b.intervals = curr_split.leaf_index->intervals; leaf_s.intervals[curr_split.split_coordinate - 1].second = curr_split.split_point; leaf_b.intervals[curr_split.split_coordinate - 1].first = curr_split.split_point; + std::set resulting_dims = curr_split.tree_index->split_dims; resulting_dims.insert(curr_split.split_coordinate); resulting_dims.erase(0); std::shared_ptr found_tree = treeExists(resulting_dims, curr_family); - - // determine which tree is modified - if (curr_split.tree_index->split_dims.count(curr_split.split_coordinate)) - { // if split variable is already in tree to be split - // change values - { - leaf_s.value += curr_split.leaf_index->value; - leaf_b.value += curr_split.leaf_index->value; + if (!found_tree) { curr_family.insert({resulting_dims, std::make_shared(DecisionTree(resulting_dims))}); found_tree = curr_family[resulting_dims]; } + if ((curr_split.tree_index->split_dims.count(curr_split.split_coordinate)) && delete_leaves) { + leaf_s.value += curr_split.leaf_index->value; leaf_b.value += curr_split.leaf_index->value; + // index of the replaced leaf BEFORE push_back + size_t idx_b = static_cast(curr_split.leaf_index - &curr_split.tree_index->leaves[0]); + *curr_split.leaf_index = leaf_b; + curr_split.tree_index->leaves.push_back(leaf_s); + size_t idx_s = curr_split.tree_index->leaves.size() - 1; + // Incrementally update sampling caches if initialized + if ((int)curr_split.tree_index->fenwick_by_dim_v.size() >= this->feature_size) { + for (int kdim = 0; kdim < this->feature_size; ++kdim) { + auto &bit = curr_split.tree_index->fenwick_by_dim_v[(size_t)kdim]; + auto &wts = curr_split.tree_index->leaf_weights_by_dim_v[(size_t)kdim]; + if (!bit.empty() && wts.size() == bit.size()) { + // update replaced leaf + double m_b = (double)curr_split.tree_index->leaves[idx_b].individuals.size(); + int leaf_min = this->n_leaves[kdim]; + double w_new_b = std::max(0.0, m_b - 2.0 * (double)leaf_min); + double delta_b = w_new_b - (idx_b < wts.size() ? wts[idx_b] : 0.0); + if ((size_t)idx_b < wts.size()) wts[idx_b] = w_new_b; + if (delta_b != 0.0) { rpf_utils::fenwick_add(bit, idx_b + 1, delta_b); curr_split.tree_index->weights_total_by_dim_v[(size_t)kdim] += delta_b; } + // append new leaf + double m_s = (double)curr_split.tree_index->leaves[idx_s].individuals.size(); + double w_new_s = std::max(0.0, m_s - 2.0 * (double)leaf_min); + bit.push_back(0.0); + wts.push_back(0.0); + if (w_new_s != 0.0) rpf_utils::fenwick_add(bit, bit.size(), w_new_s); + wts[wts.size() - 1] = w_new_s; + curr_split.tree_index->weights_total_by_dim_v[(size_t)kdim] += w_new_s; + } + } } - *curr_split.leaf_index = leaf_b; // replace old interval - curr_split.tree_index->leaves.push_back(leaf_s); // add new leaf } - else - { // otherwise - found_tree->leaves.push_back(leaf_s); // append new leaves - found_tree->leaves.push_back(leaf_b); + else { + found_tree->leaves.push_back(leaf_s); found_tree->leaves.push_back(leaf_b); + size_t idx_s = found_tree->leaves.size() - 2; size_t idx_b = found_tree->leaves.size() - 1; + // Incrementally update sampling caches if initialized + if ((int)found_tree->fenwick_by_dim_v.size() >= this->feature_size) { + for (int kdim = 0; kdim < this->feature_size; ++kdim) { + auto &bit = found_tree->fenwick_by_dim_v[(size_t)kdim]; + auto &wts = found_tree->leaf_weights_by_dim_v[(size_t)kdim]; + if (!bit.empty() && wts.size() == bit.size()) { + int leaf_min = this->n_leaves[kdim]; + // append s + double m_s = (double)found_tree->leaves[idx_s].individuals.size(); + double w_new_s = std::max(0.0, m_s - 2.0 * (double)leaf_min); + bit.push_back(0.0); wts.push_back(0.0); + if (w_new_s != 0.0) rpf_utils::fenwick_add(bit, bit.size(), w_new_s); + wts[wts.size() - 1] = w_new_s; + found_tree->weights_total_by_dim_v[(size_t)kdim] += w_new_s; + // append b + double m_b = (double)found_tree->leaves[idx_b].individuals.size(); + double w_new_b = std::max(0.0, m_b - 2.0 * (double)leaf_min); + bit.push_back(0.0); wts.push_back(0.0); + if (w_new_b != 0.0) rpf_utils::fenwick_add(bit, bit.size(), w_new_b); + wts[wts.size() - 1] = w_new_b; + found_tree->weights_total_by_dim_v[(size_t)kdim] += w_new_b; + } + } + } } } } - // remove empty trees & clear individuals of each tree + // Final memory cleanup: drop training-only buffers and shrink containers auto keys = getKeys(curr_family); - for (auto &key : keys) - { - if (curr_family[key]->leaves.size() == 0) - { - curr_family.erase(key); - continue; - } - for (auto &leaf : curr_family[key]->leaves) - { + for (auto &key : keys) { + auto itTree = curr_family.find(key); + if (itTree == curr_family.end()) continue; + auto &treePtr = itTree->second; + if (treePtr->leaves.size() == 0) { curr_family.erase(itTree); continue; } + for (auto &leaf : treePtr->leaves) { leaf.individuals.clear(); - } + leaf.individuals.shrink_to_fit(); + std::unordered_map>().swap(leaf.order_cache); + std::unordered_map>().swap(leaf.sorted_vals_cache); + std::unordered_map>().swap(leaf.unique_vals_cache); + std::unordered_map() .swap(leaf.unique_count_cache); + leaf.intervals.shrink_to_fit(); + leaf.value.shrink_to_fit(); + } + for (auto &v : treePtr->fenwick_by_dim_v) { v.clear(); v.shrink_to_fit(); } + treePtr->fenwick_by_dim_v.clear(); + treePtr->fenwick_by_dim_v.shrink_to_fit(); + for (auto &v : treePtr->leaf_weights_by_dim_v) { v.clear(); v.shrink_to_fit(); } + treePtr->leaf_weights_by_dim_v.clear(); + treePtr->leaf_weights_by_dim_v.shrink_to_fit(); + treePtr->weights_total_by_dim_v.clear(); + treePtr->weights_total_by_dim_v.shrink_to_fit(); + treePtr->weights_epoch_by_dim_v.clear(); + treePtr->weights_epoch_by_dim_v.shrink_to_fit(); + treePtr->leaves.shrink_to_fit(); } - tree_families[n] = curr_family; } // fit forest to new data -void RandomPlantedForest::fit() -{ +// fit() moved to lib/training.cpp - // setup initial set of individuals - std::vector initial_individuals(sample_size); - std::iota(initial_individuals.begin(), initial_individuals.end(), 0); +// predict single feature vector (from leaves variant) +// predict_single moved to lib/predict.cpp - // initialize intervals with lower and upper bounds - std::vector initial_intervals(feature_size); - for (int i = 0; i < feature_size; ++i) - initial_intervals[i] = Interval{lower_bounds[i], upper_bounds[i]}; +// predict_matrix moved to lib/predict.cpp +// predict_vector moved to lib/predict.cpp - // set properties of first leaf - Leaf initial_leaf; - { - initial_leaf.value = std::vector(value_size, 0); - initial_leaf.individuals = initial_individuals; - initial_leaf.intervals = initial_intervals; - } - std::vector initial_leaves{initial_leaf}; // vector with initial leaf +double RandomPlantedForest::MSE_vec(const NumericVector &Y_predicted, const NumericVector &Y_true) +{ return sum(Rcpp::pow(Y_true - Y_predicted, 2)) / Y_true.size(); } - // initialize tree families - this->tree_families = std::vector(n_trees); +double RandomPlantedForest::MSE(const NumericMatrix &Y_predicted, const NumericMatrix &Y_true) +{ + double sumv = 0; int Y_size = Y_predicted.size(); + for (int i = 0; i < Y_size; ++i) sumv += MSE_vec(Y_predicted(i, _), Y_true(i, _)); + return sumv / Y_size; +} - // Loop over number of tree families and dispatch threads in batches - // of nhreads at once - if (nthreads > 1) +void RandomPlantedForest::print() +{ + for (int n = 0; n < n_trees; ++n) { - if (nthreads > std::thread::hardware_concurrency()) - { - Rcout << "Requested " << nthreads << " threads but only " << std::thread::hardware_concurrency() << " available" << std::endl; - } - // Create local thread count to not overwrite nthreads, - // would get reported wrongly by get_parameters() - unsigned int current_threads = nthreads; - for (int n = 0; n < n_trees; n += current_threads) + TreeFamily family = tree_families[n]; auto keys = getKeys(family); + for (size_t m = 0; m < keys.size(); ++m) { - if (n >= (n_trees - current_threads + 1)) - { - current_threads = n_trees % current_threads; - } - - std::vector threads(current_threads); - for (int t = 0; t < current_threads; ++t) - { - // Rcout << "Dispatching thread " << (n + t + 1) << "/" << n_trees << std::endl; - threads[t] = std::thread(&RandomPlantedForest::create_tree_family, this, std::ref(initial_leaves), n + t); - } - for (auto &t : threads) + DecisionTree tree = *(family[keys[m]]); + Rcout << m + 1 << " Tree: "; Rcout << "Dims="; for (const auto &dim : tree.split_dims) Rcout << dim << ","; + Rcout << std::endl << "Leaves: (" << tree.leaves.size() << ")" << std::endl; + for (const auto &leaf : tree.leaves) { - if (t.joinable()) - t.join(); + Rcout << "Intervals="; for (const auto &interval : leaf.intervals) { Rcout << interval.first << "," << interval.second << "/"; } + Rcout << " Value="; for (const auto &val : leaf.value) Rcout << val << ", "; Rcout << std::endl; } + Rcout << std::endl; } + Rcout << std::endl << std::endl; } - else - { - for (int n = 0; n < n_trees; ++n) - { - create_tree_family(initial_leaves, n); - } - } +} + +void RandomPlantedForest::get_parameters() +{ + Rcout << "Parameters: n_trees=" << n_trees << ", n_splits=" << n_splits << ", max_interaction=" << max_interaction << ", t_try=" << t_try + << ", split_decay_rate=" << split_decay_rate_<< ", max_candidates=" << max_candidates_ + << ", split_try=" << split_try << ", purified=" << purified << ", deterministic=" << deterministic << ", nthreads=" << nthreads + << ", feature_size=" << feature_size << ", sample_size=" << sample_size + << ", split_structure_mode=" << split_structure_mode_ << std::endl; +} - // optionally purify tree - if (purify_forest) +void RandomPlantedForest::set_parameters(StringVector keys, NumericVector values) +{ + if (keys.size() != values.size()) { Rcout << "Size of input vectors is not the same. " << std::endl; return; } + for (unsigned int i = 0; i < keys.size(); ++i) { - this->purify_3(); + if (keys[i] == "deterministic") this->deterministic = values[i]; + else if (keys[i] == "nthreads") this->nthreads = values[i]; + else if (keys[i] == "purify") this->purify_forest = values[i]; + else if (keys[i] == "n_trees") this->n_trees = values[i]; + else if (keys[i] == "n_splits") this->n_splits = values[i]; + else if (keys[i] == "t_try") this->t_try = values[i]; + else if (keys[i] == "split_try") this->split_try = values[i]; + else if (keys[i] == "max_interaction") this->max_interaction = values[i]; + else if (keys[i] == "cv") this->cross_validate = values[i]; + else if (keys[i] == "split_decay_rate") this->split_decay_rate_ = values[i]; + else if (keys[i] == "max_candidates") this->max_candidates_ = static_cast(values[i]); + else if (keys[i] == "delete_leaves") this->delete_leaves = static_cast(values[i]); + else if (keys[i] == "leaf_feature_cache_cap") this->leaf_feature_cache_cap_ = static_cast(values[i]); + + else if (keys[i] == "split_structure_mode") this->split_structure_mode_ = static_cast(values[i]); + else Rcout << "Unkown parameter key '" << keys[i] << "' ." << std::endl; } - else + this->fit(); +} + +List RandomPlantedForest::get_model() +{ + List model; + for (const auto &family : tree_families) { - purified = false; + List variables, family_values, family_intervals; + for (const auto &tree : family) + { + List tree_values; List tree_intervals; variables.push_back(from_std_set(tree.first)); + for (const auto &leaf : tree.second->leaves) + { + NumericMatrix leaf_values; for (const auto &val : leaf.value) leaf_values.push_back(val); + tree_values.push_back(leaf_values); + NumericVector intervals; for (const auto &interval : leaf.intervals) { intervals.push_back(interval.first); intervals.push_back(interval.second); } + NumericMatrix leaf_intervals(2, feature_size, intervals.begin()); tree_intervals.push_back(leaf_intervals); + } + family_intervals.push_back(tree_intervals); family_values.push_back(tree_values); + } + model.push_back(List::create(Named("variables") = variables, _["values"] = family_values, _["intervals"] = family_intervals)); } + return (model); } + + void RandomPlantedForest::cross_validation(int n_sets, IntegerVector splits, NumericVector t_tries, IntegerVector split_tries) { @@ -616,1471 +744,10 @@ void RandomPlantedForest::cross_validation(int n_sets, IntegerVector splits, Num */ } -// predict single feature vector -std::vector RandomPlantedForest::predict_single(const std::vector &X, std::set component_index) -{ - - std::vector total_res = std::vector(value_size, 0); - - if (!purified) - { - // consider all components - if (component_index == std::set{0}) - { - for (auto &tree_family : this->tree_families) - { - for (auto &tree : tree_family) - { - for (auto &leaf : tree.second->leaves) - { - bool valid = true; - for (auto &dim : tree.first) - { - if (!((leaf.intervals[std::max(0, dim - 1)].first <= X[std::max(0, dim - 1)] || leaf.intervals[std::max(0, dim - 1)].first == lower_bounds[std::max(0, dim - 1)]) && (leaf.intervals[std::max(0, dim - 1)].second > X[std::max(0, dim - 1)] || leaf.intervals[std::max(0, dim - 1)].second == upper_bounds[std::max(0, dim - 1)]))) - { - valid = false; - } - } - if (valid) - { - - // Rcout << leaf.value[0] << "\n"; - total_res += leaf.value; - } - } - } - } - } - else - { // choose components for prediction - for (auto &tree_family : this->tree_families) - { - for (auto &tree : tree_family) - { - - // only consider trees with same dimensions as component_index - if (tree.first != component_index) - continue; - - std::vector dims; - for (auto dim : tree.first) - { - dims.push_back(dim); - } - - for (auto &leaf : tree.second->leaves) - { - bool valid = true; - for (unsigned int i = 0; i < dims.size(); ++i) - { - - int dim = dims[i]; - - if (!((leaf.intervals[std::max(0, dim - 1)].first <= X[i] || leaf.intervals[std::max(0, dim - 1)].first == lower_bounds[std::max(0, dim - 1)]) && (leaf.intervals[std::max(0, dim - 1)].second > X[i] || leaf.intervals[std::max(0, dim - 1)].second == upper_bounds[std::max(0, dim - 1)]))) - { - valid = false; - } - } - if (valid) - total_res += leaf.value; - } - } - } - } - } - else - { - if (component_index == std::set{-1}) - { - for (auto &tree_family : this->tree_families) - { - for (auto &tree : tree_family) - { - std::vector leaf_index(tree.first.size(), -1); - // add value of null tree - if (tree.first == std::set{0}) - { - - // Rcout << tree.first.size() ; - leaf_index = std::vector(tree.first.size(), 0); - total_res += tree.second->GridLeaves.values[leaf_index]; - } - } - } - } - else if (component_index == std::set{0}) - { - for (auto &tree_family : this->tree_families) - { - for (auto &tree : tree_family) - { - std::vector leaf_index(tree.first.size(), -1); - - // add value of null tree - if (tree.first == std::set{0}) - { - - // Rcout << tree.first.size() ; - leaf_index = std::vector(tree.first.size(), 0); - } - else - { - - // go through limits of grid - for (size_t dim_index = 0; dim_index < tree.first.size(); ++dim_index) - { - // get dim at dim_index - int dim = 0; - { - auto dim_pnt = tree.first.begin(); - std::advance(dim_pnt, dim_index); - dim = *dim_pnt; - --dim; // transform into index - } - - auto bounds = tree.second->GridLeaves.lim_list[dim]; - for (double bound : bounds) - { - - // check if sample in leaf at dimension - if (X[dim] < bound) - break; // changed - - // if no interval smaller, set to end of bounds, otherwise set to leaf index - leaf_index[dim_index] = std::min(leaf_index[dim_index] + 1, (int)bounds.size() - 2); - } - } - } - - // if interval of first leaf smaller smaller - for (int &index : leaf_index) - index = std::max(0, index); - - total_res += tree.second->GridLeaves.values[leaf_index]; - } - } - } - else - { - - for (auto &tree_family : this->tree_families) - { - for (auto &tree : tree_family) - { - - // only consider trees with same dimensions as component_index - if (tree.first != component_index) - continue; - - std::vector leaf_index(tree.first.size(), -1); - // add value of null tree - if (tree.first == std::set{0}) - { - leaf_index = std::vector(tree.first.size(), 0); - } - else - { - - // go through limits of grid - for (size_t dim_index = 0; dim_index < tree.first.size(); ++dim_index) - { - // get dim at dim_index - int dim = 0; - { - auto dim_pnt = tree.first.begin(); - std::advance(dim_pnt, dim_index); - dim = *dim_pnt; - --dim; // transform into index - } - - auto bounds = tree.second->GridLeaves.lim_list[dim]; - for (double bound : bounds) - { - // check if sample in leaf at dimension - if (X[dim_index] < bound) - break; // changed +// purify_1 moved to lib/purify.cpp - // if no interval smaller, set to end of bounds, otherwise set to leaf index - leaf_index[dim_index] = std::min(leaf_index[dim_index] + 1, (int)bounds.size() - 2); - } - } - } +// purify_2 moved to lib/purify.cpp - // if interval of first leaf smaller smaller - for (int &index : leaf_index) - index = std::max(0, index); - - total_res += tree.second->GridLeaves.values[leaf_index]; - } - } - } - } - - return total_res / n_trees; -} - -// predict multiple feature vectors -Rcpp::NumericMatrix RandomPlantedForest::predict_matrix(const NumericMatrix &X, const NumericVector components) -{ - std::vector> feature_vec = to_std_vec(X); - std::set component_index = to_std_set(components); - std::vector> predictions; - - // todo: sanity check for X - if (feature_vec.empty()) - throw std::invalid_argument("Feature vector is empty."); - if (component_index == std::set{0} && this->feature_size >= 0 && feature_vec[0].size() != (size_t)this->feature_size) - throw std::invalid_argument("Feature vector has wrong dimension."); - if (component_index != std::set{0} && component_index != std::set{-1} && component_index.size() != feature_vec[0].size()) - throw std::invalid_argument("The input X has the wrong dimension in order to calculate f_i(x)"); - - for (auto &vec : feature_vec) - { - predictions.push_back(predict_single(vec, component_index)); - } - - return from_std_vec(predictions); -} - -Rcpp::NumericMatrix RandomPlantedForest::predict_vector(const NumericVector &X, const NumericVector components) -{ - std::vector feature_vec = to_std_vec(X); - std::set component_index = to_std_set(components); - std::vector> predictions; - Rcpp::NumericMatrix res; - - // todo: sanity check for X - if (feature_vec.empty()) - { - Rcout << "Feature vector is empty." << std::endl; - return res; - } - - if (component_index == std::set{0} && this->feature_size >= 0 && feature_vec.size() != (size_t)this->feature_size) - { - Rcout << "Feature vector has wrong dimension." << std::endl; - return res; - } - - if (component_index == std::set{0}) - { - predictions.push_back(predict_single(feature_vec, component_index)); - } - else - { - for (auto vec : feature_vec) - { - predictions.push_back(predict_single(std::vector{vec}, component_index)); - } - } - - res = from_std_vec(predictions); - return res; -} - -double RandomPlantedForest::MSE_vec(const NumericVector &Y_predicted, const NumericVector &Y_true) -{ - return sum(Rcpp::pow(Y_true - Y_predicted, 2)) / Y_true.size(); -} - -double RandomPlantedForest::MSE(const NumericMatrix &Y_predicted, const NumericMatrix &Y_true) -{ - // todo: multiclass - double sum = 0; - int Y_size = Y_predicted.size(); - - for (int i = 0; i < Y_size; ++i) - { - sum += MSE_vec(Y_predicted(i, _), Y_true(i, _)); - } - - return sum / Y_size; -} - -void RandomPlantedForest::purify_1() -{ - - // go through all n_trees families - for (auto &curr_family : this->tree_families) - { - - // recap maximum number of dimensions of current family - unsigned int curr_max = 0; - for (auto tree : curr_family) - { - if (tree.first.size() > curr_max) - curr_max = tree.first.size(); - } - - while (curr_max >= 1) - { - - // go through split dimensions of all trees - auto keys = getKeys(curr_family); - std::vector>::reverse_iterator key = keys.rbegin(); - while (key != keys.rend()) - { - - auto &curr_tree = curr_family[(*key)]; - std::set curr_dims = curr_tree->split_dims; - - // check if number of dims same as current max_interaction - if (curr_dims.size() == curr_max) - { - - // go through feature dims - for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) - { - - // continue only if dim in current tree - if (curr_tree->split_dims.count(feature_dim) != 0) - { - - std::set tree_dims = curr_tree->split_dims; - tree_dims.erase(tree_dims.find(feature_dim)); // remove current feature dim from current tree - - // check if tree with dimensions exists, if not create - std::shared_ptr tree = treeExists(tree_dims, curr_family); - if (curr_max == 1) - { - tree = curr_family[std::set{0}]; - } - else - { - if (!tree) - { - curr_family.insert(std::make_pair(tree_dims, std::make_shared(DecisionTree(tree_dims)))); - tree = curr_family[tree_dims]; - } - } - - // go through leaves of current tree - int n_leaves = curr_tree->leaves.size(); - for (int l = 0; l < n_leaves; ++l) - { - auto &curr_leaf = curr_tree->leaves[l]; - - double multiplier = (curr_leaf.intervals[feature_dim - 1].second - curr_leaf.intervals[feature_dim - 1].first) / (upper_bounds[feature_dim - 1] - lower_bounds[feature_dim - 1]); - - // new leaf including intervals and value - Leaf new_leaf = curr_leaf; // initialize intervals with first leaf - new_leaf.intervals[feature_dim - 1].first = lower_bounds[feature_dim - 1]; - new_leaf.intervals[feature_dim - 1].second = upper_bounds[feature_dim - 1]; - for (size_t i = 0; i < value_size; ++i) - new_leaf.value[i] = -curr_leaf.value[i] * multiplier; // update value of new leaf - - // append new leaf - if (!leafExists(new_leaf.intervals, curr_tree)) - curr_tree->leaves.push_back(new_leaf); - for (size_t i = 0; i < value_size; ++i) - new_leaf.value[i] = curr_leaf.value[i] * multiplier; // update value of new leaf - if (!leafExists(new_leaf.intervals, tree)) - tree->leaves.push_back(new_leaf); - } - } - } - } - key++; - } - - // update currently considered dimension size - --curr_max; - } - } - - purified = true; -} - -void RandomPlantedForest::purify_2() -{ - - // go through all n_trees families - for (auto &curr_family : this->tree_families) - { - - // lim_list is a list giving for each variable all interval end-points - std::vector> lim_list(feature_size); - - // go through all variables of the component - for (int curr_dim = 1; curr_dim <= feature_size; ++curr_dim) - { - std::vector bounds; - - // go through trees of family - for (const auto &curr_tree : curr_family) - { - - // consider only relevant trees that have current dimension as variable - if (!curr_tree.first.count(curr_dim)) - continue; - - // go through leaves of tree - for (const auto &curr_leaf : curr_tree.second->leaves) - { - // get interval ends of variable - bounds.push_back(curr_leaf.intervals[curr_dim - 1].second); - } - } - std::sort(bounds.begin(), bounds.end()); - bounds.erase(std::unique(bounds.begin(), bounds.end()), bounds.end()); - lim_list[curr_dim - 1] = bounds; - } - - // initialize values and individuals for each tree in family - std::vector grids(curr_family.size() - 1); - std::vector> individuals(curr_family.size() - 1); - std::vector>> values(curr_family.size() - 1); - std::vector> variables(curr_family.size() - 1); - - // ------------- setup finer grid ------------- - - int tree_index = 0; - for (const auto &curr_tree : curr_family) - { - - if (curr_tree.first == std::set{0}) - continue; // ignore null tree - - // fill space with dimensions - std::vector dimensions; - for (const auto &dim : curr_tree.first) - { - dimensions.push_back(lim_list[dim - 1].size() - 1); // size - 1 ? - } - - // setup grid for leaf indices - auto grid = grid::NDGrid(dimensions); - - // initialize data for current tree - grids[tree_index] = grid; - individuals[tree_index] = utils::Matrix(dimensions, 0); - values[tree_index] = utils::Matrix>(dimensions, std::vector(value_size, 0)); // changed - variables[tree_index] = curr_tree.first; - - // fill grid points with individuals and values - while (!grid.nextPoint()) - { - - std::vector gridPoint = grid.getPoint(); - - bool in_leaf = true; - - // go through sample points to sum up individuals - for (const auto &feature_vec : X) - { - int dim_index = 0; - in_leaf = true; - for (const auto &dim : curr_tree.first) - { - double val = feature_vec[dim - 1]; - if (!((val >= lim_list[dim - 1][gridPoint[dim_index]]) && (val < lim_list[dim - 1][gridPoint[dim_index] + 1]))) - in_leaf = false; - ++dim_index; - } - - // consider individuals only if all in - if (in_leaf) - individuals[tree_index][gridPoint] += 1; - } - - // go through leaves of tree to sum up values - for (const auto &leaf : curr_tree.second->get_leaves()) - { - - in_leaf = true; - int dim_index = 0; - for (const auto &dim : curr_tree.first) - { - // consider values only if all in - if (!((leaf.intervals[dim - 1].first <= lim_list[dim - 1][gridPoint[dim_index]]) && (leaf.intervals[dim - 1].second >= lim_list[dim - 1][gridPoint[dim_index] + 1]))) - in_leaf = false; - ++dim_index; - } - - // sum up values - if (in_leaf) - values[tree_index][gridPoint] += leaf.value; // todo: multiclass - } - } - - ++tree_index; - } - - // ------------- create new trees ------------- - - // insert null tree - grids.insert(grids.begin(), grid::NDGrid()); - values.insert(values.begin(), utils::Matrix>(std::vector{1}, std::vector(value_size, 0))); - individuals.insert(individuals.begin(), utils::Matrix(std::vector{1})); - variables.insert(variables.begin(), std::set{0}); - - // recap maximum number of dimensions of current family - unsigned int curr_max = 0; - for (const auto &tree : curr_family) - { - if (tree.first.size() > curr_max) - curr_max = tree.first.size(); - } - - auto keys = getKeys(curr_family); - while (curr_max > 1) - { - - // go through split dimensions of all trees - for (std::vector>::reverse_iterator key = keys.rbegin(); key != keys.rend(); ++key) - { - - auto &curr_tree = curr_family[(*key)]; - std::set curr_dims = curr_tree->split_dims; - - // check if number of dims same as current max_interaction - if (curr_dims.size() == curr_max) - { - - // go through feature dims - int dim_index = 0; - for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) - { - - // continue only if dim in current tree - if (curr_tree->split_dims.count(feature_dim) != 0) - { - - std::set tree_dims = curr_tree->split_dims; - tree_dims.erase(tree_dims.find(feature_dim)); // remove current feature dim from current tree - - // check if tree with dimensions exists, if not create - std::shared_ptr tree = treeExists(tree_dims, curr_family); - if (!tree) - { - - // get index of old and new tree - auto old_tree_index = std::distance(std::begin(curr_family), curr_family.find(curr_tree->get_split_dims())); - curr_family.insert(std::make_pair(tree_dims, std::make_shared(DecisionTree(tree_dims)))); - auto tree_index = std::distance(std::begin(curr_family), curr_family.find(tree_dims)); - - // remove matrix dimension of respective variable - std::vector matrix_dimensions = values[old_tree_index].dims; - matrix_dimensions.erase(matrix_dimensions.begin() + dim_index); - - // initialize data for new tree - auto grid = grid::NDGrid(matrix_dimensions); - grids.insert(grids.begin() + tree_index, grid); - values.insert(values.begin() + tree_index, utils::Matrix>(matrix_dimensions, std::vector(0, value_size))); - individuals.insert(individuals.begin() + tree_index, utils::Matrix(matrix_dimensions)); - variables.insert(variables.begin() + tree_index, tree_dims); - - // fill individuals of new trees - while (!grid.nextPoint()) - { - - std::vector gridPoint = grid.getPoint(); - bool in_leaf = true; - - // go through sample points to sum up individuals - for (const auto &feature_vec : X) - { - int dim_index = 0; - in_leaf = true; - for (const auto &dim : tree_dims) - { - double val = feature_vec[dim - 1]; - if (!((val >= lim_list[dim - 1][gridPoint[dim_index]]) && (val < lim_list[dim - 1][gridPoint[dim_index] + 1]))) - in_leaf = false; - ++dim_index; - } - - // consider individuals only if all in - if (in_leaf) - individuals[tree_index][gridPoint] += 1; - } - } - } - - dim_index++; - } - } - } - } - - // update currently considered dimension size - --curr_max; - } - - // ------------- purify ------------- - - // measure tolerance and number of iterations - std::vector tol(curr_family.size(), 1); - int iter; - - // iterate backwards through tree family - int curr_tree_index = curr_family.size() - 1; - for (TreeFamily::reverse_iterator curr_tree = curr_family.rbegin(); curr_tree != curr_family.rend(); ++curr_tree) - { - iter = 0; - std::set curr_dims = curr_tree->second->get_split_dims(); - - // do not purify null - if (curr_dims == std::set{0}) - continue; - - // repeat until tolerance small enough and (?) maximum number of iterations reached - while ((tol[curr_tree_index] > 0.00000000001) && (iter < 100)) - { - - // go through feature dims - int curr_dim_index = 0; - for (const auto &feature_dim : curr_dims) - { - - // get tree that has same variables as curr_tree minus j-variable - std::set tree_dims = curr_dims; - tree_dims.erase(tree_dims.find(feature_dim)); - int tree_index = 0; // if tree not exist, set to null tree - if (curr_family.find(tree_dims) != curr_family.end()) - tree_index = std::distance(std::begin(curr_family), curr_family.find(tree_dims)) - 1; - - // update values - if (grids[curr_tree_index].dimensions.size() == 1) - { // one dimensional case - - int sum_ind = 0; - std::vector avg(value_size, 0); - - // get sum of individuals - for (int i = 0; i < individuals[curr_tree_index].n_entries; ++i) - { - std::vector tmp{i}; - sum_ind += individuals[curr_tree_index][tmp]; - } - if (sum_ind == 0) - continue; - - // calc avg - for (int i = 0; i < individuals[curr_tree_index].n_entries; ++i) - { - std::vector tmp{i}; - avg += (individuals[curr_tree_index][tmp] * values[curr_tree_index][tmp]) / sum_ind; - } - - // update values of one dimensional and null tree - for (int i = 0; i < values[curr_tree_index].n_entries; ++i) - { - std::vector tmp{i}; - values[curr_tree_index][tmp] -= avg; - } - std::vector tmp{0}; - values[tree_index][tmp] += avg; - } - else - { // higher dimensional case - - // setup new grid without dimension j - std::vector new_dimensions = grids[curr_tree_index].dimensions; - int j_dim = new_dimensions[curr_dim_index]; - new_dimensions.erase(new_dimensions.begin() + curr_dim_index); - grid::NDGrid grid = grid::NDGrid(new_dimensions); - - // go through values without dimension j - while (!grid.nextPoint()) - { - auto gridPoint = grid.getPoint(); - gridPoint.push_back(0); - - int sum_ind = 0; - std::vector avg(value_size, 0); - - // go through slice to sum up individuals - for (int j = 0; j < j_dim; ++j) - { - gridPoint.back() = j; - - // get sum of individuals - sum_ind += individuals[curr_tree_index][gridPoint]; - } - - // go through slice to calc avg - for (int j = 0; j < j_dim; ++j) - { - gridPoint.back() = j; - - // calc avg - avg += (individuals[curr_tree_index][gridPoint] * values[curr_tree_index][gridPoint]) / sum_ind; - } - - // go through slice to update values - for (int j = 0; j < j_dim; ++j) - { - gridPoint.back() = j; - - // update values of current slice - values[curr_tree_index][gridPoint] -= avg; - } - - // update lower dimensional tree - gridPoint.pop_back(); - values[tree_index][gridPoint] += avg; - } - } - - ++curr_dim_index; - } - - // update tolerance - if (variables[curr_tree_index].size() == 1) - { - tol[curr_tree_index] = 1; // todo - } - else - { - tol[curr_tree_index] = 1; - } - - ++iter; - } - - --curr_tree_index; - } - - // ------------- attach to rpf class ------------- - - // fill with new trees - for (size_t tree_index = 0; tree_index < variables.size(); ++tree_index) - { - LeafGrid curr_gridLeaf; - curr_gridLeaf.grid = grids[tree_index]; - curr_gridLeaf.individuals = individuals[tree_index]; - curr_gridLeaf.lim_list = lim_list; - curr_gridLeaf.values = values[tree_index]; - curr_family[variables[tree_index]]->GridLeaves = curr_gridLeaf; - } - } - - purified = true; -} - -void RandomPlantedForest::purify_3() -{ - - // go through all n_trees families - for (auto &curr_family : this->tree_families) - { - - // lim_list is a list giving for each variable all interval end-points - std::vector> lim_list(feature_size); - - // go through all variables of the component - for (int curr_dim = 1; curr_dim <= feature_size; ++curr_dim) - { - std::vector bounds; - - // go through trees of family - for (const auto &curr_tree : curr_family) - { - - // consider only relevant trees that have current dimension as variable - if (!curr_tree.first.count(curr_dim)) - continue; - - // go through leaves of tree - for (const auto &curr_leaf : curr_tree.second->leaves) - { - // get interval ends of variable - bounds.push_back(curr_leaf.intervals[curr_dim - 1].first); - bounds.push_back(curr_leaf.intervals[curr_dim - 1].second); - } - } - std::sort(bounds.begin(), bounds.end()); - bounds.erase(std::unique(bounds.begin(), bounds.end()), bounds.end()); - // int i_last = bounds.size()-1; - // double bibi = bounds[i_last] + 0.0001; - // bounds[i_last] = bounds[i_last] + 0.0001; - lim_list[curr_dim - 1] = bounds; - } - - // initialize values and individuals for each tree in family - std::vector grids(curr_family.size() - 1); - std::vector> individuals(curr_family.size() - 1); - std::vector>> values(curr_family.size() - 1); - std::vector>> values_old(curr_family.size() - 1); - std::vector> variables(curr_family.size() - 1); - - // ------------- setup finer grid ------------- - - int tree_index = 0; - for (const auto &curr_tree : curr_family) - { - - if (curr_tree.first == std::set{0}) - { - - // values[tree_index] = rpf::Matrix>(dimensions, std::vector(value_size, 0)); // changed - continue; // ignore null tree - } - - // fill space with dimensions - std::vector dimensions; - for (const auto &dim : curr_tree.first) - { - dimensions.push_back(lim_list[dim - 1].size()); // size - 1 ? WICHTIG - } - - // setup grid for leaf indices - auto grid = grid::NDGrid(dimensions); - - // initialize data for current tree - grids[tree_index] = grid; - individuals[tree_index] = utils::Matrix(dimensions, 0); - values[tree_index] = utils::Matrix>(dimensions, std::vector(value_size, 0)); // changed - values_old[tree_index] = utils::Matrix>(dimensions, std::vector(value_size, 0)); // changed - variables[tree_index] = curr_tree.first; - - // fill grid points with individuals and values - while (!grid.nextPoint()) - { - - std::vector gridPoint = grid.getPoint(); - - bool in_leaf = true; - - // go through sample points to sum up individuals - for (const auto &feature_vec : X) - { - int dim_index = 0; - in_leaf = true; - for (const auto &dim : curr_tree.first) - { - double val = feature_vec[dim - 1]; - if (!((val >= lim_list[dim - 1][gridPoint[dim_index]]) && (val < lim_list[dim - 1][gridPoint[dim_index] + 1]))) - in_leaf = false; - ++dim_index; - } - - // consider individuals only if all in - if (in_leaf) - individuals[tree_index][gridPoint] += 1; - } - - // go through leaves of tree to sum up values - for (const auto &leaf : curr_tree.second->get_leaves()) - { - - in_leaf = true; - int dim_index = 0; - for (const auto &dim : curr_tree.first) - { - // consider values only if all in - if (!((leaf.intervals[dim - 1].first <= lim_list[dim - 1][gridPoint[dim_index]]) && (leaf.intervals[dim - 1].second >= lim_list[dim - 1][gridPoint[dim_index] + 1]))) - in_leaf = false; - ++dim_index; - } - - // sum up values - if (in_leaf) - { - - values[tree_index][gridPoint] += leaf.value; // todo: multiclass - values_old[tree_index][gridPoint] += leaf.value; // todo: multiclass - } - } - } - - ++tree_index; - } - - // Rcout << variables.size(); - // for(int i = 0; i>(std::vector{1}, std::vector(value_size, 0))); - values_old.insert(values_old.begin(), utils::Matrix>(std::vector{1}, std::vector(value_size, 0))); - individuals.insert(individuals.begin(), utils::Matrix(std::vector{1})); - variables.insert(variables.begin(), std::set{0}); - - // recap maximum number of dimensions of current family - unsigned int curr_max = curr_family.rbegin()->first.size(); - - while (curr_max > 1) - { - - auto keys = getKeys(curr_family); - // go through split dimensions of all trees - for (std::vector>::reverse_iterator key = keys.rbegin(); key != keys.rend(); ++key) - { - auto &curr_tree = curr_family[(*key)]; - std::set curr_dims = curr_tree->split_dims; - // check if number of dims same as current max_interaction - if (curr_dims.size() == curr_max) - { - // go through feature dims - int dim_index = 0; - for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) - { - // continue only if dim in current tree - if (curr_tree->split_dims.count(feature_dim) != 0) - { - std::set tree_dims = curr_tree->split_dims; - tree_dims.erase(tree_dims.find(feature_dim)); // remove current feature dim from current tree - // check if tree with dimensions exists, if not create - std::shared_ptr tree = treeExists(tree_dims, curr_family); - if (!tree) - { - // get index of old and new tree - auto old_tree_index = std::distance(std::begin(curr_family), curr_family.find(curr_tree->get_split_dims())); - curr_family.insert(std::make_pair(tree_dims, std::make_shared(DecisionTree(tree_dims)))); - auto tree_index = std::distance(std::begin(curr_family), curr_family.find(tree_dims)); - // remove matrix dimension of respective variable - std::vector matrix_dimensions = values[old_tree_index].dims; - // std::vector matrix_dimensions = values_old[old_tree_index].dims; - - // Rcout << typeof(matrix_dimensions.begin()) << std::endl; - - matrix_dimensions.erase(matrix_dimensions.begin() + dim_index); - // initialize data for new tree - auto grid = grid::NDGrid(matrix_dimensions); - grids.insert(grids.begin() + tree_index, grid); - values.insert(values.begin() + tree_index, utils::Matrix>(matrix_dimensions, std::vector(value_size, 0))); - values_old.insert(values_old.begin() + tree_index, utils::Matrix>(matrix_dimensions, std::vector(value_size, 0))); - individuals.insert(individuals.begin() + tree_index, utils::Matrix(matrix_dimensions)); - variables.insert(variables.begin() + tree_index, tree_dims); - // fill individuals of new trees - while (!grid.nextPoint()) - { - std::vector gridPoint = grid.getPoint(); - bool in_leaf = true; - // go through sample points to sum up individuals - for (const auto &feature_vec : X) - { - int dim_index2 = 0; - in_leaf = true; - for (const auto &dim : tree_dims) - { - double val = feature_vec[dim - 1]; - if (!((val >= lim_list[dim - 1][gridPoint[dim_index2]]) && (val < lim_list[dim - 1][gridPoint[dim_index2] + 1]))) - in_leaf = false; - ++dim_index2; - } - // consider individuals only if all in - if (in_leaf) - individuals[tree_index][gridPoint] += 1; - } - } - } - dim_index++; - } - } - } - } - // update currently considered dimension size - --curr_max; - } - - // Rcout << std::endl; - // Rcout << std::endl; - // Rcout << std::endl; - // - // for(int i = 0; i curr_dims = *tree_t; - // do not purify null - if (curr_dims == std::set{0}) - continue; - // Rcout << std::endl << tree_index_t << " - T: "; - // Rcout << "tree_t:"; - // for(auto dim: curr_dims) Rcout << dim << ", "; - // Rcout << std::endl; - - auto grid = grids[tree_index_t]; - // Rcout << "Grid dimensions of T: "; - // for(auto dim: grid.dimensions) Rcout << dim << ", "; - // Rcout << std::endl; - // go through subtrees of t - int tree_index_u = variables.size(); - for (auto tree_u = variables.rbegin(); tree_u != variables.rend(); ++tree_u) - { - --tree_index_u; - // j_dims = dims of t without u - std::set j_dims = curr_dims; - if (tree_u->size() > curr_dims.size()) - continue; - // check if subset - bool subset = true; - for (const auto dim : *tree_u) - { - if (tree_t->count(dim) == 0) - { - subset = false; - break; - } - j_dims.erase(dim); - } - if (!subset) - continue; - - // Rcout << "Hello"; - // Rcout << " " << tree_index_u << " - U: "; - // for(auto dim: *tree_u) Rcout << dim << ", "; - // Rcout << std::endl; - // Rcout << " Individuals: "; - - double tot_sum = 0; - grid = grids[tree_index_u]; - while (!grid.nextPoint()) - { - auto gridPoint = grid.getPoint(); - // Rcout << individuals[tree_index_u][gridPoint] << ", "; - tot_sum += individuals[tree_index_u][gridPoint]; - } - // Rcout << "Total sum: " << tot_sum << std::endl; - // Rcout << std::endl; - - grid = grids[tree_index_u]; - // Rcout << " Grid dimensions of U: "; - // for(auto dim: grid.dimensions) Rcout << dim << ", "; - // Rcout << std::endl; - - // Rcout<< "j_dims: "< update(value_size, 0); - - if (j_dims.size() == 0) - { - - // grid = grids[tree_index_u]; - while (!grid.nextPoint()) - { - auto gridPoint_i = grid.getPoint(); - // Rcout << " " << "i: "; - // for(auto p: gridPoint_i) Rcout << p << ", "; - // Rcout << std::endl << " "; - double curr_sum = individuals[tree_index_u][gridPoint_i]; - // Rcout << ", Current Sum: " << curr_sum << std::endl; - // Rcout << std::endl << " " << "i, j: "; - update += (curr_sum / tot_sum) * values_old[tree_index_t][gridPoint_i]; - // Rcout << std::endl; - } - - int tree_index_s = variables.size(); - for (auto tree_s = variables.rbegin(); tree_s != variables.rend(); ++tree_s) - { - - // Rcout << "tree_s:"; - // for(auto dim: *tree_s) Rcout << dim << ", "; - // Rcout << std::endl; - - --tree_index_s; - if (*tree_s == std::set{0}) - { - - auto gridPoint_0 = std::vector{0}; - values[tree_index_s][gridPoint_0] += update; - // Rcout << std::endl; - //} - - /* - for(auto tree_0: curr_family){ - - if(tree_0.first == std::set{0}){ - - Rcout << tree_0.first.size(); - std::vector leaf_index(tree_0.first.size(), 0); - std::vector leaf_index(tree_0.second->GridLeaves.values.size(), 0); - - int Test = tree_0.second->GridLeaves.values.size(); - Rcout << Test; - tree_0.second->GridLeaves.values[leaf_index] += update; - } - } - */ - } - else - { - - // check if S subset of T - - bool subset = true; - for (const auto dim : *tree_s) - { - if (tree_t->count(dim) == 0) - { - subset = false; - break; - } - } - if (!subset) - continue; - - // Rcout << pow(-1, (*tree_s).size()) << std::endl; - - auto grid_k = grids[tree_index_s]; - while (!grid_k.nextPoint()) - { - auto gridPoint_k = grid_k.getPoint(); - // - // if((*tree_s).size()>2){ - // Rcout << std::endl << " " << "j, k: "; - // for(auto p: gridPoint_k) Rcout << p << ", "; - // Rcout << std::endl; - // } - // - // Rcout << pow(-1, (*tree_s).size()) * update << std::endl; - values[tree_index_s][gridPoint_k] += pow(-1, (*tree_s).size()) * update; - } - } - } - // Rcout << std::endl; - } - else - { - - std::vector j_sizes(j_dims.size(), 0); - for (size_t j = 0; j < j_dims.size(); ++j) - { - auto tmp = j_dims.begin(); - std::advance(tmp, j); - int j_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*tmp)); - j_sizes[j] = grids[tree_index_t].dimensions[j_index]; - } - - // Rcout<<"Hello 1"; - - grid::NDGrid grid_j = grid::NDGrid(j_sizes); - while (!grid_j.nextPoint()) - { - - std::vector update(value_size, 0); - auto gridPoint_j = grid_j.getPoint(); - // Rcout << " " << "j: "; - // for(auto p: gridPoint_j) Rcout << p << ", "; - // Rcout << std::endl; - // calc update - grid = grids[tree_index_u]; - while (!grid.nextPoint()) - { - auto gridPoint_i = grid.getPoint(); - // Rcout << " " << "i: "; - // for(auto p: gridPoint_i) Rcout << p << ", "; - // Rcout << std::endl << " "; - double curr_sum = individuals[tree_index_u][gridPoint_i]; - // Rcout << ", Current Sum: " << curr_sum << std::endl; - std::vector gridPoint_ij(tree_t->size(), 0); - for (size_t j = 0; j < gridPoint_j.size(); ++j) - { - auto j_dim = j_dims.begin(); - std::advance(j_dim, j); - int j_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*j_dim)); - // Rcout << " j_dim=" << *j_dim << ", j_index=" << j_index; - gridPoint_ij[j_index] = gridPoint_j[j]; - } - for (size_t i = 0; i < gridPoint_i.size(); ++i) - { - auto i_dim = tree_u->begin(); - std::advance(i_dim, i); - int i_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*i_dim)); - // Rcout << " i_dim=" << *i_dim << ", i_index=" << i_index; - gridPoint_ij[i_index] = gridPoint_i[i]; - } - // Rcout << std::endl << " " << "i, j: "; - // for(auto p: gridPoint_ij) Rcout << p << ", "; - // Rcout << std::endl; - update += (curr_sum / tot_sum) * values_old[tree_index_t][gridPoint_ij]; - // Rcout << std::endl; - } - - // Rcout << "Hello_2"; - // update trees - int tree_index_s = variables.size(); - for (auto tree_s = variables.rbegin(); tree_s != variables.rend(); ++tree_s) - { - --tree_index_s; - // check if T\U=j_dims subset of S and S subset of T - bool subset = true; - for (const auto dim : j_dims) - { - if (tree_s->count(dim) == 0) - { - subset = false; - break; - } - } - for (const auto dim : *tree_s) - { - if (tree_t->count(dim) == 0) - { - subset = false; - break; - } - } - if (!subset) - continue; - // Rcout << " " << "S: "; - // for(auto dim: *tree_s) Rcout << dim << ", "; - // Rcout << std::endl; - // S cap U - std::set k_dims = *tree_s; - std::set k_dims_h1 = *tree_s; - std::set k_dims_h2 = *tree_u; - for (const auto dim : *tree_u) - k_dims.insert(dim); - for (const auto dim : *tree_s) - k_dims_h2.erase(dim); - for (const auto dim : *tree_u) - k_dims_h1.erase(dim); - for (const auto dim : k_dims_h1) - k_dims.erase(dim); - for (const auto dim : k_dims_h2) - k_dims.erase(dim); - - // std::set k_dims = *tree_s; - // for(const auto dim: *tree_t) k_dims.erase(dim); - // for(const auto dim: *tree_u) k_dims.insert(dim); - - // Rcout << " " << "k_dims: "; - // for(auto dim: k_dims) Rcout << dim << ", "; - // Rcout << std::endl; - - if (k_dims.size() == 0) - { - - values[tree_index_s][gridPoint_j] += pow(-1, (*tree_s).size() - j_dims.size()) * update; - } - else - { - - // Rcout <<"k_dims :"; - // for(auto dim: k_dims) Rcout << dim << ", "; - // Rcout << std::endl; - - std::vector k_sizes(k_dims.size(), 0); - for (size_t k = 0; k < k_dims.size(); ++k) - { - auto tmp = k_dims.begin(); - std::advance(tmp, k); - int k_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*tmp)); - k_sizes[k] = grids[tree_index_t].dimensions[k_index]; - } - // Rcout << " " << "k_sizes: "; - // for(auto dim: k_sizes) Rcout << dim << ", "; - // Rcout << std::endl; - grid::NDGrid grid_k = grid::NDGrid(k_sizes); - while (!grid_k.nextPoint()) - { - auto gridPoint_k = grid_k.getPoint(); - // Rcout << " " << "k: "; - // for(auto p: gridPoint_k) Rcout << p << ", "; - // Rcout << std::endl << " "; - std::vector gridPoint_jk(tree_s->size(), 0); - for (size_t j = 0; j < gridPoint_j.size(); ++j) - { - auto j_dim = j_dims.begin(); - std::advance(j_dim, j); - int j_index = std::distance(variables[tree_index_s].begin(), variables[tree_index_s].find(*j_dim)); - // Rcout << " j_dim=" << *j_dim << ", j_index=" << j_index; - gridPoint_jk[j_index] = gridPoint_j[j]; - } - for (size_t k = 0; k < gridPoint_k.size(); ++k) - { - auto k_dim = k_dims.begin(); - std::advance(k_dim, k); - int k_index = std::distance(variables[tree_index_s].begin(), variables[tree_index_s].find(*k_dim)); - // Rcout << " k_dim=" << *k_dim << ", k_index=" << k_index; - gridPoint_jk[k_index] = gridPoint_k[k]; - } - // Rcout << std::endl << " " << "j, k: "; - // for(auto p: gridPoint_jk) Rcout << p << ", "; - // Rcout << std::endl; - - // Rcout << pow(-1, (*tree_s).size() - j_dims.size()) * update[0]; - values[tree_index_s][gridPoint_jk] += pow(-1, (*tree_s).size() - j_dims.size()) * update; - } - } - } - } - } - } - --tree_index_t; - } - - // ------------- attach to rpf class ------------- - - // fill with new trees - for (size_t tree_index = 0; tree_index < variables.size(); ++tree_index) - { - LeafGrid curr_gridLeaf; - curr_gridLeaf.grid = grids[tree_index]; - curr_gridLeaf.individuals = individuals[tree_index]; - curr_gridLeaf.lim_list = lim_list; - curr_gridLeaf.values = values[tree_index]; - curr_family[variables[tree_index]]->GridLeaves = curr_gridLeaf; - } - } - - purified = true; -} - -void RandomPlantedForest::print() -{ - for (int n = 0; n < n_trees; ++n) - { - TreeFamily family = tree_families[n]; - auto keys = getKeys(family); - for (size_t m = 0; m < keys.size(); ++m) - { - DecisionTree tree = *(family[keys[m]]); - Rcout << m + 1 << " Tree: "; - Rcout << "Dims="; - for (const auto &dim : tree.split_dims) - Rcout << dim << ","; - Rcout << std::endl - << "Leaves: (" << tree.leaves.size() << ")" << std::endl; - for (const auto &leaf : tree.leaves) - { - Rcout << "Intervals="; - for (const auto &interval : leaf.intervals) - { - Rcout << interval.first << "," << interval.second << "/"; - } - Rcout << " Value="; - for (const auto &val : leaf.value) - Rcout << val << ", "; - Rcout << std::endl; - } - Rcout << std::endl; - } - Rcout << std::endl - << std::endl; - } -} - -// print parameters of the model to the console -void RandomPlantedForest::get_parameters() -{ - Rcout << "Parameters: n_trees=" << n_trees << ", n_splits=" << n_splits << ", max_interaction=" << max_interaction << ", t_try=" << t_try - << ", split_try=" << split_try << ", purified=" << purified << ", deterministic=" << deterministic << ", nthreads=" << nthreads - << ", feature_size=" << feature_size << ", sample_size=" << sample_size << std::endl; -} - -/* retrospectively change parameters of existing class object, - updates the model, so far only single valued parameters supported, - for replacing training data use 'set_data', - note that changing cv does not trigger cross validation */ -void RandomPlantedForest::set_parameters(StringVector keys, NumericVector values) -{ - if (keys.size() != values.size()) - { - Rcout << "Size of input vectors is not the same. " << std::endl; - return; - } - - for (unsigned int i = 0; i < keys.size(); ++i) - { - if (keys[i] == "deterministic") - { - this->deterministic = values[i]; - } - else if (keys[i] == "nthreads") - { - this->nthreads = values[i]; - } - else if (keys[i] == "purify") - { - this->purify_forest = values[i]; - } - else if (keys[i] == "n_trees") - { - this->n_trees = values[i]; - } - else if (keys[i] == "n_splits") - { - this->n_splits = values[i]; - } - else if (keys[i] == "t_try") - { - this->t_try = values[i]; - } - else if (keys[i] == "split_try") - { - this->split_try = values[i]; - } - else if (keys[i] == "max_interaction") - { - this->max_interaction = values[i]; - } - else if (keys[i] == "cv") - { - this->cross_validate = values[i]; - } - else - { - Rcout << "Unkown parameter key '" << keys[i] << "' ." << std::endl; - } - } - this->fit(); -} - -List RandomPlantedForest::get_model() -{ - List model; - for (const auto &family : tree_families) - { - List variables, family_values, family_intervals; - for (const auto &tree : family) - { - List tree_values; - List tree_intervals; - variables.push_back(from_std_set(tree.first)); - for (const auto &leaf : tree.second->leaves) - { - NumericMatrix leaf_values; - for (const auto &val : leaf.value) - { - leaf_values.push_back(val); - } - tree_values.push_back(leaf_values); - - NumericVector intervals; - for (const auto &interval : leaf.intervals) - { - intervals.push_back(interval.first); - intervals.push_back(interval.second); - } - NumericMatrix leaf_intervals(2, feature_size, intervals.begin()); - tree_intervals.push_back(leaf_intervals); - } - family_intervals.push_back(tree_intervals); - family_values.push_back(tree_values); - } - model.push_back(List::create(Named("variables") = variables, _["values"] = family_values, _["intervals"] = family_intervals)); - } - return (model); -} +// purify_3 moved to lib/purify.cpp diff --git a/src/lib/splits_cur_trees_1.cpp b/src/lib/splits_cur_trees_1.cpp new file mode 100644 index 0000000..5658a44 --- /dev/null +++ b/src/lib/splits_cur_trees_1.cpp @@ -0,0 +1,203 @@ +// Split-mode: cur_trees_1. Samples feasible leaves proportionally to their +// number of candidate thresholds, then evaluates a single threshold. +#include "rpf.hpp" +#include "internal_utils.hpp" + +using namespace rpf_utils; + +Split RandomPlantedForest::calcOptimalSplit_curTrees1(const std::vector> &Y, + const std::vector> &X, + std::vector &possible_splits, + TreeFamily &curr_family) +{ + Split curr_split, min_split; min_split.min_sum = std::numeric_limits::infinity(); curr_split.Y = &Y; + + unsigned int raw = (unsigned int)std::ceil(this->t_try * possible_splits.size()); + unsigned int upper = std::min((unsigned int)this->max_candidates_, (unsigned int)possible_splits.size()); + unsigned int n_candidates = std::max(1u, std::min(raw, upper)); + std::vector weights(possible_splits.size()); + + for (size_t i = 0; i < possible_splits.size(); ++i) weights[i] = std::exp(-this->split_decay_rate_ * possible_splits[i].age); + + size_t positive_count_ = 0; for (double w : weights) if (w > 0.0) ++positive_count_; + + if (positive_count_ == 0) { n_candidates = 1; } + else { if (n_candidates > positive_count_) n_candidates = static_cast(positive_count_); } + + std::vector sample_idxs; sample_idxs.reserve(n_candidates); + if (!this->deterministic) { + std::vector pos_idx; pos_idx.reserve(possible_splits.size()); + std::vector pos_w; pos_w.reserve(possible_splits.size()); + for (size_t i = 0; i < weights.size(); ++i) if (weights[i] > 0.0) { pos_idx.push_back(i); pos_w.push_back(weights[i]); } + const size_t P = pos_idx.size(); + if (P == 0) { + std::vector all(possible_splits.size()); std::iota(all.begin(), all.end(), 0); + size_t k = std::min(n_candidates, all.size()); + for (size_t i = 0; i < k; ++i) { size_t j = i + static_cast(rng_runif01() * (double)(all.size() - i)); if (j >= all.size()) j = all.size() - 1; std::swap(all[i], all[j]); } + for (size_t i = 0; i < k; ++i) sample_idxs.push_back(all[i]); + } else if (n_candidates * 8 < P) { + size_t k2 = std::min(n_candidates, P); + std::vector> keys; keys.reserve(P); + for (size_t i = 0; i < P; ++i) { double u = rng_runif01(); if (u <= 0.0) u = std::numeric_limits::min(); double key = -std::log(u) / pos_w[i]; keys.emplace_back(key, pos_idx[i]); } + if (k2 < keys.size()) { std::nth_element(keys.begin(), keys.begin() + k2, keys.end(), [](const auto& a, const auto& b){ return a.first < b.first; }); keys.resize(k2); } + for (auto &kv : keys) sample_idxs.push_back(kv.second); + } else { + size_t k = std::min(n_candidates, P); + std::vector> keys; keys.reserve(P); + for (size_t i = 0; i < P; ++i) { double u = rng_runif01(); if (u <= 0.0) u = std::numeric_limits::min(); double key = -std::log(u) / pos_w[i]; keys.emplace_back(key, pos_idx[i]); } + if (k < keys.size()) { std::nth_element(keys.begin(), keys.begin() + k, keys.end(), [](const auto& a, const auto& b){ return a.first < b.first; }); keys.resize(k); } + for (auto &kv : keys) sample_idxs.push_back(kv.second); + } + } else { for (size_t i=0;i+n_candidates<=possible_splits.size() && i< n_candidates;++i) sample_idxs.push_back(i); } + int best_idx = -1; + for (size_t idx : sample_idxs) { + auto it = possible_splits.begin(); std::advance(it, idx); int k = it->dim - 1; int leaf_size = this->n_leaves[k]; + std::set Dprime_minus_k = it->tree->split_dims; Dprime_minus_k.erase(k + 1); Dprime_minus_k.erase(0); + std::vector> sources; sources.reserve(2); + if (Dprime_minus_k.empty()) { if (auto itZero = curr_family.find(std::set{0}); itZero != curr_family.end()) sources.push_back(itZero->second); } + else { if (auto itS = curr_family.find(Dprime_minus_k); itS != curr_family.end()) sources.push_back(itS->second); } + if (auto itD = curr_family.find(it->tree->split_dims); itD != curr_family.end()) if (sources.empty() || sources.back().get() != itD->second.get()) sources.push_back(itD->second); + + if (!this->deterministic) { + auto ensure_weights_cache = [&](const std::shared_ptr& tree, int kdim){ + // Lazy-size vectors to feature_size once + if ((int)tree->weights_epoch_by_dim_v.size() < this->feature_size) { + tree->weights_epoch_by_dim_v.assign((size_t)this->feature_size, -1); + tree->fenwick_by_dim_v.assign((size_t)this->feature_size, std::vector()); + tree->leaf_weights_by_dim_v.assign((size_t)this->feature_size, std::vector()); + tree->weights_total_by_dim_v.assign((size_t)this->feature_size, 0.0); + } + // Recompute BIT if epoch mismatches or size changed + bool need = true; + if (tree->weights_epoch_by_dim_v[(size_t)kdim] == tree->weights_epoch) { + if (tree->fenwick_by_dim_v[(size_t)kdim].size() == tree->leaves.size()) need = false; + } + if (!need) return; // cache fresh + const size_t L = tree->leaves.size(); + std::vector bit(L, 0.0), wts(L, 0.0); + double total = 0.0; + for (size_t li = 0; li < L; ++li) { + auto &leaf = tree->leaves[li]; + // Determine number of unique thresholds available in this leaf for kdim + size_t unique_count = 0; + auto it_uc = leaf.unique_count_cache.find(kdim); + if (it_uc != leaf.unique_count_cache.end()) { + unique_count = it_uc->second; + } else { + // Build or reuse sorted values, then count uniques + std::vector order_cf; std::vector sorted_vals_cf; + ensure_order_and_sorted_vals_for_leaf(X, leaf, kdim, order_cf, sorted_vals_cf); + if (!sorted_vals_cf.empty()) { + unique_count = 1; + for (size_t i = 1; i < sorted_vals_cf.size(); ++i) + if (sorted_vals_cf[i] != sorted_vals_cf[i - 1]) ++unique_count; + } + leaf.unique_count_cache[kdim] = unique_count; + } + // Weight = number of unique thresholds that respect min leaf size + const long width_unique = (long)unique_count - 2L * (long)leaf_size; + const double w = (width_unique > 0L) ? static_cast(width_unique) : 0.0; + wts[li] = w; total += w; if (w != 0.0) rpf_utils::fenwick_add(bit, li + 1, w); + } + tree->fenwick_by_dim_v[(size_t)kdim] = std::move(bit); + tree->leaf_weights_by_dim_v[(size_t)kdim] = std::move(wts); + tree->weights_total_by_dim_v[(size_t)kdim] = total; + tree->weights_epoch_by_dim_v[(size_t)kdim] = tree->weights_epoch; + }; + + struct SourceInfo { std::shared_ptr tree; double total; }; + std::vector src_info; src_info.reserve(sources.size()); + double total_all = 0.0; + for (const auto &src_tree : sources) { + if (!src_tree || src_tree->leaves.empty()) continue; + ensure_weights_cache(src_tree, k); + double tot = src_tree->weights_total_by_dim_v[(size_t)k]; + if (tot <= 0.0) continue; + src_info.push_back({src_tree, tot}); + total_all += tot; + } + if (src_info.empty() || total_all <= 0.0) continue; + + for (size_t t = 0; t < (size_t)this->split_try; ++t) { + // Sample a source tree proportionally to its total weight + double r_src = rng_runif(0.0, total_all); + size_t si = 0; + while (si + 1 < src_info.size() && r_src > src_info[si].total) { r_src -= src_info[si].total; ++si; } + auto &sel_tree = src_info[si].tree; + + // Sample a leaf within the selected tree using prefix sums + const auto &bit = sel_tree->fenwick_by_dim_v[(size_t)k]; + double tot_leaf = src_info[si].total; + if (bit.empty() || tot_leaf <= 0.0) continue; + double r_leaf = rng_runif(0.0, tot_leaf); + size_t leaf_idx_sel = rpf_utils::fenwick_find_by_prefix(bit, r_leaf); + if (leaf_idx_sel == 0) continue; // safety + leaf_idx_sel -= 1; // to 0-based + if (leaf_idx_sel >= sel_tree->leaves.size()) continue; + Leaf *leaf_ptr = &sel_tree->leaves[leaf_idx_sel]; + // Sample by unique thresholds: build/reuse unique values for this leaf and dim + std::vector order_cf; std::vector sorted_vals_cf; + ensure_order_and_sorted_vals_for_leaf(X, *leaf_ptr, k, order_cf, sorted_vals_cf); + size_t unique_count = 0; std::vector* unique_ptr = nullptr; + if (leaf_ptr->unique_vals_cache.count(k)) { + unique_ptr = &leaf_ptr->unique_vals_cache[k]; + unique_count = unique_ptr->size(); + leaf_ptr->unique_count_cache[k] = unique_count; + } else { + auto uniques = compute_unique_sorted_values(sorted_vals_cf); + unique_count = uniques.size(); + leaf_ptr->unique_count_cache[k] = unique_count; + leaf_ptr->unique_vals_cache[k] = std::move(uniques); + unique_ptr = &leaf_ptr->unique_vals_cache[k]; + } + const int left = leaf_size; const int right_exclusive = (int)unique_count - leaf_size; + if (right_exclusive - left <= 1) continue; + int s_idx = rng_randint(left, right_exclusive); + double sp = (*unique_ptr)[(size_t)s_idx]; + + size_t ns = 0, nb = 0; std::vector sum_s_adj(this->value_size, 0.0), sum_b_adj(this->value_size, 0.0); + for (int ind : leaf_ptr->individuals) { + if (X[ind][k] < sp) { ++ns; for (size_t p = 0; p < this->value_size; ++p) sum_s_adj[p] += Y[ind][p]; } + else { ++nb; for (size_t p = 0; p < this->value_size; ++p) sum_b_adj[p] += Y[ind][p]; } + } + if (ns == 0 || nb == 0) continue; double loss = 0.0; + for (size_t p = 0; p < this->value_size; ++p) { loss -= (sum_s_adj[p] * sum_s_adj[p]) / (double)ns; loss -= (sum_b_adj[p] * sum_b_adj[p]) / (double)nb; } + if (loss < min_split.min_sum) { min_split.min_sum = loss; min_split.tree_index = sel_tree; min_split.leaf_index = leaf_ptr; min_split.split_coordinate = k + 1; min_split.split_point = sp; best_idx = (int)idx; min_split.sum_s = sum_s_adj; min_split.sum_b = sum_b_adj; } + } + } else { + for (const auto &src_tree : sources) { + if (src_tree->leaves.empty()) continue; + for (auto &leaf : src_tree->leaves) { + std::vector order_cf; std::vector sorted_vals_cf; ensure_order_and_sorted_vals_for_leaf(X, leaf, k, order_cf, sorted_vals_cf); + // Build/reuse unique values for deterministic sampling across unique thresholds + size_t unique_count = 0; std::vector* unique_ptr = nullptr; + if (leaf.unique_vals_cache.count(k)) { + unique_ptr = &leaf.unique_vals_cache[k]; + unique_count = unique_ptr->size(); + leaf.unique_count_cache[k] = unique_count; + } else { + auto uniques = compute_unique_sorted_values(sorted_vals_cf); + unique_count = uniques.size(); + leaf.unique_count_cache[k] = unique_count; + leaf.unique_vals_cache[k] = std::move(uniques); + unique_ptr = &leaf.unique_vals_cache[k]; + } + if ((int)unique_count <= 2 * leaf_size) continue; int left = leaf_size; int right = (int)unique_count - leaf_size; + std::vector samples = compute_even_spread_indices(left, right, (size_t)this->split_try); + for (int s_idx : samples) { + const double sp = (*unique_ptr)[(size_t)s_idx]; size_t ns = 0, nb = 0; std::vector sum_s_adj(this->value_size, 0.0), sum_b_adj(this->value_size, 0.0); + for (int ind : leaf.individuals) { if (X[ind][k] < sp) { ++ns; for (size_t p = 0; p < this->value_size; ++p) sum_s_adj[p] += Y[ind][p]; } else { ++nb; for (size_t p = 0; p < this->value_size; ++p) sum_b_adj[p] += Y[ind][p]; } } + if (ns == 0 || nb == 0) continue; double loss = 0.0; for (size_t p = 0; p < this->value_size; ++p) { loss -= (sum_s_adj[p] * sum_s_adj[p]) / (double)ns; loss -= (sum_b_adj[p] * sum_b_adj[p]) / (double)nb; } + if (loss < min_split.min_sum) { min_split.min_sum = loss; min_split.tree_index = src_tree; min_split.leaf_index = &leaf; min_split.split_coordinate = k + 1; min_split.split_point = sp; best_idx = (int)idx; min_split.sum_s = sum_s_adj; min_split.sum_b = sum_b_adj; } + } + } + } + } + } + + rpf_utils::age_pool_by_sample(sample_idxs, best_idx, possible_splits); + finalize_split_from_sums(min_split, X, this->value_size); + return min_split; +} + + diff --git a/src/lib/splits_cur_trees_2.cpp b/src/lib/splits_cur_trees_2.cpp new file mode 100644 index 0000000..8bfdfe1 --- /dev/null +++ b/src/lib/splits_cur_trees_2.cpp @@ -0,0 +1,165 @@ +// Split-mode: cur_trees_2. Tries random thresholds across all leaves of +// predecessor/current trees, using age decay for candidate sampling. +#include "rpf.hpp" +#include "internal_utils.hpp" + +using namespace rpf_utils; + +Split RandomPlantedForest::calcOptimalSplit_curTrees2(const std::vector> &Y, + const std::vector> &X, + std::vector &possible_splits, + TreeFamily &curr_family) +{ + Split curr_split, min_split; + min_split.min_sum = std::numeric_limits::infinity(); + curr_split.Y = &Y; + + unsigned int raw_candidates = static_cast(std::ceil(this->t_try * possible_splits.size())); + unsigned int upper = std::min(this->max_candidates_, possible_splits.size()); + unsigned int n_candidates = std::max(1u, std::min(raw_candidates, upper)); + + std::vector weights(possible_splits.size()); + for (size_t i = 0; i < possible_splits.size(); ++i) weights[i] = std::exp(-this->split_decay_rate_ * possible_splits[i].age); + std::vector sample_idxs; sample_idxs.reserve(n_candidates); + if (!this->deterministic) { + std::vector pos_idx; pos_idx.reserve(possible_splits.size()); + std::vector pos_w; pos_w.reserve(possible_splits.size()); + for (size_t i = 0; i < weights.size(); ++i) if (weights[i] > 0.0) { pos_idx.push_back(i); pos_w.push_back(weights[i]); } + const size_t P = pos_idx.size(); + if (P == 0) { + std::vector all(possible_splits.size()); std::iota(all.begin(), all.end(), 0); + size_t k = std::min(n_candidates, all.size()); + for (size_t i = 0; i < k; ++i) { size_t j = i + static_cast(rng_runif01() * (double)(all.size() - i)); if (j >= all.size()) j = all.size() - 1; std::swap(all[i], all[j]); } + for (size_t i = 0; i < k; ++i) sample_idxs.push_back(all[i]); + } else if (n_candidates * 8 < P) { + size_t k2 = std::min(n_candidates, P); + std::vector> keys; keys.reserve(P); + for (size_t i = 0; i < P; ++i) { double u = rng_runif01(); if (u <= 0.0) u = std::numeric_limits::min(); double key = -std::log(u) / pos_w[i]; keys.emplace_back(key, pos_idx[i]); } + if (k2 < keys.size()) { std::nth_element(keys.begin(), keys.begin() + k2, keys.end(), [](const auto& a, const auto& b){ return a.first < b.first; }); keys.resize(k2); } + for (auto &kv : keys) sample_idxs.push_back(kv.second); + } else { + size_t k = std::min(n_candidates, P); + std::vector> keys; keys.reserve(P); + for (size_t i = 0; i < P; ++i) { double u = rng_runif01(); if (u <= 0.0) u = std::numeric_limits::min(); double key = -std::log(u) / pos_w[i]; keys.emplace_back(key, pos_idx[i]); } + if (k < keys.size()) { std::nth_element(keys.begin(), keys.begin() + k, keys.end(), [](const auto& a, const auto& b){ return a.first < b.first; }); keys.resize(k); } + for (auto &kv : keys) sample_idxs.push_back(kv.second); + } + } else { for (size_t i=0;idim - 1; + int leaf_size = this->n_leaves[k]; + + std::set tree_dims = it->tree->split_dims; + tree_dims.erase(k + 1); tree_dims.erase(0); + + std::vector> curr_trees; + if (tree_dims.empty()) { + auto itZero = curr_family.find(std::set{0}); + if (itZero != curr_family.end() && itZero->second) curr_trees.push_back(itZero->second); + } + if (auto itS = curr_family.find(tree_dims); itS != curr_family.end() && itS->second) curr_trees.push_back(itS->second); + if (auto itD = curr_family.find(it->tree->split_dims); itD != curr_family.end() && itD->second) { + if (curr_trees.empty() || curr_trees.back().get() != itD->second.get()) curr_trees.push_back(itD->second); + } + + for (auto &curr_tree : curr_trees) { + if (curr_tree->leaves.empty()) continue; + for (auto &leaf : curr_tree->leaves) { + // Reuse cached order and sorted values + std::vector order_cf; std::vector sorted_vals_cf; + ensure_order_and_sorted_vals_for_leaf(X, leaf, k, order_cf, sorted_vals_cf); + // Unique count & values caching + std::vector *unique_ptr = nullptr; + size_t unique_count = 0; + if (leaf.unique_count_cache.count(k)) { + unique_count = leaf.unique_count_cache[k]; + if (unique_count != 0 && leaf.unique_vals_cache.count(k)) unique_ptr = &leaf.unique_vals_cache[k]; + } + if (!unique_ptr) { + auto uniques = compute_unique_sorted_values(sorted_vals_cf); + unique_count = uniques.size(); + leaf.unique_count_cache[k] = unique_count; + leaf.unique_vals_cache[k] = std::move(uniques); + unique_ptr = &leaf.unique_vals_cache[k]; + } + if (unique_count < 2 * static_cast(leaf_size)) continue; + + const size_t m = leaf.individuals.size(); + std::vector samples; + if (this->deterministic) { + int maxp = std::min((int)unique_count - 1, 9); + samples.resize(maxp); std::iota(samples.begin(), samples.end(), 1); + } else { + samples.resize(this->split_try); + for (size_t i = 0; i < samples.size(); ++i) samples[i] = rng_randint(leaf_size, (int)unique_count - leaf_size); + std::sort(samples.begin(), samples.end()); + } + const bool single_eval = (samples.size() == 1); + std::vector> prefix_cf; // [value_size][m] + std::vector total_cf; // [value_size] + if (!single_eval) build_prefix_and_total_given_order(Y, leaf, order_cf, this->value_size, prefix_cf, total_cf); + + for (size_t si = 0; si < samples.size(); ++si) { + const double sp = (*unique_ptr)[samples[si]]; + size_t pos = static_cast(std::lower_bound(sorted_vals_cf.begin(), sorted_vals_cf.end(), sp) - sorted_vals_cf.begin()); + if (pos == 0 || pos >= m) continue; + if (pos < static_cast(leaf_size) || (m - pos) < static_cast(leaf_size)) continue; + double loss = 0.0; + if (!single_eval) { + for (size_t p = 0; p < this->value_size; ++p) { + const double sum_s_base = prefix_cf[p][pos - 1]; + const double sum_b_base = total_cf[p] - sum_s_base; + loss -= (sum_s_base * sum_s_base) / static_cast(pos); + loss -= (sum_b_base * sum_b_base) / static_cast(m - pos); + } + } else { + size_t ns = 0, nb = 0; + std::vector sum_s_adj(this->value_size, 0.0), sum_b_adj(this->value_size, 0.0); + for (int ind : leaf.individuals) { + const bool left_side = (X[ind][k] < sp); + if (left_side) { ++ns; for (size_t p = 0; p < this->value_size; ++p) { double v = Y[ind][p]; sum_s_adj[p] += v; } } + else { ++nb; for (size_t p = 0; p < this->value_size; ++p) { double v = Y[ind][p]; sum_b_adj[p] += v; } } + } + if (ns == 0 || nb == 0) { continue; } + for (size_t p = 0; p < this->value_size; ++p) { + loss -= (sum_s_adj[p] * sum_s_adj[p]) / static_cast(ns); + loss -= (sum_b_adj[p] * sum_b_adj[p]) / static_cast(nb); + } + } + if (loss < min_split.min_sum) { + min_split.min_sum = loss; + min_split.tree_index = curr_tree; + min_split.leaf_index = &leaf; + min_split.split_coordinate = k + 1; + min_split.split_point = sp; + best_idx = (int)idx; + min_split.sum_s.assign(this->value_size, 0.0); + min_split.sum_b.assign(this->value_size, 0.0); + if (!single_eval) { + for (size_t p = 0; p < this->value_size; ++p) { + const double sum_s_base = prefix_cf[p][pos - 1]; + const double sum_b_base = total_cf[p] - sum_s_base; + min_split.sum_s[p] = sum_s_base; + min_split.sum_b[p] = sum_b_base; + } + } else { + for (int ind : leaf.individuals) { + if (X[ind][k] < sp) { for (size_t p = 0; p < this->value_size; ++p) { double v = Y[ind][p]; min_split.sum_s[p] += v; } } + else { for (size_t p = 0; p < this->value_size; ++p) { double v = Y[ind][p]; min_split.sum_b[p] += v; } } + } + } + } + } + } + } + } + + rpf_utils::age_pool_by_sample(sample_idxs, best_idx, possible_splits); + finalize_split_from_sums(min_split, X, this->value_size); + return min_split; +} + + diff --git a/src/lib/splits_hist.cpp b/src/lib/splits_hist.cpp new file mode 100644 index 0000000..0077ca1 --- /dev/null +++ b/src/lib/splits_hist.cpp @@ -0,0 +1,137 @@ +// Split-mode: histogram-binned evaluation (mode 4). Mirrors leaves mode but +// evaluates candidate thresholds at per-feature global bin boundaries. +#include "rpf.hpp" +#include "internal_utils.hpp" + +using namespace rpf_utils; + +Split RandomPlantedForest::calcOptimalSplit_hist(const std::vector> &Y, + const std::vector> &X, + std::vector &possible_splits, + TreeFamily &curr_family) +{ + Split min_split; min_split.min_sum = std::numeric_limits::infinity(); + if (possible_splits.empty()) return min_split; + + unsigned int raw_candidates = static_cast(std::ceil(this->t_try * possible_splits.size())); + unsigned int upper = std::min(this->max_candidates_, possible_splits.size()); + unsigned int n_candidates = std::max(1u, std::min(raw_candidates, upper)); + std::vector weights(possible_splits.size()); + for (size_t i = 0; i < possible_splits.size(); ++i) weights[i] = std::exp(-this->split_decay_rate_ * possible_splits[i].age); + std::vector sample_idxs = this->deterministic ? std::vector() : sample_weighted_indices_filtered(weights, n_candidates); + if (this->deterministic) { for (size_t i = 0; i < n_candidates && i < possible_splits.size(); ++i) sample_idxs.push_back(i); } + + // Use per-feature effective bin count based on actual cut count for stability + int best_idx = -1; + for (size_t idx : sample_idxs) { + auto it = possible_splits.begin(); std::advance(it, idx); + if (!it->tree || it->leaf_idx >= it->tree->leaves.size()) continue; + const int k_dim = it->dim; // 1-based + const int k = k_dim - 1; + Leaf* leafPtr = &it->tree->leaves[it->leaf_idx]; + const int leaf_min = this->n_leaves[k]; + const size_t m = leafPtr->individuals.size(); + if (m == 0) continue; + + // Build histogram for this leaf and feature k using cached working bin ids + const auto &cuts_k = (k >= 0 && k < (int)feature_cut_points_.size()) ? feature_cut_points_[k] : std::vector{}; + size_t Kf = cuts_k.size() + 1; if (Kf < 2) continue; // cannot split without at least 2 bins + std::vector cnt(Kf, 0); + std::vector> sum(Kf, std::vector(this->value_size, 0.0)); + const bool have_cached = ((size_t)k < tls_working_bin_id.size()); + if (have_cached) { + const std::vector &bin_k = tls_working_bin_id[(size_t)k]; + for (int ind : leafPtr->individuals) { + int b = bin_k[(size_t)ind]; + cnt[(size_t)b] += 1; + for (size_t p = 0; p < this->value_size; ++p) sum[(size_t)b][p] += Y[ind][p]; + } + } else { + for (int ind : leafPtr->individuals) { + double v = X[ind][k]; + int b = 0; + if (!cuts_k.empty()) { + auto itb = std::upper_bound(cuts_k.begin(), cuts_k.end(), v); + b = (int)std::distance(cuts_k.begin(), itb); + if (b < 0) b = 0; if ((size_t)b >= Kf) b = (int)Kf - 1; + } + cnt[(size_t)b] += 1; + for (size_t p = 0; p < this->value_size; ++p) sum[(size_t)b][p] += Y[ind][p]; + } + } + + // Build prefix across bins then sample only split_try boundaries + const int total_n = (int)m; + std::vector total_sum(this->value_size, 0.0); + for (size_t b = 0; b < Kf; ++b) { + for (size_t p = 0; p < this->value_size; ++p) total_sum[p] += sum[b][p]; + } + std::vector prefix_cnt(Kf, 0); + std::vector> prefix_sum(Kf, std::vector(this->value_size, 0.0)); + for (size_t b = 0; b < Kf; ++b) { + prefix_cnt[b] = cnt[b] + (b > 0 ? prefix_cnt[b - 1] : 0); + for (size_t p = 0; p < this->value_size; ++p) + prefix_sum[b][p] = sum[b][p] + (b > 0 ? prefix_sum[b - 1][p] : 0.0); + } + + // Valid boundaries are b_left in [0, Kf-2] such that both sides satisfy leaf_min + int first_valid = -1, last_valid = -1; + for (size_t b_left = 0; b_left + 1 <= Kf - 1; ++b_left) { + int ln = prefix_cnt[b_left]; + int rn = total_n - ln; + if (ln >= leaf_min && rn >= leaf_min) { + if (first_valid < 0) first_valid = (int)b_left; + last_valid = (int)b_left; + } + } + if (first_valid < 0 || last_valid < first_valid) continue; + + // Sample boundary indices within [first_valid, last_valid] + std::vector samples = this->deterministic + ? compute_even_spread_indices(first_valid, last_valid + 1, (size_t)this->split_try) + : sample_unique_ints_uniform_R(first_valid, last_valid + 1, (size_t)this->split_try); + + for (size_t si = 0; si < samples.size(); ++si) { + int b_left = samples[si]; + if (b_left < first_valid || b_left > last_valid) continue; + int left_n = prefix_cnt[(size_t)b_left]; + int right_n = total_n - left_n; + if (left_n < leaf_min || right_n < leaf_min) continue; + + double loss = 0.0; + for (size_t p = 0; p < this->value_size; ++p) { + double ls = prefix_sum[(size_t)b_left][p]; + double rs = total_sum[p] - ls; + loss -= (ls * ls) / (double)left_n; + loss -= (rs * rs) / (double)right_n; + } + if (loss < min_split.min_sum) { + min_split.min_sum = loss; + min_split.tree_index = it->tree; + min_split.leaf_index = leafPtr; + min_split.split_coordinate = k + 1; + // Map boundary index to actual split point using precomputed cuts + double sp = 0.0; + if (k >= 0 && k < (int)feature_cut_points_.size() && !feature_cut_points_[k].empty()) { + const auto &cuts = feature_cut_points_[k]; + size_t cp_idx = (size_t)std::min((size_t)b_left, cuts.size() - 1); + sp = cuts[cp_idx]; + } else { + sp = 0.5 * (leafPtr->intervals[k].first + leafPtr->intervals[k].second); + } + min_split.split_point = sp; + best_idx = (int)idx; + // Store sums for this boundary + min_split.sum_s.assign(this->value_size, 0.0); + min_split.sum_b.assign(this->value_size, 0.0); + for (size_t p = 0; p < this->value_size; ++p) { min_split.sum_s[p] = prefix_sum[(size_t)b_left][p]; min_split.sum_b[p] = total_sum[p] - prefix_sum[(size_t)b_left][p]; } + } + } + } + + rpf_utils::age_pool_by_sample(sample_idxs, best_idx, possible_splits); + finalize_split_from_sums(min_split, X, this->value_size); + return min_split; +} + + diff --git a/src/lib/splits_leaves.cpp b/src/lib/splits_leaves.cpp new file mode 100644 index 0000000..75771ac --- /dev/null +++ b/src/lib/splits_leaves.cpp @@ -0,0 +1,105 @@ +// Split-mode: leaves. Evaluates per-leaf candidate splits using cached +// per-leaf orders and prefix sums, with age-weighted candidate sampling. +#include "rpf.hpp" +#include "internal_utils.hpp" + +using namespace rpf_utils; + +Split RandomPlantedForest::calcOptimalSplit_leaves(const std::vector> &Y, + const std::vector> &X, + std::vector &possible_splits, + TreeFamily &curr_family) +{ + Split curr_split, min_split; + min_split.min_sum = std::numeric_limits::infinity(); + curr_split.Y = &Y; + + if (possible_splits.empty()) return min_split; + + unsigned int raw_candidates = static_cast(std::ceil(this->t_try * possible_splits.size())); + unsigned int upper = std::min(this->max_candidates_, possible_splits.size()); + unsigned int n_candidates = std::max(1u, std::min(raw_candidates, upper)); + std::vector weights(possible_splits.size()); + for (size_t i = 0; i < possible_splits.size(); ++i) weights[i] = std::exp(-this->split_decay_rate_ * possible_splits[i].age); + std::vector sample_idxs = this->deterministic ? std::vector() : sample_weighted_indices_filtered(weights, n_candidates); + if (this->deterministic) { for (size_t i = 0; i < n_candidates && i < possible_splits.size(); ++i) sample_idxs.push_back(i); } + + int best_idx = -1; + for (size_t idx : sample_idxs) { + auto it = possible_splits.begin(); std::advance(it, idx); + int k = it->dim - 1; + if (!it->tree || it->leaf_idx >= it->tree->leaves.size()) continue; + Leaf* leafPtr = &it->tree->leaves[it->leaf_idx]; + + const int leaf_size = this->n_leaves[k]; + const size_t m = leafPtr->individuals.size(); + if (m == 0) continue; + // Quick infeasibility check: cannot split if fewer than 2*leaf_size individuals + if (m < static_cast(2 * leaf_size)) continue; + + std::vector order; std::vector sorted_vals; + ensure_order_and_sorted_vals_for_leaf(X, *leafPtr, k, order, sorted_vals); + std::vector unique = compute_unique_sorted_values(sorted_vals); + // Build first positions of each unique value (same length/order as `unique`) + std::vector first_pos; + first_pos.reserve(unique.size()); + if (!sorted_vals.empty()) { + first_pos.push_back(0); + for (size_t i = 1; i < sorted_vals.size(); ++i) { + if (sorted_vals[i] != sorted_vals[i - 1]) first_pos.push_back(i); + } + } + + if (unique.size() < 2 * static_cast(leaf_size)) continue; + + std::vector samples; + int left = leaf_size; int right_exclusive = (int)unique.size() - leaf_size + 1; + samples = this->deterministic ? compute_even_spread_indices(left, right_exclusive, (size_t)this->split_try) + : sample_unique_ints_uniform_R(left, right_exclusive, (size_t)this->split_try); + + // Build prefix sums once per candidate evaluation + std::vector> prefix; // [p][i] + std::vector total; // [p] + build_prefix_and_total_given_order(Y, *leafPtr, order, this->value_size, prefix, total); + + for (size_t si = 0; si < samples.size(); ++si) { + const size_t uidx = static_cast(samples[si]); + if (uidx >= unique.size() || uidx >= first_pos.size()) continue; + const double sp = unique[uidx]; + const size_t pos = first_pos[uidx]; + if (pos == 0 || pos >= m) continue; + if (pos < static_cast(leaf_size) || (m - pos) < static_cast(leaf_size)) continue; + + double loss = 0.0; + for (size_t p = 0; p < this->value_size; ++p) { + const double sum_s_base = prefix[p][pos - 1]; + const double sum_b_base = total[p] - sum_s_base; + loss -= (sum_s_base * sum_s_base) / static_cast(pos); + loss -= (sum_b_base * sum_b_base) / static_cast(m - pos); + } + + if (loss < min_split.min_sum) { + min_split.min_sum = loss; + min_split.tree_index = it->tree; + min_split.leaf_index = leafPtr; + min_split.split_coordinate = k + 1; + min_split.split_point = sp; + best_idx = (int)idx; + min_split.sum_s.assign(this->value_size, 0.0); + min_split.sum_b.assign(this->value_size, 0.0); + for (size_t p = 0; p < this->value_size; ++p) { + const double sum_s_base = prefix[p][pos - 1]; + const double sum_b_base = total[p] - sum_s_base; + min_split.sum_s[p] = sum_s_base; + min_split.sum_b[p] = sum_b_base; + } + } + } + } + + rpf_utils::age_pool_by_sample(sample_idxs, best_idx, possible_splits); + finalize_split_from_sums(min_split, X, this->value_size); + return min_split; +} + + diff --git a/src/lib/splits_res_trees.cpp b/src/lib/splits_res_trees.cpp new file mode 100644 index 0000000..3c11180 --- /dev/null +++ b/src/lib/splits_res_trees.cpp @@ -0,0 +1,258 @@ +// Split-mode: res_trees. Operates on the pool of resulting trees constructed +// by expanding dimension sets, evaluating one threshold per leaf via prefix sums. +#include "rpf.hpp" +#include "internal_utils.hpp" + +using namespace rpf_utils; + +Split RandomPlantedForest::calcOptimalSplit_resTrees(const std::vector> &Y, + const std::vector> &X, + std::vector &possible_trees, + TreeFamily &curr_family) +{ + Split curr_split, min_split; min_split.min_sum = std::numeric_limits::infinity(); curr_split.Y = &Y; + + if (possible_trees.empty()) return min_split; + unsigned int raw_candidates = (unsigned int)std::ceil(this->t_try * possible_trees.size()); + unsigned int upper = std::min((unsigned int)this->max_candidates_, (unsigned int)possible_trees.size()); + unsigned int n_candidates = std::max(1u, std::min(raw_candidates, upper)); + + std::vector weights(possible_trees.size()); + for (size_t i = 0; i < possible_trees.size(); ++i) weights[i] = std::exp(-this->split_decay_rate_ * possible_trees[i].age); + std::vector sample_idxs; sample_idxs.reserve(n_candidates); + if (!this->deterministic) { + std::vector pos_idx; pos_idx.reserve(possible_trees.size()); + std::vector pos_w; pos_w.reserve(possible_trees.size()); + for (size_t i = 0; i < weights.size(); ++i) if (weights[i] > 0.0) { pos_idx.push_back(i); pos_w.push_back(weights[i]); } + const size_t P = pos_idx.size(); + if (P == 0) { + std::vector all(possible_trees.size()); std::iota(all.begin(), all.end(), 0); + size_t k = std::min(n_candidates, all.size()); + for (size_t i = 0; i < k; ++i) { size_t j = i + static_cast(rng_runif01() * (double)(all.size() - i)); if (j >= all.size()) j = all.size() - 1; std::swap(all[i], all[j]); } + for (size_t i = 0; i < k; ++i) sample_idxs.push_back(all[i]); + } else { + size_t k = std::min(n_candidates, P); + std::vector> keys; keys.reserve(P); + for (size_t i = 0; i < P; ++i) { double u = rng_runif01(); if (u <= 0.0) u = std::numeric_limits::min(); double key = -std::log(u) / pos_w[i]; keys.emplace_back(key, pos_idx[i]); } + if (k < keys.size()) { std::nth_element(keys.begin(), keys.begin() + k, keys.end(), [](const auto& a, const auto& b){ return a.first < b.first; }); keys.resize(k); } + for (auto &kv : keys) sample_idxs.push_back(kv.second); + } + } else { for (size_t i=0;i= possible_trees.size()) continue; + auto &cand = possible_trees[idx]; + auto treePtr = cand.tree; if (!treePtr) continue; + + // Ensure per-tree, per-dimension weight caches (Fenwick + totals) like cur_trees_1 + auto ensure_weights_cache = [&](const std::shared_ptr& tree, int kdim){ + if (!tree) return; + // Lazy-size vectors to feature_size once + if ((int)tree->weights_epoch_by_dim_v.size() < this->feature_size) { + tree->weights_epoch_by_dim_v.assign((size_t)this->feature_size, -1); + tree->fenwick_by_dim_v.assign((size_t)this->feature_size, std::vector()); + tree->leaf_weights_by_dim_v.assign((size_t)this->feature_size, std::vector()); + tree->weights_total_by_dim_v.assign((size_t)this->feature_size, 0.0); + } + bool need = true; + if (tree->weights_epoch_by_dim_v[(size_t)kdim] == tree->weights_epoch) { + if (tree->fenwick_by_dim_v[(size_t)kdim].size() == tree->leaves.size()) need = false; + } + if (!need) return; + const int k = kdim; const size_t L = tree->leaves.size(); + std::vector bit(L, 0.0), wts(L, 0.0); + double total = 0.0; + const int leaf_size_local = this->n_leaves[k]; + for (size_t li = 0; li < L; ++li) { + auto &leaf = tree->leaves[li]; + size_t unique_count = 0; + auto it_uc = leaf.unique_count_cache.find(kdim); + if (it_uc != leaf.unique_count_cache.end()) { + unique_count = it_uc->second; + } else { + std::vector order_cf; std::vector sorted_vals_cf; + ensure_order_and_sorted_vals_for_leaf(X, leaf, k, order_cf, sorted_vals_cf); + if (!sorted_vals_cf.empty()) { + unique_count = 1; + for (size_t i = 1; i < sorted_vals_cf.size(); ++i) + if (sorted_vals_cf[i] != sorted_vals_cf[i - 1]) ++unique_count; + } + leaf.unique_count_cache[kdim] = unique_count; + } + const long width_unique = (long)unique_count - 2L * (long)leaf_size_local; + const double w = (width_unique > 0L) ? static_cast(width_unique) : 0.0; + wts[li] = w; total += w; if (w != 0.0) rpf_utils::fenwick_add(bit, li + 1, w); + } + tree->fenwick_by_dim_v[(size_t)kdim] = std::move(bit); + tree->leaf_weights_by_dim_v[(size_t)kdim] = std::move(wts); + tree->weights_total_by_dim_v[(size_t)kdim] = total; + tree->weights_epoch_by_dim_v[(size_t)kdim] = tree->weights_epoch; + }; + + // Per-candidate local state for leaves and dimensions used + struct LeafDimState { + bool initialized=false; int left=0; int right=0; size_t used_count=0; std::vector used_flags; + std::vector order_cf; std::vector sorted_vals; std::vector* unique_ptr=nullptr; std::vector> prefix_cf; std::vector total_cf; + }; + // Keyed by Leaf* then by k (dimension index) + std::unordered_map> local_states; + + // Buckets over (kdim, src_tree) with lazy local mutable copies of weights + struct Bucket { int kdim; std::shared_ptr tree; const std::vector* bit_src=nullptr; const std::vector* wts_src=nullptr; std::vector bit; std::vector wts; bool has_local=false; double total=0.0; }; + std::vector buckets; + size_t grand_total_remaining = 0; + + for (int kdim : treePtr->split_dims) { + if (kdim == 0) continue; const int k = kdim - 1; const int leaf_size = this->n_leaves[k]; + + std::vector> sources; sources.reserve(2); + std::set S = treePtr->split_dims; S.erase(kdim); + if (S.empty()) { if (auto itZero = curr_family.find(std::set{0}); itZero != curr_family.end()) sources.push_back(itZero->second); } + else { if (auto itS = curr_family.find(S); itS != curr_family.end()) sources.push_back(itS->second); } + if (auto itD = curr_family.find(treePtr->split_dims); itD != curr_family.end()) if (sources.empty() || sources.back().get() != itD->second.get()) sources.push_back(itD->second); + + for (const auto &src_tree : sources) { + if (!src_tree || src_tree->leaves.empty()) continue; + ensure_weights_cache(src_tree, k); + double tot = src_tree->weights_total_by_dim_v[(size_t)k]; + if (tot <= 0.0) continue; + Bucket b; b.kdim = kdim; b.tree = src_tree; b.total = tot; + b.bit_src = &src_tree->fenwick_by_dim_v[(size_t)k]; + b.wts_src = &src_tree->leaf_weights_by_dim_v[(size_t)k]; + buckets.push_back(std::move(b)); + grand_total_remaining += (size_t)std::llround(std::max(0.0, tot)); + (void)leaf_size; // silence unused if compiled with warnings + } + } + if (buckets.empty() || grand_total_remaining == 0) continue; + + // Fenwick over bucket totals for O(log B) selection and updates + std::vector bucket_bit(buckets.size(), 0.0); + for (size_t i=0;i 0.0) rpf_utils::fenwick_add(bucket_bit, i+1, buckets[i].total); + auto fenwick_prefix_sum = [&](const std::vector& bit, size_t idx1)->double { double s=0.0; while (idx1>0) { s += bit[idx1-1]; idx1 -= idx1 & (~idx1 + 1); } return s; }; + + const double total_all0 = std::accumulate(buckets.begin(), buckets.end(), 0.0, [](double s, const Bucket& b){ return s + std::max(0.0, b.total); }); + double bucket_total_all = total_all0; + size_t draws = std::min((size_t)this->split_try, (size_t)std::llround(total_all0)); + + for (size_t t=0; tdeterministic) { + if (total_all0 <= 0.0) break; + double step = total_all0 / (double)draws; + double target = step * (t + 0.5); if (target >= total_all0) target = std::max(0.0, total_all0 - 1.0); + b_idx = rpf_utils::fenwick_find_by_prefix(bucket_bit, target); + if (b_idx == 0) continue; --b_idx; + } else { + if (bucket_total_all <= 0.0) break; + double r = rng_runif(0.0, bucket_total_all); + b_idx = rpf_utils::fenwick_find_by_prefix(bucket_bit, r); + if (b_idx == 0) continue; --b_idx; + } + + auto &bucket = buckets[b_idx]; + const int kdim = bucket.kdim; const int k = kdim - 1; + if (bucket.total <= 0.0) { continue; } + + // Sample a leaf in the bucket via local Fenwick + double r_leaf; + if (this->deterministic) { + double step = (total_all0 <= 0.0) ? 0.0 : (total_all0 / (double)draws); + double target_global = step * (t + 0.5); if (target_global >= total_all0) target_global = std::max(0.0, total_all0 - 1.0); + double before = fenwick_prefix_sum(bucket_bit, b_idx); + double inside = target_global - before; if (inside < 0.0) inside = 0.0; if (inside >= bucket.total) inside = std::max(0.0, bucket.total - 1.0); + r_leaf = inside; + } else { + r_leaf = rng_runif(0.0, std::max(0.0, bucket.total)); + } + const std::vector& bit_view = bucket.has_local ? bucket.bit : *(bucket.bit_src); + size_t leaf_idx_sel = rpf_utils::fenwick_find_by_prefix(bit_view, r_leaf); + if (leaf_idx_sel == 0) continue; leaf_idx_sel -= 1; size_t wts_size = bucket.has_local ? bucket.wts.size() : (bucket.wts_src ? bucket.wts_src->size() : 0); if (leaf_idx_sel >= wts_size || leaf_idx_sel >= bucket.tree->leaves.size()) continue; + Leaf *leaf_ptr = &bucket.tree->leaves[leaf_idx_sel]; + + // Prepare local per-leaf-per-dim state lazily + auto &state = local_states[leaf_ptr][k]; + if (!state.initialized) { + // Build order and sorted values + ensure_order_and_sorted_vals_for_leaf(X, *leaf_ptr, k, state.order_cf, state.sorted_vals); + // Compute or reuse unique values and left/right bounds + size_t unique_count = 0; + if (leaf_ptr->unique_vals_cache.count(k)) { + state.unique_ptr = &leaf_ptr->unique_vals_cache[k]; + unique_count = state.unique_ptr->size(); + leaf_ptr->unique_count_cache[k] = unique_count; + } else { + auto uniques = compute_unique_sorted_values(state.sorted_vals); + unique_count = uniques.size(); + leaf_ptr->unique_count_cache[k] = unique_count; + leaf_ptr->unique_vals_cache[k] = std::move(uniques); + state.unique_ptr = &leaf_ptr->unique_vals_cache[k]; + } + const int leaf_size_here = this->n_leaves[k]; + state.left = leaf_size_here; state.right = (int)unique_count - leaf_size_here; + if (state.right < state.left) { state.left = 0; state.right = 0; } + if (state.right > state.left) state.used_flags.assign((size_t)(state.right - state.left), 0); + // Build prefix sums for fast evaluation + build_prefix_and_total_given_order(Y, *leaf_ptr, state.order_cf, this->value_size, state.prefix_cf, state.total_cf); + state.initialized = true; + } + const std::vector& wts_view = bucket.has_local ? bucket.wts : *(bucket.wts_src); + if (state.right <= state.left || wts_view[leaf_idx_sel] <= 0.0) { continue; } + + // Select threshold index within [left, right) avoiding repeats + int s_idx; + if (this->deterministic) { + int range = state.right - state.left; + int remaining_here = (int)wts_view[leaf_idx_sel]; + int guess = state.left + (int)(((double)state.used_count + 0.5) / ((double)remaining_here + 0.5) * range); + if (guess >= state.right) guess = state.right - 1; + int lo = guess, hi = guess; bool found = false; + while (lo >= state.left || hi < state.right) { + if (lo >= state.left && (state.used_flags.empty() || !state.used_flags[lo - state.left])) { s_idx = lo; found = true; break; } + if (hi < state.right && (state.used_flags.empty() || !state.used_flags[hi - state.left])) { s_idx = hi; found = true; break; } + --lo; ++hi; + } + if (!found) { + for (int p = state.left; p < state.right; ++p) { + if (state.used_flags.empty() || !state.used_flags[p - state.left]) { s_idx = p; break; } + } + } + } else { + do { s_idx = rng_randint(state.left, state.right); } while (!state.used_flags.empty() && state.used_flags[s_idx - state.left]); + } + if (!state.used_flags.empty()) state.used_flags[(size_t)(s_idx - state.left)] = 1; + state.used_count += 1; + + if (!bucket.has_local) { bucket.bit = *(bucket.bit_src); bucket.wts = *(bucket.wts_src); bucket.has_local = true; } + // Evaluate loss at chosen threshold + double sp = (*state.unique_ptr)[(size_t)s_idx]; + const size_t m_eval = leaf_ptr->individuals.size(); + size_t pos_in_sorted = static_cast(std::lower_bound(state.sorted_vals.begin(), state.sorted_vals.end(), sp) - state.sorted_vals.begin()); + if (pos_in_sorted == 0 || pos_in_sorted >= m_eval) { continue; } + + double loss = 0.0; std::vector sum_s_adj(this->value_size, 0.0), sum_b_adj(this->value_size, 0.0); + for (size_t p = 0; p < this->value_size; ++p) { + const double sum_s_base = state.prefix_cf[p][pos_in_sorted - 1]; const double sum_b_base = state.total_cf[p] - sum_s_base; + sum_s_adj[p] = sum_s_base; sum_b_adj[p] = sum_b_base; + loss -= (sum_s_adj[p] * sum_s_adj[p]) / static_cast(pos_in_sorted); + loss -= (sum_b_adj[p] * sum_b_adj[p]) / static_cast(m_eval - pos_in_sorted); + } + if (loss < min_split.min_sum) { + min_split.min_sum = loss; min_split.tree_index = bucket.tree; min_split.leaf_index = leaf_ptr; min_split.split_coordinate = kdim; min_split.split_point = sp; best_idx = (int)idx; min_split.sum_s = sum_s_adj; min_split.sum_b = sum_b_adj; + } + + // Consume one threshold from this leaf locally: update BIT, wts, totals + bucket.wts[leaf_idx_sel] -= 1.0; if (bucket.wts[leaf_idx_sel] < 0.0) bucket.wts[leaf_idx_sel] = 0.0; + rpf_utils::fenwick_add(bucket.bit, leaf_idx_sel + 1, -1.0); + bucket.total -= 1.0; if (bucket.total < 0.0) bucket.total = 0.0; + rpf_utils::fenwick_add(bucket_bit, b_idx + 1, -1.0); + bucket_total_all -= 1.0; if (bucket_total_all < 0.0) bucket_total_all = 0.0; + } + } + + rpf_utils::age_pool_by_sample(sample_idxs, best_idx, possible_trees); + finalize_split_from_sums(min_split, X, this->value_size); + return min_split; +} diff --git a/src/lib/training.cpp b/src/lib/training.cpp new file mode 100644 index 0000000..fa2898a --- /dev/null +++ b/src/lib/training.cpp @@ -0,0 +1,102 @@ +// Training orchestration split out from rpf.cpp. Builds tree families, +// manages bootstrapping and threading, and handles optional purification. +#include "rpf.hpp" +#include "internal_utils.hpp" + +using namespace rpf_utils; + +void RandomPlantedForest::fit() +{ + std::vector initial_individuals(sample_size); + std::iota(initial_individuals.begin(), initial_individuals.end(), 0); + + std::vector initial_intervals(feature_size); + for (int i = 0; i < feature_size; ++i) + initial_intervals[i] = Interval{lower_bounds[i], upper_bounds[i]}; + + Leaf initial_leaf; + { + initial_leaf.value = std::vector(value_size, 0); + initial_leaf.individuals = initial_individuals; + initial_leaf.intervals = initial_intervals; + } + std::vector initial_leaves{initial_leaf}; + + this->tree_families = std::vector(n_trees); + + // Generate per-tree seeds from R's RNG to ensure reproducibility across runs + // when the user sets the R seed. These seeds will be used regardless of + // threading mode. + tree_seeds_.assign((size_t)std::max(0, n_trees), 0ULL); + for (int i = 0; i < n_trees; ++i) { + // Two 32-bit chunks composed into a 64-bit seed using R's RNG + unsigned long long hi = static_cast(R::runif(0.0, 4294967296.0)); + unsigned long long lo = static_cast(R::runif(0.0, 4294967296.0)); + tree_seeds_[(size_t)i] = (hi << 32) ^ lo ^ static_cast(i); + } + + unsigned int threads_to_use = static_cast(nthreads); + if (threads_to_use == 0) threads_to_use = 1; + if (threads_to_use > 1) + { + if (threads_to_use > std::thread::hardware_concurrency()) + { + Rcout << "Requested " << threads_to_use << " threads but only " << std::thread::hardware_concurrency() << " available" << std::endl; + } + for (int start = 0; start < n_trees; start += (int)threads_to_use) + { + int batch = std::min((int)threads_to_use, n_trees - start); + if (batch <= 0) break; + std::vector threads((size_t)batch); + for (int i = 0; i < batch; ++i) + { + int tree_index = start + i; + threads[(size_t)i] = std::thread([this, &initial_leaves](int tree_index_inner){ + std::mt19937_64 rng_local; + std::mt19937_64* prev_ptr = rpf_utils::swap_tls_rng(nullptr); + if (!tree_seeds_.empty() && (size_t)tree_index_inner < tree_seeds_.size()) { + rng_local.seed(tree_seeds_[(size_t)tree_index_inner]); + } else { + rng_local.seed(88172645463393265ULL ^ (unsigned long long)tree_index_inner); + } + rpf_utils::swap_tls_rng(&rng_local); + this->create_tree_family(initial_leaves, (size_t)tree_index_inner); + rpf_utils::swap_tls_rng(prev_ptr); + }, tree_index); + } + for (auto &th : threads) + { + if (th.joinable()) th.join(); + } + } + } + else + { + // Single-threaded: still drive randomness from per-tree seeds + std::mt19937_64 rng_local; + std::mt19937_64* prev_ptr = rpf_utils::swap_tls_rng(nullptr); + for (int n = 0; n < n_trees; ++n) + { + if (!tree_seeds_.empty() && (size_t)n < tree_seeds_.size()) { + rng_local.seed(tree_seeds_[(size_t)n]); + } else { + rng_local.seed(88172645463393265ULL ^ (unsigned long long)n); + } + rpf_utils::swap_tls_rng(&rng_local); + create_tree_family(initial_leaves, n); + } + rpf_utils::swap_tls_rng(prev_ptr); + } + + if (purify_forest) + { + // Default: cap=0 (uncapped), nthreads=0 (auto; min(object nthreads, available)), mode=2 (fast exact) + this->purify(0, 0, 2); + } + else + { + purified = false; + } +} + + diff --git a/src/randomPlantedForest.cpp b/src/randomPlantedForest.cpp index 9fd3630..4bb7b5a 100644 --- a/src/randomPlantedForest.cpp +++ b/src/randomPlantedForest.cpp @@ -10,13 +10,13 @@ RCPP_MODULE(mod_rpf) class_("RandomPlantedForest") .constructor() .method("set_data", &RandomPlantedForest::set_data) + .method("get_parameters", &RandomPlantedForest::get_parameters) .method("cross_validation", &RandomPlantedForest::cross_validation) .method("predict_matrix", &RandomPlantedForest::predict_matrix) .method("predict_vector", &RandomPlantedForest::predict_vector) .method("MSE", &RandomPlantedForest::MSE) - .method("purify", &RandomPlantedForest::purify_3) + .method("purify_threads", static_cast(&RandomPlantedForest::purify)) .method("print", &RandomPlantedForest::print) - .method("get_parameters", &RandomPlantedForest::get_parameters) .method("set_parameters", &RandomPlantedForest::set_parameters) .method("get_model", &RandomPlantedForest::get_model) .method("is_purified", &RandomPlantedForest::is_purified); diff --git a/tests/testthat/test-predict-components.R b/tests/testthat/test-predict-components.R index a775f09..d0aa2a6 100644 --- a/tests/testthat/test-predict-components.R +++ b/tests/testthat/test-predict-components.R @@ -81,8 +81,8 @@ test_that(".predict_single_component is consistent with predictor order", { # Internal data preprocessing only done in predict_components to save time processed <- hardhat::forge(mtcars, rp$blueprint) - new_data <- preprocess_predictors_predict(rp, processed$predictors) - + new_data <- randomPlantedForest::preprocess_predictors_predict(rp, processed$predictors) + expect_equal( .predict_single_component(rp, new_data, c("cyl", "am")), .predict_single_component(rp, new_data, c("am", "cyl")) diff --git a/tests/testthat/test-purify-modes-equivalence.R b/tests/testthat/test-purify-modes-equivalence.R new file mode 100644 index 0000000..33f858c --- /dev/null +++ b/tests/testthat/test-purify-modes-equivalence.R @@ -0,0 +1,49 @@ +set.seed(2025) + +test_that("single component predictions match across purify modes (non-capped)", { + rp1 <- rpf(mpg ~ cyl + disp + hp + wt, data = mtcars, + max_interaction = 3, ntrees = 30, deterministic = TRUE) + rp2 <- rpf(mpg ~ cyl + disp + hp + wt, data = mtcars, + max_interaction = 3, ntrees = 30, deterministic = TRUE) + + expect_false(is_purified(rp1)) + expect_false(is_purified(rp2)) + + purify(rp1, mode = 1L) + purify(rp2, mode = 2L) + + expect_true(is_purified(rp1)) + expect_true(is_purified(rp2)) + + m1 <- predict_components(rp1, mtcars) + m2 <- predict_components(rp2, mtcars) + + expect_equal(colnames(m1$m), colnames(m2$m)) + expect_equal(as.matrix(m1$m), as.matrix(m2$m), tolerance = 1e-8) + expect_equal(m1$intercept, m2$intercept, tolerance = 1e-10) +}) + +test_that("single component predictions match across purify modes (capped)", { + rp1 <- rpf(mpg ~ cyl + disp + hp + wt, data = mtcars, + max_interaction = 3, ntrees = 30, deterministic = TRUE) + rp2 <- rpf(mpg ~ cyl + disp + hp + wt, data = mtcars, + max_interaction = 3, ntrees = 30, deterministic = TRUE) + + expect_false(is_purified(rp1)) + expect_false(is_purified(rp2)) + + purify(rp1, maxp_interaction = 2L, mode = 1L) + purify(rp2, maxp_interaction = 2L, mode = 2L) + + expect_true(is_purified(rp1)) + expect_true(is_purified(rp2)) + + m1 <- predict_components(rp1, mtcars, max_interaction = 2L) + m2 <- predict_components(rp2, mtcars, max_interaction = 2L) + + expect_equal(colnames(m1$m), colnames(m2$m)) + expect_equal(as.matrix(m1$m), as.matrix(m2$m), tolerance = 1e-8) + expect_equal(m1$intercept, m2$intercept, tolerance = 1e-10) +}) + + diff --git a/tests/testthat/test-purify.R b/tests/testthat/test-purify.R index f305e16..146ad19 100644 --- a/tests/testthat/test-purify.R +++ b/tests/testthat/test-purify.R @@ -31,7 +31,7 @@ test_that("purification does not alter predictions (null effect)", { pred_post <- predict(bin_fit, new_data = xdat, type = "numeric") - expect_equal(pred_pre, pred_post, tolerance = 1e-14) + expect_equal(pred_pre, pred_post, tolerance = 1e-10) }) test_that("purification does not alter predictions (with effect)", { @@ -52,5 +52,5 @@ test_that("purification does not alter predictions (with effect)", { pred_post <- predict(rpfit, test) - expect_equal(pred_pre, pred_post, tolerance = 1e-15) + expect_equal(pred_pre, pred_post, tolerance = 1e-10) })