From ab8c9d0824235a66a0320562be062dbb3c49e30d Mon Sep 17 00:00:00 2001 From: Jinyang Liu Date: Thu, 2 Jan 2025 16:32:08 +0100 Subject: [PATCH 01/13] Use standard C++ vectors --- src/include/cpf.hpp | 10 +- src/include/grid.hpp | 3 +- src/include/helper.hpp | 19 ++- src/include/rpf.hpp | 32 ++-- src/include/trees.hpp | 1 + src/lib/cpf.cpp | 33 ++-- src/lib/grid.cpp | 2 + src/lib/helper.cpp | 72 ++++----- src/lib/rpf.cpp | 350 +++++++++++++++++++++-------------------- 9 files changed, 256 insertions(+), 266 deletions(-) diff --git a/src/include/cpf.hpp b/src/include/cpf.hpp index 1c913ce..69c75b4 100644 --- a/src/include/cpf.hpp +++ b/src/include/cpf.hpp @@ -1,6 +1,7 @@ #ifndef CPF_H #define CPF_H +#include #include "rpf.hpp" class ClassificationRPF : public RandomPlantedForest @@ -8,9 +9,9 @@ class ClassificationRPF : public RandomPlantedForest public: using RandomPlantedForest::calcOptimalSplit; - ClassificationRPF(const NumericMatrix &samples_Y, const NumericMatrix &samples_X, - const String loss = "L2", const NumericVector parameters = {1, 50, 30, 10, 0.4, 0, 0, 0, 0, 0, 0.1}); - void set_parameters(StringVector keys, NumericVector values); + ClassificationRPF(const std::vector> &samples_Y, const std::vector> &samples_X, + const std::string loss = "L2", const std::vector parameters = {1, 50, 30, 10, 0.4, 0, 0, 0, 0, 0, 0.1}); + void set_parameters(std::vector keys, std::vector values); ~ClassificationRPF(){}; private: @@ -47,4 +48,5 @@ class ClassificationRPF : public RandomPlantedForest void exponential_loss_3(Split &split); }; -#endif \ No newline at end of file + +#endif diff --git a/src/include/grid.hpp b/src/include/grid.hpp index 4b21af5..b634e1c 100644 --- a/src/include/grid.hpp +++ b/src/include/grid.hpp @@ -2,6 +2,7 @@ #define GRID_H #include "helper.hpp" +#include using namespace utils; @@ -39,4 +40,4 @@ namespace grid }; }; -#endif \ No newline at end of file +#endif diff --git a/src/include/helper.hpp b/src/include/helper.hpp index c80cbfd..5e064f6 100644 --- a/src/include/helper.hpp +++ b/src/include/helper.hpp @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -109,18 +108,18 @@ namespace utils } }; - std::vector to_std_vec(Rcpp::IntegerVector rv); - std::vector to_std_vec(Rcpp::NumericVector rv); - std::vector> to_std_vec(Rcpp::NumericMatrix rv); - std::set to_std_set(Rcpp::NumericVector rv); - std::set to_std_set(Rcpp::IntegerVector rv); + std::vector to_std_vec(std::vector rv); + std::vector to_std_vec(std::vector rv); + std::vector> to_std_vec(std::vector> rv); + std::set to_std_set(std::vector rv); + std::set to_std_set(std::vector rv); // functions for converting R and Cpp types - Rcpp::IntegerVector from_std_set(std::set v); - Rcpp::IntegerVector from_std_vec(std::vector v); - Rcpp::NumericVector from_std_vec(std::vector v); - Rcpp::NumericMatrix from_std_vec(std::vector> v); + std::vector from_std_set(std::set v); + std::vector from_std_vec(std::vector v); + std::vector from_std_vec(std::vector v); + std::vector> from_std_vec(std::vector> v); // ----------------- overload of vector operators ----------------- diff --git a/src/include/rpf.hpp b/src/include/rpf.hpp index 53e8d13..e0285f9 100644 --- a/src/include/rpf.hpp +++ b/src/include/rpf.hpp @@ -1,34 +1,36 @@ #ifndef RPF_H #define RPF_H +#include +#include #include "trees.hpp" -using namespace Rcpp; - class RandomPlantedForest { public: - RandomPlantedForest(const NumericMatrix &samples_Y, const NumericMatrix &samples_X, - const NumericVector parameters = {1, 50, 30, 10, 0.4, 0, 0, 0, 0}); - RandomPlantedForest(){}; - void set_data(const NumericMatrix &samples_Y, const NumericMatrix &samples_X); - NumericMatrix predict_matrix(const NumericMatrix &X, const NumericVector components = {0}); - NumericMatrix predict_vector(const NumericVector &X, const NumericVector components = {0}); + RandomPlantedForest( + const std::vector> &samples_Y, + const std::vector> &samples_X, + const std::vector parameters = {1, 50, 30, 10, 0.4, 0, 0, 0, 0}); + RandomPlantedForest() {}; + void set_data(const std::vector> &samples_Y, const std::vector> &samples_X); + std::vector> predict_matrix(const std::vector> &X, const std::vector components = {0}); + std::vector> predict_vector(const std::vector &X, const std::vector components = {0}); void purify_1(); void purify_2(); void purify_3(); void print(); - void cross_validation(int n_sets = 4, IntegerVector splits = {5, 50}, NumericVector t_tries = {0.2, 0.5, 0.7, 0.9}, IntegerVector split_tries = {1, 2, 5, 10}); - double MSE(const NumericMatrix &Y_predicted, const NumericMatrix &Y_true); + void cross_validation(int n_sets = 4, std::vector splits = {5, 50}, std::vector t_tries = {0.2, 0.5, 0.7, 0.9}, std::vector split_tries = {1, 2, 5, 10}); + double MSE(const std::vector> &Y_predicted, const std::vector> &Y_true); void get_parameters(); - void set_parameters(StringVector keys, NumericVector values); + void set_parameters(std::vector keys, std::vector values); List get_model(); - virtual ~RandomPlantedForest(){}; + virtual ~RandomPlantedForest() {}; bool is_purified(); protected: - double MSE_vec(const NumericVector &Y_predicted, const NumericVector &Y_true); + double MSE_vec(const std::vector &Y_predicted, const std::vector &Y_true); std::vector> X; /**< Nested vector feature samples of size (sample_size x feature_size) */ std::vector> Y; /**< Corresponding values for the feature samples */ int max_interaction; /**< Maximum level of interaction determining maximum number of split dimensions for a tree */ @@ -54,7 +56,7 @@ class RandomPlantedForest virtual void fit(); virtual void create_tree_family(std::vector initial_leaves, size_t n); virtual Split calcOptimalSplit(const std::vector> &Y, const std::vector> &X, - std::multimap> &possible_splits, TreeFamily &curr_family); + std::multimap> &possible_splits, TreeFamily &curr_family); }; -#endif // RPF_HPP \ No newline at end of file +#endif // RPF_HPP diff --git a/src/include/trees.hpp b/src/include/trees.hpp index 184c441..affc46b 100644 --- a/src/include/trees.hpp +++ b/src/include/trees.hpp @@ -3,6 +3,7 @@ #include "helper.hpp" #include "grid.hpp" +#include using namespace utils; using namespace grid; diff --git a/src/lib/cpf.cpp b/src/lib/cpf.cpp index f810700..8979874 100644 --- a/src/lib/cpf.cpp +++ b/src/lib/cpf.cpp @@ -1,6 +1,5 @@ - #include "cpf.hpp" - +#include // ----------------- rpf subclass for classification ----------------- @@ -8,7 +7,6 @@ * \brief Create a prediction model based on Random Forests for classification data sets. */ - void ClassificationRPF::L1_loss(Split &split) { split.min_sum = 0; @@ -587,16 +585,13 @@ void ClassificationRPF::exponential_loss_3(Split &split) } // constructor with parameters split_try, t_try, purify_forest, deterministic, nthreads -ClassificationRPF::ClassificationRPF(const NumericMatrix &samples_Y, const NumericMatrix &samples_X, - const String loss, const NumericVector parameters) +ClassificationRPF::ClassificationRPF(const std::vector> &samples_Y, const std::vector> &samples_X, + const std::string loss, const std::vector parameters) : RandomPlantedForest{} { - // Ensure correct Rcpp RNG state - Rcpp::RNGScope scope; - // initialize class members - std::vector pars = to_std_vec(parameters); + std::vector pars = parameters; if (loss == "L1") { this->loss = LossType::L1; @@ -649,13 +644,13 @@ ClassificationRPF::ClassificationRPF(const NumericMatrix &samples_Y, const Numer } else { - Rcout << "Unkown loss function, set to default (L2)." << std::endl; + std::cout << "Unkown loss function, set to default (L2)." << std::endl; this->loss = LossType::L2; this->calcLoss = &ClassificationRPF::L2_loss; } if (pars.size() != 11) { - Rcout << "Wrong number of parameters - set to default." << std::endl; + std::cout << "Wrong number of parameters - set to default." << std::endl; this->max_interaction = 1; this->n_trees = 50; this->n_splits = 30; @@ -777,7 +772,7 @@ Split ClassificationRPF::calcOptimalSplit(const std::vector> { // randomly picked samples otherwise samples = std::vector(split_try); for (size_t i = 0; i < samples.size(); ++i) - samples[i] = R::runif(leaf_size, unique_samples.size() - leaf_size); + samples[i] = rand() % (int)(unique_samples.size() - leaf_size) + leaf_size; std::sort(samples.begin(), samples.end()); } @@ -899,7 +894,7 @@ void ClassificationRPF::create_tree_family(std::vector initial_leaves, siz // bagging/subsampling for (size_t i = 0; i < sample_size; ++i) { - sample_index = R::runif(0, sample_size - 1); + sample_index = rand() % sample_size; samples_Y[i] = Y[sample_index]; samples_X[i] = X[sample_index]; } @@ -1395,7 +1390,7 @@ void ClassificationRPF::fit() { if (nthreads > std::thread::hardware_concurrency()) { - Rcout << "Requested " << nthreads << " threads but only " << std::thread::hardware_concurrency() << " available" << std::endl; + std::cout << "Requested " << nthreads << " threads but only " << std::thread::hardware_concurrency() << " available" << std::endl; } // Create local thread count to not overwrite nthreads, // would get reported wrongly by get_parameters() @@ -1410,7 +1405,7 @@ void ClassificationRPF::fit() std::vector threads(current_threads); for (int t = 0; t < current_threads; ++t) { - // Rcout << "Dispatching thread " << (n + t + 1) << "/" << n_trees << std::endl; + // std::cout << "Dispatching thread " << (n + t + 1) << "/" << n_trees << std::endl; threads[t] = std::thread(&ClassificationRPF::create_tree_family, this, std::ref(initial_leaves), n + t); } for (auto &t : threads) @@ -1443,11 +1438,11 @@ void ClassificationRPF::fit() updates the model, so far only single valued parameters supported, for replacing training data use 'set_data', note that changing cv does not trigger cross validation */ -void ClassificationRPF::set_parameters(StringVector keys, NumericVector values) +void ClassificationRPF::set_parameters(std::vector keys, std::vector values) { if (keys.size() != values.size()) { - Rcout << "Size of input vectors is not the same. " << std::endl; + std::cout << "Size of input vectors is not the same. " << std::endl; return; } @@ -1518,7 +1513,7 @@ void ClassificationRPF::set_parameters(StringVector keys, NumericVector values) } else { - Rcout << "Unkown loss function." << std::endl; + std::cout << "Unkown loss function." << std::endl; } } else if (keys[i] == "delta") @@ -1531,7 +1526,7 @@ void ClassificationRPF::set_parameters(StringVector keys, NumericVector values) } else { - Rcout << "Unkown parameter key '" << keys[i] << "' ." << std::endl; + std::cout << "Unkown parameter key '" << keys[i] << "' ." << std::endl; } } this->fit(); diff --git a/src/lib/grid.cpp b/src/lib/grid.cpp index 6e7b835..885ce34 100644 --- a/src/lib/grid.cpp +++ b/src/lib/grid.cpp @@ -1,4 +1,6 @@ #include "grid.hpp" +#include +#include using namespace grid; diff --git a/src/lib/helper.cpp b/src/lib/helper.cpp index 83ba046..b4f542a 100644 --- a/src/lib/helper.cpp +++ b/src/lib/helper.cpp @@ -1,87 +1,71 @@ #include "helper.hpp" - -using namespace Rcpp; +#include +#include +#include namespace utils { -// Helper function to generate random number using R's RNG -// this replaces the previous randWrapper and later use of std::random_shuffle, -// as the latter is removed in C++17 and I couldn't figure out an easy replacement. -int random_index(const int n) { return static_cast(R::runif(0, 1) * n); } +// Helper function to generate random number using standard C++ libraries +int random_index(const int n) { return static_cast(static_cast(rand()) / RAND_MAX * n); } -// ----------------- functions for converting R and Cpp types ----------------- +// ----------------- functions for converting Cpp types ----------------- /** - * \brief Convert the std container set of type int into an IntegerVector - * from rcpp. + * \brief Convert the std container set of type int into a std::vector. * - * \param v the vector that is converted. + * \param v the set that is converted. */ -Rcpp::IntegerVector from_std_set(std::set v) { - return Rcpp::IntegerVector(v.begin(), v.end()); +std::vector from_std_set(std::set v) { + return std::vector(v.begin(), v.end()); } /** - * \brief Convert the std container vector of type int into an IntegerVector - * from rcpp. + * \brief Convert the std container vector of type int into a std::vector. * * \param v the vector that is converted. */ -Rcpp::IntegerVector from_std_vec(std::vector v) { - return Rcpp::IntegerVector(v.begin(), v.end()); +std::vector from_std_vec(std::vector v) { + return v; } /** - * \brief Convert the std container vector of type double into a NumericVector - * from rcpp. + * \brief Convert the std container vector of type double into a std::vector. * * \param v the vector that is converted. */ -Rcpp::NumericVector from_std_vec(std::vector v) { - return Rcpp::NumericVector(v.begin(), v.end()); +std::vector from_std_vec(std::vector v) { + return v; } /** * \brief Convert the nested std container vector containing a vector itself - * of type double into a NumericMatrix from rcpp. - * - * Predefines a NumericMatrix of respective size. Afterwards iterates over the - * outer vector, then transforms the inner vector using 'from_std_vec'-function - * and inserts row into NumericMatrix at the correct position. + * of type double into a std::vector>. * * \param v the vector of vectors that is converted. */ -Rcpp::NumericMatrix from_std_vec(std::vector> v) { - if(v.empty()) return Rcpp::NumericMatrix(); - Rcpp::NumericMatrix m(v.size(), v[0].size()); - for(unsigned int row=0; row> from_std_vec(std::vector> v) { + return v; } -std::vector to_std_vec(Rcpp::IntegerVector rv) { - return std::vector(rv.begin(), rv.end()); +std::vector to_std_vec(std::vector rv) { + return rv; } -std::vector to_std_vec(Rcpp::NumericVector rv) { - return std::vector(rv.begin(), rv.end()); +std::vector to_std_vec(std::vector rv) { + return rv; } -std::vector> to_std_vec(Rcpp::NumericMatrix rv) { - std::vector> X; - for(int i=0; i> to_std_vec(std::vector> rv) { + return rv; } -std::set to_std_set(Rcpp::NumericVector rv) { +std::set to_std_set(std::vector rv) { return std::set(rv.begin(), rv.end()); } -std::set to_std_set(Rcpp::IntegerVector rv) { +std::set to_std_set(std::vector rv) { return std::set(rv.begin(), rv.end()); } - -} \ No newline at end of file +} diff --git a/src/lib/rpf.cpp b/src/lib/rpf.cpp index 085df8c..53c3610 100644 --- a/src/lib/rpf.cpp +++ b/src/lib/rpf.cpp @@ -1,5 +1,8 @@ #include "rpf.hpp" +#include +#include +using namespace Rcpp; bool RandomPlantedForest::is_purified() { @@ -22,18 +25,16 @@ void RandomPlantedForest::L2_loss(Split &split) } // constructor -RandomPlantedForest::RandomPlantedForest(const NumericMatrix &samples_Y, const NumericMatrix &samples_X, - const NumericVector parameters) +RandomPlantedForest::RandomPlantedForest( + const std::vector> &samples_Y, + const std::vector> &samples_X, + const std::vector parameters) { - // Ensure correct Rcpp RNG state - Rcpp::RNGScope scope; - // initialize class members - std::vector pars = to_std_vec(parameters); - if (pars.size() != 9) + if (parameters.size() != 9) { - Rcout << "Wrong number of parameters - set to default." << std::endl; + std::cout << "Wrong number of parameters - set to default." << std::endl; this->max_interaction = 1; this->n_trees = 50; this->n_splits = 30; @@ -46,15 +47,15 @@ RandomPlantedForest::RandomPlantedForest(const NumericMatrix &samples_Y, const N } else { - this->max_interaction = pars[0]; - this->n_trees = pars[1]; - this->n_splits = pars[2]; - this->split_try = pars[3]; - this->t_try = pars[4]; - this->purify_forest = pars[5]; - this->deterministic = pars[6]; - this->nthreads = pars[7]; - this->cross_validate = pars[8]; + this->max_interaction = parameters[0]; + this->n_trees = parameters[1]; + this->n_splits = parameters[2]; + this->split_try = parameters[3]; + this->t_try = parameters[4]; + this->purify_forest = parameters[5]; + this->deterministic = parameters[6]; + this->nthreads = parameters[7]; + this->cross_validate = parameters[8]; } // set data and data related members @@ -149,7 +150,7 @@ Split RandomPlantedForest::calcOptimalSplit(const std::vector(split_try); for (size_t i = 0; i < samples.size(); ++i) - samples[i] = R::runif(leaf_size, unique_samples.size() - leaf_size); + samples[i] = rand() % (int)(unique_samples.size() - leaf_size); std::sort(samples.begin(), samples.end()); } @@ -236,29 +237,29 @@ Split RandomPlantedForest::calcOptimalSplit(const std::vector> &samples_Y, const std::vector> &samples_X) { - this->Y = to_std_vec(samples_Y); - this->X = to_std_vec(samples_X); + this->Y = samples_Y; + this->X = samples_X; // Check for correct input - if (Y.size() == 0) + if (samples_Y.empty()) throw std::invalid_argument("Y empty - no data provided."); - if (X.size() == 0) + if (samples_X.empty()) throw std::invalid_argument("X empty - no data provided."); - this->feature_size = X[0].size(); - this->value_size = Y[0].size(); // multiclass - for (const auto &vec : X) + this->feature_size = samples_X[0].size(); + this->value_size = samples_Y[0].size(); // multiclass + for (const auto &vec : samples_X) { if (vec.size() != (size_t)feature_size) throw std::invalid_argument("Feature dimensions of X not uniform."); } - if (Y.size() != X.size()) + if (samples_Y.size() != samples_X.size()) throw std::invalid_argument("X and Y are not of the same length!"); this->n_leaves = std::vector(feature_size, 1); - this->sample_size = X.size(); + this->sample_size = samples_X.size(); this->upper_bounds = std::vector(feature_size); this->lower_bounds = std::vector(feature_size); @@ -266,10 +267,10 @@ void RandomPlantedForest::set_data(const NumericMatrix &samples_Y, const Numeric double minVal, maxVal, currVal; for (int i = 0; i < feature_size; ++i) { - minVal = maxVal = X[0][i]; + minVal = maxVal = samples_X[0][i]; for (size_t j = 0; j < sample_size; ++j) { - currVal = X[j][i]; + currVal = samples_X[j][i]; if (currVal < minVal) minVal = currVal; if (currVal > maxVal) @@ -321,7 +322,7 @@ void RandomPlantedForest::create_tree_family(std::vector initial_leaves, s for (size_t i = 0; i < sample_size; ++i) { - sample_index = R::runif(0, sample_size - 1); + sample_index = rand() % sample_size; samples_Y[i] = Y[sample_index]; samples_X[i] = X[sample_index]; } @@ -480,7 +481,7 @@ void RandomPlantedForest::fit() { if (nthreads > std::thread::hardware_concurrency()) { - Rcout << "Requested " << nthreads << " threads but only " << std::thread::hardware_concurrency() << " available" << std::endl; + std::cout << "Requested " << nthreads << " threads but only " << std::thread::hardware_concurrency() << " available" << std::endl; } // Create local thread count to not overwrite nthreads, // would get reported wrongly by get_parameters() @@ -495,7 +496,7 @@ void RandomPlantedForest::fit() std::vector threads(current_threads); for (int t = 0; t < current_threads; ++t) { - // Rcout << "Dispatching thread " << (n + t + 1) << "/" << n_trees << std::endl; + // std::cout << "Dispatching thread " << (n + t + 1) << "/" << n_trees << std::endl; threads[t] = std::thread(&RandomPlantedForest::create_tree_family, this, std::ref(initial_leaves), n + t); } for (auto &t : threads) @@ -524,20 +525,20 @@ void RandomPlantedForest::fit() } } -void RandomPlantedForest::cross_validation(int n_sets, IntegerVector splits, NumericVector t_tries, IntegerVector split_tries) +/* +void RandomPlantedForest::cross_validation(int n_sets, const std::vector &splits, const std::vector &t_tries, const std::vector &split_tries) { - /* bool cv_tmp = this->cross_validate; this->cross_validate = false; if(deterministic) { - Rcout << "Note: Set model to non-deterministic. " << std::endl; + std::cout << "Note: Set model to non-deterministic. " << std::endl; deterministic = false; } std::set splits_vec = to_std_set(splits); std::vector split_tries_vec = to_std_vec(split_tries); std::vector t_tries_vec = to_std_vec(t_tries); - if(splits_vec.size()!=2) {Rcout << "Min and max needed for number of splits." << std::endl; return;} + if(splits_vec.size()!=2) {std::cout << "Min and max needed for number of splits." << std::endl; return;} // remember optimal parameter set and MSE double MSE_sum = 0, curr_MSE = 0, MSE_min = INF, optimal_split = INF, optimal_t_try = INF, optimal_split_try = INF; int optimal_inter = 1; @@ -547,8 +548,8 @@ void RandomPlantedForest::cross_validation(int n_sets, IntegerVector splits, Num double tmp = double(sample_size)/double(n_sets); int set_size = round(tmp); // remember original data samples - NumericMatrix X_original = from_std_vec(X); - NumericVector Y_original = from_std_vec(Y); + std::vector> X_original = from_std_vec(X); + std::vector Y_original = from_std_vec(Y); // set level of interactions std::set interactions{1}; if(feature_size >= 2){ @@ -571,8 +572,8 @@ void RandomPlantedForest::cross_validation(int n_sets, IntegerVector splits, Num int test_size = set_size; if(n_set == n_sets-1) test_size = order.size() - (n_sets-1) * set_size; int train_size = order.size() - test_size, i = 0, j = 0; - NumericVector Y_train(train_size), Y_test_true(test_size), Y_test_predicted; - NumericMatrix X_train(train_size, feature_size), X_test(test_size, feature_size); + std::vector Y_train(train_size), Y_test_true(test_size), Y_test_predicted; + std::vector> X_train(train_size, feature_size), X_test(test_size, feature_size); for(int index=0; index= (n_set * set_size)) && (index < ((n_set + 1) * set_size))){ Y_test_true[i] = Y_original[order[index]]; @@ -592,7 +593,7 @@ void RandomPlantedForest::cross_validation(int n_sets, IntegerVector splits, Num } // average curr_MSE = MSE_sum / n_sets; - Rcout << inter << ", " << splits << ", " << t << ", " << s << ": MSE=" << curr_MSE << std::endl; + std::cout << inter << ", " << splits << ", " << t << ", " << s << ": MSE=" << curr_MSE << std::endl; // update optimal if(curr_MSE < MSE_min){ MSE_min = curr_MSE; @@ -612,9 +613,9 @@ void RandomPlantedForest::cross_validation(int n_sets, IntegerVector splits, Num this->max_interaction = optimal_inter; this->set_data(Y_original, X_original); this->cross_validate = cv_tmp; - Rcout << "Optimal parameters: " << optimal_inter << ", " << optimal_split << ", " << optimal_t_try << ", " << optimal_split_try << ": MSE=" << MSE_min << std::endl; - */ + std::cout << "Optimal parameters: " << optimal_inter << ", " << optimal_split << ", " << optimal_t_try << ", " << optimal_split_try << ": MSE=" << MSE_min << std::endl; } +*/ // predict single feature vector std::vector RandomPlantedForest::predict_single(const std::vector &X, std::set component_index) @@ -644,7 +645,7 @@ std::vector RandomPlantedForest::predict_single(const std::vector RandomPlantedForest::predict_single(const std::vector{0}) { - // Rcout << tree.first.size() ; + // std::cout << tree.first.size() ; leaf_index = std::vector(tree.first.size(), 0); total_res += tree.second->GridLeaves.values[leaf_index]; } @@ -720,7 +721,7 @@ std::vector RandomPlantedForest::predict_single(const std::vector{0}) { - // Rcout << tree.first.size() ; + // std::cout << tree.first.size() ; leaf_index = std::vector(tree.first.size(), 0); } else @@ -822,10 +823,10 @@ std::vector RandomPlantedForest::predict_single(const std::vector> RandomPlantedForest::predict_matrix(const std::vector> &X, const std::vector components) { - std::vector> feature_vec = to_std_vec(X); - std::set component_index = to_std_set(components); + std::vector> feature_vec = X; + std::set component_index = std::set(components.begin(), components.end()); std::vector> predictions; // todo: sanity check for X @@ -841,27 +842,26 @@ Rcpp::NumericMatrix RandomPlantedForest::predict_matrix(const NumericMatrix &X, predictions.push_back(predict_single(vec, component_index)); } - return from_std_vec(predictions); + return predictions; } -Rcpp::NumericMatrix RandomPlantedForest::predict_vector(const NumericVector &X, const NumericVector components) +std::vector> RandomPlantedForest::predict_vector(const std::vector &X, const std::vector components) { - std::vector feature_vec = to_std_vec(X); - std::set component_index = to_std_set(components); + std::vector feature_vec = X; + std::set component_index = std::set(components.begin(), components.end()); std::vector> predictions; - Rcpp::NumericMatrix res; // todo: sanity check for X if (feature_vec.empty()) { - Rcout << "Feature vector is empty." << std::endl; - return res; + std::cout << "Feature vector is empty." << std::endl; + return predictions; } if (component_index == std::set{0} && this->feature_size >= 0 && feature_vec.size() != (size_t)this->feature_size) { - Rcout << "Feature vector has wrong dimension." << std::endl; - return res; + std::cout << "Feature vector has wrong dimension." << std::endl; + return predictions; } if (component_index == std::set{0}) @@ -876,16 +876,20 @@ Rcpp::NumericMatrix RandomPlantedForest::predict_vector(const NumericVector &X, } } - res = from_std_vec(predictions); - return res; + return predictions; } -double RandomPlantedForest::MSE_vec(const NumericVector &Y_predicted, const NumericVector &Y_true) +double RandomPlantedForest::MSE_vec(const std::vector &Y_predicted, const std::vector &Y_true) { - return sum(Rcpp::pow(Y_true - Y_predicted, 2)) / Y_true.size(); + double sum = 0; + for (size_t i = 0; i < Y_predicted.size(); ++i) + { + sum += pow(Y_true[i] - Y_predicted[i], 2); + } + return sum / Y_true.size(); } -double RandomPlantedForest::MSE(const NumericMatrix &Y_predicted, const NumericMatrix &Y_true) +double RandomPlantedForest::MSE(const std::vector> &Y_predicted, const std::vector> &Y_true) { // todo: multiclass double sum = 0; @@ -893,7 +897,7 @@ double RandomPlantedForest::MSE(const NumericMatrix &Y_predicted, const NumericM for (int i = 0; i < Y_size; ++i) { - sum += MSE_vec(Y_predicted(i, _), Y_true(i, _)); + sum += MSE_vec(Y_predicted[i], Y_true[i]); } return sum / Y_size; @@ -1182,14 +1186,14 @@ void RandomPlantedForest::purify_2() // go through sample points to sum up individuals for (const auto &feature_vec : X) { - int dim_index = 0; + int dim_index2 = 0; in_leaf = true; for (const auto &dim : tree_dims) { double val = feature_vec[dim - 1]; - if (!((val >= lim_list[dim - 1][gridPoint[dim_index]]) && (val < lim_list[dim - 1][gridPoint[dim_index] + 1]))) + if (!((val >= lim_list[dim - 1][gridPoint[dim_index2]]) && (val < lim_list[dim - 1][gridPoint[dim_index2] + 1]))) in_leaf = false; - ++dim_index; + ++dim_index2; } // consider individuals only if all in @@ -1490,17 +1494,17 @@ void RandomPlantedForest::purify_3() ++tree_index; } - // Rcout << variables.size(); + // std::cout << variables.size(); // for(int i = 0; i matrix_dimensions = values[old_tree_index].dims; // std::vector matrix_dimensions = values_old[old_tree_index].dims; - // Rcout << typeof(matrix_dimensions.begin()) << std::endl; + // std::cout << typeof(matrix_dimensions.begin()) << std::endl; matrix_dimensions.erase(matrix_dimensions.begin() + dim_index); // initialize data for new tree @@ -1590,20 +1594,20 @@ void RandomPlantedForest::purify_3() --curr_max; } - // Rcout << std::endl; - // Rcout << std::endl; - // Rcout << std::endl; + // std::cout << std::endl; + // std::cout << std::endl; + // std::cout << std::endl; // // for(int i = 0; i{0}) continue; - // Rcout << std::endl << tree_index_t << " - T: "; - // Rcout << "tree_t:"; - // for(auto dim: curr_dims) Rcout << dim << ", "; - // Rcout << std::endl; + // std::cout << std::endl << tree_index_t << " - T: "; + // std::cout << "tree_t:"; + // for(auto dim: curr_dims) std::cout << dim << ", "; + // std::cout << std::endl; auto grid = grids[tree_index_t]; - // Rcout << "Grid dimensions of T: "; - // for(auto dim: grid.dimensions) Rcout << dim << ", "; - // Rcout << std::endl; + // std::cout << "Grid dimensions of T: "; + // for(auto dim: grid.dimensions) std::cout << dim << ", "; + // std::cout << std::endl; // go through subtrees of t int tree_index_u = variables.size(); for (auto tree_u = variables.rbegin(); tree_u != variables.rend(); ++tree_u) @@ -1647,29 +1651,29 @@ void RandomPlantedForest::purify_3() if (!subset) continue; - // Rcout << "Hello"; - // Rcout << " " << tree_index_u << " - U: "; - // for(auto dim: *tree_u) Rcout << dim << ", "; - // Rcout << std::endl; - // Rcout << " Individuals: "; + // std::cout << "Hello"; + // std::cout << " " << tree_index_u << " - U: "; + // for(auto dim: *tree_u) std::cout << dim << ", "; + // std::cout << std::endl; + // std::cout << " Individuals: "; double tot_sum = 0; grid = grids[tree_index_u]; while (!grid.nextPoint()) { auto gridPoint = grid.getPoint(); - // Rcout << individuals[tree_index_u][gridPoint] << ", "; + // std::cout << individuals[tree_index_u][gridPoint] << ", "; tot_sum += individuals[tree_index_u][gridPoint]; } - // Rcout << "Total sum: " << tot_sum << std::endl; - // Rcout << std::endl; + // std::cout << "Total sum: " << tot_sum << std::endl; + // std::cout << std::endl; grid = grids[tree_index_u]; - // Rcout << " Grid dimensions of U: "; - // for(auto dim: grid.dimensions) Rcout << dim << ", "; - // Rcout << std::endl; + // std::cout << " Grid dimensions of U: "; + // for(auto dim: grid.dimensions) std::cout << dim << ", "; + // std::cout << std::endl; - // Rcout<< "j_dims: "< update(value_size, 0); @@ -1680,23 +1684,23 @@ void RandomPlantedForest::purify_3() while (!grid.nextPoint()) { auto gridPoint_i = grid.getPoint(); - // Rcout << " " << "i: "; - // for(auto p: gridPoint_i) Rcout << p << ", "; - // Rcout << std::endl << " "; + // std::cout << " " << "i: "; + // for(auto p: gridPoint_i) std::cout << p << ", "; + // std::cout << std::endl << " "; double curr_sum = individuals[tree_index_u][gridPoint_i]; - // Rcout << ", Current Sum: " << curr_sum << std::endl; - // Rcout << std::endl << " " << "i, j: "; + // std::cout << ", Current Sum: " << curr_sum << std::endl; + // std::cout << std::endl << " " << "i, j: "; update += (curr_sum / tot_sum) * values_old[tree_index_t][gridPoint_i]; - // Rcout << std::endl; + // std::cout << std::endl; } int tree_index_s = variables.size(); for (auto tree_s = variables.rbegin(); tree_s != variables.rend(); ++tree_s) { - // Rcout << "tree_s:"; - // for(auto dim: *tree_s) Rcout << dim << ", "; - // Rcout << std::endl; + // std::cout << "tree_s:"; + // for(auto dim: *tree_s) std::cout << dim << ", "; + // std::cout << std::endl; --tree_index_s; if (*tree_s == std::set{0}) @@ -1704,7 +1708,7 @@ void RandomPlantedForest::purify_3() auto gridPoint_0 = std::vector{0}; values[tree_index_s][gridPoint_0] += update; - // Rcout << std::endl; + // std::cout << std::endl; //} /* @@ -1712,12 +1716,12 @@ void RandomPlantedForest::purify_3() if(tree_0.first == std::set{0}){ - Rcout << tree_0.first.size(); + std::cout << tree_0.first.size(); std::vector leaf_index(tree_0.first.size(), 0); std::vector leaf_index(tree_0.second->GridLeaves.values.size(), 0); int Test = tree_0.second->GridLeaves.values.size(); - Rcout << Test; + std::cout << Test; tree_0.second->GridLeaves.values[leaf_index] += update; } } @@ -1740,7 +1744,7 @@ void RandomPlantedForest::purify_3() if (!subset) continue; - // Rcout << pow(-1, (*tree_s).size()) << std::endl; + // std::cout << pow(-1, (*tree_s).size()) << std::endl; auto grid_k = grids[tree_index_s]; while (!grid_k.nextPoint()) @@ -1748,17 +1752,17 @@ void RandomPlantedForest::purify_3() auto gridPoint_k = grid_k.getPoint(); // // if((*tree_s).size()>2){ - // Rcout << std::endl << " " << "j, k: "; - // for(auto p: gridPoint_k) Rcout << p << ", "; - // Rcout << std::endl; + // std::cout << std::endl << " " << "j, k: "; + // for(auto p: gridPoint_k) std::cout << p << ", "; + // std::cout << std::endl; // } // - // Rcout << pow(-1, (*tree_s).size()) * update << std::endl; + // std::cout << pow(-1, (*tree_s).size()) * update << std::endl; values[tree_index_s][gridPoint_k] += pow(-1, (*tree_s).size()) * update; } } } - // Rcout << std::endl; + // std::cout << std::endl; } else { @@ -1772,7 +1776,7 @@ void RandomPlantedForest::purify_3() j_sizes[j] = grids[tree_index_t].dimensions[j_index]; } - // Rcout<<"Hello 1"; + // std::cout<<"Hello 1"; grid::NDGrid grid_j = grid::NDGrid(j_sizes); while (!grid_j.nextPoint()) @@ -1780,26 +1784,26 @@ void RandomPlantedForest::purify_3() std::vector update(value_size, 0); auto gridPoint_j = grid_j.getPoint(); - // Rcout << " " << "j: "; - // for(auto p: gridPoint_j) Rcout << p << ", "; - // Rcout << std::endl; + // std::cout << " " << "j: "; + // for(auto p: gridPoint_j) std::cout << p << ", "; + // std::cout << std::endl; // calc update grid = grids[tree_index_u]; while (!grid.nextPoint()) { auto gridPoint_i = grid.getPoint(); - // Rcout << " " << "i: "; - // for(auto p: gridPoint_i) Rcout << p << ", "; - // Rcout << std::endl << " "; + // std::cout << " " << "i: "; + // for(auto p: gridPoint_i) std::cout << p << ", "; + // std::cout << std::endl << " "; double curr_sum = individuals[tree_index_u][gridPoint_i]; - // Rcout << ", Current Sum: " << curr_sum << std::endl; + // std::cout << ", Current Sum: " << curr_sum << std::endl; std::vector gridPoint_ij(tree_t->size(), 0); for (size_t j = 0; j < gridPoint_j.size(); ++j) { auto j_dim = j_dims.begin(); std::advance(j_dim, j); int j_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*j_dim)); - // Rcout << " j_dim=" << *j_dim << ", j_index=" << j_index; + // std::cout << " j_dim=" << *j_dim << ", j_index=" << j_index; gridPoint_ij[j_index] = gridPoint_j[j]; } for (size_t i = 0; i < gridPoint_i.size(); ++i) @@ -1807,17 +1811,17 @@ void RandomPlantedForest::purify_3() auto i_dim = tree_u->begin(); std::advance(i_dim, i); int i_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*i_dim)); - // Rcout << " i_dim=" << *i_dim << ", i_index=" << i_index; + // std::cout << " i_dim=" << *i_dim << ", i_index=" << i_index; gridPoint_ij[i_index] = gridPoint_i[i]; } - // Rcout << std::endl << " " << "i, j: "; - // for(auto p: gridPoint_ij) Rcout << p << ", "; - // Rcout << std::endl; + // std::cout << std::endl << " " << "i, j: "; + // for(auto p: gridPoint_ij) std::cout << p << ", "; + // std::cout << std::endl; update += (curr_sum / tot_sum) * values_old[tree_index_t][gridPoint_ij]; - // Rcout << std::endl; + // std::cout << std::endl; } - // Rcout << "Hello_2"; + // std::cout << "Hello_2"; // update trees int tree_index_s = variables.size(); for (auto tree_s = variables.rbegin(); tree_s != variables.rend(); ++tree_s) @@ -1843,9 +1847,9 @@ void RandomPlantedForest::purify_3() } if (!subset) continue; - // Rcout << " " << "S: "; - // for(auto dim: *tree_s) Rcout << dim << ", "; - // Rcout << std::endl; + // std::cout << " " << "S: "; + // for(auto dim: *tree_s) std::cout << dim << ", "; + // std::cout << std::endl; // S cap U std::set k_dims = *tree_s; std::set k_dims_h1 = *tree_s; @@ -1865,9 +1869,9 @@ void RandomPlantedForest::purify_3() // for(const auto dim: *tree_t) k_dims.erase(dim); // for(const auto dim: *tree_u) k_dims.insert(dim); - // Rcout << " " << "k_dims: "; - // for(auto dim: k_dims) Rcout << dim << ", "; - // Rcout << std::endl; + // std::cout << " " << "k_dims: "; + // for(auto dim: k_dims) std::cout << dim << ", "; + // std::cout << std::endl; if (k_dims.size() == 0) { @@ -1877,9 +1881,9 @@ void RandomPlantedForest::purify_3() else { - // Rcout <<"k_dims :"; - // for(auto dim: k_dims) Rcout << dim << ", "; - // Rcout << std::endl; + // std::cout <<"k_dims :"; + // for(auto dim: k_dims) std::cout << dim << ", "; + // std::cout << std::endl; std::vector k_sizes(k_dims.size(), 0); for (size_t k = 0; k < k_dims.size(); ++k) @@ -1889,23 +1893,23 @@ void RandomPlantedForest::purify_3() int k_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*tmp)); k_sizes[k] = grids[tree_index_t].dimensions[k_index]; } - // Rcout << " " << "k_sizes: "; - // for(auto dim: k_sizes) Rcout << dim << ", "; - // Rcout << std::endl; + // std::cout << " " << "k_sizes: "; + // for(auto dim: k_sizes) std::cout << dim << ", "; + // std::cout << std::endl; grid::NDGrid grid_k = grid::NDGrid(k_sizes); while (!grid_k.nextPoint()) { auto gridPoint_k = grid_k.getPoint(); - // Rcout << " " << "k: "; - // for(auto p: gridPoint_k) Rcout << p << ", "; - // Rcout << std::endl << " "; + // std::cout << " " << "k: "; + // for(auto p: gridPoint_k) std::cout << p << ", "; + // std::cout << std::endl << " "; std::vector gridPoint_jk(tree_s->size(), 0); for (size_t j = 0; j < gridPoint_j.size(); ++j) { auto j_dim = j_dims.begin(); std::advance(j_dim, j); int j_index = std::distance(variables[tree_index_s].begin(), variables[tree_index_s].find(*j_dim)); - // Rcout << " j_dim=" << *j_dim << ", j_index=" << j_index; + // std::cout << " j_dim=" << *j_dim << ", j_index=" << j_index; gridPoint_jk[j_index] = gridPoint_j[j]; } for (size_t k = 0; k < gridPoint_k.size(); ++k) @@ -1913,14 +1917,14 @@ void RandomPlantedForest::purify_3() auto k_dim = k_dims.begin(); std::advance(k_dim, k); int k_index = std::distance(variables[tree_index_s].begin(), variables[tree_index_s].find(*k_dim)); - // Rcout << " k_dim=" << *k_dim << ", k_index=" << k_index; + // std::cout << " k_dim=" << *k_dim << ", k_index=" << k_index; gridPoint_jk[k_index] = gridPoint_k[k]; } - // Rcout << std::endl << " " << "j, k: "; - // for(auto p: gridPoint_jk) Rcout << p << ", "; - // Rcout << std::endl; + // std::cout << std::endl << " " << "j, k: "; + // for(auto p: gridPoint_jk) std::cout << p << ", "; + // std::cout << std::endl; - // Rcout << pow(-1, (*tree_s).size() - j_dims.size()) * update[0]; + // std::cout << pow(-1, (*tree_s).size() - j_dims.size()) * update[0]; values[tree_index_s][gridPoint_jk] += pow(-1, (*tree_s).size() - j_dims.size()) * update; } } @@ -1957,48 +1961,48 @@ void RandomPlantedForest::print() for (size_t m = 0; m < keys.size(); ++m) { DecisionTree tree = *(family[keys[m]]); - Rcout << m + 1 << " Tree: "; - Rcout << "Dims="; + std::cout << m + 1 << " Tree: "; + std::cout << "Dims="; for (const auto &dim : tree.split_dims) - Rcout << dim << ","; - Rcout << std::endl - << "Leaves: (" << tree.leaves.size() << ")" << std::endl; + std::cout << dim << ","; + std::cout << std::endl + << "Leaves: (" << tree.leaves.size() << ")" << std::endl; for (const auto &leaf : tree.leaves) { - Rcout << "Intervals="; + std::cout << "Intervals="; for (const auto &interval : leaf.intervals) { - Rcout << interval.first << "," << interval.second << "/"; + std::cout << interval.first << "," << interval.second << "/"; } - Rcout << " Value="; + std::cout << " Value="; for (const auto &val : leaf.value) - Rcout << val << ", "; - Rcout << std::endl; + std::cout << val << ", "; + std::cout << std::endl; } - Rcout << std::endl; + std::cout << std::endl; } - Rcout << std::endl - << std::endl; + std::cout << std::endl + << std::endl; } } // print parameters of the model to the console void RandomPlantedForest::get_parameters() { - Rcout << "Parameters: n_trees=" << n_trees << ", n_splits=" << n_splits << ", max_interaction=" << max_interaction << ", t_try=" << t_try - << ", split_try=" << split_try << ", purified=" << purified << ", deterministic=" << deterministic << ", nthreads=" << nthreads - << ", feature_size=" << feature_size << ", sample_size=" << sample_size << std::endl; + std::cout << "Parameters: n_trees=" << n_trees << ", n_splits=" << n_splits << ", max_interaction=" << max_interaction << ", t_try=" << t_try + << ", split_try=" << split_try << ", purified=" << purified << ", deterministic=" << deterministic << ", nthreads=" << nthreads + << ", feature_size=" << feature_size << ", sample_size=" << sample_size << std::endl; } /* retrospectively change parameters of existing class object, updates the model, so far only single valued parameters supported, for replacing training data use 'set_data', note that changing cv does not trigger cross validation */ -void RandomPlantedForest::set_parameters(StringVector keys, NumericVector values) +void RandomPlantedForest::set_parameters(std::vector keys, std::vector values) { if (keys.size() != values.size()) { - Rcout << "Size of input vectors is not the same. " << std::endl; + std::cout << "Size of input vectors is not the same. " << std::endl; return; } @@ -2042,7 +2046,7 @@ void RandomPlantedForest::set_parameters(StringVector keys, NumericVector values } else { - Rcout << "Unkown parameter key '" << keys[i] << "' ." << std::endl; + std::cout << "Unkown parameter key '" << keys[i] << "' ." << std::endl; } } this->fit(); From 66c1ef28047a4e8690f3c86522b63c4c4f6dfddf Mon Sep 17 00:00:00 2001 From: Jinyang Liu Date: Fri, 3 Jan 2025 10:42:03 +0100 Subject: [PATCH 02/13] Include Rcpp - for now --- src/include/rpf.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/include/rpf.hpp b/src/include/rpf.hpp index e0285f9..aeb1b7c 100644 --- a/src/include/rpf.hpp +++ b/src/include/rpf.hpp @@ -3,8 +3,9 @@ #include #include +#include #include "trees.hpp" - +using namespace Rcpp; class RandomPlantedForest { From 796f994e055a551abdd7fd4c0db7a9ccd487da73 Mon Sep 17 00:00:00 2001 From: Jinyang Liu Date: Fri, 3 Jan 2025 10:42:43 +0100 Subject: [PATCH 03/13] Add empty CV method to RPF --- src/lib/rpf.cpp | 177 ++++++++++++++++++++++++------------------------ 1 file changed, 88 insertions(+), 89 deletions(-) diff --git a/src/lib/rpf.cpp b/src/lib/rpf.cpp index 53c3610..25ac56f 100644 --- a/src/lib/rpf.cpp +++ b/src/lib/rpf.cpp @@ -525,97 +525,96 @@ void RandomPlantedForest::fit() } } -/* -void RandomPlantedForest::cross_validation(int n_sets, const std::vector &splits, const std::vector &t_tries, const std::vector &split_tries) +void RandomPlantedForest::cross_validation(int n_sets, std::vector splits, std::vector t_tries, std::vector split_tries) { - - bool cv_tmp = this->cross_validate; - this->cross_validate = false; - if(deterministic) { - std::cout << "Note: Set model to non-deterministic. " << std::endl; - deterministic = false; - } - std::set splits_vec = to_std_set(splits); - std::vector split_tries_vec = to_std_vec(split_tries); - std::vector t_tries_vec = to_std_vec(t_tries); - if(splits_vec.size()!=2) {std::cout << "Min and max needed for number of splits." << std::endl; return;} - // remember optimal parameter set and MSE - double MSE_sum = 0, curr_MSE = 0, MSE_min = INF, optimal_split = INF, optimal_t_try = INF, optimal_split_try = INF; - int optimal_inter = 1; - std::vector order(sample_size); - std::iota(order.begin(), order.end(), 0); - std::random_shuffle(order.begin(), order.end(), randWrapper); - double tmp = double(sample_size)/double(n_sets); - int set_size = round(tmp); - // remember original data samples - std::vector> X_original = from_std_vec(X); - std::vector Y_original = from_std_vec(Y); - // set level of interactions - std::set interactions{1}; - if(feature_size >= 2){ - interactions.insert(2); - interactions.insert(feature_size); - } - // go through all parameter combinations - for(int inter: interactions){ - this->max_interaction = inter; - for(int splits=*splits_vec.begin(); splits<=*--splits_vec.end(); splits=ceil(splits*1.2)){ - this->n_splits = splits; - for(auto t: t_tries){ - this->t_try = t; - for(auto s: split_tries){ - this->split_try = s; - // k-fold cross-validation: go over all possible combinations as test set - MSE_sum = 0; - for(int n_set=0; n_set Y_train(train_size), Y_test_true(test_size), Y_test_predicted; - std::vector> X_train(train_size, feature_size), X_test(test_size, feature_size); - for(int index=0; index= (n_set * set_size)) && (index < ((n_set + 1) * set_size))){ - Y_test_true[i] = Y_original[order[index]]; - X_test(i, _ ) = X_original(order[index], _ ); - ++i; - }else{ - Y_train[j] = Y_original[order[index]]; - X_train(j, _ ) = X_original(order[index], _ ); - ++j; - } - } - // fit to training data - this->set_data(Y_train, X_train); - // predict with test set and determine mse - Y_test_predicted = this->predict_matrix(X_test); - MSE_sum += this->MSE(Y_test_predicted, Y_test_true); - } - // average - curr_MSE = MSE_sum / n_sets; - std::cout << inter << ", " << splits << ", " << t << ", " << s << ": MSE=" << curr_MSE << std::endl; - // update optimal - if(curr_MSE < MSE_min){ - MSE_min = curr_MSE; - optimal_split = splits; - optimal_t_try = t; - optimal_split_try = s; - optimal_inter = inter; - } - } - } - } - } - // reset X&Y to original and fit with optimal pars - this->n_splits = optimal_split; - this->t_try = optimal_t_try; - this->split_try = optimal_split_try; - this->max_interaction = optimal_inter; - this->set_data(Y_original, X_original); - this->cross_validate = cv_tmp; - std::cout << "Optimal parameters: " << optimal_inter << ", " << optimal_split << ", " << optimal_t_try << ", " << optimal_split_try << ": MSE=" << MSE_min << std::endl; + /* + bool cv_tmp = this->cross_validate; + this->cross_validate = false; + if(deterministic) { + std::cout << "Note: Set model to non-deterministic. " << std::endl; + deterministic = false; + } + std::set splits_vec = to_std_set(splits); + std::vector split_tries_vec = to_std_vec(split_tries); + std::vector t_tries_vec = to_std_vec(t_tries); + if(splits_vec.size()!=2) {std::cout << "Min and max needed for number of splits." << std::endl; return;} + // remember optimal parameter set and MSE + double MSE_sum = 0, curr_MSE = 0, MSE_min = INF, optimal_split = INF, optimal_t_try = INF, optimal_split_try = INF; + int optimal_inter = 1; + std::vector order(sample_size); + std::iota(order.begin(), order.end(), 0); + std::random_shuffle(order.begin(), order.end(), randWrapper); + double tmp = double(sample_size)/double(n_sets); + int set_size = round(tmp); + // remember original data samples + std::vector> X_original = from_std_vec(X); + std::vector Y_original = from_std_vec(Y); + // set level of interactions + std::set interactions{1}; + if(feature_size >= 2){ + interactions.insert(2); + interactions.insert(feature_size); + } + // go through all parameter combinations + for(int inter: interactions){ + this->max_interaction = inter; + for(int splits=*splits_vec.begin(); splits<=*--splits_vec.end(); splits=ceil(splits*1.2)){ + this->n_splits = splits; + for(auto t: t_tries){ + this->t_try = t; + for(auto s: split_tries){ + this->split_try = s; + // k-fold cross-validation: go over all possible combinations as test set + MSE_sum = 0; + for(int n_set=0; n_set Y_train(train_size), Y_test_true(test_size), Y_test_predicted; + std::vector> X_train(train_size, feature_size), X_test(test_size, feature_size); + for(int index=0; index= (n_set * set_size)) && (index < ((n_set + 1) * set_size))){ + Y_test_true[i] = Y_original[order[index]]; + X_test(i, _ ) = X_original(order[index], _ ); + ++i; + }else{ + Y_train[j] = Y_original[order[index]]; + X_train(j, _ ) = X_original(order[index], _ ); + ++j; + } + } + // fit to training data + this->set_data(Y_train, X_train); + // predict with test set and determine mse + Y_test_predicted = this->predict_matrix(X_test); + MSE_sum += this->MSE(Y_test_predicted, Y_test_true); + } + // average + curr_MSE = MSE_sum / n_sets; + std::cout << inter << ", " << splits << ", " << t << ", " << s << ": MSE=" << curr_MSE << std::endl; + // update optimal + if(curr_MSE < MSE_min){ + MSE_min = curr_MSE; + optimal_split = splits; + optimal_t_try = t; + optimal_split_try = s; + optimal_inter = inter; + } + } + } + } + } + // reset X&Y to original and fit with optimal pars + this->n_splits = optimal_split; + this->t_try = optimal_t_try; + this->split_try = optimal_split_try; + this->max_interaction = optimal_inter; + this->set_data(Y_original, X_original); + this->cross_validate = cv_tmp; + std::cout << "Optimal parameters: " << optimal_inter << ", " << optimal_split << ", " << optimal_t_try << ", " << optimal_split_try << ": MSE=" << MSE_min << std::endl; + */ } -*/ // predict single feature vector std::vector RandomPlantedForest::predict_single(const std::vector &X, std::set component_index) From 77f3b967ab0b11591c6ce3a7c5b11dfa13aff97c Mon Sep 17 00:00:00 2001 From: Jinyang Liu Date: Fri, 3 Jan 2025 10:43:10 +0100 Subject: [PATCH 04/13] Add Rcpp interface for RPF --- src/include/rcpp_interface.hpp | 32 ++++++++++ src/lib/rcpp_interface.cpp | 109 +++++++++++++++++++++++++++++++++ 2 files changed, 141 insertions(+) create mode 100644 src/include/rcpp_interface.hpp create mode 100644 src/lib/rcpp_interface.cpp diff --git a/src/include/rcpp_interface.hpp b/src/include/rcpp_interface.hpp new file mode 100644 index 0000000..71688d9 --- /dev/null +++ b/src/include/rcpp_interface.hpp @@ -0,0 +1,32 @@ +#ifndef RCPP_INTERFACE_H +#define RCPP_INTERFACE_H + +#include "cpf.hpp" +#include "rpf.hpp" +#include + +using namespace Rcpp; + +class RcppRPF : public RandomPlantedForest { + +public: + RcppRPF(const NumericMatrix &samples_Y, const NumericMatrix &samples_X, + const NumericVector parameters = {1, 50, 30, 10, 0.4, 0, 0, 0, 0}); + RcppRPF() {}; + void set_data(const NumericMatrix &samples_Y, const NumericMatrix &samples_X); + NumericMatrix predict_matrix(const NumericMatrix &X, const NumericVector components = {0}); + NumericMatrix predict_vector(const NumericVector &X, const NumericVector components = {0}); + void cross_validation(int n_sets = 4, IntegerVector splits = {5, 50}, NumericVector t_tries = {0.2, 0.5, 0.7, 0.9}, IntegerVector split_tries = {1, 2, 5, 10}); + double MSE(const NumericMatrix &Y_predicted, const NumericMatrix &Y_true); + void set_parameters(StringVector keys, NumericVector values); + void purify_3(); + void print(); + void get_parameters(); + List get_model(); + bool is_purified(); + +protected: + double MSE_vec(const NumericVector &Y_predicted, const NumericVector &Y_true); +}; + +#endif // RCPP_INTERFACE_H diff --git a/src/lib/rcpp_interface.cpp b/src/lib/rcpp_interface.cpp new file mode 100644 index 0000000..50d6df1 --- /dev/null +++ b/src/lib/rcpp_interface.cpp @@ -0,0 +1,109 @@ +#include "rcpp_interface.hpp" + +// Helper to convert NumericMatrix to std::vector> +static std::vector> toStd2D(const Rcpp::NumericMatrix &mat) +{ + std::vector> vec(mat.nrow(), std::vector(mat.ncol())); + for (int i = 0; i < mat.nrow(); ++i) + { + for (int j = 0; j < mat.ncol(); ++j) + { + vec[i][j] = mat(i, j); + } + } + return vec; +} + +// Helper to convert NumericVector to std::vector +static std::vector toStd1D(const Rcpp::NumericVector &vec) +{ + return std::vector(vec.begin(), vec.end()); +} + +RcppRPF::RcppRPF(const NumericMatrix &samples_Y, const NumericMatrix &samples_X, + const NumericVector parameters) + : RandomPlantedForest(toStd2D(samples_Y), toStd2D(samples_X), + toStd1D(parameters)) +{ +} + +void RcppRPF::set_data(const NumericMatrix &samples_Y, const NumericMatrix &samples_X) +{ + RandomPlantedForest::set_data(toStd2D(samples_Y), toStd2D(samples_X)); +} + +NumericMatrix RcppRPF::predict_matrix(const NumericMatrix &X, const NumericVector components) +{ + auto result = RandomPlantedForest::predict_matrix(toStd2D(X), toStd1D(components)); + NumericMatrix rResult(result.size(), result[0].size()); + for (size_t i = 0; i < result.size(); ++i) + { + for (size_t j = 0; j < result[i].size(); ++j) + { + rResult(i, j) = result[i][j]; + } + } + return rResult; +} + +NumericMatrix RcppRPF::predict_vector(const NumericVector &X, const NumericVector components) +{ + auto result = RandomPlantedForest::predict_vector(toStd1D(X), toStd1D(components)); + NumericMatrix rResult(result.size(), result[0].size()); + for (size_t i = 0; i < result.size(); ++i) + { + for (size_t j = 0; j < result[i].size(); ++j) + { + rResult(i, j) = result[i][j]; + } + } + return rResult; +} + +void RcppRPF::cross_validation(int n_sets, IntegerVector splits, NumericVector t_tries, IntegerVector split_tries) +{ + RandomPlantedForest::cross_validation(n_sets, std::vector(splits.begin(), splits.end()), + std::vector(t_tries.begin(), t_tries.end()), + std::vector(split_tries.begin(), split_tries.end())); +} + +double RcppRPF::MSE(const NumericMatrix &Y_predicted, const NumericMatrix &Y_true) +{ + return RandomPlantedForest::MSE(toStd2D(Y_predicted), toStd2D(Y_true)); +} + +void RcppRPF::set_parameters(StringVector keys, NumericVector values) +{ + RandomPlantedForest::set_parameters(std::vector(keys.begin(), keys.end()), + std::vector(values.begin(), values.end())); +} + +double RcppRPF::MSE_vec(const NumericVector &Y_predicted, const NumericVector &Y_true) +{ + return RandomPlantedForest::MSE_vec(toStd1D(Y_predicted), toStd1D(Y_true)); +} + +void RcppRPF::purify_3() +{ + RandomPlantedForest::purify_3(); +} + +void RcppRPF::print() +{ + RandomPlantedForest::print(); +} + +void RcppRPF::get_parameters() +{ + RandomPlantedForest::get_parameters(); +} + +List RcppRPF::get_model() +{ + return RandomPlantedForest::get_model(); +} + +bool RcppRPF::is_purified() +{ + return RandomPlantedForest::is_purified(); +} From 035a094885774e284126a3ec690b931cd8d3f0e7 Mon Sep 17 00:00:00 2001 From: Jinyang Liu Date: Fri, 3 Jan 2025 11:56:17 +0100 Subject: [PATCH 05/13] Add classificationRPF to the interface --- src/Makevars | 2 +- src/include/rcpp_interface.hpp | 11 ++++++++++- src/lib/rcpp_interface.cpp | 13 ++++++++++++ src/randomPlantedForest.cpp | 36 +++++++++++++++++----------------- 4 files changed, 42 insertions(+), 20 deletions(-) diff --git a/src/Makevars b/src/Makevars index 71ae982..39c5daa 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,4 +1,4 @@ -SOURCES=lib/cpf.cpp lib/grid.cpp lib/helper.cpp lib/rpf.cpp lib/trees.cpp randomPlantedForest.cpp RcppExports.cpp +SOURCES=lib/cpf.cpp lib/grid.cpp lib/helper.cpp lib/rpf.cpp lib/trees.cpp lib/rcpp_interface.cpp randomPlantedForest.cpp RcppExports.cpp OBJECTS = $(SOURCES:.cpp=.o) diff --git a/src/include/rcpp_interface.hpp b/src/include/rcpp_interface.hpp index 71688d9..b8aa450 100644 --- a/src/include/rcpp_interface.hpp +++ b/src/include/rcpp_interface.hpp @@ -7,7 +7,8 @@ using namespace Rcpp; -class RcppRPF : public RandomPlantedForest { +class RcppRPF : public RandomPlantedForest +{ public: RcppRPF(const NumericMatrix &samples_Y, const NumericMatrix &samples_X, @@ -29,4 +30,12 @@ class RcppRPF : public RandomPlantedForest { double MSE_vec(const NumericVector &Y_predicted, const NumericVector &Y_true); }; +class RcppCPF : public ClassificationRPF, public RcppRPF +{ +public: + RcppCPF(const NumericMatrix &samples_Y, const NumericMatrix &samples_X, + const std::string loss = "L2", const NumericVector parameters = {1, 50, 30, 10, 0.4, 0, 0, 0, 0}); + void set_parameters(StringVector keys, NumericVector values); +}; + #endif // RCPP_INTERFACE_H diff --git a/src/lib/rcpp_interface.cpp b/src/lib/rcpp_interface.cpp index 50d6df1..098adec 100644 --- a/src/lib/rcpp_interface.cpp +++ b/src/lib/rcpp_interface.cpp @@ -107,3 +107,16 @@ bool RcppRPF::is_purified() { return RandomPlantedForest::is_purified(); } + +RcppCPF::RcppCPF(const NumericMatrix &samples_Y, const NumericMatrix &samples_X, + const std::string loss, const NumericVector parameters) + : ClassificationRPF(toStd2D(samples_Y), toStd2D(samples_X), loss, + toStd1D(parameters)) +{ +} + +void RcppCPF::set_parameters(StringVector keys, NumericVector values) +{ + ClassificationRPF::set_parameters(std::vector(keys.begin(), keys.end()), + std::vector(values.begin(), values.end())); +} diff --git a/src/randomPlantedForest.cpp b/src/randomPlantedForest.cpp index 9fd3630..4dd7ebf 100644 --- a/src/randomPlantedForest.cpp +++ b/src/randomPlantedForest.cpp @@ -1,28 +1,28 @@ #include "rpf.hpp" #include "cpf.hpp" +#include "rcpp_interface.hpp" // ----------------- Rcpp include ----------------- - RCPP_MODULE(mod_rpf) { - class_("RandomPlantedForest") - .constructor() - .method("set_data", &RandomPlantedForest::set_data) - .method("cross_validation", &RandomPlantedForest::cross_validation) - .method("predict_matrix", &RandomPlantedForest::predict_matrix) - .method("predict_vector", &RandomPlantedForest::predict_vector) - .method("MSE", &RandomPlantedForest::MSE) - .method("purify", &RandomPlantedForest::purify_3) - .method("print", &RandomPlantedForest::print) - .method("get_parameters", &RandomPlantedForest::get_parameters) - .method("set_parameters", &RandomPlantedForest::set_parameters) - .method("get_model", &RandomPlantedForest::get_model) - .method("is_purified", &RandomPlantedForest::is_purified); + class_("RandomPlantedForest") + .constructor() + .method("set_data", &RcppRPF::set_data) + .method("cross_validation", &RcppRPF::cross_validation) + .method("predict_matrix", &RcppRPF::predict_matrix) + .method("predict_vector", &RcppRPF::predict_vector) + .method("MSE", &RcppRPF::MSE) + .method("purify", &RcppRPF::purify_3) + .method("print", &RcppRPF::print) + .method("get_parameters", &RcppRPF::get_parameters) + .method("set_parameters", &RcppRPF::set_parameters) + .method("get_model", &RcppRPF::get_model) + .method("is_purified", &RcppRPF::is_purified); - class_("ClassificationRPF") - .derives("RandomPlantedForest") - .constructor() - .method("set_parameters", &ClassificationRPF::set_parameters); + class_("ClassificationRPF") + .derives("RandomPlantedForest") + .constructor() + .method("set_parameters", &RcppCPF::set_parameters); } From 3e6c957fab9a534864925aaab79a251393f18275 Mon Sep 17 00:00:00 2001 From: Jinyang Liu Date: Fri, 3 Jan 2025 14:21:06 +0100 Subject: [PATCH 06/13] Remove the `set_data` method from `RcppRPF`. --- src/lib/rcpp_interface.cpp | 5 ----- src/randomPlantedForest.cpp | 1 - 2 files changed, 6 deletions(-) diff --git a/src/lib/rcpp_interface.cpp b/src/lib/rcpp_interface.cpp index 098adec..6881cda 100644 --- a/src/lib/rcpp_interface.cpp +++ b/src/lib/rcpp_interface.cpp @@ -27,11 +27,6 @@ RcppRPF::RcppRPF(const NumericMatrix &samples_Y, const NumericMatrix &samples_X, { } -void RcppRPF::set_data(const NumericMatrix &samples_Y, const NumericMatrix &samples_X) -{ - RandomPlantedForest::set_data(toStd2D(samples_Y), toStd2D(samples_X)); -} - NumericMatrix RcppRPF::predict_matrix(const NumericMatrix &X, const NumericVector components) { auto result = RandomPlantedForest::predict_matrix(toStd2D(X), toStd1D(components)); diff --git a/src/randomPlantedForest.cpp b/src/randomPlantedForest.cpp index 4dd7ebf..c0c38ac 100644 --- a/src/randomPlantedForest.cpp +++ b/src/randomPlantedForest.cpp @@ -9,7 +9,6 @@ RCPP_MODULE(mod_rpf) class_("RandomPlantedForest") .constructor() - .method("set_data", &RcppRPF::set_data) .method("cross_validation", &RcppRPF::cross_validation) .method("predict_matrix", &RcppRPF::predict_matrix) .method("predict_vector", &RcppRPF::predict_vector) From 30196b7a77be1ddef8e16325dd42fa5470d9df4e Mon Sep 17 00:00:00 2001 From: Jinyang Liu Date: Fri, 3 Jan 2025 14:22:22 +0100 Subject: [PATCH 07/13] Add additional methods for classification --- src/include/rcpp_interface.hpp | 37 ++++++++++++++----- src/lib/rcpp_interface.cpp | 65 ++++++++++++++++++++++++++++++++++ src/randomPlantedForest.cpp | 13 +++++-- 3 files changed, 104 insertions(+), 11 deletions(-) diff --git a/src/include/rcpp_interface.hpp b/src/include/rcpp_interface.hpp index b8aa450..61d128c 100644 --- a/src/include/rcpp_interface.hpp +++ b/src/include/rcpp_interface.hpp @@ -7,19 +7,28 @@ using namespace Rcpp; -class RcppRPF : public RandomPlantedForest +class RcppInterface +{ +public: + virtual void set_parameters(StringVector keys, NumericVector values) = 0; + virtual NumericMatrix predict_matrix(const NumericMatrix &X, const NumericVector components) = 0; + virtual NumericMatrix predict_vector(const NumericVector &X, const NumericVector components) = 0; + virtual void cross_validation(int n_sets, IntegerVector splits, NumericVector t_tries, IntegerVector split_tries) = 0; + virtual double MSE(const NumericMatrix &Y_predicted, const NumericMatrix &Y_true) = 0; +}; + +class RcppRPF : public RandomPlantedForest, public RcppInterface { public: RcppRPF(const NumericMatrix &samples_Y, const NumericMatrix &samples_X, const NumericVector parameters = {1, 50, 30, 10, 0.4, 0, 0, 0, 0}); RcppRPF() {}; - void set_data(const NumericMatrix &samples_Y, const NumericMatrix &samples_X); - NumericMatrix predict_matrix(const NumericMatrix &X, const NumericVector components = {0}); - NumericMatrix predict_vector(const NumericVector &X, const NumericVector components = {0}); - void cross_validation(int n_sets = 4, IntegerVector splits = {5, 50}, NumericVector t_tries = {0.2, 0.5, 0.7, 0.9}, IntegerVector split_tries = {1, 2, 5, 10}); - double MSE(const NumericMatrix &Y_predicted, const NumericMatrix &Y_true); - void set_parameters(StringVector keys, NumericVector values); + NumericMatrix predict_matrix(const NumericMatrix &X, const NumericVector components = {0}) override; + NumericMatrix predict_vector(const NumericVector &X, const NumericVector components = {0}) override; + void cross_validation(int n_sets, IntegerVector splits, NumericVector t_tries, IntegerVector split_tries) override; + double MSE(const NumericMatrix &Y_predicted, const NumericMatrix &Y_true) override; + void set_parameters(StringVector keys, NumericVector values) override; void purify_3(); void print(); void get_parameters(); @@ -30,12 +39,22 @@ class RcppRPF : public RandomPlantedForest double MSE_vec(const NumericVector &Y_predicted, const NumericVector &Y_true); }; -class RcppCPF : public ClassificationRPF, public RcppRPF +class RcppCPF : public ClassificationRPF, public RcppInterface { public: RcppCPF(const NumericMatrix &samples_Y, const NumericMatrix &samples_X, const std::string loss = "L2", const NumericVector parameters = {1, 50, 30, 10, 0.4, 0, 0, 0, 0}); - void set_parameters(StringVector keys, NumericVector values); + void set_parameters(StringVector keys, NumericVector values) override; + NumericMatrix predict_matrix(const NumericMatrix &X, const NumericVector components = {0}) override; + NumericMatrix predict_vector(const NumericVector &X, const NumericVector components = {0}) override; + void cross_validation(int n_sets, IntegerVector splits, NumericVector t_tries, IntegerVector split_tries) override; + double MSE(const NumericMatrix &Y_predicted, const NumericMatrix &Y_true) override; + + void purify_3(); + void print(); + void get_parameters(); + List get_model(); + bool is_purified(); }; #endif // RCPP_INTERFACE_H diff --git a/src/lib/rcpp_interface.cpp b/src/lib/rcpp_interface.cpp index 6881cda..37c65f5 100644 --- a/src/lib/rcpp_interface.cpp +++ b/src/lib/rcpp_interface.cpp @@ -115,3 +115,68 @@ void RcppCPF::set_parameters(StringVector keys, NumericVector values) ClassificationRPF::set_parameters(std::vector(keys.begin(), keys.end()), std::vector(values.begin(), values.end())); } + +NumericMatrix RcppCPF::predict_matrix(const NumericMatrix &X, const NumericVector components) +{ + auto result = ClassificationRPF::predict_matrix(toStd2D(X), toStd1D(components)); + NumericMatrix rResult(result.size(), result[0].size()); + for (size_t i = 0; i < result.size(); ++i) + { + for (size_t j = 0; j < result[i].size(); ++j) + { + rResult(i, j) = result[i][j]; + } + } + return rResult; +} + +NumericMatrix RcppCPF::predict_vector(const NumericVector &X, const NumericVector components) +{ + auto result = ClassificationRPF::predict_vector(toStd1D(X), toStd1D(components)); + NumericMatrix rResult(result.size(), result[0].size()); + for (size_t i = 0; i < result.size(); ++i) + { + for (size_t j = 0; j < result[i].size(); ++j) + { + rResult(i, j) = result[i][j]; + } + } + return rResult; +} + +void RcppCPF::cross_validation(int n_sets, IntegerVector splits, NumericVector t_tries, IntegerVector split_tries) +{ + ClassificationRPF::cross_validation(n_sets, std::vector(splits.begin(), splits.end()), + std::vector(t_tries.begin(), t_tries.end()), + std::vector(split_tries.begin(), split_tries.end())); +} + +double RcppCPF::MSE(const NumericMatrix &Y_predicted, const NumericMatrix &Y_true) +{ + return ClassificationRPF::MSE(toStd2D(Y_predicted), toStd2D(Y_true)); +} + +void RcppCPF::purify_3() +{ + RandomPlantedForest::purify_3(); +} + +void RcppCPF::print() +{ + RandomPlantedForest::print(); +} + +void RcppCPF::get_parameters() +{ + RandomPlantedForest::get_parameters(); +} + +List RcppCPF::get_model() +{ + return RandomPlantedForest::get_model(); +} + +bool RcppCPF::is_purified() +{ + return RandomPlantedForest::is_purified(); +} diff --git a/src/randomPlantedForest.cpp b/src/randomPlantedForest.cpp index c0c38ac..407a79b 100644 --- a/src/randomPlantedForest.cpp +++ b/src/randomPlantedForest.cpp @@ -21,7 +21,16 @@ RCPP_MODULE(mod_rpf) .method("is_purified", &RcppRPF::is_purified); class_("ClassificationRPF") - .derives("RandomPlantedForest") .constructor() - .method("set_parameters", &RcppCPF::set_parameters); + .method("set_parameters", &RcppCPF::set_parameters) + .method("predict_matrix", &RcppCPF::predict_matrix) + .method("predict_vector", &RcppCPF::predict_vector) + .method("cross_validation", &RcppCPF::cross_validation) + .method("MSE", &RcppCPF::MSE) + .method("purify", &RcppCPF::purify_3) + .method("print", &RcppCPF::print) + .method("get_parameters", &RcppCPF::get_parameters) + .method("set_parameters", &RcppCPF::set_parameters) + .method("get_model", &RcppCPF::get_model) + .method("is_purified", &RcppCPF::is_purified); } From 3c26adf9b3f2bb7f3b4c784513d4e9e51b336fa0 Mon Sep 17 00:00:00 2001 From: Jinyang Liu Date: Fri, 3 Jan 2025 16:16:28 +0100 Subject: [PATCH 08/13] Add random number generation utilities and refactor sampling methods --- src/Makevars | 2 +- src/include/helper.hpp | 8 +- src/include/random_utils.hpp | 184 +++++++++++++++++++++++++++++++++++ src/lib/cpf.cpp | 4 +- src/lib/helper.cpp | 116 ++++++++++++---------- src/lib/random_utils.cpp | 10 ++ src/lib/rcpp_interface.cpp | 6 +- src/lib/rpf.cpp | 14 +-- tests/testthat/test-rpf.R | 12 +-- 9 files changed, 280 insertions(+), 76 deletions(-) create mode 100644 src/include/random_utils.hpp create mode 100644 src/lib/random_utils.cpp diff --git a/src/Makevars b/src/Makevars index 39c5daa..fd55185 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,4 +1,4 @@ -SOURCES=lib/cpf.cpp lib/grid.cpp lib/helper.cpp lib/rpf.cpp lib/trees.cpp lib/rcpp_interface.cpp randomPlantedForest.cpp RcppExports.cpp +SOURCES=lib/cpf.cpp lib/grid.cpp lib/helper.cpp lib/rpf.cpp lib/trees.cpp lib/rcpp_interface.cpp lib/random_utils.cpp randomPlantedForest.cpp RcppExports.cpp OBJECTS = $(SOURCES:.cpp=.o) diff --git a/src/include/helper.hpp b/src/include/helper.hpp index 5e064f6..93aa06f 100644 --- a/src/include/helper.hpp +++ b/src/include/helper.hpp @@ -11,6 +11,7 @@ #include #include #include +#include "random_utils.hpp" #ifndef UTILS_H #define UTILS_H @@ -100,12 +101,7 @@ namespace utils template void shuffle_vector(Iter first, Iter last) { - int n = std::distance(first, last); - while (n > 1) - { - int k = random_index(n--); - std::swap(*(first + n), *(first + k)); - } + RandomGenerator::shuffle(first, last); }; std::vector to_std_vec(std::vector rv); diff --git a/src/include/random_utils.hpp b/src/include/random_utils.hpp new file mode 100644 index 0000000..9d465d3 --- /dev/null +++ b/src/include/random_utils.hpp @@ -0,0 +1,184 @@ +#ifndef RANDOM_UTILS_H +#define RANDOM_UTILS_H + +#include +#include +#include +#include +#include +#include + +namespace utils +{ + + // Abstract base class for random number generators + class RNGBackend + { + public: + virtual ~RNGBackend() = default; + virtual void begin_rng() {} // Called before a sequence of random numbers + virtual void end_rng() {} // Called after a sequence of random numbers + virtual int random_int(int n) = 0; + virtual double random_double() = 0; + }; + + // Standard C++ random number generator backend + class StdRNGBackend : public RNGBackend + { + private: + static thread_local std::mt19937 generator; + static bool seeded; + + public: + void seed(uint32_t seed) + { + generator.seed(seed); + seeded = true; + } + + void initialize() + { + if (!seeded) + { + auto now = std::chrono::high_resolution_clock::now(); + auto nanos = std::chrono::duration_cast( + now.time_since_epoch()) + .count(); + generator.seed(static_cast(nanos)); + seeded = true; + } + } + + int random_int(int n) override + { + initialize(); + std::uniform_int_distribution dist(0, n - 1); + return dist(generator); + } + + double random_double() override + { + initialize(); + std::uniform_real_distribution dist(0.0, 1.0); + return dist(generator); + } + }; + + // R random number generator backend + class RcppRNGBackend : public RNGBackend + { + private: + std::unique_ptr rng_scope; + + public: + void begin_rng() override + { + if (!rng_scope) + { + rng_scope = std::make_unique(); + } + } + + void end_rng() override + { + rng_scope.reset(); + } + + int random_int(int n) override + { + return static_cast(unif_rand() * n); + } + + double random_double() override + { + return unif_rand(); + } + }; + + class RandomGenerator + { + private: + static RNGBackend *backend; + static StdRNGBackend std_backend; + static RcppRNGBackend rcpp_backend; + + public: + // RAII class to handle RNG state + class RNGScope + { + private: + RNGBackend *backend; + + public: + RNGScope() : backend(RandomGenerator::backend) + { + backend->begin_rng(); + } + ~RNGScope() + { + backend->end_rng(); + } + }; + + // Switch to using R's RNG + static void use_r_random() + { + backend = &rcpp_backend; + } + + // Switch to using C++ standard RNG + static void use_std_random() + { + backend = &std_backend; + } + + // Initialize the standard generator with a seed + static void seed(uint32_t seed) + { + std_backend.seed(seed); + } + + // Generate random integer in range [0, n) + static int random_index(int n) + { + RNGScope scope; + return backend->random_int(n); + } + + // Generate random double in range [0, 1) + static double random_double() + { + RNGScope scope; + return backend->random_double(); + } + + // Shuffle a range of elements + template + static void shuffle(Iter first, Iter last) + { + RNGScope scope; + auto n = std::distance(first, last); + for (auto i = n - 1; i > 0; --i) + { + std::swap(*(first + i), *(first + backend->random_int(i + 1))); + } + } + + // Sample n elements with replacement + template + static std::vector sample_with_replacement(const std::vector &population, size_t n) + { + RNGScope scope; + std::vector result; + result.reserve(n); + for (size_t i = 0; i < n; ++i) + { + result.push_back(population[backend->random_int(population.size())]); + } + return result; + } + }; + +} // namespace utils + +#endif // RANDOM_UTILS_H diff --git a/src/lib/cpf.cpp b/src/lib/cpf.cpp index 8979874..b69e5c8 100644 --- a/src/lib/cpf.cpp +++ b/src/lib/cpf.cpp @@ -769,10 +769,10 @@ Split ClassificationRPF::calcOptimalSplit(const std::vector> std::iota(samples.begin(), samples.end(), 1); } else - { // randomly picked samples otherwise + { // randomly picked samples using RandomGenerator samples = std::vector(split_try); for (size_t i = 0; i < samples.size(); ++i) - samples[i] = rand() % (int)(unique_samples.size() - leaf_size) + leaf_size; + samples[i] = utils::RandomGenerator::random_index((int)(unique_samples.size() - leaf_size)) + leaf_size; std::sort(samples.begin(), samples.end()); } diff --git a/src/lib/helper.cpp b/src/lib/helper.cpp index b4f542a..1280096 100644 --- a/src/lib/helper.cpp +++ b/src/lib/helper.cpp @@ -1,71 +1,81 @@ #include "helper.hpp" +#include "random_utils.hpp" #include #include #include -namespace utils { +namespace utils +{ -// Helper function to generate random number using standard C++ libraries -int random_index(const int n) { return static_cast(static_cast(rand()) / RAND_MAX * n); } + // Helper function to generate random number using our RandomGenerator + int random_index(const int n) { return RandomGenerator::random_index(n); } -// ----------------- functions for converting Cpp types ----------------- + // ----------------- functions for converting Cpp types ----------------- + /** + * \brief Convert the std container set of type int into a std::vector. + * + * \param v the set that is converted. + */ + std::vector from_std_set(std::set v) + { + return std::vector(v.begin(), v.end()); + } -/** - * \brief Convert the std container set of type int into a std::vector. - * - * \param v the set that is converted. - */ -std::vector from_std_set(std::set v) { - return std::vector(v.begin(), v.end()); -} + /** + * \brief Convert the std container vector of type int into a std::vector. + * + * \param v the vector that is converted. + */ + std::vector from_std_vec(std::vector v) + { + return v; + } -/** - * \brief Convert the std container vector of type int into a std::vector. - * - * \param v the vector that is converted. - */ -std::vector from_std_vec(std::vector v) { - return v; -} + /** + * \brief Convert the std container vector of type double into a std::vector. + * + * \param v the vector that is converted. + */ + std::vector from_std_vec(std::vector v) + { + return v; + } -/** - * \brief Convert the std container vector of type double into a std::vector. - * - * \param v the vector that is converted. - */ -std::vector from_std_vec(std::vector v) { - return v; -} - -/** - * \brief Convert the nested std container vector containing a vector itself - * of type double into a std::vector>. - * - * \param v the vector of vectors that is converted. - */ -std::vector> from_std_vec(std::vector> v) { - return v; -} + /** + * \brief Convert the nested std container vector containing a vector itself + * of type double into a std::vector>. + * + * \param v the vector of vectors that is converted. + */ + std::vector> from_std_vec(std::vector> v) + { + return v; + } -std::vector to_std_vec(std::vector rv) { - return rv; -} + std::vector to_std_vec(std::vector rv) + { + return rv; + } -std::vector to_std_vec(std::vector rv) { - return rv; -} + std::vector to_std_vec(std::vector rv) + { + return rv; + } -std::vector> to_std_vec(std::vector> rv) { - return rv; -} + std::vector> to_std_vec(std::vector> rv) + { + return rv; + } -std::set to_std_set(std::vector rv) { - return std::set(rv.begin(), rv.end()); -} + std::set to_std_set(std::vector rv) + { + return std::set(rv.begin(), rv.end()); + } -std::set to_std_set(std::vector rv) { - return std::set(rv.begin(), rv.end()); -} + std::set to_std_set(std::vector rv) + { + return std::set(rv.begin(), rv.end()); + } } diff --git a/src/lib/random_utils.cpp b/src/lib/random_utils.cpp new file mode 100644 index 0000000..5756fde --- /dev/null +++ b/src/lib/random_utils.cpp @@ -0,0 +1,10 @@ +#include "random_utils.hpp" + +namespace utils +{ + thread_local std::mt19937 StdRNGBackend::generator; + bool StdRNGBackend::seeded = false; + RNGBackend *RandomGenerator::backend = &RandomGenerator::std_backend; + StdRNGBackend RandomGenerator::std_backend; + RcppRNGBackend RandomGenerator::rcpp_backend; +} diff --git a/src/lib/rcpp_interface.cpp b/src/lib/rcpp_interface.cpp index 37c65f5..0796a35 100644 --- a/src/lib/rcpp_interface.cpp +++ b/src/lib/rcpp_interface.cpp @@ -1,4 +1,5 @@ #include "rcpp_interface.hpp" +#include "random_utils.hpp" // Helper to convert NumericMatrix to std::vector> static std::vector> toStd2D(const Rcpp::NumericMatrix &mat) @@ -25,6 +26,7 @@ RcppRPF::RcppRPF(const NumericMatrix &samples_Y, const NumericMatrix &samples_X, : RandomPlantedForest(toStd2D(samples_Y), toStd2D(samples_X), toStd1D(parameters)) { + utils::RandomGenerator::use_r_random(); } NumericMatrix RcppRPF::predict_matrix(const NumericMatrix &X, const NumericVector components) @@ -105,9 +107,9 @@ bool RcppRPF::is_purified() RcppCPF::RcppCPF(const NumericMatrix &samples_Y, const NumericMatrix &samples_X, const std::string loss, const NumericVector parameters) - : ClassificationRPF(toStd2D(samples_Y), toStd2D(samples_X), loss, - toStd1D(parameters)) + : ClassificationRPF(toStd2D(samples_Y), toStd2D(samples_X), loss, toStd1D(parameters)) { + utils::RandomGenerator::use_r_random(); } void RcppCPF::set_parameters(StringVector keys, NumericVector values) diff --git a/src/lib/rpf.cpp b/src/lib/rpf.cpp index 25ac56f..f3a6f03 100644 --- a/src/lib/rpf.cpp +++ b/src/lib/rpf.cpp @@ -147,10 +147,10 @@ Split RandomPlantedForest::calcOptimalSplit(const std::vector(split_try); for (size_t i = 0; i < samples.size(); ++i) - samples[i] = rand() % (int)(unique_samples.size() - leaf_size); + samples[i] = utils::RandomGenerator::random_index((int)(unique_samples.size() - leaf_size)); std::sort(samples.begin(), samples.end()); } @@ -319,12 +319,14 @@ void RandomPlantedForest::create_tree_family(std::vector initial_leaves, s samples_X = std::vector>(sample_size); samples_Y = std::vector>(sample_size); + // Use our RandomGenerator for sampling with replacement + auto indices = utils::RandomGenerator::sample_with_replacement( + std::vector(sample_size), sample_size); + for (size_t i = 0; i < sample_size; ++i) { - - sample_index = rand() % sample_size; - samples_Y[i] = Y[sample_index]; - samples_X[i] = X[sample_index]; + samples_Y[i] = Y[indices[i]]; + samples_X[i] = X[indices[i]]; } } diff --git a/tests/testthat/test-rpf.R b/tests/testthat/test-rpf.R index ed26bc6..96b8f19 100644 --- a/tests/testthat/test-rpf.R +++ b/tests/testthat/test-rpf.R @@ -77,16 +77,16 @@ test_that("Rcpp RNG/R RNG interference", { test_that("Setting max_interaction = 0 works", { set.seed(100) - rpf_fit0 <- rpf(mpg ~ ., data = mtcars[1:20,], max_interaction = 0) - pred0 <- predict(rpf_fit0, new_data = mtcars[21:32,]) + rpf_fit0 <- rpf(mpg ~ ., data = mtcars[1:20, ], max_interaction = 0) + pred0 <- predict(rpf_fit0, new_data = mtcars[21:32, ]) set.seed(100) - rpf_fit10 <- rpf(mpg ~ ., data = mtcars[1:20,], max_interaction = 10) - pred10 <- predict(rpf_fit10, new_data = mtcars[21:32,]) + rpf_fit10 <- rpf(mpg ~ ., data = mtcars[1:20, ], max_interaction = 10) + pred10 <- predict(rpf_fit10, new_data = mtcars[21:32, ]) set.seed(100) - rpf_fit_default <- rpf(mpg ~ ., data = mtcars[1:20,]) - pred_default <- predict(rpf_fit_default, new_data = mtcars[21:32,]) + rpf_fit_default <- rpf(mpg ~ ., data = mtcars[1:20, ]) + pred_default <- predict(rpf_fit_default, new_data = mtcars[21:32, ]) # Sanity: pred of default (max_interaction = 1) should be different expect_failure(expect_equal(pred_default, pred0)) From 426e1bf748917d86252f4e7ccdda5833d3d8706e Mon Sep 17 00:00:00 2001 From: Jinyang Liu Date: Fri, 3 Jan 2025 16:16:59 +0100 Subject: [PATCH 09/13] Refactor cpp test --- tests/testthat/test-rpf-cpp.R | 37 +++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/tests/testthat/test-rpf-cpp.R b/tests/testthat/test-rpf-cpp.R index 6378765..c4e7df1 100644 --- a/tests/testthat/test-rpf-cpp.R +++ b/tests/testthat/test-rpf-cpp.R @@ -1,11 +1,12 @@ # Calling miscellaneous C++ functions -test_that("C-level functionality works", { - train <- mtcars[1:20, ] - test <- mtcars[21:32, ] - ntrees <- 35 +# Test MSE calculation +test_that("MSE calculation works", { + train <- mtcars[1:20, ] + test <- mtcars[21:32, ] + set.seed(23) - rpfit <- rpf(mpg ~., data = train, max_interaction = 3, ntrees = ntrees) + rpfit <- rpf(mpg ~ ., data = train, max_interaction = 3, ntrees = 35) pred <- predict(rpfit, test) mse <- rpfit$fit$MSE(as.matrix(pred$.pred), as.matrix(test$mpg)) @@ -14,24 +15,36 @@ test_that("C-level functionality works", { # MSE here is ~14 on non-macOS platforms. # Difference probably compiler related, so can't reasonably expect fixed values I think. # if (Sys.info()[["sysname"]] == "Darwin") expect_equal(mse, 17.1905463) +}) + +# Test model structure +test_that("Model structure is correct", { + train <- mtcars[1:20, ] + ntrees <- 35 + + set.seed(23) + rpfit <- rpf(mpg ~ ., data = train, max_interaction = 3, ntrees = ntrees) mod <- rpfit$fit$get_model() expect_true(inherits(mod, "list")) expect_equal(length(mod), ntrees) # length is number of trees +}) - # FIXME: Cosmetic issue but should be addressed at some point - # This should be assignable instead of printing to stdout - # An R-vector or list would probably be ideal - params_captured <- capture.output(rpfit$fit$get_parameters()) - # params_return <- rpfit$fit$get_parameters() - # expect_equal(params_captured, params_return) +# Test parameter getting and setting +test_that("Parameter getting and setting works", { + train <- mtcars[1:20, ] + + set.seed(23) + rpfit <- rpf(mpg ~ ., data = train) # could test if setting params work rpfit$fit$set_parameters("n_trees", 60) + # FIXME: Cosmetic issue but should be addressed at some point + # This should be assignable instead of printing to stdout + # An R-vector or list would probably be ideal params_captured <- capture.output(rpfit$fit$get_parameters()) expect_equal( strsplit(params_captured, "(, )|(: )")[[1]][[2]], "n_trees=60" ) }) - From 6304f8411149f23b1c45a81bd9f366da09724d4d Mon Sep 17 00:00:00 2001 From: Jinyang Liu Date: Fri, 3 Jan 2025 16:59:11 +0100 Subject: [PATCH 10/13] Separate get_model into interface --- src/include/rcpp_interface.hpp | 6 +++-- src/include/rpf.hpp | 4 +--- src/lib/rcpp_interface.cpp | 42 ++++++++++++++++++++++++++++++++-- src/lib/rpf.cpp | 37 ------------------------------ 4 files changed, 45 insertions(+), 44 deletions(-) diff --git a/src/include/rcpp_interface.hpp b/src/include/rcpp_interface.hpp index 61d128c..ceaf22a 100644 --- a/src/include/rcpp_interface.hpp +++ b/src/include/rcpp_interface.hpp @@ -15,6 +15,7 @@ class RcppInterface virtual NumericMatrix predict_vector(const NumericVector &X, const NumericVector components) = 0; virtual void cross_validation(int n_sets, IntegerVector splits, NumericVector t_tries, IntegerVector split_tries) = 0; virtual double MSE(const NumericMatrix &Y_predicted, const NumericMatrix &Y_true) = 0; + virtual List get_model() = 0; }; class RcppRPF : public RandomPlantedForest, public RcppInterface @@ -29,10 +30,11 @@ class RcppRPF : public RandomPlantedForest, public RcppInterface void cross_validation(int n_sets, IntegerVector splits, NumericVector t_tries, IntegerVector split_tries) override; double MSE(const NumericMatrix &Y_predicted, const NumericMatrix &Y_true) override; void set_parameters(StringVector keys, NumericVector values) override; + List get_model() override; + void purify_3(); void print(); void get_parameters(); - List get_model(); bool is_purified(); protected: @@ -49,11 +51,11 @@ class RcppCPF : public ClassificationRPF, public RcppInterface NumericMatrix predict_vector(const NumericVector &X, const NumericVector components = {0}) override; void cross_validation(int n_sets, IntegerVector splits, NumericVector t_tries, IntegerVector split_tries) override; double MSE(const NumericMatrix &Y_predicted, const NumericMatrix &Y_true) override; + List get_model() override; void purify_3(); void print(); void get_parameters(); - List get_model(); bool is_purified(); }; diff --git a/src/include/rpf.hpp b/src/include/rpf.hpp index aeb1b7c..6787d4a 100644 --- a/src/include/rpf.hpp +++ b/src/include/rpf.hpp @@ -3,9 +3,8 @@ #include #include -#include #include "trees.hpp" -using namespace Rcpp; + class RandomPlantedForest { @@ -26,7 +25,6 @@ class RandomPlantedForest double MSE(const std::vector> &Y_predicted, const std::vector> &Y_true); void get_parameters(); void set_parameters(std::vector keys, std::vector values); - List get_model(); virtual ~RandomPlantedForest() {}; bool is_purified(); diff --git a/src/lib/rcpp_interface.cpp b/src/lib/rcpp_interface.cpp index 0796a35..28cc6b5 100644 --- a/src/lib/rcpp_interface.cpp +++ b/src/lib/rcpp_interface.cpp @@ -21,6 +21,44 @@ static std::vector toStd1D(const Rcpp::NumericVector &vec) return std::vector(vec.begin(), vec.end()); } +// New helper function to convert tree families to R List +static List treeFamiliesToRList(const std::vector &families, int feature_size) +{ + List model; + for (const auto &family : families) + { + List variables, family_values, family_intervals; + for (const auto &tree : family) + { + List tree_values; + List tree_intervals; + variables.push_back(from_std_set(tree.first)); + for (const auto &leaf : tree.second->get_leaves()) + { + NumericMatrix leaf_values; + for (const auto &val : leaf.value) + { + leaf_values.push_back(val); + } + tree_values.push_back(leaf_values); + + NumericVector intervals; + for (const auto &interval : leaf.intervals) + { + intervals.push_back(interval.first); + intervals.push_back(interval.second); + } + NumericMatrix leaf_intervals(2, feature_size, intervals.begin()); + tree_intervals.push_back(leaf_intervals); + } + family_intervals.push_back(tree_intervals); + family_values.push_back(tree_values); + } + model.push_back(List::create(Named("variables") = variables, _["values"] = family_values, _["intervals"] = family_intervals)); + } + return model; +} + RcppRPF::RcppRPF(const NumericMatrix &samples_Y, const NumericMatrix &samples_X, const NumericVector parameters) : RandomPlantedForest(toStd2D(samples_Y), toStd2D(samples_X), @@ -97,7 +135,7 @@ void RcppRPF::get_parameters() List RcppRPF::get_model() { - return RandomPlantedForest::get_model(); + return treeFamiliesToRList(tree_families, feature_size); } bool RcppRPF::is_purified() @@ -175,7 +213,7 @@ void RcppCPF::get_parameters() List RcppCPF::get_model() { - return RandomPlantedForest::get_model(); + return treeFamiliesToRList(tree_families, feature_size); } bool RcppCPF::is_purified() diff --git a/src/lib/rpf.cpp b/src/lib/rpf.cpp index f3a6f03..85ef07d 100644 --- a/src/lib/rpf.cpp +++ b/src/lib/rpf.cpp @@ -2052,40 +2052,3 @@ void RandomPlantedForest::set_parameters(std::vector keys, std::vec } this->fit(); } - -List RandomPlantedForest::get_model() -{ - List model; - for (const auto &family : tree_families) - { - List variables, family_values, family_intervals; - for (const auto &tree : family) - { - List tree_values; - List tree_intervals; - variables.push_back(from_std_set(tree.first)); - for (const auto &leaf : tree.second->leaves) - { - NumericMatrix leaf_values; - for (const auto &val : leaf.value) - { - leaf_values.push_back(val); - } - tree_values.push_back(leaf_values); - - NumericVector intervals; - for (const auto &interval : leaf.intervals) - { - intervals.push_back(interval.first); - intervals.push_back(interval.second); - } - NumericMatrix leaf_intervals(2, feature_size, intervals.begin()); - tree_intervals.push_back(leaf_intervals); - } - family_intervals.push_back(tree_intervals); - family_values.push_back(tree_values); - } - model.push_back(List::create(Named("variables") = variables, _["values"] = family_values, _["intervals"] = family_intervals)); - } - return (model); -} From f86881afef578a15696b487120fb62368d210e9b Mon Sep 17 00:00:00 2001 From: Jinyang Liu Date: Sat, 4 Jan 2025 14:08:43 +0100 Subject: [PATCH 11/13] Simplify RNG implementation --- src/include/random_utils.hpp | 80 +++++++++--------------------------- src/lib/rpf.cpp | 7 +++- 2 files changed, 25 insertions(+), 62 deletions(-) diff --git a/src/include/random_utils.hpp b/src/include/random_utils.hpp index 9d465d3..bbd0f20 100644 --- a/src/include/random_utils.hpp +++ b/src/include/random_utils.hpp @@ -8,6 +8,8 @@ #include #include +using namespace Rcpp; + namespace utils { @@ -16,8 +18,6 @@ namespace utils { public: virtual ~RNGBackend() = default; - virtual void begin_rng() {} // Called before a sequence of random numbers - virtual void end_rng() {} // Called after a sequence of random numbers virtual int random_int(int n) = 0; virtual double random_double() = 0; }; @@ -67,31 +67,17 @@ namespace utils // R random number generator backend class RcppRNGBackend : public RNGBackend { - private: - std::unique_ptr rng_scope; - public: - void begin_rng() override - { - if (!rng_scope) - { - rng_scope = std::make_unique(); - } - } - - void end_rng() override - { - rng_scope.reset(); - } - int random_int(int n) override { - return static_cast(unif_rand() * n); + RNGScope scope; + return static_cast(R::runif(0, 1) * n); } double random_double() override { - return unif_rand(); + RNGScope scope; + return R::runif(0, 1); } }; @@ -103,52 +89,15 @@ namespace utils static RcppRNGBackend rcpp_backend; public: - // RAII class to handle RNG state - class RNGScope - { - private: - RNGBackend *backend; - - public: - RNGScope() : backend(RandomGenerator::backend) - { - backend->begin_rng(); - } - ~RNGScope() - { - backend->end_rng(); - } - }; - - // Switch to using R's RNG - static void use_r_random() - { - backend = &rcpp_backend; - } - - // Switch to using C++ standard RNG - static void use_std_random() - { - backend = &std_backend; - } - - // Initialize the standard generator with a seed - static void seed(uint32_t seed) - { - std_backend.seed(seed); - } - // Generate random integer in range [0, n) static int random_index(int n) { - RNGScope scope; return backend->random_int(n); } // Generate random double in range [0, 1) static double random_double() { - RNGScope scope; return backend->random_double(); } @@ -156,7 +105,6 @@ namespace utils template static void shuffle(Iter first, Iter last) { - RNGScope scope; auto n = std::distance(first, last); for (auto i = n - 1; i > 0; --i) { @@ -168,15 +116,27 @@ namespace utils template static std::vector sample_with_replacement(const std::vector &population, size_t n) { - RNGScope scope; std::vector result; result.reserve(n); for (size_t i = 0; i < n; ++i) { - result.push_back(population[backend->random_int(population.size())]); + result.push_back(population[random_index(population.size())]); } return result; } + + // Switch to using R's RNG + static void use_r_random() + { + backend = &rcpp_backend; + } + + // Switch to using C++ standard RNG + static void use_std_random(uint32_t seed = 42) + { + std_backend.seed(seed); + backend = &std_backend; + } }; } // namespace utils diff --git a/src/lib/rpf.cpp b/src/lib/rpf.cpp index 85ef07d..a08db6d 100644 --- a/src/lib/rpf.cpp +++ b/src/lib/rpf.cpp @@ -319,9 +319,12 @@ void RandomPlantedForest::create_tree_family(std::vector initial_leaves, s samples_X = std::vector>(sample_size); samples_Y = std::vector>(sample_size); + // Create a vector with values [0, 1, ..., sample_size-1] + std::vector population(sample_size); + std::iota(population.begin(), population.end(), 0); + // Use our RandomGenerator for sampling with replacement - auto indices = utils::RandomGenerator::sample_with_replacement( - std::vector(sample_size), sample_size); + auto indices = utils::RandomGenerator::sample_with_replacement(population, sample_size); for (size_t i = 0; i < sample_size; ++i) { From 4e5b8bca87d3f476bb4f71a2befaa45c515e0fd3 Mon Sep 17 00:00:00 2001 From: Jinyang Liu Date: Sat, 4 Jan 2025 14:08:58 +0100 Subject: [PATCH 12/13] Move fitting away from set_data --- src/lib/rcpp_interface.cpp | 13 +++++++++++++ src/lib/rpf.cpp | 7 ------- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/lib/rcpp_interface.cpp b/src/lib/rcpp_interface.cpp index 28cc6b5..6b466d1 100644 --- a/src/lib/rcpp_interface.cpp +++ b/src/lib/rcpp_interface.cpp @@ -65,6 +65,13 @@ RcppRPF::RcppRPF(const NumericMatrix &samples_Y, const NumericMatrix &samples_X, toStd1D(parameters)) { utils::RandomGenerator::use_r_random(); + + this->fit(); + + if (cross_validate) + { + RandomPlantedForest::cross_validation(); + } } NumericMatrix RcppRPF::predict_matrix(const NumericMatrix &X, const NumericVector components) @@ -148,6 +155,12 @@ RcppCPF::RcppCPF(const NumericMatrix &samples_Y, const NumericMatrix &samples_X, : ClassificationRPF(toStd2D(samples_Y), toStd2D(samples_X), loss, toStd1D(parameters)) { utils::RandomGenerator::use_r_random(); + RandomPlantedForest::fit(); + + if (cross_validate) + { + ClassificationRPF::cross_validation(); + } } void RcppCPF::set_parameters(StringVector keys, NumericVector values) diff --git a/src/lib/rpf.cpp b/src/lib/rpf.cpp index a08db6d..d36c3c1 100644 --- a/src/lib/rpf.cpp +++ b/src/lib/rpf.cpp @@ -279,13 +279,6 @@ void RandomPlantedForest::set_data(const std::vector> &sampl this->upper_bounds[i] = maxVal + 2 * eps; // to consider samples at max value this->lower_bounds[i] = minVal; } - - this->fit(); - - if (cross_validate) - { - this->cross_validation(); - } } void RandomPlantedForest::create_tree_family(std::vector initial_leaves, size_t n) From 79afe8ef5ea2026c7be15373b3ecb280c72d1560 Mon Sep 17 00:00:00 2001 From: Jinyang Liu Date: Sat, 4 Jan 2025 14:37:42 +0100 Subject: [PATCH 13/13] Refactor parameter handling in RcppRPF and ClassificationRPF - Removed `set_parameters` method from both RcppRPF and ClassificationRPF classes. - Updated `get_parameters` to return a structured RPFParams object instead of printing to stdout. - Removed obsolete test related to parameter setting from test suite. --- src/include/cpf.hpp | 44 +++++++++------ src/include/rcpp_interface.hpp | 7 +-- src/include/rpf.hpp | 16 +++++- src/lib/cpf.cpp | 97 +--------------------------------- src/lib/rcpp_interface.cpp | 38 +++++++------ src/lib/rpf.cpp | 65 +---------------------- src/randomPlantedForest.cpp | 3 -- tests/testthat/test-rpf-cpp.R | 19 ------- 8 files changed, 70 insertions(+), 219 deletions(-) diff --git a/src/include/cpf.hpp b/src/include/cpf.hpp index 69c75b4..8c9eb69 100644 --- a/src/include/cpf.hpp +++ b/src/include/cpf.hpp @@ -3,6 +3,32 @@ #include #include "rpf.hpp" +enum LossType +{ + L1, + L2, + median, + logit, + logit_2, + logit_3, + logit_4, + exponential, + exponential_2, + exponential_3 +}; +struct CPFParams +{ + int max_interaction; + int n_trees; + int n_splits; + int split_try; + double t_try; + bool purify_forest; + bool deterministic; + int nthreads; + bool cross_validate; + LossType loss; +}; class ClassificationRPF : public RandomPlantedForest { @@ -11,25 +37,12 @@ class ClassificationRPF : public RandomPlantedForest using RandomPlantedForest::calcOptimalSplit; ClassificationRPF(const std::vector> &samples_Y, const std::vector> &samples_X, const std::string loss = "L2", const std::vector parameters = {1, 50, 30, 10, 0.4, 0, 0, 0, 0, 0, 0.1}); - void set_parameters(std::vector keys, std::vector values); - ~ClassificationRPF(){}; + ~ClassificationRPF() {}; + CPFParams get_parameters(); private: double delta; double epsilon; - enum LossType - { - L1, - L2, - median, - logit, - logit_2, - logit_3, - logit_4, - exponential, - exponential_2, - exponential_3 - }; LossType loss; void (ClassificationRPF::*calcLoss)(Split &); void create_tree_family(std::vector initial_leaves, size_t n) override; @@ -48,5 +61,4 @@ class ClassificationRPF : public RandomPlantedForest void exponential_loss_3(Split &split); }; - #endif diff --git a/src/include/rcpp_interface.hpp b/src/include/rcpp_interface.hpp index ceaf22a..f24486a 100644 --- a/src/include/rcpp_interface.hpp +++ b/src/include/rcpp_interface.hpp @@ -10,7 +10,6 @@ using namespace Rcpp; class RcppInterface { public: - virtual void set_parameters(StringVector keys, NumericVector values) = 0; virtual NumericMatrix predict_matrix(const NumericMatrix &X, const NumericVector components) = 0; virtual NumericMatrix predict_vector(const NumericVector &X, const NumericVector components) = 0; virtual void cross_validation(int n_sets, IntegerVector splits, NumericVector t_tries, IntegerVector split_tries) = 0; @@ -29,12 +28,11 @@ class RcppRPF : public RandomPlantedForest, public RcppInterface NumericMatrix predict_vector(const NumericVector &X, const NumericVector components = {0}) override; void cross_validation(int n_sets, IntegerVector splits, NumericVector t_tries, IntegerVector split_tries) override; double MSE(const NumericMatrix &Y_predicted, const NumericMatrix &Y_true) override; - void set_parameters(StringVector keys, NumericVector values) override; List get_model() override; void purify_3(); void print(); - void get_parameters(); + List get_parameters(); bool is_purified(); protected: @@ -46,7 +44,6 @@ class RcppCPF : public ClassificationRPF, public RcppInterface public: RcppCPF(const NumericMatrix &samples_Y, const NumericMatrix &samples_X, const std::string loss = "L2", const NumericVector parameters = {1, 50, 30, 10, 0.4, 0, 0, 0, 0}); - void set_parameters(StringVector keys, NumericVector values) override; NumericMatrix predict_matrix(const NumericMatrix &X, const NumericVector components = {0}) override; NumericMatrix predict_vector(const NumericVector &X, const NumericVector components = {0}) override; void cross_validation(int n_sets, IntegerVector splits, NumericVector t_tries, IntegerVector split_tries) override; @@ -55,7 +52,7 @@ class RcppCPF : public ClassificationRPF, public RcppInterface void purify_3(); void print(); - void get_parameters(); + List get_parameters(); bool is_purified(); }; diff --git a/src/include/rpf.hpp b/src/include/rpf.hpp index 6787d4a..7cee967 100644 --- a/src/include/rpf.hpp +++ b/src/include/rpf.hpp @@ -5,6 +5,19 @@ #include #include "trees.hpp" +struct RPFParams +{ + int max_interaction; + int n_trees; + int n_splits; + int split_try; + double t_try; + bool purify_forest; + bool deterministic; + int nthreads; + bool cross_validate; +}; + class RandomPlantedForest { @@ -23,8 +36,7 @@ class RandomPlantedForest void print(); void cross_validation(int n_sets = 4, std::vector splits = {5, 50}, std::vector t_tries = {0.2, 0.5, 0.7, 0.9}, std::vector split_tries = {1, 2, 5, 10}); double MSE(const std::vector> &Y_predicted, const std::vector> &Y_true); - void get_parameters(); - void set_parameters(std::vector keys, std::vector values); + RPFParams get_parameters(); virtual ~RandomPlantedForest() {}; bool is_purified(); diff --git a/src/lib/cpf.cpp b/src/lib/cpf.cpp index b69e5c8..f19246b 100644 --- a/src/lib/cpf.cpp +++ b/src/lib/cpf.cpp @@ -1434,100 +1434,7 @@ void ClassificationRPF::fit() } } -/* retrospectively change parameters of existing class object, - updates the model, so far only single valued parameters supported, - for replacing training data use 'set_data', - note that changing cv does not trigger cross validation */ -void ClassificationRPF::set_parameters(std::vector keys, std::vector values) +CPFParams ClassificationRPF::get_parameters() { - if (keys.size() != values.size()) - { - std::cout << "Size of input vectors is not the same. " << std::endl; - return; - } - - for (unsigned int i = 0; i < keys.size(); ++i) - { - if (keys[i] == "deterministic") - { - this->deterministic = values[i]; - } - else if (keys[i] == "nthreads") - { - this->nthreads = values[i]; - } - else if (keys[i] == "purify") - { - this->purify_forest = values[i]; - } - else if (keys[i] == "n_trees") - { - this->n_trees = values[i]; - } - else if (keys[i] == "n_splits") - { - this->n_splits = values[i]; - } - else if (keys[i] == "t_try") - { - this->t_try = values[i]; - } - else if (keys[i] == "split_try") - { - this->split_try = values[i]; - } - else if (keys[i] == "max_interaction") - { - this->max_interaction = values[i]; - } - else if (keys[i] == "cv") - { - this->cross_validate = values[i]; - } - else if (keys[i] == "loss") - { - if (keys[i] == "L1") - { - this->loss = LossType::L1; - this->calcLoss = &ClassificationRPF::L1_loss; - } - else if (keys[i] == "L2") - { - this->loss = LossType::L2; - this->calcLoss = &ClassificationRPF::L2_loss; - } - else if (keys[i] == "median") - { - this->loss = LossType::median; - this->calcLoss = &ClassificationRPF::median_loss; - } - else if (keys[i] == "logit") - { - this->loss = LossType::logit; - this->calcLoss = &ClassificationRPF::logit_loss; - } - else if (keys[i] == "exponential") - { - this->loss = LossType::exponential; - this->calcLoss = &ClassificationRPF::exponential_loss; - } - else - { - std::cout << "Unkown loss function." << std::endl; - } - } - else if (keys[i] == "delta") - { - this->delta = values[i]; - } - else if (keys[i] == "epsilon") - { - this->epsilon = values[i]; - } - else - { - std::cout << "Unkown parameter key '" << keys[i] << "' ." << std::endl; - } - } - this->fit(); + return {max_interaction, n_trees, n_splits, split_try, t_try, purify_forest, deterministic, nthreads, cross_validate, loss}; } diff --git a/src/lib/rcpp_interface.cpp b/src/lib/rcpp_interface.cpp index 6b466d1..28456a8 100644 --- a/src/lib/rcpp_interface.cpp +++ b/src/lib/rcpp_interface.cpp @@ -114,12 +114,6 @@ double RcppRPF::MSE(const NumericMatrix &Y_predicted, const NumericMatrix &Y_tru return RandomPlantedForest::MSE(toStd2D(Y_predicted), toStd2D(Y_true)); } -void RcppRPF::set_parameters(StringVector keys, NumericVector values) -{ - RandomPlantedForest::set_parameters(std::vector(keys.begin(), keys.end()), - std::vector(values.begin(), values.end())); -} - double RcppRPF::MSE_vec(const NumericVector &Y_predicted, const NumericVector &Y_true) { return RandomPlantedForest::MSE_vec(toStd1D(Y_predicted), toStd1D(Y_true)); @@ -135,9 +129,18 @@ void RcppRPF::print() RandomPlantedForest::print(); } -void RcppRPF::get_parameters() +List RcppRPF::get_parameters() { - RandomPlantedForest::get_parameters(); + RPFParams params = RandomPlantedForest::get_parameters(); + return List::create(Named("max_interaction") = params.max_interaction, + Named("n_trees") = params.n_trees, + Named("n_splits") = params.n_splits, + Named("split_try") = params.split_try, + Named("t_try") = params.t_try, + Named("purify_forest") = params.purify_forest, + Named("deterministic") = params.deterministic, + Named("nthreads") = params.nthreads, + Named("cross_validate") = params.cross_validate); } List RcppRPF::get_model() @@ -163,12 +166,6 @@ RcppCPF::RcppCPF(const NumericMatrix &samples_Y, const NumericMatrix &samples_X, } } -void RcppCPF::set_parameters(StringVector keys, NumericVector values) -{ - ClassificationRPF::set_parameters(std::vector(keys.begin(), keys.end()), - std::vector(values.begin(), values.end())); -} - NumericMatrix RcppCPF::predict_matrix(const NumericMatrix &X, const NumericVector components) { auto result = ClassificationRPF::predict_matrix(toStd2D(X), toStd1D(components)); @@ -219,9 +216,18 @@ void RcppCPF::print() RandomPlantedForest::print(); } -void RcppCPF::get_parameters() +List RcppCPF::get_parameters() { - RandomPlantedForest::get_parameters(); + CPFParams params = ClassificationRPF::get_parameters(); + return List::create(Named("max_interaction") = params.max_interaction, + Named("n_trees") = params.n_trees, + Named("n_splits") = params.n_splits, + Named("split_try") = params.split_try, + Named("t_try") = params.t_try, + Named("purify_forest") = params.purify_forest, + Named("deterministic") = params.deterministic, + Named("nthreads") = params.nthreads, + Named("cross_validate") = params.cross_validate); } List RcppCPF::get_model() diff --git a/src/lib/rpf.cpp b/src/lib/rpf.cpp index d36c3c1..b41d098 100644 --- a/src/lib/rpf.cpp +++ b/src/lib/rpf.cpp @@ -1983,68 +1983,7 @@ void RandomPlantedForest::print() } } -// print parameters of the model to the console -void RandomPlantedForest::get_parameters() +RPFParams RandomPlantedForest::get_parameters() { - std::cout << "Parameters: n_trees=" << n_trees << ", n_splits=" << n_splits << ", max_interaction=" << max_interaction << ", t_try=" << t_try - << ", split_try=" << split_try << ", purified=" << purified << ", deterministic=" << deterministic << ", nthreads=" << nthreads - << ", feature_size=" << feature_size << ", sample_size=" << sample_size << std::endl; -} - -/* retrospectively change parameters of existing class object, - updates the model, so far only single valued parameters supported, - for replacing training data use 'set_data', - note that changing cv does not trigger cross validation */ -void RandomPlantedForest::set_parameters(std::vector keys, std::vector values) -{ - if (keys.size() != values.size()) - { - std::cout << "Size of input vectors is not the same. " << std::endl; - return; - } - - for (unsigned int i = 0; i < keys.size(); ++i) - { - if (keys[i] == "deterministic") - { - this->deterministic = values[i]; - } - else if (keys[i] == "nthreads") - { - this->nthreads = values[i]; - } - else if (keys[i] == "purify") - { - this->purify_forest = values[i]; - } - else if (keys[i] == "n_trees") - { - this->n_trees = values[i]; - } - else if (keys[i] == "n_splits") - { - this->n_splits = values[i]; - } - else if (keys[i] == "t_try") - { - this->t_try = values[i]; - } - else if (keys[i] == "split_try") - { - this->split_try = values[i]; - } - else if (keys[i] == "max_interaction") - { - this->max_interaction = values[i]; - } - else if (keys[i] == "cv") - { - this->cross_validate = values[i]; - } - else - { - std::cout << "Unkown parameter key '" << keys[i] << "' ." << std::endl; - } - } - this->fit(); + return {max_interaction, n_trees, n_splits, split_try, t_try, purify_forest, deterministic, nthreads, cross_validate}; } diff --git a/src/randomPlantedForest.cpp b/src/randomPlantedForest.cpp index 407a79b..86e4ae3 100644 --- a/src/randomPlantedForest.cpp +++ b/src/randomPlantedForest.cpp @@ -16,13 +16,11 @@ RCPP_MODULE(mod_rpf) .method("purify", &RcppRPF::purify_3) .method("print", &RcppRPF::print) .method("get_parameters", &RcppRPF::get_parameters) - .method("set_parameters", &RcppRPF::set_parameters) .method("get_model", &RcppRPF::get_model) .method("is_purified", &RcppRPF::is_purified); class_("ClassificationRPF") .constructor() - .method("set_parameters", &RcppCPF::set_parameters) .method("predict_matrix", &RcppCPF::predict_matrix) .method("predict_vector", &RcppCPF::predict_vector) .method("cross_validation", &RcppCPF::cross_validation) @@ -30,7 +28,6 @@ RCPP_MODULE(mod_rpf) .method("purify", &RcppCPF::purify_3) .method("print", &RcppCPF::print) .method("get_parameters", &RcppCPF::get_parameters) - .method("set_parameters", &RcppCPF::set_parameters) .method("get_model", &RcppCPF::get_model) .method("is_purified", &RcppCPF::is_purified); } diff --git a/tests/testthat/test-rpf-cpp.R b/tests/testthat/test-rpf-cpp.R index c4e7df1..a7af1fb 100644 --- a/tests/testthat/test-rpf-cpp.R +++ b/tests/testthat/test-rpf-cpp.R @@ -29,22 +29,3 @@ test_that("Model structure is correct", { expect_true(inherits(mod, "list")) expect_equal(length(mod), ntrees) # length is number of trees }) - -# Test parameter getting and setting -test_that("Parameter getting and setting works", { - train <- mtcars[1:20, ] - - set.seed(23) - rpfit <- rpf(mpg ~ ., data = train) - - # could test if setting params work - rpfit$fit$set_parameters("n_trees", 60) - # FIXME: Cosmetic issue but should be addressed at some point - # This should be assignable instead of printing to stdout - # An R-vector or list would probably be ideal - params_captured <- capture.output(rpfit$fit$get_parameters()) - expect_equal( - strsplit(params_captured, "(, )|(: )")[[1]][[2]], - "n_trees=60" - ) -})