From 6f54459322a8b812c8e2946ebca17c06dc63c830 Mon Sep 17 00:00:00 2001 From: Wush Wu Date: Thu, 17 Sep 2015 23:26:18 +0800 Subject: [PATCH 01/14] implement two callback converters --- src/vector_converter.h | 88 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/src/vector_converter.h b/src/vector_converter.h index 8d56861..feb410b 100644 --- a/src/vector_converter.h +++ b/src/vector_converter.h @@ -539,4 +539,92 @@ class InteractionConverter : public VectorConverter { }; +template +class CallbackCharacterVectorConverter : public VectorConverter { + + void correct_feature(uint32_t& feature) { + feature = feature % hash_size; + } + +public: + + typedef OutputType (*callback)(const char* input); + +protected: + + callback f; + OutputType cache; + Rcpp::CharacterVector src; + SEXP psrc; + +public: + + explicit CallbackCharacterVectorConverter(SEXP _src, callback _f, const Param& param) + : src(_src), psrc(_src), f(_f), VectorConverter(param) + { } + + virtual ~CallbackCharacterVectorConverter() { } + + virtual void get_feature() = 0; + + virtual const std::vector& get_feature(size_t i) { + SEXP pstr = STRING_ELT(psrc, i); + if (pstr == NA_STRING) { + feature_buffer.clear(); + } else { + const char* str = CHAR(pstr); + cache = f(str); + get_feature(); + if (is_final) { + size_t hash_size = this->hash_size; + std::for_each(feature_buffer.begin(), feature_buffer.end(), [this](uint32_t& feature) { + feature = feature % this->hash_size; + }); + } + } + return feature_buffer; + } + + virtual void get_value() = 0; + + virtual const std::vector& get_value(size_t i) { + SEXP pstr = STRING_ELT(psrc, i); + if (pstr == NA_STRING) { + value_buffer.clear(); + } else { + get_value(); + } + return value_buffer; + } +}; + +class CallbackCharacterVectorStdVectorConverter + : public CallbackCharacterVectorConverter< std::vector > { + +protected: + + virtual void get_feature() { + feature_buffer.resize(cache.size()); + std::transform(cache.begin(), cache.end(), feature_buffer.begin(), [this](const std::string& s) { + return get_hashed_feature(h_main, s.c_str()); + }); + } + + virtual void get_value() { + value_buffer.resize(cache.size()); + std::transform(cache.begin(), cache.end(), value_buffer.begin(), [this](const std::string& s) { + return get_hashed_feature(h_binary, s.c_str()); + }); + } + +public: + + explicit CallbackCharacterVectorStdVectorConverter(SEXP _src, callback _f, const Param& param) + : CallbackCharacterVectorConverter(_src, _f, param) + { } + + virtual ~CallbackCharacterVectorStdVectorConverter() { } + +}; + #endif // __VECTOR_CONVERTER_HPP__ \ No newline at end of file From a027423c9f4c587289d0f7779f0c992feecbac0e Mon Sep 17 00:00:00 2001 From: Wush Wu Date: Thu, 17 Sep 2015 23:26:56 +0800 Subject: [PATCH 02/14] implement test_callback --- NAMESPACE | 1 + R/RcppExports.R | 12 ++++++++++ src/RcppExports.cpp | 46 +++++++++++++++++++++++-------------- src/hashed_model_matrix.cpp | 13 +++++++++++ 4 files changed, 55 insertions(+), 17 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index a51b87b..3c4e407 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,6 +7,7 @@ export(hashed.interaction.value) export(hashed.model.matrix) export(hashed.value) export(intToRaw) +export(test_callback) import(digest) importClassesFrom(Matrix,dgCMatrix) importFrom(Matrix,Diagonal) diff --git a/R/RcppExports.R b/R/RcppExports.R index 2cba22a..f43567f 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -36,6 +36,18 @@ hashed.interaction.value <- function(src) { .Call('FeatureHashing_h2', PACKAGE = 'FeatureHashing', src) } +#'@title Test the callback function. +#'@param Rcallback external pointer. The pointer of the callback function. +#'@param input string. The input. +#'@details The Rcallback is an external pointer which points to a functional pointer.. +#'The signature of the functional pointer should be: +#'\code{std::vector (*f)(const char* str)} +#'@return character +#'@export +test_callback <- function(Rcallback, input) { + .Call('FeatureHashing_test_callback', PACKAGE = 'FeatureHashing', Rcallback, input) +} + #'@title Convert the integer to raw vector with endian correction #'@param src integer value. #'@return raw vector with length 4 diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 764f041..ec6ec3a 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -50,23 +50,6 @@ BEGIN_RCPP return __result; END_RCPP } -// hashed_model_matrix_dataframe -SEXP hashed_model_matrix_dataframe(RObject tf, DataFrame data, unsigned long hash_size, bool transpose, S4 retval, bool keep_hashing_mapping, bool is_xi); -RcppExport SEXP FeatureHashing_hashed_model_matrix_dataframe(SEXP tfSEXP, SEXP dataSEXP, SEXP hash_sizeSEXP, SEXP transposeSEXP, SEXP retvalSEXP, SEXP keep_hashing_mappingSEXP, SEXP is_xiSEXP) { -BEGIN_RCPP - Rcpp::RObject __result; - Rcpp::RNGScope __rngScope; - Rcpp::traits::input_parameter< RObject >::type tf(tfSEXP); - Rcpp::traits::input_parameter< DataFrame >::type data(dataSEXP); - Rcpp::traits::input_parameter< unsigned long >::type hash_size(hash_sizeSEXP); - Rcpp::traits::input_parameter< bool >::type transpose(transposeSEXP); - Rcpp::traits::input_parameter< S4 >::type retval(retvalSEXP); - Rcpp::traits::input_parameter< bool >::type keep_hashing_mapping(keep_hashing_mappingSEXP); - Rcpp::traits::input_parameter< bool >::type is_xi(is_xiSEXP); - __result = Rcpp::wrap(hashed_model_matrix_dataframe(tf, data, hash_size, transpose, retval, keep_hashing_mapping, is_xi)); - return __result; -END_RCPP -} // xi IntegerVector xi(CharacterVector src); RcppExport SEXP FeatureHashing_xi(SEXP srcSEXP) { @@ -100,6 +83,35 @@ BEGIN_RCPP return __result; END_RCPP } +// hashed_model_matrix_dataframe +SEXP hashed_model_matrix_dataframe(RObject tf, DataFrame data, unsigned long hash_size, bool transpose, S4 retval, bool keep_hashing_mapping, bool is_xi); +RcppExport SEXP FeatureHashing_hashed_model_matrix_dataframe(SEXP tfSEXP, SEXP dataSEXP, SEXP hash_sizeSEXP, SEXP transposeSEXP, SEXP retvalSEXP, SEXP keep_hashing_mappingSEXP, SEXP is_xiSEXP) { +BEGIN_RCPP + Rcpp::RObject __result; + Rcpp::RNGScope __rngScope; + Rcpp::traits::input_parameter< RObject >::type tf(tfSEXP); + Rcpp::traits::input_parameter< DataFrame >::type data(dataSEXP); + Rcpp::traits::input_parameter< unsigned long >::type hash_size(hash_sizeSEXP); + Rcpp::traits::input_parameter< bool >::type transpose(transposeSEXP); + Rcpp::traits::input_parameter< S4 >::type retval(retvalSEXP); + Rcpp::traits::input_parameter< bool >::type keep_hashing_mapping(keep_hashing_mappingSEXP); + Rcpp::traits::input_parameter< bool >::type is_xi(is_xiSEXP); + __result = Rcpp::wrap(hashed_model_matrix_dataframe(tf, data, hash_size, transpose, retval, keep_hashing_mapping, is_xi)); + return __result; +END_RCPP +} +// test_callback +SEXP test_callback(SEXP Rcallback, const std::string& input); +RcppExport SEXP FeatureHashing_test_callback(SEXP RcallbackSEXP, SEXP inputSEXP) { +BEGIN_RCPP + Rcpp::RObject __result; + Rcpp::RNGScope __rngScope; + Rcpp::traits::input_parameter< SEXP >::type Rcallback(RcallbackSEXP); + Rcpp::traits::input_parameter< const std::string& >::type input(inputSEXP); + __result = Rcpp::wrap(test_callback(Rcallback, input)); + return __result; +END_RCPP +} // intToRaw SEXP intToRaw(int src); RcppExport SEXP FeatureHashing_intToRaw(SEXP srcSEXP) { diff --git a/src/hashed_model_matrix.cpp b/src/hashed_model_matrix.cpp index 0b2b7bc..ffaa7f5 100644 --- a/src/hashed_model_matrix.cpp +++ b/src/hashed_model_matrix.cpp @@ -304,3 +304,16 @@ SEXP hashed_model_matrix_dataframe(RObject tf, DataFrame data, unsigned long has return hashed_model_matrix(tf, data, hash_size, transpose, retval, keep_hashing_mapping, is_xi); } +//'@title Test the callback function. +//'@param Rcallback external pointer. The pointer of the callback function. +//'@param input string. The input. +//'@details The Rcallback is an external pointer which points to a functional pointer.. +//'The signature of the functional pointer should be: +//'\code{std::vector (*f)(const char* str)} +//'@return character +//'@export +//[[Rcpp::export("test_callback")]] +SEXP test_callback(SEXP Rcallback, const std::string& input) { + XPtr callback(Rcallback); + return wrap((*callback)(input.c_str())); +} From 92ce9c6e1bed3dfda782c4f401a56463a3171491 Mon Sep 17 00:00:00 2001 From: Wush Wu Date: Thu, 17 Sep 2015 23:27:14 +0800 Subject: [PATCH 03/14] register function of callback --- NAMESPACE | 1 + R/hashed.model.matrix.R | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index 3c4e407..78cc4d5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,6 +7,7 @@ export(hashed.interaction.value) export(hashed.model.matrix) export(hashed.value) export(intToRaw) +export(register_callback) export(test_callback) import(digest) importClassesFrom(Matrix,dgCMatrix) diff --git a/R/hashed.model.matrix.R b/R/hashed.model.matrix.R index 37f60e4..8dc9027 100644 --- a/R/hashed.model.matrix.R +++ b/R/hashed.model.matrix.R @@ -256,6 +256,17 @@ tf.idf.transfo <- function(hash.matrix){ hash.matrix %*% idf.train } +.callback <- new.env() + +#'@export +#'@title Register Special Function for Formula Interface +#'@param special string. The name which will be used in formula interface. +#'@param callback external pointer. The pointer to the callback function. Please see the details. +register_callback <- function(special, callback) { + .callback[[special]] <- callback + invisible(NULL) +} + # Avoid error messages during CRAN check. # The reason is that these variables are never declared # They are mainly column names inferred by Data.table... From 41e8a386fb0e245841e4ccfad43f3e5644494f35 Mon Sep 17 00:00:00 2001 From: Wush Wu Date: Sun, 20 Sep 2015 23:57:35 +0800 Subject: [PATCH 04/14] stash development! --- R/RcppExports.R | 28 ++++++------ R/callback.R | 16 +++++++ R/hashed.model.matrix.R | 11 ----- inst/include/callback.h | 35 +++++++++++++++ man/register_callback.Rd | 17 ++++++++ man/test_callback.Rd | 25 +++++++++++ src/Makevars | 2 + src/Makevars.win | 2 + src/RcppExports.cpp | 70 +++++++++++++++++------------- src/callback.cpp | 37 ++++++++++++++++ src/hashed_model_matrix.cpp | 14 ------ src/hashed_model_matrix.h | 1 + src/split.cpp | 13 ++++++ src/split.h | 51 ++++++++++++++++++++++ src/vector_converter.h | 85 +++++++++++-------------------------- 15 files changed, 281 insertions(+), 126 deletions(-) create mode 100644 R/callback.R create mode 100644 inst/include/callback.h create mode 100644 man/register_callback.Rd create mode 100644 man/test_callback.Rd create mode 100644 src/callback.cpp diff --git a/R/RcppExports.R b/R/RcppExports.R index f43567f..1046083 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -17,6 +17,18 @@ tomatrix <- function(m) { .Call('FeatureHashing_tomatrix', PACKAGE = 'FeatureHashing', m) } +#'@title Test the callback function. +#'@param Rcallback external pointer. The pointer of the callback function. +#'@param input string. The input. +#'@details The Rcallback is an external pointer which points to a functional pointer.. +#'The signature of the functional pointer should be: +#'\code{std::vector (*f)(const char* str)} +#'@return character +#'@export +test_callback <- function(Rcallback, input) { + .Call('FeatureHashing_test_callback', PACKAGE = 'FeatureHashing', Rcallback, input) +} + .hashed.model.matrix.dataframe <- function(tf, data, hash_size, transpose, retval, keep_hashing_mapping, is_xi) { .Call('FeatureHashing_hashed_model_matrix_dataframe', PACKAGE = 'FeatureHashing', tf, data, hash_size, transpose, retval, keep_hashing_mapping, is_xi) } @@ -36,18 +48,6 @@ hashed.interaction.value <- function(src) { .Call('FeatureHashing_h2', PACKAGE = 'FeatureHashing', src) } -#'@title Test the callback function. -#'@param Rcallback external pointer. The pointer of the callback function. -#'@param input string. The input. -#'@details The Rcallback is an external pointer which points to a functional pointer.. -#'The signature of the functional pointer should be: -#'\code{std::vector (*f)(const char* str)} -#'@return character -#'@export -test_callback <- function(Rcallback, input) { - .Call('FeatureHashing_test_callback', PACKAGE = 'FeatureHashing', Rcallback, input) -} - #'@title Convert the integer to raw vector with endian correction #'@param src integer value. #'@return raw vector with length 4 @@ -72,6 +72,10 @@ split_count <- function(src, delim) { .Call('FeatureHashing_split_count', PACKAGE = 'FeatureHashing', src, delim) } +init_split_callback <- function(delim, type) { + .Call('FeatureHashing_init_split_callback', PACKAGE = 'FeatureHashing', delim, type) +} + .selectColumn <- function(m, index, drop = TRUE, Rretval = NULL) { .Call('FeatureHashing_selectColumn', PACKAGE = 'FeatureHashing', m, index, drop, Rretval) } diff --git a/R/callback.R b/R/callback.R new file mode 100644 index 0000000..0d6103b --- /dev/null +++ b/R/callback.R @@ -0,0 +1,16 @@ + +.callback <- new.env() + +#'@export +#'@title Register Special Function for Formula Interface +#'@param special string. The name which will be used in formula interface. +#'@param callback external pointer. The pointer to the callback function. Please see the details. +register_callback <- function(special, callback) { + .callback[[special]] <- callback + invisible(NULL) +} + +#'@export +generate_split_callback <- function(sep, type) { + +} diff --git a/R/hashed.model.matrix.R b/R/hashed.model.matrix.R index 8dc9027..37f60e4 100644 --- a/R/hashed.model.matrix.R +++ b/R/hashed.model.matrix.R @@ -256,17 +256,6 @@ tf.idf.transfo <- function(hash.matrix){ hash.matrix %*% idf.train } -.callback <- new.env() - -#'@export -#'@title Register Special Function for Formula Interface -#'@param special string. The name which will be used in formula interface. -#'@param callback external pointer. The pointer to the callback function. Please see the details. -register_callback <- function(special, callback) { - .callback[[special]] <- callback - invisible(NULL) -} - # Avoid error messages during CRAN check. # The reason is that these variables are never declared # They are mainly column names inferred by Data.table... diff --git a/inst/include/callback.h b/inst/include/callback.h new file mode 100644 index 0000000..36a3ff9 --- /dev/null +++ b/inst/include/callback.h @@ -0,0 +1,35 @@ +/* + * This file is part of FeatureHashing + * Copyright (C) 2015 Wush Wu + * + * This program is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + */ + +#ifndef __CALLBACK_H__ +#define __CALLBACK_H__ + +#include +#include + +class CallbackFunctor { + +public: + CallbackFunctor() { } + virtual ~CallbackFunctor() { } + + virtual const std::vector operator()(const char* input) const = 0; + +}; + +#endif //__CALLBACK_H__ \ No newline at end of file diff --git a/man/register_callback.Rd b/man/register_callback.Rd new file mode 100644 index 0000000..b1eaa00 --- /dev/null +++ b/man/register_callback.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/hashed.model.matrix.R +\name{register_callback} +\alias{register_callback} +\title{Register Special Function for Formula Interface} +\usage{ +register_callback(special, callback) +} +\arguments{ +\item{special}{string. The name which will be used in formula interface.} + +\item{callback}{external pointer. The pointer to the callback function. Please see the details.} +} +\description{ +Register Special Function for Formula Interface +} + diff --git a/man/test_callback.Rd b/man/test_callback.Rd new file mode 100644 index 0000000..93e2513 --- /dev/null +++ b/man/test_callback.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{test_callback} +\alias{test_callback} +\title{Test the callback function.} +\usage{ +test_callback(Rcallback, input) +} +\arguments{ +\item{Rcallback}{external pointer. The pointer of the callback function.} + +\item{input}{string. The input.} +} +\value{ +character +} +\description{ +Test the callback function. +} +\details{ +The Rcallback is an external pointer which points to a functional pointer.. +The signature of the functional pointer should be: +\code{std::vector (*f)(const char* str)} +} + diff --git a/src/Makevars b/src/Makevars index a7f3510..c78b353 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1 +1,3 @@ CXX_STD = CXX11 + +PKG_CPPFLAGS = -I../inst/include/ \ No newline at end of file diff --git a/src/Makevars.win b/src/Makevars.win index a7f3510..c78b353 100644 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -1 +1,3 @@ CXX_STD = CXX11 + +PKG_CPPFLAGS = -I../inst/include/ \ No newline at end of file diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index ec6ec3a..d1feac2 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -50,6 +50,35 @@ BEGIN_RCPP return __result; END_RCPP } +// test_callback +SEXP test_callback(SEXP Rcallback, const std::string& input); +RcppExport SEXP FeatureHashing_test_callback(SEXP RcallbackSEXP, SEXP inputSEXP) { +BEGIN_RCPP + Rcpp::RObject __result; + Rcpp::RNGScope __rngScope; + Rcpp::traits::input_parameter< SEXP >::type Rcallback(RcallbackSEXP); + Rcpp::traits::input_parameter< const std::string& >::type input(inputSEXP); + __result = Rcpp::wrap(test_callback(Rcallback, input)); + return __result; +END_RCPP +} +// hashed_model_matrix_dataframe +SEXP hashed_model_matrix_dataframe(RObject tf, DataFrame data, unsigned long hash_size, bool transpose, S4 retval, bool keep_hashing_mapping, bool is_xi); +RcppExport SEXP FeatureHashing_hashed_model_matrix_dataframe(SEXP tfSEXP, SEXP dataSEXP, SEXP hash_sizeSEXP, SEXP transposeSEXP, SEXP retvalSEXP, SEXP keep_hashing_mappingSEXP, SEXP is_xiSEXP) { +BEGIN_RCPP + Rcpp::RObject __result; + Rcpp::RNGScope __rngScope; + Rcpp::traits::input_parameter< RObject >::type tf(tfSEXP); + Rcpp::traits::input_parameter< DataFrame >::type data(dataSEXP); + Rcpp::traits::input_parameter< unsigned long >::type hash_size(hash_sizeSEXP); + Rcpp::traits::input_parameter< bool >::type transpose(transposeSEXP); + Rcpp::traits::input_parameter< S4 >::type retval(retvalSEXP); + Rcpp::traits::input_parameter< bool >::type keep_hashing_mapping(keep_hashing_mappingSEXP); + Rcpp::traits::input_parameter< bool >::type is_xi(is_xiSEXP); + __result = Rcpp::wrap(hashed_model_matrix_dataframe(tf, data, hash_size, transpose, retval, keep_hashing_mapping, is_xi)); + return __result; +END_RCPP +} // xi IntegerVector xi(CharacterVector src); RcppExport SEXP FeatureHashing_xi(SEXP srcSEXP) { @@ -83,35 +112,6 @@ BEGIN_RCPP return __result; END_RCPP } -// hashed_model_matrix_dataframe -SEXP hashed_model_matrix_dataframe(RObject tf, DataFrame data, unsigned long hash_size, bool transpose, S4 retval, bool keep_hashing_mapping, bool is_xi); -RcppExport SEXP FeatureHashing_hashed_model_matrix_dataframe(SEXP tfSEXP, SEXP dataSEXP, SEXP hash_sizeSEXP, SEXP transposeSEXP, SEXP retvalSEXP, SEXP keep_hashing_mappingSEXP, SEXP is_xiSEXP) { -BEGIN_RCPP - Rcpp::RObject __result; - Rcpp::RNGScope __rngScope; - Rcpp::traits::input_parameter< RObject >::type tf(tfSEXP); - Rcpp::traits::input_parameter< DataFrame >::type data(dataSEXP); - Rcpp::traits::input_parameter< unsigned long >::type hash_size(hash_sizeSEXP); - Rcpp::traits::input_parameter< bool >::type transpose(transposeSEXP); - Rcpp::traits::input_parameter< S4 >::type retval(retvalSEXP); - Rcpp::traits::input_parameter< bool >::type keep_hashing_mapping(keep_hashing_mappingSEXP); - Rcpp::traits::input_parameter< bool >::type is_xi(is_xiSEXP); - __result = Rcpp::wrap(hashed_model_matrix_dataframe(tf, data, hash_size, transpose, retval, keep_hashing_mapping, is_xi)); - return __result; -END_RCPP -} -// test_callback -SEXP test_callback(SEXP Rcallback, const std::string& input); -RcppExport SEXP FeatureHashing_test_callback(SEXP RcallbackSEXP, SEXP inputSEXP) { -BEGIN_RCPP - Rcpp::RObject __result; - Rcpp::RNGScope __rngScope; - Rcpp::traits::input_parameter< SEXP >::type Rcallback(RcallbackSEXP); - Rcpp::traits::input_parameter< const std::string& >::type input(inputSEXP); - __result = Rcpp::wrap(test_callback(Rcallback, input)); - return __result; -END_RCPP -} // intToRaw SEXP intToRaw(int src); RcppExport SEXP FeatureHashing_intToRaw(SEXP srcSEXP) { @@ -173,6 +173,18 @@ BEGIN_RCPP return __result; END_RCPP } +// init_split_callback +SEXP init_split_callback(const std::string& delim, const std::string& type); +RcppExport SEXP FeatureHashing_init_split_callback(SEXP delimSEXP, SEXP typeSEXP) { +BEGIN_RCPP + Rcpp::RObject __result; + Rcpp::RNGScope __rngScope; + Rcpp::traits::input_parameter< const std::string& >::type delim(delimSEXP); + Rcpp::traits::input_parameter< const std::string& >::type type(typeSEXP); + __result = Rcpp::wrap(init_split_callback(delim, type)); + return __result; +END_RCPP +} // selectColumn SEXP selectColumn(S4 m, IntegerVector index, bool drop, SEXP Rretval); RcppExport SEXP FeatureHashing_selectColumn(SEXP mSEXP, SEXP indexSEXP, SEXP dropSEXP, SEXP RretvalSEXP) { diff --git a/src/callback.cpp b/src/callback.cpp new file mode 100644 index 0000000..70cd6a2 --- /dev/null +++ b/src/callback.cpp @@ -0,0 +1,37 @@ +/* + * This file is part of FeatureHashing + * Copyright (C) 2015 Wush Wu + * + * This program is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + */ + +#include "callback.h" +#include "split.h" +#include + +using namespace Rcpp; + +//'@title Test the callback function. +//'@param Rcallback external pointer. The pointer of the callback function. +//'@param input string. The input. +//'@details The Rcallback is an external pointer which points to a functional pointer.. +//'The signature of the functional pointer should be: +//'\code{std::vector (*f)(const char* str)} +//'@return character +//'@export +//[[Rcpp::export("test_callback")]] +SEXP test_callback(SEXP Rcallback, const std::string& input) { + XPtr callback(Rcallback); + return wrap((*callback)(input.c_str())); +} diff --git a/src/hashed_model_matrix.cpp b/src/hashed_model_matrix.cpp index ffaa7f5..b9388e7 100644 --- a/src/hashed_model_matrix.cpp +++ b/src/hashed_model_matrix.cpp @@ -303,17 +303,3 @@ SEXP hashed_model_matrix(RObject tf, DataFrameLike data, unsigned long hash_size SEXP hashed_model_matrix_dataframe(RObject tf, DataFrame data, unsigned long hash_size, bool transpose, S4 retval, bool keep_hashing_mapping, bool is_xi) { return hashed_model_matrix(tf, data, hash_size, transpose, retval, keep_hashing_mapping, is_xi); } - -//'@title Test the callback function. -//'@param Rcallback external pointer. The pointer of the callback function. -//'@param input string. The input. -//'@details The Rcallback is an external pointer which points to a functional pointer.. -//'The signature of the functional pointer should be: -//'\code{std::vector (*f)(const char* str)} -//'@return character -//'@export -//[[Rcpp::export("test_callback")]] -SEXP test_callback(SEXP Rcallback, const std::string& input) { - XPtr callback(Rcallback); - return wrap((*callback)(input.c_str())); -} diff --git a/src/hashed_model_matrix.h b/src/hashed_model_matrix.h index 752ec1d..1ada8ce 100644 --- a/src/hashed_model_matrix.h +++ b/src/hashed_model_matrix.h @@ -23,6 +23,7 @@ #include #include #include +#include "callback.h" #include "hash_function.h" #include "vector_converter.h" diff --git a/src/split.cpp b/src/split.cpp index b2b170b..d8b6374 100644 --- a/src/split.cpp +++ b/src/split.cpp @@ -85,3 +85,16 @@ SEXP split_count(CharacterVector src, const std::string& delim) { retval.attr("names") = retval_name; return retval; } + +//[[Rcpp::export]] +SEXP init_split_callback(const std::string& delim, const std::string& type) { + if (type.compare("existence") == 0) { + return XPtr(new SplitCallbackFunctor(delim, SplitType::Existence)); + } else if (type.compare("count") == 0) { + return XPtr(new SplitCallbackFunctor(delim, SplitType::Count)); + } else if (type.compare("tf-idf") == 0) { + return XPtr(new SplitCallbackFunctor(delim, SplitType::Count)); + } else { + throw std::invalid_argument("Unknown type"); + } +} \ No newline at end of file diff --git a/src/split.h b/src/split.h index 4104c2c..c1cf01a 100644 --- a/src/split.h +++ b/src/split.h @@ -16,7 +16,58 @@ * this program. If not, see . */ +#ifndef __SPLIT_H__ +#define __SPLIT_H__ + #include #include +#include +#include +#include "callback.h" std::vector split(const std::string& src, const std::string& delim); + +enum class SplitType { + Count, + Existence +}; + +class SplitCallbackFunctor : public CallbackFunctor { + + SplitType type; + const std::string delim; + + const std::vector split_count(const char* input) const { + std::vector temp(split(input, delim)); + temp.erase(std::remove(temp.begin(), temp.end(), ""), temp.end()); + return temp; + } + + const std::vector split_existence(const char* input) const { + std::vector temp(split(input, delim)); + std::set temp2(temp.begin(), temp.end()); + temp2.erase(""); + temp.assign(temp2.begin(), temp2.end()); + return temp; + } + +public: + + explicit SplitCallbackFunctor(const std::string& _delim, SplitType _type) + : type(_type), delim(_delim), CallbackFunctor() + { } + + virtual ~SplitCallbackFunctor() { } + + virtual const std::vector operator()(const char* input) const { + switch (type) { + case SplitType::Count : + return split_count(input); + case SplitType::Existence : + return split_existence(input); + } + } + +}; + +#endif //__SPLIT_H__ diff --git a/src/vector_converter.h b/src/vector_converter.h index feb410b..e488369 100644 --- a/src/vector_converter.h +++ b/src/vector_converter.h @@ -21,6 +21,7 @@ #include "hash_function.h" #include "split.h" +#include "callback.h" #include struct VectorConverterParam; @@ -539,92 +540,56 @@ class InteractionConverter : public VectorConverter { }; -template -class CallbackCharacterVectorConverter : public VectorConverter { +class CallbackConverter : public VectorConverter { - void correct_feature(uint32_t& feature) { - feature = feature % hash_size; - } - -public: - - typedef OutputType (*callback)(const char* input); - -protected: - - callback f; - OutputType cache; + const CallbackFunctor* f; Rcpp::CharacterVector src; SEXP psrc; + std::vector< std::string > cache; public: - explicit CallbackCharacterVectorConverter(SEXP _src, callback _f, const Param& param) - : src(_src), psrc(_src), f(_f), VectorConverter(param) - { } + CallbackConverter(const CallbackFunctor* _f, SEXP _src, const Param& param) + : f(_f), src(_src), psrc(_src), VectorConverter(param) + { } - virtual ~CallbackCharacterVectorConverter() { } + virtual ~CallbackConverter() { } - virtual void get_feature() = 0; - virtual const std::vector& get_feature(size_t i) { SEXP pstr = STRING_ELT(psrc, i); if (pstr == NA_STRING) { feature_buffer.clear(); } else { const char* str = CHAR(pstr); - cache = f(str); - get_feature(); - if (is_final) { - size_t hash_size = this->hash_size; - std::for_each(feature_buffer.begin(), feature_buffer.end(), [this](uint32_t& feature) { - feature = feature % this->hash_size; - }); - } + cache = f->operator()(str); + feature_buffer.resize(cache.size()); + std::transform(cache.begin(), cache.end(), feature_buffer.begin(), + [this](const std::string& s) { + return this->get_hashed_feature(this->h_main, s.c_str()); + }); + if (is_final) std::transform(feature_buffer.begin(), feature_buffer.end(), + feature_buffer.begin(), [this](uint32_t feature) { + return feature % this->hash_size; + }); } return feature_buffer; } - virtual void get_value() = 0; - virtual const std::vector& get_value(size_t i) { SEXP pstr = STRING_ELT(psrc, i); if (pstr == NA_STRING) { value_buffer.clear(); } else { - get_value(); + const char* str = CHAR(pstr); + value_buffer.resize(cache.size()); + std::transform(cache.begin(), cache.end(), value_buffer.begin(), + [this](const std::string& s) { + return this->get_hashed_feature(this->h_binary, s.c_str()); + }); } return value_buffer; - } -}; - -class CallbackCharacterVectorStdVectorConverter - : public CallbackCharacterVectorConverter< std::vector > { - -protected: - - virtual void get_feature() { - feature_buffer.resize(cache.size()); - std::transform(cache.begin(), cache.end(), feature_buffer.begin(), [this](const std::string& s) { - return get_hashed_feature(h_main, s.c_str()); - }); } - virtual void get_value() { - value_buffer.resize(cache.size()); - std::transform(cache.begin(), cache.end(), value_buffer.begin(), [this](const std::string& s) { - return get_hashed_feature(h_binary, s.c_str()); - }); - } - -public: - - explicit CallbackCharacterVectorStdVectorConverter(SEXP _src, callback _f, const Param& param) - : CallbackCharacterVectorConverter(_src, _f, param) - { } - - virtual ~CallbackCharacterVectorStdVectorConverter() { } - }; - + #endif // __VECTOR_CONVERTER_HPP__ \ No newline at end of file From 9af6e3bb3a587ad4ad7b286072abd700fb455f1b Mon Sep 17 00:00:00 2001 From: Wush Wu Date: Tue, 22 Sep 2015 01:37:58 +0800 Subject: [PATCH 05/14] use RcppModule to implement callback --- DESCRIPTION | 1 + NAMESPACE | 4 ++- R/RcppExports.R | 4 --- R/callback.R | 5 +-- R/zzz.R | 9 +++-- inst/include/callback.h | 4 +++ man/register_callback.Rd | 2 +- src/RcppExports.cpp | 12 ------- src/callback.cpp | 12 +++++-- src/split.cpp | 13 ------- src/split.h | 43 ----------------------- src/split_callback.cpp | 69 +++++++++++++++++++++++++++++++++++++ tests/test-split_callback.R | 15 ++++++++ 13 files changed, 113 insertions(+), 80 deletions(-) create mode 100644 src/split_callback.cpp create mode 100644 tests/test-split_callback.R diff --git a/DESCRIPTION b/DESCRIPTION index d2ce5b5..d104aab 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -25,6 +25,7 @@ Imports: magrittr (>= 1.5) LinkingTo: Rcpp, digest(>= 0.6.8), BH Suggests: RUnit, glmnet, knitr, xgboost, rmarkdown +RcppModules: callback, split_callback SystemRequirements: C++11 BugReports: https://github.com/wush978/FeatureHashing/issues URL: https://github.com/wush978/FeatureHashing diff --git a/NAMESPACE b/NAMESPACE index 78cc4d5..3ff1d15 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2 (4.1.1): do not edit by hand +export(generate_split_callback) export(hash.mapping) export(hash.sign) export(hash.size) @@ -13,7 +14,8 @@ import(digest) importClassesFrom(Matrix,dgCMatrix) importFrom(Matrix,Diagonal) importFrom(Matrix,colSums) -importFrom(Rcpp,evalCpp) +importFrom(Rcpp,cpp_object_initializer) +importFrom(Rcpp,loadModule) importFrom(magrittr,"%<>%") importFrom(magrittr,"%>%") importFrom(methods,checkAtAssignment) diff --git a/R/RcppExports.R b/R/RcppExports.R index 1046083..2888ded 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -72,10 +72,6 @@ split_count <- function(src, delim) { .Call('FeatureHashing_split_count', PACKAGE = 'FeatureHashing', src, delim) } -init_split_callback <- function(delim, type) { - .Call('FeatureHashing_init_split_callback', PACKAGE = 'FeatureHashing', delim, type) -} - .selectColumn <- function(m, index, drop = TRUE, Rretval = NULL) { .Call('FeatureHashing_selectColumn', PACKAGE = 'FeatureHashing', m, index, drop, Rretval) } diff --git a/R/callback.R b/R/callback.R index 0d6103b..352f207 100644 --- a/R/callback.R +++ b/R/callback.R @@ -11,6 +11,7 @@ register_callback <- function(special, callback) { } #'@export -generate_split_callback <- function(sep, type) { - +generate_split_callback <- function(delim, type = c("existence", "count")) { + callback <- new(split_callback, delim, type[1]) + callback } diff --git a/R/zzz.R b/R/zzz.R index 596be64..19df95b 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,7 +1,12 @@ #'@useDynLib FeatureHashing -#'@importFrom Rcpp evalCpp +#'@importFrom Rcpp loadModule cpp_object_initializer #'@import digest -.onLoad <- function(libname, pkgname) { } +.onLoad <- function(libname, pkgname) { + # loadRcppModules() +} + +loadModule("callback", TRUE) +loadModule("split_callback", TRUE) .onAttach <- function(libname, pkgname) { if (interactive()) { diff --git a/inst/include/callback.h b/inst/include/callback.h index 36a3ff9..2ee6989 100644 --- a/inst/include/callback.h +++ b/inst/include/callback.h @@ -32,4 +32,8 @@ class CallbackFunctor { }; +#include + +RCPP_EXPOSED_CLASS(CallbackFunctor) + #endif //__CALLBACK_H__ \ No newline at end of file diff --git a/man/register_callback.Rd b/man/register_callback.Rd index b1eaa00..b4212a2 100644 --- a/man/register_callback.Rd +++ b/man/register_callback.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2 (4.1.1): do not edit by hand -% Please edit documentation in R/hashed.model.matrix.R +% Please edit documentation in R/callback.R \name{register_callback} \alias{register_callback} \title{Register Special Function for Formula Interface} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index d1feac2..428cb65 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -173,18 +173,6 @@ BEGIN_RCPP return __result; END_RCPP } -// init_split_callback -SEXP init_split_callback(const std::string& delim, const std::string& type); -RcppExport SEXP FeatureHashing_init_split_callback(SEXP delimSEXP, SEXP typeSEXP) { -BEGIN_RCPP - Rcpp::RObject __result; - Rcpp::RNGScope __rngScope; - Rcpp::traits::input_parameter< const std::string& >::type delim(delimSEXP); - Rcpp::traits::input_parameter< const std::string& >::type type(typeSEXP); - __result = Rcpp::wrap(init_split_callback(delim, type)); - return __result; -END_RCPP -} // selectColumn SEXP selectColumn(S4 m, IntegerVector index, bool drop, SEXP Rretval); RcppExport SEXP FeatureHashing_selectColumn(SEXP mSEXP, SEXP indexSEXP, SEXP dropSEXP, SEXP RretvalSEXP) { diff --git a/src/callback.cpp b/src/callback.cpp index 70cd6a2..0083ec5 100644 --- a/src/callback.cpp +++ b/src/callback.cpp @@ -30,8 +30,16 @@ using namespace Rcpp; //'\code{std::vector (*f)(const char* str)} //'@return character //'@export -//[[Rcpp::export("test_callback")]] +//[[Rcpp::export]] SEXP test_callback(SEXP Rcallback, const std::string& input) { - XPtr callback(Rcallback); + CallbackFunctor* callback(as(Rcallback)); return wrap((*callback)(input.c_str())); + // return R_NilValue; +} + +RCPP_MODULE(callback) { + + class_("callback") + ; + } diff --git a/src/split.cpp b/src/split.cpp index d8b6374..b2b170b 100644 --- a/src/split.cpp +++ b/src/split.cpp @@ -85,16 +85,3 @@ SEXP split_count(CharacterVector src, const std::string& delim) { retval.attr("names") = retval_name; return retval; } - -//[[Rcpp::export]] -SEXP init_split_callback(const std::string& delim, const std::string& type) { - if (type.compare("existence") == 0) { - return XPtr(new SplitCallbackFunctor(delim, SplitType::Existence)); - } else if (type.compare("count") == 0) { - return XPtr(new SplitCallbackFunctor(delim, SplitType::Count)); - } else if (type.compare("tf-idf") == 0) { - return XPtr(new SplitCallbackFunctor(delim, SplitType::Count)); - } else { - throw std::invalid_argument("Unknown type"); - } -} \ No newline at end of file diff --git a/src/split.h b/src/split.h index c1cf01a..8723a26 100644 --- a/src/split.h +++ b/src/split.h @@ -27,47 +27,4 @@ std::vector split(const std::string& src, const std::string& delim); -enum class SplitType { - Count, - Existence -}; - -class SplitCallbackFunctor : public CallbackFunctor { - - SplitType type; - const std::string delim; - - const std::vector split_count(const char* input) const { - std::vector temp(split(input, delim)); - temp.erase(std::remove(temp.begin(), temp.end(), ""), temp.end()); - return temp; - } - - const std::vector split_existence(const char* input) const { - std::vector temp(split(input, delim)); - std::set temp2(temp.begin(), temp.end()); - temp2.erase(""); - temp.assign(temp2.begin(), temp2.end()); - return temp; - } - -public: - - explicit SplitCallbackFunctor(const std::string& _delim, SplitType _type) - : type(_type), delim(_delim), CallbackFunctor() - { } - - virtual ~SplitCallbackFunctor() { } - - virtual const std::vector operator()(const char* input) const { - switch (type) { - case SplitType::Count : - return split_count(input); - case SplitType::Existence : - return split_existence(input); - } - } - -}; - #endif //__SPLIT_H__ diff --git a/src/split_callback.cpp b/src/split_callback.cpp new file mode 100644 index 0000000..bdeb3f8 --- /dev/null +++ b/src/split_callback.cpp @@ -0,0 +1,69 @@ +#include "callback.h" +#include "split.h" +#include + +struct SplitCallbackFunctor : public CallbackFunctor { + + enum SplitType { + Count, + Existence + }; + + std::string delim; + SplitType type; + + SplitCallbackFunctor(const std::string& _delim, const std::string& _type) + : delim(_delim) + { + set_type(_type); + } + + virtual ~SplitCallbackFunctor() { } + + void set_type(std::string _type) { + if (_type.compare("count") == 0) { + type = SplitType::Count; + } else if (_type.compare("existence") == 0) { + type = SplitType::Existence; + } else throw std::invalid_argument("Not supported type"); + } + + std::string get_type() { + switch (type) { + case SplitType::Count: + return "count"; + case SplitType::Existence: + return "existence"; + } + } + + virtual const std::vector operator()(const char* input) const { + switch (type) { + case SplitType::Count: + return split(input, delim); + case SplitType::Existence: { + std::vector tmp(split(input, delim)); + std::set tmp2(tmp.begin(), tmp.end()); + tmp.assign(tmp2.begin(), tmp2.end()); + return tmp; + } + } + } + +}; + +using namespace Rcpp; + +RCPP_MODULE(split_callback) { + + class_("callback") + ; + + class_("split_callback") + .derives("callback") + .constructor() + .field("delim", &SplitCallbackFunctor::delim) + .property("type", &SplitCallbackFunctor::get_type, &SplitCallbackFunctor::set_type) + ; + +} \ No newline at end of file diff --git a/tests/test-split_callback.R b/tests/test-split_callback.R new file mode 100644 index 0000000..430fa44 --- /dev/null +++ b/tests/test-split_callback.R @@ -0,0 +1,15 @@ +if (require(RUnit)) { + callback <- generate_split_callback(",", "existence") + checkEquals(callback$delim, ",") + checkEquals(callback$type, "existence") + checkEquals(test_callback(callback, "a,b,a,c,d"), letters[1:4]) + + checkException(callback <- generate_split_callback(",", "ex")) + + callback <- generate_split_callback(",,", "existence") + checkEquals(test_callback(callback, "a,b,a,,c,d"), c("a,b,a", "c,d")) + + callback <- generate_split_callback(",", "count") + checkEquals(callback$type, "count") + checkEquals(test_callback(callback, x <- "a,b,a,c,d"), strsplit(x, ",")[[1]]) +} \ No newline at end of file From 1dfe3a577cf87c27ee28ce086a8e11f59c0bfd41 Mon Sep 17 00:00:00 2001 From: Wush Wu Date: Wed, 23 Sep 2015 00:52:49 +0800 Subject: [PATCH 06/14] refactor: expose header for user defined callback use RcppModule to implement split_callback --- DESCRIPTION | 2 +- R/callback.R | 15 +- R/hashed.model.matrix.R | 54 ++++--- inst/include/callback.h | 60 +++++++- {src => inst/include}/hash_function.h | 44 ++---- inst/include/vector_converter.h | 110 ++++++++++++++ man/register_callback.Rd | 7 +- src/Makevars | 2 +- src/{vector_converter.h => converters.h} | 143 +------------------ src/digest.c | 23 --- src/digestlocal.h | 63 -------- src/{hash_internal.cpp => hash_function.cpp} | 64 ++++++--- src/hashed_model_matrix.cpp | 82 ++++------- src/hashed_model_matrix.h | 13 ++ src/split_callback.cpp | 6 +- 15 files changed, 326 insertions(+), 362 deletions(-) rename {src => inst/include}/hash_function.h (53%) create mode 100644 inst/include/vector_converter.h rename src/{vector_converter.h => converters.h} (78%) delete mode 100644 src/digest.c delete mode 100644 src/digestlocal.h rename src/{hash_internal.cpp => hash_function.cpp} (52%) diff --git a/DESCRIPTION b/DESCRIPTION index d104aab..a6f6e48 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: FeatureHashing Type: Package Title: Creates a Model Matrix via Feature Hashing with a Formula Interface -Version: 0.9.1 +Version: 0.10.0 Date: 2015-03-29 Authors@R: c( person("Wush", "Wu", email = "wush978@gmail.com", role = c("aut", "cre")), diff --git a/R/callback.R b/R/callback.R index 352f207..e478801 100644 --- a/R/callback.R +++ b/R/callback.R @@ -1,17 +1,20 @@ -.callback <- new.env() - #'@export #'@title Register Special Function for Formula Interface #'@param special string. The name which will be used in formula interface. -#'@param callback external pointer. The pointer to the callback function. Please see the details. -register_callback <- function(special, callback) { +#'@param callback_generator function which will create a callback. Please see the details. +#'@examples +#'register_callback("split", generate_split_callback) +register_callback <- function(special, callback_generator) { .callback[[special]] <- callback invisible(NULL) } #'@export -generate_split_callback <- function(delim, type = c("existence", "count")) { - callback <- new(split_callback, delim, type[1]) +generate_split_callback <- function(input, delim = ",", type = c("existence", "count")) { + callback <- new(split_callback, input, delim, type[1]) callback } + +.callback <- new.env() +.callback[["split"]] <- generate_split_callback diff --git a/R/hashed.model.matrix.R b/R/hashed.model.matrix.R index 37f60e4..11531e0 100644 --- a/R/hashed.model.matrix.R +++ b/R/hashed.model.matrix.R @@ -211,7 +211,7 @@ hashed.model.matrix <- function(formula, data, hash.size = 2^18, transpose = FAL formula <- as.character(formula) %>% gsub(pattern = tf.idf.string, replacement = "type = \"count\"", x = .) %>% paste0(collapse = " ") %>% as.formula } - tf <- terms.formula(formula, data = data, specials = "split") + tf <- terms.formula(formula, data = data, specials = ls(FeatureHashing:::.callback)) retval <- new(.CSCMatrix) .hashed.model.matrix.dataframe(tf, data, hash.size, transpose, retval, create.mapping, signed.hash) class(retval) <- .CSCMatrix @@ -226,28 +226,50 @@ hashed.model.matrix <- function(formula, data, hash.size = 2^18, transpose = FAL } # This is the function called from C to parse the \code{split} function. -parse_split <- function(text) { +parse_special <- function(text, special, df) { origin.keep.source <- options()$keep.source tryCatch({ options(keep.source = TRUE) p <- parse(text = text) tmp <- getParseData(p) reference_name <- tmp$text[which(tmp$token == "SYMBOL")] - if ("delim" %in% tmp$text) { - delim <- tmp$text[which(tmp$text == "delim")[1] + 2] - delim <- gsub(pattern = '"', replacement = '', delim) - } else { - # the default value of delim - delim <- "," + params <- list() + fname <- NULL + first_symbol <- NULL + start <- FALSE + for(i_symbol in seq_len(nrow(tmp))) { + if (tmp$token[i_symbol] != "SYMBOL_FUNCTION_CALL" & !start) next + start <- TRUE + switch(tmp$token[i_symbol], + "SYMBOL_FUNCTION_CALL" = { + fname <- tmp$text[i_symbol] + }, + "SYMBOL" = { + if (tmp$token[i_symbol - 1] == "EQ_SUB") next + value <- eval(parse(text = tmp$text[i_symbol]), envir = df) + params <- append(params, list(value)) + if (is.null(first_symbol)) first_symbol <- tmp$text[i_symbol] + }, + "STR_CONST" = { + if (tmp$token[i_symbol - 1] == "EQ_SUB") next + value <- eval(parse(text = tmp$text[i_symbol]), envir = parent.frame()) + params <- append(params, list(value)) + }, + "SYMBOL_SUB" = { + if (tmp$token[i_symbol + 1] != "EQ_SUB") next + element <- list() + name <- tmp$text[i_symbol] + value <- eval(parse(text = tmp$text[i_symbol + 2]), envir = df) + element[[name]] <- value + params <- append(params, element) + }, + next) } - if ("type" %in% tmp$text) { - type <- tmp$text[which(tmp$text == "type")[1] + 2] - type <- gsub(pattern = '"', replacement = '', type) - } else { - # the default value of type - type <- "existence" - } - list(reference_name = reference_name, delim = delim, type = type) + stopifnot(!is.null(fname)) + stopifnot(start) + retval <- do.call(.callback[[special]], params) + attr(retval, "rname") <- first_symbol + retval }, finally = {options(keep.source = origin.keep.source)}) } diff --git a/inst/include/callback.h b/inst/include/callback.h index 2ee6989..215478d 100644 --- a/inst/include/callback.h +++ b/inst/include/callback.h @@ -21,18 +21,74 @@ #include #include +#include "vector_converter.h" class CallbackFunctor { public: - CallbackFunctor() { } + + // TODO: let src private + Rcpp::CharacterVector src; + + CallbackFunctor(SEXP _src) : src(_src) { } virtual ~CallbackFunctor() { } virtual const std::vector operator()(const char* input) const = 0; }; -#include +class CallbackConverter : public VectorConverter { + + // TODO: refactor this + Rcpp::CharacterVector src; + const CallbackFunctor* f; + SEXP psrc; + std::vector< std::string > cache; + +public: + + CallbackConverter(const CallbackFunctor* _f, const Param& param) + : f(_f), src(_f->src), psrc(_f->src), VectorConverter(param) + { } + + virtual ~CallbackConverter() { } + + virtual const std::vector& get_feature(size_t i) { + SEXP pstr = STRING_ELT(psrc, i); + if (pstr == NA_STRING) { + feature_buffer.clear(); + } else { + const char* str = CHAR(pstr); + cache = f->operator()(str); + feature_buffer.resize(cache.size()); + std::transform(cache.begin(), cache.end(), feature_buffer.begin(), + [this](const std::string& s) { + return this->get_hashed_feature(this->h_main, s.c_str()); + }); + if (is_final) std::transform(feature_buffer.begin(), feature_buffer.end(), + feature_buffer.begin(), [this](uint32_t feature) { + return feature % this->hash_size; + }); + } + return feature_buffer; + } + + virtual const std::vector& get_value(size_t i) { + SEXP pstr = STRING_ELT(psrc, i); + if (pstr == NA_STRING) { + value_buffer.clear(); + } else { + const char* str = CHAR(pstr); + value_buffer.resize(cache.size()); + std::transform(cache.begin(), cache.end(), value_buffer.begin(), + [this](const std::string& s) { + return this->get_hashed_feature(this->h_binary, s.c_str()); + }); + } + return value_buffer; + } + +}; RCPP_EXPOSED_CLASS(CallbackFunctor) diff --git a/src/hash_function.h b/inst/include/hash_function.h similarity index 53% rename from src/hash_function.h rename to inst/include/hash_function.h index 67e6fcf..84c5523 100644 --- a/src/hash_function.h +++ b/inst/include/hash_function.h @@ -19,8 +19,10 @@ #ifndef __HASH_FUNCTION_HPP__ #define __HASH_FUNCTION_HPP__ -#include "digestlocal.h" -#include "bswap_32.h" +#include +#include +#include +#include class HashFunction { @@ -34,10 +36,8 @@ class NullHashFunction : public HashFunction { public: - virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false) { - return 1; - } - + virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false); + }; class MurmurHash3HashFunction : public HashFunction { @@ -48,9 +48,8 @@ public : MurmurHash3HashFunction(uint32_t _seed) : seed(_seed) { } - virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false) { - return ::PMurHash32(seed, buf, size); - } + virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false); + }; class MurmurHash3LogHashFunction : public HashFunction { @@ -65,32 +64,7 @@ class MurmurHash3LogHashFunction : public HashFunction { : HashFunction(), seed(_seed), e(_e) { } - virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false) { - uint32_t retval = PMurHash32(seed, buf, size); - if (is_interaction) { - const uint32_t* src = reinterpret_cast(buf); - #ifdef BOOST_BIG_ENDIAN - if (inverse_mapping.find(bswap_32(src[0])) == inverse_mapping.end()) throw std::logic_error("interaction is hashed before main effect!"); - if (inverse_mapping.find(bswap_32(src[1])) == inverse_mapping.end()) throw std::logic_error("interaction is hashed before main effect!"); - std::string key(inverse_mapping[bswap_32(src[0])]); - key.append(":"); - key.append(inverse_mapping[bswap_32(src[1])]); - #else - if (inverse_mapping.find(src[0]) == inverse_mapping.end()) throw std::logic_error("interaction is hashed before main effect!"); - if (inverse_mapping.find(src[1]) == inverse_mapping.end()) throw std::logic_error("interaction is hashed before main effect!"); - std::string key(inverse_mapping[src[0]]); - key.append(":"); - key.append(inverse_mapping[src[1]]); - #endif - e[key.c_str()] = Rcpp::wrap((int) retval); - inverse_mapping[retval] = key; - } - else { - e[buf] = Rcpp::wrap((int) retval); - inverse_mapping[retval] = buf; - } - return retval; - } + virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false); }; diff --git a/inst/include/vector_converter.h b/inst/include/vector_converter.h new file mode 100644 index 0000000..33bd28c --- /dev/null +++ b/inst/include/vector_converter.h @@ -0,0 +1,110 @@ +/* + * This file is part of FeatureHashing + * Copyright (C) 2015 Wush Wu + * + * This program is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + */ + +#ifndef __VECTOR_CONVERTER_HPP__ +#define __VECTOR_CONVERTER_HPP__ + +#include "callback.h" +#include "hash_function.h" +#include + +struct VectorConverterParam; +class VectorConverter; + +typedef VectorConverterParam Param; + +/** + * Paramter of initializing VectorConverter + */ +struct VectorConverterParam { + + std::string name; + HashFunction* h_main; + HashFunction* h_binary; + size_t hash_size; + + VectorConverterParam(const std::string& _name, HashFunction* _h_main, HashFunction* _h_binary, size_t _hash_size) + : name(_name), h_main(_h_main), h_binary(_h_binary), hash_size(_hash_size) + { } + +}; + +class VectorConverter { + +protected: + std::vector feature_buffer; + std::vector value_buffer; + std::string name; + size_t name_len; + HashFunction *h_main, *h_binary; + size_t hash_size; + +public: + + bool is_final; + + explicit VectorConverter(const Param& param) + : name(param.name), name_len(param.name.size()), h_main(param.h_main), + h_binary(param.h_binary), hash_size(param.hash_size), is_final(true) + { } + + virtual ~VectorConverter() { } + + /** + * Evaluate the hashed feature from the raw data. + * The \code{get_feature} should be called before \code{get_value} + */ + virtual const std::vector& get_feature(size_t i) = 0; + + /** + * Evaluate the value from the raw data + * The \code{get_feature} should be called before \code{get_value} + */ + virtual const std::vector& get_value(size_t i) = 0; + + const std::string& get_name() const { + return name; + } + +protected: + + /** + * @return -1 if (int) v < 0, or +1 otherwise + */ + static inline int get_sign(uint32_t v) { + if ((int) v < 0) return -1; + else return 1; + } + + uint32_t get_hashed_feature(HashFunction* h, const char* str) { + name.append(str); + #ifdef NOISY_DEBUG + Rprintf("hashing %s ... ", name.c_str()); + #endif + uint32_t retval = (*h)(name.c_str(), name.size()); + #ifdef NOISY_DEBUG + Rprintf(" got %zu \n", retval); + #endif + name.resize(name_len); + return retval; + } + +}; + + +#endif // __VECTOR_CONVERTER_HPP__ \ No newline at end of file diff --git a/man/register_callback.Rd b/man/register_callback.Rd index b4212a2..179c425 100644 --- a/man/register_callback.Rd +++ b/man/register_callback.Rd @@ -4,14 +4,17 @@ \alias{register_callback} \title{Register Special Function for Formula Interface} \usage{ -register_callback(special, callback) +register_callback(special, callback_generator) } \arguments{ \item{special}{string. The name which will be used in formula interface.} -\item{callback}{external pointer. The pointer to the callback function. Please see the details.} +\item{callback_generator}{function which will create a callback. Please see the details.} } \description{ Register Special Function for Formula Interface } +\examples{ +register_callback("split", generate_split_callback) +} diff --git a/src/Makevars b/src/Makevars index c78b353..7f3a663 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,3 +1,3 @@ CXX_STD = CXX11 -PKG_CPPFLAGS = -I../inst/include/ \ No newline at end of file +PKG_CPPFLAGS = -I../inst/include/ -DNOISY_DEBUG \ No newline at end of file diff --git a/src/vector_converter.h b/src/converters.h similarity index 78% rename from src/vector_converter.h rename to src/converters.h index e488369..0b33fac 100644 --- a/src/vector_converter.h +++ b/src/converters.h @@ -16,16 +16,12 @@ * this program. If not, see . */ -#ifndef __VECTOR_CONVERTER_HPP__ -#define __VECTOR_CONVERTER_HPP__ +#ifndef __CONVERTERS_HPP__ +#define __CONVERTERS_HPP__ -#include "hash_function.h" +#include "vector_converter.h" #include "split.h" -#include "callback.h" -#include -struct VectorConverterParam; -class VectorConverter; class CharacterConverter; class FactorConverter; template @@ -36,7 +32,6 @@ class TagCountFactorConverter; class TagCountCharacterConverter; class InteractionConverter; -typedef VectorConverterParam Param; typedef std::shared_ptr pVectorConverter; typedef std::shared_ptr pCharacterConverter; typedef std::shared_ptr pFactorConverter; @@ -53,84 +48,6 @@ typedef std::shared_ptr pTagCountCharacterConverter; typedef std::vector< pVectorConverter > ConvertersVec; typedef std::shared_ptr pInteractionConverter; -/** - * Paramter of initializing VectorConverter - */ -struct VectorConverterParam { - - std::string name; - HashFunction* h_main; - HashFunction* h_binary; - size_t hash_size; - - VectorConverterParam(const std::string& _name, HashFunction* _h_main, HashFunction* _h_binary, size_t _hash_size) - : name(_name), h_main(_h_main), h_binary(_h_binary), hash_size(_hash_size) - { } - -}; - -class VectorConverter { - -protected: - std::vector feature_buffer; - std::vector value_buffer; - std::string name; - size_t name_len; - HashFunction *h_main, *h_binary; - size_t hash_size; - -public: - - bool is_final; - - explicit VectorConverter(const Param& param) - : name(param.name), name_len(param.name.size()), h_main(param.h_main), - h_binary(param.h_binary), hash_size(param.hash_size), is_final(true) - { } - - virtual ~VectorConverter() { } - - /** - * Evaluate the hashed feature from the raw data. - * The \code{get_feature} should be called before \code{get_value} - */ - virtual const std::vector& get_feature(size_t i) = 0; - - /** - * Evaluate the value from the raw data - * The \code{get_feature} should be called before \code{get_value} - */ - virtual const std::vector& get_value(size_t i) = 0; - - const std::string& get_name() const { - return name; - } - -protected: - - /** - * @return -1 if (int) v < 0, or +1 otherwise - */ - static inline int get_sign(uint32_t v) { - if ((int) v < 0) return -1; - else return 1; - } - - uint32_t get_hashed_feature(HashFunction* h, const char* str) { - name.append(str); - #ifdef NOISY_DEBUG - Rprintf("hashing %s ... ", name.c_str()); - #endif - uint32_t retval = (*h)(name.c_str(), name.size()); - #ifdef NOISY_DEBUG - Rprintf(" got %zu \n", retval); - #endif - name.resize(name_len); - return retval; - } - -}; - class CharacterConverter : public VectorConverter { Rcpp::CharacterVector src; @@ -540,56 +457,4 @@ class InteractionConverter : public VectorConverter { }; -class CallbackConverter : public VectorConverter { - - const CallbackFunctor* f; - Rcpp::CharacterVector src; - SEXP psrc; - std::vector< std::string > cache; - -public: - - CallbackConverter(const CallbackFunctor* _f, SEXP _src, const Param& param) - : f(_f), src(_src), psrc(_src), VectorConverter(param) - { } - - virtual ~CallbackConverter() { } - - virtual const std::vector& get_feature(size_t i) { - SEXP pstr = STRING_ELT(psrc, i); - if (pstr == NA_STRING) { - feature_buffer.clear(); - } else { - const char* str = CHAR(pstr); - cache = f->operator()(str); - feature_buffer.resize(cache.size()); - std::transform(cache.begin(), cache.end(), feature_buffer.begin(), - [this](const std::string& s) { - return this->get_hashed_feature(this->h_main, s.c_str()); - }); - if (is_final) std::transform(feature_buffer.begin(), feature_buffer.end(), - feature_buffer.begin(), [this](uint32_t feature) { - return feature % this->hash_size; - }); - } - return feature_buffer; - } - - virtual const std::vector& get_value(size_t i) { - SEXP pstr = STRING_ELT(psrc, i); - if (pstr == NA_STRING) { - value_buffer.clear(); - } else { - const char* str = CHAR(pstr); - value_buffer.resize(cache.size()); - std::transform(cache.begin(), cache.end(), value_buffer.begin(), - [this](const std::string& s) { - return this->get_hashed_feature(this->h_binary, s.c_str()); - }); - } - return value_buffer; - } - -}; - -#endif // __VECTOR_CONVERTER_HPP__ \ No newline at end of file +#endif // __CONVERTERS_HPP__ \ No newline at end of file diff --git a/src/digest.c b/src/digest.c deleted file mode 100644 index 40e539e..0000000 --- a/src/digest.c +++ /dev/null @@ -1,23 +0,0 @@ -/* - * This file is part of FeatureHashing - * Copyright (C) 2014-2015 Wush Wu - * - * This program is free software: you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation, either version 3 of the License, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - */ - -#include "pmurhashAPI.h" - -const uint32_t - MURMURHASH3_H_SEED = 3120602769LL, - MURMURHASH3_XI_SEED = 79193439LL; diff --git a/src/digestlocal.h b/src/digestlocal.h deleted file mode 100644 index 86fe0df..0000000 --- a/src/digestlocal.h +++ /dev/null @@ -1,63 +0,0 @@ -/** - * - * MurmurHash3 was written by Austin Appleby, and is placed in the public. - * - * This header links the implementation of murmurhash3 in digest3 to FeatureHashing. - * This was writting by Wush Wu, and also public domain. - * - */ - -#include -#include -#include -#include -#include - -#ifdef HAVE_VISIBILITY_ATTRIBUTE - # define attribute_hidden __attribute__ ((visibility ("hidden"))) -#else - # define attribute_hidden -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -/* First look for special cases */ -#if defined(_MSC_VER) - #define MH_UINT32 unsigned long -#endif - -/* If the compiler says it's C99 then take its word for it */ -#if !defined(MH_UINT32) && ( \ - defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L ) - #include - #define MH_UINT32 uint32_t -#endif - -/* Otherwise try testing against max value macros from limit.h */ -#if !defined(MH_UINT32) - #include - #if (USHRT_MAX == 0xffffffffUL) - #define MH_UINT32 unsigned short - #elif (UINT_MAX == 0xffffffffUL) - #define MH_UINT32 unsigned int - #elif (ULONG_MAX == 0xffffffffUL) - #define MH_UINT32 unsigned long - #endif -#endif - -#if !defined(MH_UINT32) - #error Unable to determine type name for unsigned 32-bit int -#endif - -/* I'm yet to work on a platform where 'unsigned char' is not 8 bits */ -#define MH_UINT8 unsigned char - -MH_UINT32 PMurHash32(MH_UINT32, const void*, int); - -extern const MH_UINT32 MURMURHASH3_H_SEED, MURMURHASH3_XI_SEED; - -#ifdef __cplusplus -} -#endif diff --git a/src/hash_internal.cpp b/src/hash_function.cpp similarity index 52% rename from src/hash_internal.cpp rename to src/hash_function.cpp index b69d3c6..d7881e5 100644 --- a/src/hash_internal.cpp +++ b/src/hash_function.cpp @@ -1,26 +1,51 @@ -/* - * This file is part of FeatureHashing - * Copyright (C) 2014-2015 Wush Wu - * - * This program is free software: you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation, either version 3 of the License, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - */ - #include #include #include +#include "hash_function.h" +#include "pmurhashAPI.h" +#include "bswap_32.h" #include -#include "digestlocal.h" +#include "hashed_model_matrix.h" + +const uint32_t + MURMURHASH3_H_SEED = 3120602769LL, + MURMURHASH3_XI_SEED = 79193439LL; + +uint32_t NullHashFunction::operator()(const char* buf, int size, bool is_interaction) { + return 1; +} + +uint32_t MurmurHash3HashFunction::operator()(const char* buf, int size, bool is_interaction) { + return ::PMurHash32(seed, buf, size); +} + +uint32_t MurmurHash3LogHashFunction::operator()(const char* buf, int size, bool is_interaction) { + uint32_t retval = PMurHash32(seed, buf, size); + if (is_interaction) { + const uint32_t* src = reinterpret_cast(buf); + #ifdef BOOST_BIG_ENDIAN + if (inverse_mapping.find(bswap_32(src[0])) == inverse_mapping.end()) throw std::logic_error("interaction is hashed before main effect!"); + if (inverse_mapping.find(bswap_32(src[1])) == inverse_mapping.end()) throw std::logic_error("interaction is hashed before main effect!"); + std::string key(inverse_mapping[bswap_32(src[0])]); + key.append(":"); + key.append(inverse_mapping[bswap_32(src[1])]); + #else + if (inverse_mapping.find(src[0]) == inverse_mapping.end()) throw std::logic_error("interaction is hashed before main effect!"); + if (inverse_mapping.find(src[1]) == inverse_mapping.end()) throw std::logic_error("interaction is hashed before main effect!"); + std::string key(inverse_mapping[src[0]]); + key.append(":"); + key.append(inverse_mapping[src[1]]); + #endif + e[key.c_str()] = Rcpp::wrap((int) retval); + inverse_mapping[retval] = key; + } + else { + e[buf] = Rcpp::wrap((int) retval); + inverse_mapping[retval] = buf; + } + return retval; +} + using namespace Rcpp; //'@export hash.sign @@ -84,4 +109,3 @@ IntegerVector h2(CharacterVector src) { } return retval; } - diff --git a/src/hashed_model_matrix.cpp b/src/hashed_model_matrix.cpp index b9388e7..3b7790e 100644 --- a/src/hashed_model_matrix.cpp +++ b/src/hashed_model_matrix.cpp @@ -47,15 +47,29 @@ const ConvertersVec get_converters( NumericMatrix tfactors(wrap(tf.attr("factors"))); CharacterVector reference_name, feature_name; Environment feature_hashing(Environment::namespace_env("FeatureHashing")); - Function parse_split(feature_hashing["parse_split"]); - std::set specials; + Function parse_special(feature_hashing["parse_special"]); + std::map specials; { List tmp(tf.attr("specials")); - SEXP ptag = tmp["split"]; - if (!Rf_isNull(ptag)) { - IntegerVector tmpvec(ptag); - specials.insert(tmpvec.begin(), tmpvec.end()); + CharacterVector tmp_name(tmp.attr("names")); + for(int i = 0;i < tmp.size();i++) { + SEXP ptag = tmp[i]; + if (!Rf_isNull(ptag)) { + IntegerVector tmp_index(tmp[i]); + const char* callback_generator_name = CHAR(wrap(tmp_name[i])); +#ifdef NOISY_DEBUG + Rprintf("Extract generator: %s from .callback\n", CHAR(wrap(tmp_name[i]))); +#endif + std::for_each(tmp_index.begin(), tmp_index.end(), [&specials, &callback_generator_name](const int index) { + specials.insert(std::make_pair(index, callback_generator_name)); + }); + } + } +#ifdef NOISY_DEBUG + for(auto i = specials.begin();i != specials.end();i++) { + Rprintf("special %s at index: %d\n", i->second.c_str(), i->first); } +#endif } { List tmp(tfactors.attr("dimnames")); @@ -73,11 +87,12 @@ const ConvertersVec get_converters( #endif pVectorConverter p(NULL); try{ - if (specials.find(j + 1) == specials.end()) { + const auto j_special = specials.find(j + 1); + if (j_special == specials.end()) { if (reference_class.find(rname) == reference_class.end()) throw std::invalid_argument("Failed to find the column:"); const std::string& rclass(reference_class.find(rname)->second); #ifdef NOISY_DEBUG - Rprintf("%s\n", rclass.c_str()); + Rprintf("rclass: %s\n", rclass.c_str()); #endif Param param(rname, _h_main, _h_binary, hash_size); if (rclass.compare("factor") == 0) { @@ -111,56 +126,21 @@ const ConvertersVec get_converters( } else { #ifdef NOISY_DEBUG - Rprintf(" (parsing tag..) "); + Rprintf(" (parsing spetial..) text: %s special: %s\n", rname.c_str(), j_special->second.c_str()); #endif - List expression(parse_split(wrap(rname))); - rname.assign(as(expression["reference_name"])); + RObject callback_functor(parse_special(wrap(rname), wrap(j_special->second.c_str()), data)); + rname.assign(as(callback_functor.attr("rname"))); Param param(rname, _h_main, _h_binary, hash_size); #ifdef NOISY_DEBUG Rprintf(" (rname ==> %s) ", rname.c_str()); #endif - if (reference_class.find(rname) == reference_class.end()) throw std::invalid_argument("Failed to find the column: "); - const std::string& rclass(reference_class.find(rname)->second); - #ifdef NOISY_DEBUG - Rprintf("%s\n", rclass.c_str()); - #endif - std::string - delim(as(expression["delim"])), - type(as(expression["type"])); + if (reference_class.find(rname) == reference_class.end()) { + throw std::invalid_argument("The first argument of the callback should be one of the column name of the data"); + } #ifdef NOISY_DEBUG - Rprintf("delim: %s type: %s\n", delim.c_str(), type.c_str()); + Rprintf("Initialize CallbackConverter\n"); #endif - if (rclass.compare("factor") == 0) { - if (type.compare("existence") == 0) { - #ifdef NOISY_DEBUG - Rprintf("Initialize TagExistenceFactorConverter\n"); - #endif - p.reset(new TagExistenceFactorConverter(wrap(data[rname.c_str()]), param, delim)); - } else if (type.compare("count") == 0) { - #ifdef NOISY_DEBUG - Rprintf("Initialize TagCountFactorConverter\n"); - #endif - p.reset(new TagCountFactorConverter(wrap(data[rname.c_str()]), param, delim)); - } else { - throw std::invalid_argument("Non supported type at name: "); - } - } else if (rclass.compare("character") == 0) { - if (type.compare("existence") == 0) { - #ifdef NOISY_DEBUG - Rprintf("Initialize TagExistenceCharacterConverter\n"); - #endif - p.reset(new TagExistenceCharacterConverter(wrap(data[rname.c_str()]), param, delim)); - } else if (type.compare("count") == 0) { - #ifdef NOISY_DEBUG - Rprintf("Initialize TagCountCharacterConverter\n"); - #endif - p.reset(new TagCountCharacterConverter(wrap(data[rname.c_str()]), param, delim)); - } else { - throw std::invalid_argument("Non supported type at name: "); - } - } else { - throw std::invalid_argument("Non supported type at name: "); - } + p.reset(new CallbackConverter(as(callback_functor), param)); } } catch(std::invalid_argument& e) { std::string message(e.what()); diff --git a/src/hashed_model_matrix.h b/src/hashed_model_matrix.h index 1ada8ce..28a4d2a 100644 --- a/src/hashed_model_matrix.h +++ b/src/hashed_model_matrix.h @@ -26,6 +26,19 @@ #include "callback.h" #include "hash_function.h" #include "vector_converter.h" +#include "converters.h" + +#ifdef __cplusplus +extern "C" { +#endif + +uint32_t PMurHash32(uint32_t, const void*, int); + +extern const uint32_t MURMURHASH3_H_SEED, MURMURHASH3_XI_SEED; + +#ifdef __cplusplus +} +#endif typedef std::map< std::string, std::string > NameClassMapping; typedef std::vector< std::string > StrVec; diff --git a/src/split_callback.cpp b/src/split_callback.cpp index bdeb3f8..d02b1c4 100644 --- a/src/split_callback.cpp +++ b/src/split_callback.cpp @@ -12,8 +12,8 @@ struct SplitCallbackFunctor : public CallbackFunctor { std::string delim; SplitType type; - SplitCallbackFunctor(const std::string& _delim, const std::string& _type) - : delim(_delim) + SplitCallbackFunctor(SEXP input, const std::string& _delim, const std::string& _type) + : delim(_delim), CallbackFunctor(input) { set_type(_type); } @@ -61,7 +61,7 @@ RCPP_MODULE(split_callback) { class_("split_callback") .derives("callback") - .constructor() + .constructor() .field("delim", &SplitCallbackFunctor::delim) .property("type", &SplitCallbackFunctor::get_type, &SplitCallbackFunctor::set_type) ; From 1e59025fe826e7a350ca1f022101df6e8a09edeb Mon Sep 17 00:00:00 2001 From: Wush Wu Date: Wed, 23 Sep 2015 02:02:11 +0800 Subject: [PATCH 07/14] consistent to 0.9.1 --- inst/include/callback.h | 59 ++----------------------------------- src/Makevars | 2 +- src/callback.cpp | 1 + src/converters.cpp | 53 +++++++++++++++++++++++++++++++++ src/converters.h | 25 ++++++++++++++++ src/hashed_model_matrix.cpp | 2 ++ src/split.cpp | 2 +- src/split_callback.cpp | 1 + 8 files changed, 87 insertions(+), 58 deletions(-) create mode 100644 src/converters.cpp diff --git a/inst/include/callback.h b/inst/include/callback.h index 215478d..5c24b2b 100644 --- a/inst/include/callback.h +++ b/inst/include/callback.h @@ -22,6 +22,7 @@ #include #include #include "vector_converter.h" +#include class CallbackFunctor { @@ -29,67 +30,13 @@ class CallbackFunctor { // TODO: let src private Rcpp::CharacterVector src; + bool decollision; - CallbackFunctor(SEXP _src) : src(_src) { } + CallbackFunctor(SEXP _src) : src(_src), decollision(false) { } virtual ~CallbackFunctor() { } virtual const std::vector operator()(const char* input) const = 0; }; -class CallbackConverter : public VectorConverter { - - // TODO: refactor this - Rcpp::CharacterVector src; - const CallbackFunctor* f; - SEXP psrc; - std::vector< std::string > cache; - -public: - - CallbackConverter(const CallbackFunctor* _f, const Param& param) - : f(_f), src(_f->src), psrc(_f->src), VectorConverter(param) - { } - - virtual ~CallbackConverter() { } - - virtual const std::vector& get_feature(size_t i) { - SEXP pstr = STRING_ELT(psrc, i); - if (pstr == NA_STRING) { - feature_buffer.clear(); - } else { - const char* str = CHAR(pstr); - cache = f->operator()(str); - feature_buffer.resize(cache.size()); - std::transform(cache.begin(), cache.end(), feature_buffer.begin(), - [this](const std::string& s) { - return this->get_hashed_feature(this->h_main, s.c_str()); - }); - if (is_final) std::transform(feature_buffer.begin(), feature_buffer.end(), - feature_buffer.begin(), [this](uint32_t feature) { - return feature % this->hash_size; - }); - } - return feature_buffer; - } - - virtual const std::vector& get_value(size_t i) { - SEXP pstr = STRING_ELT(psrc, i); - if (pstr == NA_STRING) { - value_buffer.clear(); - } else { - const char* str = CHAR(pstr); - value_buffer.resize(cache.size()); - std::transform(cache.begin(), cache.end(), value_buffer.begin(), - [this](const std::string& s) { - return this->get_hashed_feature(this->h_binary, s.c_str()); - }); - } - return value_buffer; - } - -}; - -RCPP_EXPOSED_CLASS(CallbackFunctor) - #endif //__CALLBACK_H__ \ No newline at end of file diff --git a/src/Makevars b/src/Makevars index 7f3a663..c78b353 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,3 +1,3 @@ CXX_STD = CXX11 -PKG_CPPFLAGS = -I../inst/include/ -DNOISY_DEBUG \ No newline at end of file +PKG_CPPFLAGS = -I../inst/include/ \ No newline at end of file diff --git a/src/callback.cpp b/src/callback.cpp index 0083ec5..3833a84 100644 --- a/src/callback.cpp +++ b/src/callback.cpp @@ -17,6 +17,7 @@ */ #include "callback.h" +#include "converters.h" #include "split.h" #include diff --git a/src/converters.cpp b/src/converters.cpp new file mode 100644 index 0000000..4540035 --- /dev/null +++ b/src/converters.cpp @@ -0,0 +1,53 @@ +#include "converters.h" + +const std::vector& CallbackConverter::get_feature(size_t i) { + SEXP pstr = STRING_ELT(psrc, i); + if (pstr == NA_STRING) { + feature_buffer.clear(); + } else { + const char* str = CHAR(pstr); + cache = f->operator()(str); + feature_buffer.resize(cache.size()); + std::transform(cache.begin(), cache.end(), feature_buffer.begin(), + [this](const std::string& s) { + return this->get_hashed_feature(this->h_main, s.c_str()); + }); + if (is_final) { + std::transform(feature_buffer.begin(), feature_buffer.end(), + feature_buffer.begin(), [this](uint32_t feature) { + return feature % this->hash_size; + }); + if (f->decollision) { + std::set tmp(feature_buffer.begin(), feature_buffer.end()); + feature_buffer.clear(); + feature_buffer.assign(tmp.begin(), tmp.end()); + } + } + } + return feature_buffer; +} + +const std::vector& CallbackConverter::get_value(size_t i) { + SEXP pstr = STRING_ELT(psrc, i); + if (pstr == NA_STRING) { + value_buffer.clear(); + } else { + const char* str = CHAR(pstr); + value_buffer.resize(cache.size()); + std::transform(cache.begin(), cache.end(), value_buffer.begin(), + [this](const std::string& s) { + return get_sign(get_hashed_feature(this->h_binary, s.c_str())); + }); +#ifdef NOISY_DEBUG + for(int j = 0;j < cache.size();j++) { + Rprintf("signed hash: %s ... got %zu\n", cache[j].c_str(), value_buffer[j]); + } +#endif + if (is_final & f->decollision) { + if (value_buffer.size() < feature_buffer.size()) throw std::logic_error("The length of value_buffer and feature_buffer go wrong!"); + value_buffer.resize(feature_buffer.size()); + } + } + return value_buffer; +} + diff --git a/src/converters.h b/src/converters.h index 0b33fac..3ff2159 100644 --- a/src/converters.h +++ b/src/converters.h @@ -19,6 +19,7 @@ #ifndef __CONVERTERS_HPP__ #define __CONVERTERS_HPP__ +#include #include "vector_converter.h" #include "split.h" @@ -457,4 +458,28 @@ class InteractionConverter : public VectorConverter { }; +class CallbackConverter : public VectorConverter { + + // TODO: refactor this + Rcpp::CharacterVector src; + const CallbackFunctor* f; + SEXP psrc; + std::vector< std::string > cache; + +public: + + CallbackConverter(const CallbackFunctor* _f, const Param& param) + : f(_f), src(_f->src), psrc(_f->src), VectorConverter(param) + { } + + virtual ~CallbackConverter() { } + + virtual const std::vector& get_feature(size_t i); + + virtual const std::vector& get_value(size_t i); + +}; + +RCPP_EXPOSED_CLASS(CallbackFunctor) + #endif // __CONVERTERS_HPP__ \ No newline at end of file diff --git a/src/hashed_model_matrix.cpp b/src/hashed_model_matrix.cpp index 3b7790e..ef37427 100644 --- a/src/hashed_model_matrix.cpp +++ b/src/hashed_model_matrix.cpp @@ -139,6 +139,8 @@ const ConvertersVec get_converters( } #ifdef NOISY_DEBUG Rprintf("Initialize CallbackConverter\n"); + Rprintf("Test h_main: %zu\n", (*param.h_main)("test", 4)); + Rprintf("Test h_binary: %zu\n", (*param.h_binary)("test", 4)); #endif p.reset(new CallbackConverter(as(callback_functor), param)); } diff --git a/src/split.cpp b/src/split.cpp index b2b170b..4296028 100644 --- a/src/split.cpp +++ b/src/split.cpp @@ -26,7 +26,7 @@ std::vector split(const std::string& src, const std::string& delim) const char* end = std::strstr(start, delim.c_str()); std::vector retval; while(end != NULL) { - retval.push_back(std::string(start, end)); + if (end - start > 0) retval.push_back(std::string(start, end)); start = end + delim.size(); end = std::strstr(start, delim.c_str()); } diff --git a/src/split_callback.cpp b/src/split_callback.cpp index d02b1c4..0b73be4 100644 --- a/src/split_callback.cpp +++ b/src/split_callback.cpp @@ -16,6 +16,7 @@ struct SplitCallbackFunctor : public CallbackFunctor { : delim(_delim), CallbackFunctor(input) { set_type(_type); + if (type == SplitType::Existence) decollision = true; } virtual ~SplitCallbackFunctor() { } From 1166abedbb2ce84404d9bf838bf34dd93bcf660d Mon Sep 17 00:00:00 2001 From: Wush Wu Date: Wed, 23 Sep 2015 02:22:49 +0800 Subject: [PATCH 08/14] debug: test-split_callback --- R/RcppExports.R | 3 --- man/test_callback.Rd | 5 ----- src/callback.cpp | 4 ---- src/split_callback.cpp | 8 ++++++-- tests/test-split_callback.R | 9 +++++---- 5 files changed, 11 insertions(+), 18 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index 2888ded..d552097 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -20,9 +20,6 @@ tomatrix <- function(m) { #'@title Test the callback function. #'@param Rcallback external pointer. The pointer of the callback function. #'@param input string. The input. -#'@details The Rcallback is an external pointer which points to a functional pointer.. -#'The signature of the functional pointer should be: -#'\code{std::vector (*f)(const char* str)} #'@return character #'@export test_callback <- function(Rcallback, input) { diff --git a/man/test_callback.Rd b/man/test_callback.Rd index 93e2513..5c9e383 100644 --- a/man/test_callback.Rd +++ b/man/test_callback.Rd @@ -17,9 +17,4 @@ character \description{ Test the callback function. } -\details{ -The Rcallback is an external pointer which points to a functional pointer.. -The signature of the functional pointer should be: -\code{std::vector (*f)(const char* str)} -} diff --git a/src/callback.cpp b/src/callback.cpp index 3833a84..24945aa 100644 --- a/src/callback.cpp +++ b/src/callback.cpp @@ -26,16 +26,12 @@ using namespace Rcpp; //'@title Test the callback function. //'@param Rcallback external pointer. The pointer of the callback function. //'@param input string. The input. -//'@details The Rcallback is an external pointer which points to a functional pointer.. -//'The signature of the functional pointer should be: -//'\code{std::vector (*f)(const char* str)} //'@return character //'@export //[[Rcpp::export]] SEXP test_callback(SEXP Rcallback, const std::string& input) { CallbackFunctor* callback(as(Rcallback)); return wrap((*callback)(input.c_str())); - // return R_NilValue; } RCPP_MODULE(callback) { diff --git a/src/split_callback.cpp b/src/split_callback.cpp index 0b73be4..b879394 100644 --- a/src/split_callback.cpp +++ b/src/split_callback.cpp @@ -40,11 +40,15 @@ struct SplitCallbackFunctor : public CallbackFunctor { virtual const std::vector operator()(const char* input) const { switch (type) { - case SplitType::Count: - return split(input, delim); + case SplitType::Count: { + auto tmp(split(input, delim)); + tmp.erase(std::remove(tmp.begin(), tmp.end(), ""), tmp.end()); + return tmp; + } case SplitType::Existence: { std::vector tmp(split(input, delim)); std::set tmp2(tmp.begin(), tmp.end()); + tmp2.erase(""); tmp.assign(tmp2.begin(), tmp2.end()); return tmp; } diff --git a/tests/test-split_callback.R b/tests/test-split_callback.R index 430fa44..42ceac2 100644 --- a/tests/test-split_callback.R +++ b/tests/test-split_callback.R @@ -1,15 +1,16 @@ if (require(RUnit)) { - callback <- generate_split_callback(",", "existence") + library(FeatureHashing) + callback <- generate_split_callback(letters, ",", "existence") checkEquals(callback$delim, ",") checkEquals(callback$type, "existence") checkEquals(test_callback(callback, "a,b,a,c,d"), letters[1:4]) - checkException(callback <- generate_split_callback(",", "ex")) + checkException(callback <- generate_split_callback(letters, ",", "ex")) - callback <- generate_split_callback(",,", "existence") + callback <- generate_split_callback(letters, ",,", "existence") checkEquals(test_callback(callback, "a,b,a,,c,d"), c("a,b,a", "c,d")) - callback <- generate_split_callback(",", "count") + callback <- generate_split_callback(letters, ",", "count") checkEquals(callback$type, "count") checkEquals(test_callback(callback, x <- "a,b,a,c,d"), strsplit(x, ",")[[1]]) } \ No newline at end of file From 92d80245c5b63fdb6f6709d20d3855caddef2bf4 Mon Sep 17 00:00:00 2001 From: Wush Wu Date: Wed, 23 Sep 2015 22:14:06 +0800 Subject: [PATCH 09/14] debug: using `:::` add documentation --- R/callback.R | 5 +++++ R/hashed.model.matrix.R | 2 +- man/generate_split_callback.Rd | 20 ++++++++++++++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) create mode 100644 man/generate_split_callback.Rd diff --git a/R/callback.R b/R/callback.R index e478801..498bc07 100644 --- a/R/callback.R +++ b/R/callback.R @@ -10,6 +10,11 @@ register_callback <- function(special, callback_generator) { invisible(NULL) } +#'@title Generate callback of split +#'@param input character vector. The input of split +#'@param delim string. \code{delim} will be used as delimiter for splitting +#'@param type string. One of \code{c("existence", "count")} +#'"count" indicates the number of occurrence of the token. "existence" indicates the boolean that whether the token exist or not. #'@export generate_split_callback <- function(input, delim = ",", type = c("existence", "count")) { callback <- new(split_callback, input, delim, type[1]) diff --git a/R/hashed.model.matrix.R b/R/hashed.model.matrix.R index 11531e0..684e8f6 100644 --- a/R/hashed.model.matrix.R +++ b/R/hashed.model.matrix.R @@ -211,7 +211,7 @@ hashed.model.matrix <- function(formula, data, hash.size = 2^18, transpose = FAL formula <- as.character(formula) %>% gsub(pattern = tf.idf.string, replacement = "type = \"count\"", x = .) %>% paste0(collapse = " ") %>% as.formula } - tf <- terms.formula(formula, data = data, specials = ls(FeatureHashing:::.callback)) + tf <- terms.formula(formula, data = data, specials = ls(.callback)) retval <- new(.CSCMatrix) .hashed.model.matrix.dataframe(tf, data, hash.size, transpose, retval, create.mapping, signed.hash) class(retval) <- .CSCMatrix diff --git a/man/generate_split_callback.Rd b/man/generate_split_callback.Rd new file mode 100644 index 0000000..0963855 --- /dev/null +++ b/man/generate_split_callback.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/callback.R +\name{generate_split_callback} +\alias{generate_split_callback} +\title{Generate callback of split} +\usage{ +generate_split_callback(input, delim = ",", type = c("existence", "count")) +} +\arguments{ +\item{input}{character vector. The input of split} + +\item{delim}{string. \code{delim} will be used as delimiter for splitting} + +\item{type}{string. One of \code{c("existence", "count")} +"count" indicates the number of occurrence of the token. "existence" indicates the boolean that whether the token exist or not.} +} +\description{ +Generate callback of split +} + From 9e4d254c400fe5858dd425d97b8cb0a080484843 Mon Sep 17 00:00:00 2001 From: Wush Wu Date: Wed, 23 Sep 2015 22:38:13 +0800 Subject: [PATCH 10/14] correct documentation --- R/hashed.model.matrix.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/hashed.model.matrix.R b/R/hashed.model.matrix.R index 684e8f6..6de2bc8 100644 --- a/R/hashed.model.matrix.R +++ b/R/hashed.model.matrix.R @@ -225,7 +225,7 @@ hashed.model.matrix <- function(formula, data, hash.size = 2^18, transpose = FAL } else if (tf.idf) tf.idf.transfo(retval) else retval } -# This is the function called from C to parse the \code{split} function. +# This is the function called from C to parse the special function. parse_special <- function(text, special, df) { origin.keep.source <- options()$keep.source tryCatch({ From b031208cfb093381fd59d30bd881a7c0033502c9 Mon Sep 17 00:00:00 2001 From: Wush Wu Date: Thu, 24 Sep 2015 22:59:29 +0800 Subject: [PATCH 11/14] resolve #95 --- DESCRIPTION | 2 +- NAMESPACE | 3 + R/callback.R | 44 ++++++++- inst/callback/jiebaR_callback.cpp | 112 +++++++++++++++++++++++ inst/include/callback.h | 1 - inst/include/hash_function.h | 37 -------- man/init_jiebaR_callback.Rd | 32 +++++++ man/ls_special.Rd | 16 ++++ man/register_callback.Rd | 6 +- src/hash_function_implementation.h | 43 +++++++++ src/hashed_model_matrix.h | 2 +- src/split_callback.cpp | 4 +- {inst/include => src}/vector_converter.h | 4 +- tests/test-jiebaR.R | 49 ++++++++++ vignettes/Callback.Rmd | 27 ++++++ 15 files changed, 336 insertions(+), 46 deletions(-) create mode 100644 inst/callback/jiebaR_callback.cpp create mode 100644 man/init_jiebaR_callback.Rd create mode 100644 man/ls_special.Rd create mode 100644 src/hash_function_implementation.h rename {inst/include => src}/vector_converter.h (98%) create mode 100644 tests/test-jiebaR.R create mode 100644 vignettes/Callback.Rmd diff --git a/DESCRIPTION b/DESCRIPTION index 6e60c59..ccaa974 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -24,7 +24,7 @@ Imports: digest(>= 0.6.8), magrittr (>= 1.5) LinkingTo: Rcpp, digest(>= 0.6.8), BH -Suggests: RUnit, glmnet, knitr, xgboost, rmarkdown +Suggests: RUnit, glmnet, knitr, xgboost, rmarkdown, jiebaR(>= 0.5.1) RcppModules: callback, split_callback SystemRequirements: C++11 BugReports: https://github.com/wush978/FeatureHashing/issues diff --git a/NAMESPACE b/NAMESPACE index c323192..5b870ec 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,7 +7,9 @@ export(hash.size) export(hashed.interaction.value) export(hashed.model.matrix) export(hashed.value) +export(init_jiebaR_callback) export(intToRaw) +export(ls_special) export(register_callback) export(test_callback) import(digest) @@ -16,6 +18,7 @@ importFrom(Matrix,Diagonal) importFrom(Matrix,colSums) importFrom(Rcpp,cpp_object_initializer) importFrom(Rcpp,loadModule) +importFrom(Rcpp,sourceCpp) importFrom(magrittr,"%<>%") importFrom(magrittr,"%>%") importFrom(methods,as) diff --git a/R/callback.R b/R/callback.R index 498bc07..f7c147d 100644 --- a/R/callback.R +++ b/R/callback.R @@ -3,13 +3,24 @@ #'@title Register Special Function for Formula Interface #'@param special string. The name which will be used in formula interface. #'@param callback_generator function which will create a callback. Please see the details. -#'@examples +#'@details The callback_generator is a function whose first argument is the +#'input data and the other arguments could be used to initialize the callback +#'function properly. The result should be a Rcpp module which derives the +#'`CallbackFunctor` class. Please see the vignette for details. #'register_callback("split", generate_split_callback) register_callback <- function(special, callback_generator) { - .callback[[special]] <- callback + .callback[[special]] <- callback_generator invisible(NULL) } +#'@title List the Registered Specials +#'@return character vector. The specials which could be used in the +#'formula interface. +#'@export +ls_special <- function() { + ls(.callback) +} + #'@title Generate callback of split #'@param input character vector. The input of split #'@param delim string. \code{delim} will be used as delimiter for splitting @@ -23,3 +34,32 @@ generate_split_callback <- function(input, delim = ",", type = c("existence", "c .callback <- new.env() .callback[["split"]] <- generate_split_callback + +#'@title Initialize and register jiebaR to the formula interface +#'@details This function will register the callback of word segmentation +#'function provided by jiebaR to the formula interface. +#'For example, `~ jiebaR(...)` will use the feature of word segmentation +#'provided by jiebaR to segment a given column of the data. +#'The first argument of the jiebaR is a character which will be segmented. +#'The left arguments are the same as \code{\link[jiebaR]{worker}}. These +#'arguments will be used to initialize a jiebaR worker which will segment +#'the input data. +#' +#'@examples +#'\dontrun{ +#'library(FeatureHashing) +#'init_jiebaR_callback() +#'m <- hashed.model.matrix(~ jiebaR(title, type = "mix", df)) +#'# the column `df$title` will be feed into `worker <- worker(type = "mix")` +#'# the result of `worker <= df$title` will be hashed into the sparse matrix +#'# the result is `m` +#'} +#'@export +#'@importFrom Rcpp sourceCpp +init_jiebaR_callback <- function() { + if (!requireNamespace("jiebaR", character.only = TRUE)) stop("Please install the package jiebaR first") + tryCatch({ + sourceCpp(system.file("callback/jiebaR_callback.cpp", package = "FeatureHashing")) + }, finally = { + }) +} diff --git a/inst/callback/jiebaR_callback.cpp b/inst/callback/jiebaR_callback.cpp new file mode 100644 index 0000000..7d77864 --- /dev/null +++ b/inst/callback/jiebaR_callback.cpp @@ -0,0 +1,112 @@ +// [[Rcpp::depends(jiebaR)]] +// [[Rcpp::depends(FeatureHashing)]] + +#include "jiebaRAPI.h" +#include +#include + +using namespace Rcpp; + +struct jiebaRCallbackFunctor : public CallbackFunctor { + + enum Type { + MIX, + MP, + HMM, + QUERY, + KEY + }; + + Type type; + Environment cutter; + SEXP cutter_pointer; + + typedef SEXP (*Cut)(SEXP, SEXP); + + Cut cut; + + void set_type(std::string _type) { + if (_type.compare("mix") == 0) { + type = MIX; + } else if (_type.compare("mp") == 0) { + type = MP; + } else if (_type.compare("hmm") == 0) { + type = HMM; + } else if (_type.compare("query") == 0) { + type = QUERY; + } else if (_type.compare("key") == 0) { + type = KEY; + } else { + throw std::invalid_argument("Unknown type"); + } + } + + std::string get_type() { + switch (type) { + case MIX: + return "mix"; + case MP: + return "mp"; + case HMM: + return "hmm"; + case QUERY: + return "query"; + case KEY: + return "key"; + } + } + + void set_cut() { + std::string fname("jiebaR_"); + fname.append(get_type()); + fname.append("_cut"); + cut = reinterpret_cast(::R_GetCCallable("jiebaR", fname.c_str())); + } + + explicit jiebaRCallbackFunctor( + SEXP _src, + std::string _type, + SEXP _cutter + ) + : type(MIX), + cutter(_cutter), + cutter_pointer(NULL), + cut(NULL), + CallbackFunctor(_src) + { + set_type(_type); + set_cut(); + cutter_pointer = wrap(cutter["worker"]); + } + + virtual ~jiebaRCallbackFunctor() { } + + virtual const std::vector operator()(const char* input) const { + return as >((*cut)(wrap(input), cutter_pointer)); + } + +}; + +RCPP_MODULE(jiebaR_callback) { + + class_("callback") + ; + + class_("jiebaR_callback") + .derives("callback") + .constructor() + .property("type", &jiebaRCallbackFunctor::get_type, &jiebaRCallbackFunctor::set_type) + .field("cutter", &jiebaRCallbackFunctor::cutter) + ; + +} + +/***R +generate_jiebaR_callback <- function(input, type = "mix", ...) { + worker <- jiebaR::worker(type = type, ...) + callback <- new(jiebaR_callback, input, type, worker) + callback +} + +FeatureHashing::register_callback("jiebaR", generate_jiebaR_callback) +*/ diff --git a/inst/include/callback.h b/inst/include/callback.h index 5c24b2b..4e217a2 100644 --- a/inst/include/callback.h +++ b/inst/include/callback.h @@ -21,7 +21,6 @@ #include #include -#include "vector_converter.h" #include class CallbackFunctor { diff --git a/inst/include/hash_function.h b/inst/include/hash_function.h index 84c5523..43c17d3 100644 --- a/inst/include/hash_function.h +++ b/inst/include/hash_function.h @@ -22,7 +22,6 @@ #include #include #include -#include class HashFunction { @@ -32,40 +31,4 @@ class HashFunction { }; -class NullHashFunction : public HashFunction { - - public: - - virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false); - -}; - -class MurmurHash3HashFunction : public HashFunction { - - uint32_t seed; - -public : - - MurmurHash3HashFunction(uint32_t _seed) : seed(_seed) { } - - virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false); - -}; - -class MurmurHash3LogHashFunction : public HashFunction { - - uint32_t seed; - Rcpp::Environment e; - std::map inverse_mapping; - -public: - - MurmurHash3LogHashFunction(SEXP _e, uint32_t _seed) - : HashFunction(), seed(_seed), e(_e) - { } - - virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false); - -}; - #endif \ No newline at end of file diff --git a/man/init_jiebaR_callback.Rd b/man/init_jiebaR_callback.Rd new file mode 100644 index 0000000..9f0d0e4 --- /dev/null +++ b/man/init_jiebaR_callback.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/callback.R +\name{init_jiebaR_callback} +\alias{init_jiebaR_callback} +\title{Initialize and register jiebaR to the formula interface} +\usage{ +init_jiebaR_callback() +} +\description{ +Initialize and register jiebaR to the formula interface +} +\details{ +This function will register the callback of word segmentation +function provided by jiebaR to the formula interface. +For example, `~ jiebaR(...)` will use the feature of word segmentation +provided by jiebaR to segment a given column of the data. +The first argument of the jiebaR is a character which will be segmented. +The left arguments are the same as \code{\link[jiebaR]{worker}}. These +arguments will be used to initialize a jiebaR worker which will segment +the input data. +} +\examples{ +\dontrun{ +library(FeatureHashing) +init_jiebaR_callback() +m <- hashed.model.matrix(~ jiebaR(title, type = "mix", df)) +# the column `df$title` will be feed into `worker <- worker(type = "mix")` +# the result of `worker <= df$title` will be hashed into the sparse matrix +# the result is `m` +} +} + diff --git a/man/ls_special.Rd b/man/ls_special.Rd new file mode 100644 index 0000000..c9a3d24 --- /dev/null +++ b/man/ls_special.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/callback.R +\name{ls_special} +\alias{ls_special} +\title{List the Registered Specials} +\usage{ +ls_special() +} +\value{ +character vector. The specials which could be used in the +formula interface. +} +\description{ +List the Registered Specials +} + diff --git a/man/register_callback.Rd b/man/register_callback.Rd index 179c425..8660d12 100644 --- a/man/register_callback.Rd +++ b/man/register_callback.Rd @@ -14,7 +14,11 @@ register_callback(special, callback_generator) \description{ Register Special Function for Formula Interface } -\examples{ +\details{ +The callback_generator is a function whose first argument is the +input data and the other arguments could be used to initialize the callback +function properly. The result should be a Rcpp module which derives the +`CallbackFunctor` class. Please see the vignette for details. register_callback("split", generate_split_callback) } diff --git a/src/hash_function_implementation.h b/src/hash_function_implementation.h new file mode 100644 index 0000000..8c71698 --- /dev/null +++ b/src/hash_function_implementation.h @@ -0,0 +1,43 @@ +#ifndef __HASH_FUNCTION_IMPLEMENTATION_HPP__ +#define __HASH_FUNCTION_IMPLEMENTATION_HPP__ + +#include +#include + +class NullHashFunction : public HashFunction { + + public: + + virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false); + +}; + +class MurmurHash3HashFunction : public HashFunction { + + uint32_t seed; + +public : + + MurmurHash3HashFunction(uint32_t _seed) : seed(_seed) { } + + virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false); + +}; + +class MurmurHash3LogHashFunction : public HashFunction { + + uint32_t seed; + Rcpp::Environment e; + std::map inverse_mapping; + +public: + + MurmurHash3LogHashFunction(SEXP _e, uint32_t _seed) + : HashFunction(), seed(_seed), e(_e) + { } + + virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false); + +}; + +# endif // __HASH_FUNCTION_IMPLEMENTATION_HPP__ \ No newline at end of file diff --git a/src/hashed_model_matrix.h b/src/hashed_model_matrix.h index b3ab8c2..905fe1c 100644 --- a/src/hashed_model_matrix.h +++ b/src/hashed_model_matrix.h @@ -25,7 +25,7 @@ #include #include #include "callback.h" -#include "hash_function.h" +#include "hash_function_implementation.h" #include "vector_converter.h" #include "converters.h" diff --git a/src/split_callback.cpp b/src/split_callback.cpp index b879394..946526f 100644 --- a/src/split_callback.cpp +++ b/src/split_callback.cpp @@ -60,10 +60,10 @@ struct SplitCallbackFunctor : public CallbackFunctor { using namespace Rcpp; RCPP_MODULE(split_callback) { - + class_("callback") ; - + class_("split_callback") .derives("callback") .constructor() diff --git a/inst/include/vector_converter.h b/src/vector_converter.h similarity index 98% rename from inst/include/vector_converter.h rename to src/vector_converter.h index 33bd28c..d78e4dc 100644 --- a/inst/include/vector_converter.h +++ b/src/vector_converter.h @@ -21,7 +21,9 @@ #include "callback.h" #include "hash_function.h" -#include +#ifdef NOISY_DEBUG +#include +#endif struct VectorConverterParam; class VectorConverter; diff --git a/tests/test-jiebaR.R b/tests/test-jiebaR.R new file mode 100644 index 0000000..ab1f419 --- /dev/null +++ b/tests/test-jiebaR.R @@ -0,0 +1,49 @@ +if (require(RUnit) & Sys.getenv("TEST_JIEBAR") == "TRUE") { + library(FeatureHashing) + df <- data.frame(title = c( + "貶值取代降息? 台幣貶破33元", + "優生 培寶4款毒奶瓶下架", + " 秋節上國道 閃11塞車點", + "習近平訪美前…//中國戰機公海危險攔截美機", + "352億公開收購 日月光成矽品最大股東", + "驚 AT-3又出事 南投深山失聯 2飛官生死未卜", + "誰說該廢死的?怕死鄭捷首度道歉", + "歐習會前夕// 美國安顧問:反對片面改變台海現狀" + )) + init_jiebaR_callback() + m <- hashed.model.matrix(~ jiebaR(title), df, create.mapping = TRUE) + title_tokens <- names(hash.mapping(m)) + checkEquals(title_tokens, c("title4", "title股東", "title國道", "title中國", "title現狀", +"title…", "title閃", "title習近平", "title日", "title11", +"title驚", "title公開", "title億", "title又", "title:", +"title該", "title塞車", "title訪美", "title?", "title會", +"title公海", "title深山", "title片面", "title奶瓶", "title說", +"title成矽品", "title危險", "title台海", "title最大", +"title美國", "title貶值", "title上", "title下架", "title秋節", +"titleAT", "title352", "title生死未卜", "title收購", "title月光", +"title怕死", "title貶破", "title飛官", "title出事", "title取代", +"title道歉", "title歐習", "title33", "title ", "title款毒", +"title優生", "title顧問", "title前", "title前夕", "title廢死的", +"title反對", "title改變", "title點", "title培寶", "title台幣", +"title降息", "title美機", "title安", "title-", "title南投", +"title首度", "title戰機", "title鄭捷", "title/", "title元", +"title誰", "title攔截", "title2", "title失聯", "title3")) + m <- hashed.model.matrix(~ jiebaR(title, type = "hmm"), df, create.mapping = TRUE) + title_tokens <- names(hash.mapping(m)) + checkEquals(title_tokens, c("title4", "title改", "title鄭", "title股東", "title死", +"title…", "title海", "title上國道", "title閃", "title日", +"title11", "title現", "title首", "title驚", "title片", "title光成", +"title又", "title:", "title該", "title機公海", "title反", +"title習近", "title矽品", "title怕", "title生死", "title?", +"title捷", "title會", "title對", "title深山", "title奶瓶", +"title說", "title月", "title危險", "title最大", "title貶值", +"title下架", "title台", "title秋節", "titleAT", "title美前", +"title面", "title352", "title收購", "title狀", "title貶破", +"title飛官", "title歉", "title出事", "title取代", "title平訪", +"title歐習", "title億公開", "title33", "title未卜", "title中國戰", +"title ", "title款毒", "title優生", "title前夕", "title度", +"title美國安顧問", "title廢死的", "title變", "title塞車點", +"title培寶", "title台幣", "title降息", "title美機", "title-", +"title南投", "title道", "title/", "title元", "title誰", +"title攔截", "title2", "title失聯", "title3")) +} \ No newline at end of file diff --git a/vignettes/Callback.Rmd b/vignettes/Callback.Rmd new file mode 100644 index 0000000..97b47ce --- /dev/null +++ b/vignettes/Callback.Rmd @@ -0,0 +1,27 @@ +--- +title: "Register Callback for FeatureHashing" +author: "Wush Wu" +output: + rmarkdown::html_vignette: + css: vignette.css + number_sections: yes + toc: yes +date: "September 24, 2015" +vignette: > + %\VignetteIndexEntry{FeatureHashing} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + +This is an introduction of registering callback for the formula interface of FeatureHashing. + +## Demo + +## Getting Started + +### Implement Rcpp Module + +### Implement Generator + +### Register the Generator to the Formula Interface + From 11a742485072f7f945aa10d9f068defda07a7908 Mon Sep 17 00:00:00 2001 From: Wush Wu Date: Thu, 24 Sep 2015 23:33:01 +0800 Subject: [PATCH 12/14] install jiebaR from github --- .travis.yml | 1 + appveyor.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index daeabbf..52b57ab 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,6 +15,7 @@ before_install: - curl -OL http://raw.github.com/craigcitro/r-travis/master/scripts/travis-tool.sh - chmod 755 ./travis-tool.sh - ./travis-tool.sh bootstrap + - ./travis-tool.sh install_github qinwf/jiebaR - ./travis-tool.sh install_github jimhester/robustr - ./travis-tool.sh install_github jimhester/covr diff --git a/appveyor.yml b/appveyor.yml index 086cdaf..9862d36 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -13,6 +13,7 @@ install: build_script: - travis-tool.sh install_deps + - travis-tool.sh install_github qinwf/jiebaR test_script: - travis-tool.sh run_tests From ae532cba109ad1c80e097d449d76f530c4a0e082 Mon Sep 17 00:00:00 2001 From: Wush Wu Date: Thu, 24 Sep 2015 23:45:11 +0800 Subject: [PATCH 13/14] debug: warning: control reaches end of non-void function [-Wreturn-type] --- src/split_callback.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/split_callback.cpp b/src/split_callback.cpp index 946526f..30e5e15 100644 --- a/src/split_callback.cpp +++ b/src/split_callback.cpp @@ -36,6 +36,7 @@ struct SplitCallbackFunctor : public CallbackFunctor { case SplitType::Existence: return "existence"; } + throw std::logic_error("Invalid SplitType"); } virtual const std::vector operator()(const char* input) const { @@ -53,6 +54,7 @@ struct SplitCallbackFunctor : public CallbackFunctor { return tmp; } } + throw std::logic_error("Invalid SplitType"); } }; From e2d0b8e3652ed4f9a4d28a26f87ff1bebe5b863a Mon Sep 17 00:00:00 2001 From: Wush Wu Date: Thu, 29 Mar 2018 13:40:23 +0800 Subject: [PATCH 14/14] debug: cannot directly include pmurhashAPI.h --- inst/include/hash_function.h | 41 +++++++++++++++++++++++++++++++++++- src/hash_function.cpp | 1 - src/pmurhash32.c | 1 + 3 files changed, 41 insertions(+), 2 deletions(-) create mode 100644 src/pmurhash32.c diff --git a/inst/include/hash_function.h b/inst/include/hash_function.h index 144ac3d..122b1ab 100644 --- a/inst/include/hash_function.h +++ b/inst/include/hash_function.h @@ -19,11 +19,50 @@ #ifndef __HASH_FUNCTION_HPP__ #define __HASH_FUNCTION_HPP__ -#include "pmurhashAPI.h" #include #include #include +#ifdef HAVE_VISIBILITY_ATTRIBUTE +# define attribute_hidden __attribute__ ((visibility ("hidden"))) +#else +# define attribute_hidden +#endif + +extern "C" { + + /* First look for special cases */ +#if defined(_MSC_VER) +#define MH_UINT32 unsigned long +#endif + +/* If the compiler says it's C99 then take its word for it */ +#if !defined(MH_UINT32) && ( \ + defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L ) +#include +#define MH_UINT32 uint32_t +#endif + +/* Otherwise try testing against max value macros from limit.h */ +#if !defined(MH_UINT32) +#include +#if (USHRT_MAX == 0xffffffffUL) +#define MH_UINT32 unsigned short +#elif (UINT_MAX == 0xffffffffUL) +#define MH_UINT32 unsigned int +#elif (ULONG_MAX == 0xffffffffUL) +#define MH_UINT32 unsigned long +#endif +#endif + +#if !defined(MH_UINT32) +#error Unable to determine type name for unsigned 32-bit int +#endif + +MH_UINT32 attribute_hidden PMurHash32(MH_UINT32 seed, const void *key, int len); + +} + class HashFunction { public: diff --git a/src/hash_function.cpp b/src/hash_function.cpp index d7881e5..5ee10aa 100644 --- a/src/hash_function.cpp +++ b/src/hash_function.cpp @@ -2,7 +2,6 @@ #include #include #include "hash_function.h" -#include "pmurhashAPI.h" #include "bswap_32.h" #include #include "hashed_model_matrix.h" diff --git a/src/pmurhash32.c b/src/pmurhash32.c new file mode 100644 index 0000000..720d022 --- /dev/null +++ b/src/pmurhash32.c @@ -0,0 +1 @@ +#include