diff --git a/DESCRIPTION b/DESCRIPTION index 2f9bb1c..ccaa974 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: FeatureHashing Type: Package Title: Creates a Model Matrix via Feature Hashing with a Formula Interface -Version: 0.9.1.2 +Version: 0.10.0 Date: 2015-09-22 Authors@R: c( person("Wush", "Wu", email = "wush978@gmail.com", role = c("aut", "cre")), @@ -24,7 +24,8 @@ Imports: digest(>= 0.6.8), magrittr (>= 1.5) LinkingTo: Rcpp, digest(>= 0.6.8), BH -Suggests: RUnit, glmnet, knitr, xgboost, rmarkdown +Suggests: RUnit, glmnet, knitr, xgboost, rmarkdown, jiebaR(>= 0.5.1) +RcppModules: callback, split_callback SystemRequirements: C++11 BugReports: https://github.com/wush978/FeatureHashing/issues URL: https://github.com/wush978/FeatureHashing diff --git a/NAMESPACE b/NAMESPACE index 208e638..5b870ec 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,17 +1,24 @@ # Generated by roxygen2 (4.1.1): do not edit by hand +export(generate_split_callback) export(hash.mapping) export(hash.sign) export(hash.size) export(hashed.interaction.value) export(hashed.model.matrix) export(hashed.value) +export(init_jiebaR_callback) export(intToRaw) +export(ls_special) +export(register_callback) +export(test_callback) import(digest) importClassesFrom(Matrix,dgCMatrix) importFrom(Matrix,Diagonal) importFrom(Matrix,colSums) -importFrom(Rcpp,evalCpp) +importFrom(Rcpp,cpp_object_initializer) +importFrom(Rcpp,loadModule) +importFrom(Rcpp,sourceCpp) importFrom(magrittr,"%<>%") importFrom(magrittr,"%>%") importFrom(methods,as) diff --git a/R/RcppExports.R b/R/RcppExports.R index d29cab8..5774ed1 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -17,6 +17,15 @@ tomatrix <- function(m) { .Call('_FeatureHashing_tomatrix', PACKAGE = 'FeatureHashing', m) } +#'@title Test the callback function. +#'@param Rcallback external pointer. The pointer of the callback function. +#'@param input string. The input. +#'@return character +#'@export +test_callback <- function(Rcallback, input) { + .Call('_FeatureHashing_test_callback', PACKAGE = 'FeatureHashing', Rcallback, input) +} + #'@export hash.sign hash.sign <- function(src) { .Call('_FeatureHashing_xi', PACKAGE = 'FeatureHashing', src) diff --git a/R/callback.R b/R/callback.R new file mode 100644 index 0000000..f7c147d --- /dev/null +++ b/R/callback.R @@ -0,0 +1,65 @@ + +#'@export +#'@title Register Special Function for Formula Interface +#'@param special string. The name which will be used in formula interface. +#'@param callback_generator function which will create a callback. Please see the details. +#'@details The callback_generator is a function whose first argument is the +#'input data and the other arguments could be used to initialize the callback +#'function properly. The result should be a Rcpp module which derives the +#'`CallbackFunctor` class. Please see the vignette for details. +#'register_callback("split", generate_split_callback) +register_callback <- function(special, callback_generator) { + .callback[[special]] <- callback_generator + invisible(NULL) +} + +#'@title List the Registered Specials +#'@return character vector. The specials which could be used in the +#'formula interface. +#'@export +ls_special <- function() { + ls(.callback) +} + +#'@title Generate callback of split +#'@param input character vector. The input of split +#'@param delim string. \code{delim} will be used as delimiter for splitting +#'@param type string. One of \code{c("existence", "count")} +#'"count" indicates the number of occurrence of the token. "existence" indicates the boolean that whether the token exist or not. +#'@export +generate_split_callback <- function(input, delim = ",", type = c("existence", "count")) { + callback <- new(split_callback, input, delim, type[1]) + callback +} + +.callback <- new.env() +.callback[["split"]] <- generate_split_callback + +#'@title Initialize and register jiebaR to the formula interface +#'@details This function will register the callback of word segmentation +#'function provided by jiebaR to the formula interface. +#'For example, `~ jiebaR(...)` will use the feature of word segmentation +#'provided by jiebaR to segment a given column of the data. +#'The first argument of the jiebaR is a character which will be segmented. +#'The left arguments are the same as \code{\link[jiebaR]{worker}}. These +#'arguments will be used to initialize a jiebaR worker which will segment +#'the input data. +#' +#'@examples +#'\dontrun{ +#'library(FeatureHashing) +#'init_jiebaR_callback() +#'m <- hashed.model.matrix(~ jiebaR(title, type = "mix", df)) +#'# the column `df$title` will be feed into `worker <- worker(type = "mix")` +#'# the result of `worker <= df$title` will be hashed into the sparse matrix +#'# the result is `m` +#'} +#'@export +#'@importFrom Rcpp sourceCpp +init_jiebaR_callback <- function() { + if (!requireNamespace("jiebaR", character.only = TRUE)) stop("Please install the package jiebaR first") + tryCatch({ + sourceCpp(system.file("callback/jiebaR_callback.cpp", package = "FeatureHashing")) + }, finally = { + }) +} diff --git a/R/hashed.model.matrix.R b/R/hashed.model.matrix.R index 4e75f97..3a18211 100644 --- a/R/hashed.model.matrix.R +++ b/R/hashed.model.matrix.R @@ -214,7 +214,7 @@ hashed.model.matrix <- function(formula, data, hash.size = 2^18, transpose = FAL formula <- as.character(formula) %>% gsub(pattern = tf.idf.string, replacement = "type = \"count\"", x = .) %>% paste0(collapse = " ") %>% as.formula } - tf <- terms.formula(formula, data = data, specials = "split") + tf <- terms.formula(formula, data = data, specials = ls(.callback)) retval <- new(.CSCMatrix) .hashed.model.matrix.dataframe(tf, data, hash.size, transpose, retval, create.mapping, signed.hash, progress) class(retval) <- .CSCMatrix @@ -228,29 +228,51 @@ hashed.model.matrix <- function(formula, data, hash.size = 2^18, transpose = FAL } else if (tf.idf) tf.idf.transfo(retval) else retval } -# This is the function called from C to parse the \code{split} function. -parse_split <- function(text) { +# This is the function called from C to parse the special function. +parse_special <- function(text, special, df) { origin.keep.source <- options()$keep.source tryCatch({ options(keep.source = TRUE) p <- parse(text = text) tmp <- getParseData(p) reference_name <- tmp$text[which(tmp$token == "SYMBOL")] - if ("delim" %in% tmp$text) { - delim <- tmp$text[which(tmp$text == "delim")[1] + 2] - delim <- gsub(pattern = '"', replacement = '', delim) - } else { - # the default value of delim - delim <- "," + params <- list() + fname <- NULL + first_symbol <- NULL + start <- FALSE + for(i_symbol in seq_len(nrow(tmp))) { + if (tmp$token[i_symbol] != "SYMBOL_FUNCTION_CALL" & !start) next + start <- TRUE + switch(tmp$token[i_symbol], + "SYMBOL_FUNCTION_CALL" = { + fname <- tmp$text[i_symbol] + }, + "SYMBOL" = { + if (tmp$token[i_symbol - 1] == "EQ_SUB") next + value <- eval(parse(text = tmp$text[i_symbol]), envir = df) + params <- append(params, list(value)) + if (is.null(first_symbol)) first_symbol <- tmp$text[i_symbol] + }, + "STR_CONST" = { + if (tmp$token[i_symbol - 1] == "EQ_SUB") next + value <- eval(parse(text = tmp$text[i_symbol]), envir = parent.frame()) + params <- append(params, list(value)) + }, + "SYMBOL_SUB" = { + if (tmp$token[i_symbol + 1] != "EQ_SUB") next + element <- list() + name <- tmp$text[i_symbol] + value <- eval(parse(text = tmp$text[i_symbol + 2]), envir = df) + element[[name]] <- value + params <- append(params, element) + }, + next) } - if ("type" %in% tmp$text) { - type <- tmp$text[which(tmp$text == "type")[1] + 2] - type <- gsub(pattern = '"', replacement = '', type) - } else { - # the default value of type - type <- "existence" - } - list(reference_name = reference_name, delim = delim, type = type) + stopifnot(!is.null(fname)) + stopifnot(start) + retval <- do.call(.callback[[special]], params) + attr(retval, "rname") <- first_symbol + retval }, finally = {options(keep.source = origin.keep.source)}) } diff --git a/R/zzz.R b/R/zzz.R index 2ce3353..2c4abdc 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,7 +1,12 @@ #'@useDynLib FeatureHashing -#'@importFrom Rcpp evalCpp +#'@importFrom Rcpp loadModule cpp_object_initializer #'@import digest -.onLoad <- function(libname, pkgname) { } +.onLoad <- function(libname, pkgname) { + # loadRcppModules() +} + +loadModule("callback", TRUE) +loadModule("split_callback", TRUE) .onAttach <- function(libname, pkgname) { if (interactive()) { diff --git a/appveyor.yml b/appveyor.yml index 086cdaf..9862d36 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -13,6 +13,7 @@ install: build_script: - travis-tool.sh install_deps + - travis-tool.sh install_github qinwf/jiebaR test_script: - travis-tool.sh run_tests diff --git a/inst/callback/jiebaR_callback.cpp b/inst/callback/jiebaR_callback.cpp new file mode 100644 index 0000000..7d77864 --- /dev/null +++ b/inst/callback/jiebaR_callback.cpp @@ -0,0 +1,112 @@ +// [[Rcpp::depends(jiebaR)]] +// [[Rcpp::depends(FeatureHashing)]] + +#include "jiebaRAPI.h" +#include +#include + +using namespace Rcpp; + +struct jiebaRCallbackFunctor : public CallbackFunctor { + + enum Type { + MIX, + MP, + HMM, + QUERY, + KEY + }; + + Type type; + Environment cutter; + SEXP cutter_pointer; + + typedef SEXP (*Cut)(SEXP, SEXP); + + Cut cut; + + void set_type(std::string _type) { + if (_type.compare("mix") == 0) { + type = MIX; + } else if (_type.compare("mp") == 0) { + type = MP; + } else if (_type.compare("hmm") == 0) { + type = HMM; + } else if (_type.compare("query") == 0) { + type = QUERY; + } else if (_type.compare("key") == 0) { + type = KEY; + } else { + throw std::invalid_argument("Unknown type"); + } + } + + std::string get_type() { + switch (type) { + case MIX: + return "mix"; + case MP: + return "mp"; + case HMM: + return "hmm"; + case QUERY: + return "query"; + case KEY: + return "key"; + } + } + + void set_cut() { + std::string fname("jiebaR_"); + fname.append(get_type()); + fname.append("_cut"); + cut = reinterpret_cast(::R_GetCCallable("jiebaR", fname.c_str())); + } + + explicit jiebaRCallbackFunctor( + SEXP _src, + std::string _type, + SEXP _cutter + ) + : type(MIX), + cutter(_cutter), + cutter_pointer(NULL), + cut(NULL), + CallbackFunctor(_src) + { + set_type(_type); + set_cut(); + cutter_pointer = wrap(cutter["worker"]); + } + + virtual ~jiebaRCallbackFunctor() { } + + virtual const std::vector operator()(const char* input) const { + return as >((*cut)(wrap(input), cutter_pointer)); + } + +}; + +RCPP_MODULE(jiebaR_callback) { + + class_("callback") + ; + + class_("jiebaR_callback") + .derives("callback") + .constructor() + .property("type", &jiebaRCallbackFunctor::get_type, &jiebaRCallbackFunctor::set_type) + .field("cutter", &jiebaRCallbackFunctor::cutter) + ; + +} + +/***R +generate_jiebaR_callback <- function(input, type = "mix", ...) { + worker <- jiebaR::worker(type = type, ...) + callback <- new(jiebaR_callback, input, type, worker) + callback +} + +FeatureHashing::register_callback("jiebaR", generate_jiebaR_callback) +*/ diff --git a/src/digest.c b/inst/include/callback.h similarity index 60% rename from src/digest.c rename to inst/include/callback.h index d02f6a3..4e217a2 100644 --- a/src/digest.c +++ b/inst/include/callback.h @@ -1,6 +1,6 @@ /* * This file is part of FeatureHashing - * Copyright (C) 2014-2015 Wush Wu + * Copyright (C) 2015 Wush Wu * * This program is free software: you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -16,9 +16,26 @@ * this program. If not, see . */ -#include "pmurhashAPI.h" -#include +#ifndef __CALLBACK_H__ +#define __CALLBACK_H__ -const uint32_t - MURMURHASH3_H_SEED = 3120602769LL, - MURMURHASH3_XI_SEED = 79193439LL; +#include +#include +#include + +class CallbackFunctor { + +public: + + // TODO: let src private + Rcpp::CharacterVector src; + bool decollision; + + CallbackFunctor(SEXP _src) : src(_src), decollision(false) { } + virtual ~CallbackFunctor() { } + + virtual const std::vector operator()(const char* input) const = 0; + +}; + +#endif //__CALLBACK_H__ \ No newline at end of file diff --git a/inst/include/hash_function.h b/inst/include/hash_function.h new file mode 100644 index 0000000..122b1ab --- /dev/null +++ b/inst/include/hash_function.h @@ -0,0 +1,74 @@ +/* + * This file is part of FeatureHashing + * Copyright (C) 2015 Wush Wu + * + * This program is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + */ + +#ifndef __HASH_FUNCTION_HPP__ +#define __HASH_FUNCTION_HPP__ + +#include +#include +#include + +#ifdef HAVE_VISIBILITY_ATTRIBUTE +# define attribute_hidden __attribute__ ((visibility ("hidden"))) +#else +# define attribute_hidden +#endif + +extern "C" { + + /* First look for special cases */ +#if defined(_MSC_VER) +#define MH_UINT32 unsigned long +#endif + +/* If the compiler says it's C99 then take its word for it */ +#if !defined(MH_UINT32) && ( \ + defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L ) +#include +#define MH_UINT32 uint32_t +#endif + +/* Otherwise try testing against max value macros from limit.h */ +#if !defined(MH_UINT32) +#include +#if (USHRT_MAX == 0xffffffffUL) +#define MH_UINT32 unsigned short +#elif (UINT_MAX == 0xffffffffUL) +#define MH_UINT32 unsigned int +#elif (ULONG_MAX == 0xffffffffUL) +#define MH_UINT32 unsigned long +#endif +#endif + +#if !defined(MH_UINT32) +#error Unable to determine type name for unsigned 32-bit int +#endif + +MH_UINT32 attribute_hidden PMurHash32(MH_UINT32 seed, const void *key, int len); + +} + +class HashFunction { + +public: + + virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false) = 0; + +}; + +#endif diff --git a/man/generate_split_callback.Rd b/man/generate_split_callback.Rd new file mode 100644 index 0000000..0963855 --- /dev/null +++ b/man/generate_split_callback.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/callback.R +\name{generate_split_callback} +\alias{generate_split_callback} +\title{Generate callback of split} +\usage{ +generate_split_callback(input, delim = ",", type = c("existence", "count")) +} +\arguments{ +\item{input}{character vector. The input of split} + +\item{delim}{string. \code{delim} will be used as delimiter for splitting} + +\item{type}{string. One of \code{c("existence", "count")} +"count" indicates the number of occurrence of the token. "existence" indicates the boolean that whether the token exist or not.} +} +\description{ +Generate callback of split +} + diff --git a/man/init_jiebaR_callback.Rd b/man/init_jiebaR_callback.Rd new file mode 100644 index 0000000..9f0d0e4 --- /dev/null +++ b/man/init_jiebaR_callback.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/callback.R +\name{init_jiebaR_callback} +\alias{init_jiebaR_callback} +\title{Initialize and register jiebaR to the formula interface} +\usage{ +init_jiebaR_callback() +} +\description{ +Initialize and register jiebaR to the formula interface +} +\details{ +This function will register the callback of word segmentation +function provided by jiebaR to the formula interface. +For example, `~ jiebaR(...)` will use the feature of word segmentation +provided by jiebaR to segment a given column of the data. +The first argument of the jiebaR is a character which will be segmented. +The left arguments are the same as \code{\link[jiebaR]{worker}}. These +arguments will be used to initialize a jiebaR worker which will segment +the input data. +} +\examples{ +\dontrun{ +library(FeatureHashing) +init_jiebaR_callback() +m <- hashed.model.matrix(~ jiebaR(title, type = "mix", df)) +# the column `df$title` will be feed into `worker <- worker(type = "mix")` +# the result of `worker <= df$title` will be hashed into the sparse matrix +# the result is `m` +} +} + diff --git a/man/ls_special.Rd b/man/ls_special.Rd new file mode 100644 index 0000000..c9a3d24 --- /dev/null +++ b/man/ls_special.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/callback.R +\name{ls_special} +\alias{ls_special} +\title{List the Registered Specials} +\usage{ +ls_special() +} +\value{ +character vector. The specials which could be used in the +formula interface. +} +\description{ +List the Registered Specials +} + diff --git a/man/register_callback.Rd b/man/register_callback.Rd new file mode 100644 index 0000000..8660d12 --- /dev/null +++ b/man/register_callback.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/callback.R +\name{register_callback} +\alias{register_callback} +\title{Register Special Function for Formula Interface} +\usage{ +register_callback(special, callback_generator) +} +\arguments{ +\item{special}{string. The name which will be used in formula interface.} + +\item{callback_generator}{function which will create a callback. Please see the details.} +} +\description{ +Register Special Function for Formula Interface +} +\details{ +The callback_generator is a function whose first argument is the +input data and the other arguments could be used to initialize the callback +function properly. The result should be a Rcpp module which derives the +`CallbackFunctor` class. Please see the vignette for details. +register_callback("split", generate_split_callback) +} + diff --git a/man/test_callback.Rd b/man/test_callback.Rd new file mode 100644 index 0000000..5c9e383 --- /dev/null +++ b/man/test_callback.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{test_callback} +\alias{test_callback} +\title{Test the callback function.} +\usage{ +test_callback(Rcallback, input) +} +\arguments{ +\item{Rcallback}{external pointer. The pointer of the callback function.} + +\item{input}{string. The input.} +} +\value{ +character +} +\description{ +Test the callback function. +} + diff --git a/src/Makevars b/src/Makevars index a7f3510..c78b353 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1 +1,3 @@ CXX_STD = CXX11 + +PKG_CPPFLAGS = -I../inst/include/ \ No newline at end of file diff --git a/src/Makevars.win b/src/Makevars.win index a7f3510..c78b353 100644 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -1 +1,3 @@ CXX_STD = CXX11 + +PKG_CPPFLAGS = -I../inst/include/ \ No newline at end of file diff --git a/src/Makevars~ b/src/Makevars~ new file mode 100644 index 0000000..a7f3510 --- /dev/null +++ b/src/Makevars~ @@ -0,0 +1 @@ +CXX_STD = CXX11 diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 0c46a1e..1d56e6c 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -50,6 +50,18 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// test_callback +SEXP test_callback(SEXP Rcallback, const std::string& input); +RcppExport SEXP _FeatureHashing_test_callback(SEXP RcallbackSEXP, SEXP inputSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< SEXP >::type Rcallback(RcallbackSEXP); + Rcpp::traits::input_parameter< const std::string& >::type input(inputSEXP); + rcpp_result_gen = Rcpp::wrap(test_callback(Rcallback, input)); + return rcpp_result_gen; +END_RCPP +} // xi IntegerVector xi(CharacterVector src); RcppExport SEXP _FeatureHashing_xi(SEXP srcSEXP) { @@ -191,11 +203,15 @@ BEGIN_RCPP END_RCPP } +RcppExport SEXP _rcpp_module_boot_callback(); +RcppExport SEXP _rcpp_module_boot_split_callback(); + static const R_CallMethodDef CallEntries[] = { {"_FeatureHashing_pair_sort", (DL_FUNC) &_FeatureHashing_pair_sort, 2}, {"_FeatureHashing_merge", (DL_FUNC) &_FeatureHashing_merge, 2}, {"_FeatureHashing_todgCMatrix", (DL_FUNC) &_FeatureHashing_todgCMatrix, 1}, {"_FeatureHashing_tomatrix", (DL_FUNC) &_FeatureHashing_tomatrix, 1}, + {"_FeatureHashing_test_callback", (DL_FUNC) &_FeatureHashing_test_callback, 2}, {"_FeatureHashing_xi", (DL_FUNC) &_FeatureHashing_xi, 1}, {"_FeatureHashing_h", (DL_FUNC) &_FeatureHashing_h, 1}, {"_FeatureHashing_h2", (DL_FUNC) &_FeatureHashing_h2, 1}, @@ -207,6 +223,8 @@ static const R_CallMethodDef CallEntries[] = { {"_FeatureHashing_split_count", (DL_FUNC) &_FeatureHashing_split_count, 2}, {"_FeatureHashing_selectColumn", (DL_FUNC) &_FeatureHashing_selectColumn, 4}, {"_FeatureHashing_selectRow", (DL_FUNC) &_FeatureHashing_selectRow, 4}, + {"_rcpp_module_boot_callback", (DL_FUNC) &_rcpp_module_boot_callback, 0}, + {"_rcpp_module_boot_split_callback", (DL_FUNC) &_rcpp_module_boot_split_callback, 0}, {NULL, NULL, 0} }; diff --git a/src/callback.cpp b/src/callback.cpp new file mode 100644 index 0000000..24945aa --- /dev/null +++ b/src/callback.cpp @@ -0,0 +1,42 @@ +/* + * This file is part of FeatureHashing + * Copyright (C) 2015 Wush Wu + * + * This program is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + */ + +#include "callback.h" +#include "converters.h" +#include "split.h" +#include + +using namespace Rcpp; + +//'@title Test the callback function. +//'@param Rcallback external pointer. The pointer of the callback function. +//'@param input string. The input. +//'@return character +//'@export +//[[Rcpp::export]] +SEXP test_callback(SEXP Rcallback, const std::string& input) { + CallbackFunctor* callback(as(Rcallback)); + return wrap((*callback)(input.c_str())); +} + +RCPP_MODULE(callback) { + + class_("callback") + ; + +} diff --git a/src/converters.cpp b/src/converters.cpp new file mode 100644 index 0000000..4540035 --- /dev/null +++ b/src/converters.cpp @@ -0,0 +1,53 @@ +#include "converters.h" + +const std::vector& CallbackConverter::get_feature(size_t i) { + SEXP pstr = STRING_ELT(psrc, i); + if (pstr == NA_STRING) { + feature_buffer.clear(); + } else { + const char* str = CHAR(pstr); + cache = f->operator()(str); + feature_buffer.resize(cache.size()); + std::transform(cache.begin(), cache.end(), feature_buffer.begin(), + [this](const std::string& s) { + return this->get_hashed_feature(this->h_main, s.c_str()); + }); + if (is_final) { + std::transform(feature_buffer.begin(), feature_buffer.end(), + feature_buffer.begin(), [this](uint32_t feature) { + return feature % this->hash_size; + }); + if (f->decollision) { + std::set tmp(feature_buffer.begin(), feature_buffer.end()); + feature_buffer.clear(); + feature_buffer.assign(tmp.begin(), tmp.end()); + } + } + } + return feature_buffer; +} + +const std::vector& CallbackConverter::get_value(size_t i) { + SEXP pstr = STRING_ELT(psrc, i); + if (pstr == NA_STRING) { + value_buffer.clear(); + } else { + const char* str = CHAR(pstr); + value_buffer.resize(cache.size()); + std::transform(cache.begin(), cache.end(), value_buffer.begin(), + [this](const std::string& s) { + return get_sign(get_hashed_feature(this->h_binary, s.c_str())); + }); +#ifdef NOISY_DEBUG + for(int j = 0;j < cache.size();j++) { + Rprintf("signed hash: %s ... got %zu\n", cache[j].c_str(), value_buffer[j]); + } +#endif + if (is_final & f->decollision) { + if (value_buffer.size() < feature_buffer.size()) throw std::logic_error("The length of value_buffer and feature_buffer go wrong!"); + value_buffer.resize(feature_buffer.size()); + } + } + return value_buffer; +} + diff --git a/src/converters.h b/src/converters.h new file mode 100644 index 0000000..3ff2159 --- /dev/null +++ b/src/converters.h @@ -0,0 +1,485 @@ +/* + * This file is part of FeatureHashing + * Copyright (C) 2015 Wush Wu + * + * This program is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + */ + +#ifndef __CONVERTERS_HPP__ +#define __CONVERTERS_HPP__ + +#include +#include "vector_converter.h" +#include "split.h" + +class CharacterConverter; +class FactorConverter; +template +class DenseConverter; +class TagExistenceFactorConverter; +class TagExistenceCharacterConverter; +class TagCountFactorConverter; +class TagCountCharacterConverter; +class InteractionConverter; + +typedef std::shared_ptr pVectorConverter; +typedef std::shared_ptr pCharacterConverter; +typedef std::shared_ptr pFactorConverter; +typedef DenseConverter NumConverter; +typedef std::shared_ptr pNumConverter; +typedef DenseConverter IntConverter; +typedef std::shared_ptr pIntConverter; +typedef DenseConverter LogicalConverter; +typedef std::shared_ptr pLogicalConverter; +typedef std::shared_ptr pTagExistenceFactorConverter; +typedef std::shared_ptr pTagExistenceCharacterConverter; +typedef std::shared_ptr pTagCountFactorConverter; +typedef std::shared_ptr pTagCountCharacterConverter; +typedef std::vector< pVectorConverter > ConvertersVec; +typedef std::shared_ptr pInteractionConverter; + +class CharacterConverter : public VectorConverter { + + Rcpp::CharacterVector src; + SEXP psrc; + +public: + + explicit CharacterConverter(SEXP _src, const Param& param) + : VectorConverter(param), src(_src), psrc(wrap(src)) { + value_buffer.reserve(1); + feature_buffer.reserve(1); + } + + virtual ~CharacterConverter() { } + + virtual const std::vector& get_feature(size_t i) { + SEXP pstr = STRING_ELT(psrc, i); + if (pstr == NA_STRING) { + feature_buffer.clear(); + } else { + const char* str = CHAR(pstr); + feature_buffer.resize(1); + feature_buffer[0] = get_hashed_feature(h_main, str); + if (is_final) feature_buffer[0] = feature_buffer[0] % hash_size; + } + return feature_buffer; + } + + virtual const std::vector& get_value(size_t i) { + SEXP pstr = STRING_ELT(psrc, i); + if (pstr == NA_STRING) { + value_buffer.clear(); + } else { + const char* str = CHAR(pstr); + uint32_t sign_value = get_hashed_feature(h_binary, str); + value_buffer.resize(1); + value_buffer[0] = get_sign(sign_value); + } + return value_buffer; + } +}; + +class FactorConverter : public VectorConverter { + + Rcpp::IntegerVector src; + Rcpp::CharacterVector levels; + SEXP plevels; + +public: + + explicit FactorConverter(SEXP _src, const Param& param) + : VectorConverter(param), src(_src), levels(src.attr("levels")), plevels(wrap(levels)) { + value_buffer.reserve(1); + feature_buffer.reserve(1); + } + + virtual ~FactorConverter() { } + + virtual const std::vector& get_feature(size_t i) { + if (src[i] == NA_INTEGER) { + feature_buffer.clear(); + } else { + feature_buffer.resize(1); + const char* str = CHAR(STRING_ELT(plevels, src[i] - 1)); // R start from 1 and C start from 0 + feature_buffer[0] = get_hashed_feature(h_main, str); + if (is_final) feature_buffer[0] = feature_buffer[0] % hash_size; + } + return feature_buffer; + } + + virtual const std::vector& get_value(size_t i) { + if (src[i] == NA_INTEGER) { + value_buffer.clear(); + } else { + const char* str = CHAR(STRING_ELT(plevels, src[i] - 1)); // R start from 1 and C start from 0 + uint32_t sign_value = get_hashed_feature(h_binary, str); + value_buffer.resize(1); + value_buffer[0] = get_sign(sign_value); + } + return value_buffer; + } + +}; + +template +class DenseConverter : public VectorConverter { + + Rcpp::Vector src; + uint32_t value; + int sign_value; + + static bool isNA(ValueType x) { + switch(RType) { + case REALSXP: + return R_IsNA(x); + case INTSXP: + return x == NA_INTEGER; + case LGLSXP: + return x == NA_LOGICAL; + default: + throw std::logic_error("Invalid RType"); + } + } + +public: + + explicit DenseConverter(SEXP _src, const Param& param) + : VectorConverter(param), src(_src), value(get_hashed_feature(h_main, "")), + sign_value(get_sign(get_hashed_feature(h_binary, ""))) { + feature_buffer.reserve(1); + value_buffer.reserve(1); + } + + virtual ~DenseConverter() { } + + virtual const std::vector& get_feature(size_t i) { + if (isNA(src[i]) | src[i] == 0) { + feature_buffer.clear(); + } else { + feature_buffer.resize(1); + feature_buffer[0] = (is_final ? value % hash_size : value); + } + return feature_buffer; + } + + virtual const std::vector& get_value(size_t i) { + if (isNA(src[i]) | src[i] == 0) { + value_buffer.clear(); + } else { + value_buffer.resize(1); + value_buffer[0] = sign_value * src[i]; + } + return value_buffer; + } + +}; + +template +class TagConverter : public VectorConverter { + + +protected: + + std::string delim; + size_t cache_i; + CacheTagType cache_tags; + + virtual void get_tags(size_t i) = 0; + + std::vector split_tags(const std::string& src) { + std::vector temp(split(src, delim)); + temp.erase(std::remove(temp.begin(), temp.end(), ""), temp.end()); + return temp; + } + + virtual void decollision_feature(size_t i) { } + + virtual void decollision_value(size_t i) { } + +public: + + explicit TagConverter(const Param& param, const std::string& _delim) + : VectorConverter(param), delim(_delim), cache_i(-1) + { } + + virtual ~TagConverter() { } + + virtual const std::vector& get_feature(size_t i) { + get_tags(i); + feature_buffer.resize(cache_tags.size()); + size_t k = 0; + for(auto j = cache_tags.begin();j != cache_tags.end();j++) { + feature_buffer[k++] = (is_final ? get_hashed_feature(h_main, j->c_str()) % hash_size : get_hashed_feature(h_main, j->c_str())); + } + if (is_final) decollision_feature(i); + return feature_buffer; + } + + virtual const std::vector& get_value(size_t i) { + get_tags(i); + value_buffer.resize(cache_tags.size()); + size_t k = 0; + for(auto j = cache_tags.begin();j != cache_tags.end();j++) { + value_buffer[k++] = get_sign(get_hashed_feature(h_binary, j->c_str())); + } + if (is_final) decollision_value(i); + return value_buffer; + } + +}; + +class TagExistenceConverter : public TagConverter< std::set > { + + size_t decollision_mark; + +protected: + + virtual void decollision_feature(size_t i) { + std::set temp; + temp.insert(feature_buffer.begin(), feature_buffer.end()); + feature_buffer.clear(); + feature_buffer.assign(temp.begin(), temp.end()); + decollision_mark = i + 1; + } + + virtual void decollision_value(size_t i) { + if (decollision_mark != i + 1) throw std::logic_error("The order of decollision is unexpected"); + value_buffer.resize(feature_buffer.size()); + } + +public: + + TagExistenceConverter(const Param& param, const std::string& _delim) + : TagConverter >(param, _delim), decollision_mark(0) + { } + + virtual ~TagExistenceConverter() { } + +}; + +class TagExistenceFactorConverter : public TagExistenceConverter { + + Rcpp::IntegerVector src; + Rcpp::CharacterVector levels; + SEXP plevels; + std::vector cache_splitted; + +protected: + + virtual void get_tags(size_t i) { + if (i == cache_i) return; + if (src[i] == NA_INTEGER) { + cache_tags.clear(); + } else { + const char* str = CHAR(STRING_ELT(plevels, src[i] - 1)); + std::vector temp(split_tags(str)); + cache_splitted.swap(temp); + cache_tags.clear(); + cache_tags.insert(cache_splitted.begin(), cache_splitted.end()); + } + } + +public: + + explicit TagExistenceFactorConverter(SEXP _src, const Param& param, const std::string& _delim) + : TagExistenceConverter(param, _delim), src(_src), levels(src.attr("levels")), plevels(wrap(levels)) + { } + + virtual ~TagExistenceFactorConverter() { } + +}; + +class TagExistenceCharacterConverter : public TagExistenceConverter { + + Rcpp::CharacterVector src; + SEXP psrc; + std::vector cache_splitted; + +protected: + + virtual void get_tags(size_t i) { + if (i == cache_i) return; + SEXP pstr = STRING_ELT(psrc, i); + if (pstr == NA_STRING) { + cache_tags.clear(); + } else { + const char* str = CHAR(STRING_ELT(psrc, i)); + std::vector temp(split_tags(str)); + cache_splitted.swap(temp); + cache_tags.clear(); + cache_tags.insert(cache_splitted.begin(), cache_splitted.end()); + } + } + +public: + + explicit TagExistenceCharacterConverter(SEXP _src, const Param& param, const std::string& _delim) + : TagExistenceConverter(param, _delim), src(_src), psrc(wrap(src)) + { } + + virtual ~TagExistenceCharacterConverter() { } + +}; + +class TagCountFactorConverter : public TagConverter< std::vector > { + + Rcpp::IntegerVector src; + Rcpp::CharacterVector levels; + SEXP plevels; + +protected: + + virtual void get_tags(size_t i) { + if (i == cache_i) return; + if (src[i] == NA_INTEGER) { + cache_tags.clear(); + } else { + const char* str = CHAR(STRING_ELT(plevels, src[i] - 1)); + std::vector temp(split_tags(str)); + cache_tags.swap(temp); + } + } + +public: + + explicit TagCountFactorConverter(SEXP _src, const Param& param, const std::string& _delim) + : TagConverter< std::vector >(param, _delim), src(_src), levels(src.attr("levels")), plevels(wrap(levels)) + { } + + virtual ~TagCountFactorConverter() { } + +}; + +class TagCountCharacterConverter : public TagConverter< std::vector > { + + Rcpp::CharacterVector src; + SEXP psrc; + +protected: + + virtual void get_tags(size_t i) { + if (i == cache_i) return; + SEXP pstr = STRING_ELT(psrc, i); + if (pstr == NA_STRING) { + cache_tags.clear(); + } else { + const char* str = CHAR(STRING_ELT(psrc, i)); + std::vector temp(split_tags(str)); + cache_tags.swap(temp); + } + } + +public: + + explicit TagCountCharacterConverter(SEXP _src, const Param& param, const std::string& _delim) + : TagConverter< std::vector >(param, _delim), src(_src), psrc(wrap(src)) + { } + + virtual ~TagCountCharacterConverter() { } + +}; + +class InteractionConverter : public VectorConverter { + + pVectorConverter a, b; + +public: + + explicit InteractionConverter(pVectorConverter _a, pVectorConverter _b, const Param& param) : + VectorConverter(param), a(_a), b(_b) { + a->is_final = false; + b->is_final = false; + } + + virtual ~InteractionConverter() { } + + virtual const std::vector& get_feature(size_t i) { + const std::vector &afeature_buffer(a->get_feature(i)), &bfeature_buffer(b->get_feature(i)); + feature_buffer.resize(afeature_buffer.size() * bfeature_buffer.size()); + value_buffer.resize(afeature_buffer.size() * bfeature_buffer.size()); + size_t l = 0; + if (is_final) { + for(auto j = 0;j < afeature_buffer.size();j++) { + for(auto k = 0;k < bfeature_buffer.size();k++) { + feature_buffer[l] = get_hashed_feature(h_main, afeature_buffer[j], bfeature_buffer[k]) % hash_size; + value_buffer[l] = get_sign(get_hashed_feature(h_binary, afeature_buffer[j], bfeature_buffer[k])); + l++; + } + } + } else { + for(auto j = 0;j < afeature_buffer.size();j++) { + for(auto k = 0;k < bfeature_buffer.size();k++) { + feature_buffer[l] = get_hashed_feature(h_main, afeature_buffer[j], bfeature_buffer[k]); + value_buffer[l] = get_sign(get_hashed_feature(h_binary, afeature_buffer[j], bfeature_buffer[k])); + l++; + } + } + } + return feature_buffer; + } + + virtual const std::vector& get_value(size_t i) { + const std::vector &avalue_buffer(a->get_value(i)), &bvalue_buffer(b->get_value(i)); + size_t l = 0; + for(auto j = 0;j < avalue_buffer.size();j++) { + for(auto k = 0;k < bvalue_buffer.size();k++) { + value_buffer[l] = avalue_buffer[j] * bvalue_buffer[k] * value_buffer[l]; + l++; + } + } + return value_buffer; + } + +private: + + uint32_t get_hashed_feature(HashFunction *h, uint32_t a, uint32_t b) { + uint32_t buf[2]; + #ifdef BOOST_BIG_ENDIAN + buf[0] = bswap_32(a); + buf[1] = bswap_32(b); + #else + buf[0] = a; + buf[1] = b; + #endif + return (*h)(reinterpret_cast(buf), sizeof(uint32_t) * 2, true); + } + +}; + +class CallbackConverter : public VectorConverter { + + // TODO: refactor this + Rcpp::CharacterVector src; + const CallbackFunctor* f; + SEXP psrc; + std::vector< std::string > cache; + +public: + + CallbackConverter(const CallbackFunctor* _f, const Param& param) + : f(_f), src(_f->src), psrc(_f->src), VectorConverter(param) + { } + + virtual ~CallbackConverter() { } + + virtual const std::vector& get_feature(size_t i); + + virtual const std::vector& get_value(size_t i); + +}; + +RCPP_EXPOSED_CLASS(CallbackFunctor) + +#endif // __CONVERTERS_HPP__ \ No newline at end of file diff --git a/src/digestlocal.h b/src/digestlocal.h deleted file mode 100644 index 86fe0df..0000000 --- a/src/digestlocal.h +++ /dev/null @@ -1,63 +0,0 @@ -/** - * - * MurmurHash3 was written by Austin Appleby, and is placed in the public. - * - * This header links the implementation of murmurhash3 in digest3 to FeatureHashing. - * This was writting by Wush Wu, and also public domain. - * - */ - -#include -#include -#include -#include -#include - -#ifdef HAVE_VISIBILITY_ATTRIBUTE - # define attribute_hidden __attribute__ ((visibility ("hidden"))) -#else - # define attribute_hidden -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -/* First look for special cases */ -#if defined(_MSC_VER) - #define MH_UINT32 unsigned long -#endif - -/* If the compiler says it's C99 then take its word for it */ -#if !defined(MH_UINT32) && ( \ - defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L ) - #include - #define MH_UINT32 uint32_t -#endif - -/* Otherwise try testing against max value macros from limit.h */ -#if !defined(MH_UINT32) - #include - #if (USHRT_MAX == 0xffffffffUL) - #define MH_UINT32 unsigned short - #elif (UINT_MAX == 0xffffffffUL) - #define MH_UINT32 unsigned int - #elif (ULONG_MAX == 0xffffffffUL) - #define MH_UINT32 unsigned long - #endif -#endif - -#if !defined(MH_UINT32) - #error Unable to determine type name for unsigned 32-bit int -#endif - -/* I'm yet to work on a platform where 'unsigned char' is not 8 bits */ -#define MH_UINT8 unsigned char - -MH_UINT32 PMurHash32(MH_UINT32, const void*, int); - -extern const MH_UINT32 MURMURHASH3_H_SEED, MURMURHASH3_XI_SEED; - -#ifdef __cplusplus -} -#endif diff --git a/src/hash_internal.cpp b/src/hash_function.cpp similarity index 54% rename from src/hash_internal.cpp rename to src/hash_function.cpp index bc6aa40..5ee10aa 100644 --- a/src/hash_internal.cpp +++ b/src/hash_function.cpp @@ -1,27 +1,50 @@ -/* - * This file is part of FeatureHashing - * Copyright (C) 2014-2015 Wush Wu - * - * This program is free software: you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation, either version 3 of the License, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - */ - #include #include -#include #include -#include #include "hash_function.h" +#include "bswap_32.h" +#include +#include "hashed_model_matrix.h" + +const uint32_t + MURMURHASH3_H_SEED = 3120602769LL, + MURMURHASH3_XI_SEED = 79193439LL; + +uint32_t NullHashFunction::operator()(const char* buf, int size, bool is_interaction) { + return 1; +} + +uint32_t MurmurHash3HashFunction::operator()(const char* buf, int size, bool is_interaction) { + return ::PMurHash32(seed, buf, size); +} + +uint32_t MurmurHash3LogHashFunction::operator()(const char* buf, int size, bool is_interaction) { + uint32_t retval = PMurHash32(seed, buf, size); + if (is_interaction) { + const uint32_t* src = reinterpret_cast(buf); + #ifdef BOOST_BIG_ENDIAN + if (inverse_mapping.find(bswap_32(src[0])) == inverse_mapping.end()) throw std::logic_error("interaction is hashed before main effect!"); + if (inverse_mapping.find(bswap_32(src[1])) == inverse_mapping.end()) throw std::logic_error("interaction is hashed before main effect!"); + std::string key(inverse_mapping[bswap_32(src[0])]); + key.append(":"); + key.append(inverse_mapping[bswap_32(src[1])]); + #else + if (inverse_mapping.find(src[0]) == inverse_mapping.end()) throw std::logic_error("interaction is hashed before main effect!"); + if (inverse_mapping.find(src[1]) == inverse_mapping.end()) throw std::logic_error("interaction is hashed before main effect!"); + std::string key(inverse_mapping[src[0]]); + key.append(":"); + key.append(inverse_mapping[src[1]]); + #endif + e[key.c_str()] = Rcpp::wrap((int) retval); + inverse_mapping[retval] = key; + } + else { + e[buf] = Rcpp::wrap((int) retval); + inverse_mapping[retval] = buf; + } + return retval; +} + using namespace Rcpp; //'@export hash.sign @@ -85,4 +108,3 @@ IntegerVector h2(CharacterVector src) { } return retval; } - diff --git a/src/hash_function.h b/src/hash_function.h deleted file mode 100644 index f2ea1d9..0000000 --- a/src/hash_function.h +++ /dev/null @@ -1,98 +0,0 @@ -/* - * This file is part of FeatureHashing - * Copyright (C) 2015 Wush Wu - * - * This program is free software: you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation, either version 3 of the License, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - */ - -#ifndef __HASH_FUNCTION_HPP__ -#define __HASH_FUNCTION_HPP__ - -#include -#include "digestlocal.h" -#include "bswap_32.h" - -class HashFunction { - -public: - - virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false) = 0; - -}; - -class NullHashFunction : public HashFunction { - - public: - - virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false) { - return 1; - } - -}; - -class MurmurHash3HashFunction : public HashFunction { - - uint32_t seed; - -public : - - MurmurHash3HashFunction(uint32_t _seed) : seed(_seed) { } - - virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false) { - return ::PMurHash32(seed, buf, size); - } -}; - -class MurmurHash3LogHashFunction : public HashFunction { - - uint32_t seed; - Rcpp::Environment e; - std::map inverse_mapping; - -public: - - MurmurHash3LogHashFunction(SEXP _e, uint32_t _seed) - : HashFunction(), seed(_seed), e(_e) - { } - - virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false) { - uint32_t retval = PMurHash32(seed, buf, size); - if (is_interaction) { - const uint32_t* src = reinterpret_cast(buf); - #ifdef BOOST_BIG_ENDIAN - if (inverse_mapping.find(bswap_32(src[0])) == inverse_mapping.end()) throw std::logic_error("interaction is hashed before main effect!"); - if (inverse_mapping.find(bswap_32(src[1])) == inverse_mapping.end()) throw std::logic_error("interaction is hashed before main effect!"); - std::string key(inverse_mapping[bswap_32(src[0])]); - key.append(":"); - key.append(inverse_mapping[bswap_32(src[1])]); - #else - if (inverse_mapping.find(src[0]) == inverse_mapping.end()) throw std::logic_error("interaction is hashed before main effect!"); - if (inverse_mapping.find(src[1]) == inverse_mapping.end()) throw std::logic_error("interaction is hashed before main effect!"); - std::string key(inverse_mapping[src[0]]); - key.append(":"); - key.append(inverse_mapping[src[1]]); - #endif - e[key.c_str()] = Rcpp::wrap((int) retval); - inverse_mapping[retval] = key; - } - else { - e[buf] = Rcpp::wrap((int) retval); - inverse_mapping[retval] = buf; - } - return retval; - } - -}; - -#endif \ No newline at end of file diff --git a/src/hash_function_implementation.h b/src/hash_function_implementation.h new file mode 100644 index 0000000..8c71698 --- /dev/null +++ b/src/hash_function_implementation.h @@ -0,0 +1,43 @@ +#ifndef __HASH_FUNCTION_IMPLEMENTATION_HPP__ +#define __HASH_FUNCTION_IMPLEMENTATION_HPP__ + +#include +#include + +class NullHashFunction : public HashFunction { + + public: + + virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false); + +}; + +class MurmurHash3HashFunction : public HashFunction { + + uint32_t seed; + +public : + + MurmurHash3HashFunction(uint32_t _seed) : seed(_seed) { } + + virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false); + +}; + +class MurmurHash3LogHashFunction : public HashFunction { + + uint32_t seed; + Rcpp::Environment e; + std::map inverse_mapping; + +public: + + MurmurHash3LogHashFunction(SEXP _e, uint32_t _seed) + : HashFunction(), seed(_seed), e(_e) + { } + + virtual uint32_t operator()(const char* buf, int size, bool is_interaction = false); + +}; + +# endif // __HASH_FUNCTION_IMPLEMENTATION_HPP__ \ No newline at end of file diff --git a/src/hashed_model_matrix.cpp b/src/hashed_model_matrix.cpp index 72ef743..597dd01 100644 --- a/src/hashed_model_matrix.cpp +++ b/src/hashed_model_matrix.cpp @@ -47,15 +47,29 @@ const ConvertersVec get_converters( NumericMatrix tfactors(wrap(tf.attr("factors"))); CharacterVector reference_name, feature_name; Environment feature_hashing(Environment::namespace_env("FeatureHashing")); - Function parse_split(feature_hashing["parse_split"]); - std::set specials; + Function parse_special(feature_hashing["parse_special"]); + std::map specials; { List tmp(tf.attr("specials")); - SEXP ptag = tmp["split"]; - if (!Rf_isNull(ptag)) { - IntegerVector tmpvec(ptag); - specials.insert(tmpvec.begin(), tmpvec.end()); + CharacterVector tmp_name(tmp.attr("names")); + for(int i = 0;i < tmp.size();i++) { + SEXP ptag = tmp[i]; + if (!Rf_isNull(ptag)) { + IntegerVector tmp_index(tmp[i]); + const char* callback_generator_name = CHAR(wrap(tmp_name[i])); +#ifdef NOISY_DEBUG + Rprintf("Extract generator: %s from .callback\n", CHAR(wrap(tmp_name[i]))); +#endif + std::for_each(tmp_index.begin(), tmp_index.end(), [&specials, &callback_generator_name](const int index) { + specials.insert(std::make_pair(index, callback_generator_name)); + }); + } } +#ifdef NOISY_DEBUG + for(auto i = specials.begin();i != specials.end();i++) { + Rprintf("special %s at index: %d\n", i->second.c_str(), i->first); + } +#endif } { List tmp(tfactors.attr("dimnames")); @@ -73,11 +87,12 @@ const ConvertersVec get_converters( #endif pVectorConverter p(NULL); try{ - if (specials.find(j + 1) == specials.end()) { + const auto j_special = specials.find(j + 1); + if (j_special == specials.end()) { if (reference_class.find(rname) == reference_class.end()) throw std::invalid_argument("Failed to find the column:"); const std::string& rclass(reference_class.find(rname)->second); #ifdef NOISY_DEBUG - Rprintf("%s\n", rclass.c_str()); + Rprintf("rclass: %s\n", rclass.c_str()); #endif Param param(rname, _h_main, _h_binary, hash_size); if (rclass.compare("factor") == 0) { @@ -111,56 +126,23 @@ const ConvertersVec get_converters( } else { #ifdef NOISY_DEBUG - Rprintf(" (parsing tag..) "); + Rprintf(" (parsing spetial..) text: %s special: %s\n", rname.c_str(), j_special->second.c_str()); #endif - List expression(parse_split(wrap(rname))); - rname.assign(as(expression["reference_name"])); + RObject callback_functor(parse_special(wrap(rname), wrap(j_special->second.c_str()), data)); + rname.assign(as(callback_functor.attr("rname"))); Param param(rname, _h_main, _h_binary, hash_size); #ifdef NOISY_DEBUG Rprintf(" (rname ==> %s) ", rname.c_str()); #endif - if (reference_class.find(rname) == reference_class.end()) throw std::invalid_argument("Failed to find the column: "); - const std::string& rclass(reference_class.find(rname)->second); - #ifdef NOISY_DEBUG - Rprintf("%s\n", rclass.c_str()); - #endif - std::string - delim(as(expression["delim"])), - type(as(expression["type"])); + if (reference_class.find(rname) == reference_class.end()) { + throw std::invalid_argument("The first argument of the callback should be one of the column name of the data"); + } #ifdef NOISY_DEBUG - Rprintf("delim: %s type: %s\n", delim.c_str(), type.c_str()); + Rprintf("Initialize CallbackConverter\n"); + Rprintf("Test h_main: %zu\n", (*param.h_main)("test", 4)); + Rprintf("Test h_binary: %zu\n", (*param.h_binary)("test", 4)); #endif - if (rclass.compare("factor") == 0) { - if (type.compare("existence") == 0) { - #ifdef NOISY_DEBUG - Rprintf("Initialize TagExistenceFactorConverter\n"); - #endif - p.reset(new TagExistenceFactorConverter(wrap(data[rname.c_str()]), param, delim)); - } else if (type.compare("count") == 0) { - #ifdef NOISY_DEBUG - Rprintf("Initialize TagCountFactorConverter\n"); - #endif - p.reset(new TagCountFactorConverter(wrap(data[rname.c_str()]), param, delim)); - } else { - throw std::invalid_argument("Non supported type at name: "); - } - } else if (rclass.compare("character") == 0) { - if (type.compare("existence") == 0) { - #ifdef NOISY_DEBUG - Rprintf("Initialize TagExistenceCharacterConverter\n"); - #endif - p.reset(new TagExistenceCharacterConverter(wrap(data[rname.c_str()]), param, delim)); - } else if (type.compare("count") == 0) { - #ifdef NOISY_DEBUG - Rprintf("Initialize TagCountCharacterConverter\n"); - #endif - p.reset(new TagCountCharacterConverter(wrap(data[rname.c_str()]), param, delim)); - } else { - throw std::invalid_argument("Non supported type at name: "); - } - } else { - throw std::invalid_argument("Non supported type at name: "); - } + p.reset(new CallbackConverter(as(callback_functor), param)); } } catch(std::invalid_argument& e) { std::string message(e.what()); @@ -308,4 +290,3 @@ SEXP hashed_model_matrix(RObject tf, DataFrameLike data, unsigned long hash_size SEXP hashed_model_matrix_dataframe(RObject tf, DataFrame data, unsigned long hash_size, bool transpose, S4 retval, bool keep_hashing_mapping, bool is_xi, bool progress) { return hashed_model_matrix(tf, data, hash_size, transpose, retval, keep_hashing_mapping, is_xi, progress); } - diff --git a/src/hashed_model_matrix.h b/src/hashed_model_matrix.h index 752109a..905fe1c 100644 --- a/src/hashed_model_matrix.h +++ b/src/hashed_model_matrix.h @@ -22,10 +22,24 @@ #include #include #include +#include #include -#include "hash_function.h" +#include "callback.h" +#include "hash_function_implementation.h" #include "vector_converter.h" -#include +#include "converters.h" + +#ifdef __cplusplus +extern "C" { +#endif + +uint32_t PMurHash32(uint32_t, const void*, int); + +extern const uint32_t MURMURHASH3_H_SEED, MURMURHASH3_XI_SEED; + +#ifdef __cplusplus +} +#endif typedef std::map< std::string, std::string > NameClassMapping; typedef std::vector< std::string > StrVec; diff --git a/src/pmurhash32.c b/src/pmurhash32.c new file mode 100644 index 0000000..720d022 --- /dev/null +++ b/src/pmurhash32.c @@ -0,0 +1 @@ +#include diff --git a/src/split.cpp b/src/split.cpp index b2b170b..4296028 100644 --- a/src/split.cpp +++ b/src/split.cpp @@ -26,7 +26,7 @@ std::vector split(const std::string& src, const std::string& delim) const char* end = std::strstr(start, delim.c_str()); std::vector retval; while(end != NULL) { - retval.push_back(std::string(start, end)); + if (end - start > 0) retval.push_back(std::string(start, end)); start = end + delim.size(); end = std::strstr(start, delim.c_str()); } diff --git a/src/split.h b/src/split.h index 4104c2c..8723a26 100644 --- a/src/split.h +++ b/src/split.h @@ -16,7 +16,15 @@ * this program. If not, see . */ +#ifndef __SPLIT_H__ +#define __SPLIT_H__ + #include #include +#include +#include +#include "callback.h" std::vector split(const std::string& src, const std::string& delim); + +#endif //__SPLIT_H__ diff --git a/src/split_callback.cpp b/src/split_callback.cpp new file mode 100644 index 0000000..30e5e15 --- /dev/null +++ b/src/split_callback.cpp @@ -0,0 +1,76 @@ +#include "callback.h" +#include "split.h" +#include + +struct SplitCallbackFunctor : public CallbackFunctor { + + enum SplitType { + Count, + Existence + }; + + std::string delim; + SplitType type; + + SplitCallbackFunctor(SEXP input, const std::string& _delim, const std::string& _type) + : delim(_delim), CallbackFunctor(input) + { + set_type(_type); + if (type == SplitType::Existence) decollision = true; + } + + virtual ~SplitCallbackFunctor() { } + + void set_type(std::string _type) { + if (_type.compare("count") == 0) { + type = SplitType::Count; + } else if (_type.compare("existence") == 0) { + type = SplitType::Existence; + } else throw std::invalid_argument("Not supported type"); + } + + std::string get_type() { + switch (type) { + case SplitType::Count: + return "count"; + case SplitType::Existence: + return "existence"; + } + throw std::logic_error("Invalid SplitType"); + } + + virtual const std::vector operator()(const char* input) const { + switch (type) { + case SplitType::Count: { + auto tmp(split(input, delim)); + tmp.erase(std::remove(tmp.begin(), tmp.end(), ""), tmp.end()); + return tmp; + } + case SplitType::Existence: { + std::vector tmp(split(input, delim)); + std::set tmp2(tmp.begin(), tmp.end()); + tmp2.erase(""); + tmp.assign(tmp2.begin(), tmp2.end()); + return tmp; + } + } + throw std::logic_error("Invalid SplitType"); + } + +}; + +using namespace Rcpp; + +RCPP_MODULE(split_callback) { + + class_("callback") + ; + + class_("split_callback") + .derives("callback") + .constructor() + .field("delim", &SplitCallbackFunctor::delim) + .property("type", &SplitCallbackFunctor::get_type, &SplitCallbackFunctor::set_type) + ; + +} \ No newline at end of file diff --git a/src/vector_converter.h b/src/vector_converter.h index 711cad4..d78e4dc 100644 --- a/src/vector_converter.h +++ b/src/vector_converter.h @@ -19,38 +19,16 @@ #ifndef __VECTOR_CONVERTER_HPP__ #define __VECTOR_CONVERTER_HPP__ +#include "callback.h" #include "hash_function.h" -#include "split.h" +#ifdef NOISY_DEBUG #include +#endif struct VectorConverterParam; class VectorConverter; -class CharacterConverter; -class FactorConverter; -template -class DenseConverter; -class TagExistenceFactorConverter; -class TagExistenceCharacterConverter; -class TagCountFactorConverter; -class TagCountCharacterConverter; -class InteractionConverter; typedef VectorConverterParam Param; -typedef std::shared_ptr pVectorConverter; -typedef std::shared_ptr pCharacterConverter; -typedef std::shared_ptr pFactorConverter; -typedef DenseConverter NumConverter; -typedef std::shared_ptr pNumConverter; -typedef DenseConverter IntConverter; -typedef std::shared_ptr pIntConverter; -typedef DenseConverter LogicalConverter; -typedef std::shared_ptr pLogicalConverter; -typedef std::shared_ptr pTagExistenceFactorConverter; -typedef std::shared_ptr pTagExistenceCharacterConverter; -typedef std::shared_ptr pTagCountFactorConverter; -typedef std::shared_ptr pTagCountCharacterConverter; -typedef std::vector< pVectorConverter > ConvertersVec; -typedef std::shared_ptr pInteractionConverter; /** * Paramter of initializing VectorConverter @@ -130,413 +108,5 @@ class VectorConverter { }; -class CharacterConverter : public VectorConverter { - Rcpp::CharacterVector src; - SEXP psrc; - -public: - - explicit CharacterConverter(SEXP _src, const Param& param) - : VectorConverter(param), src(_src), psrc(wrap(src)) { - value_buffer.reserve(1); - feature_buffer.reserve(1); - } - - virtual ~CharacterConverter() { } - - virtual const std::vector& get_feature(size_t i) { - SEXP pstr = STRING_ELT(psrc, i); - if (pstr == NA_STRING) { - feature_buffer.clear(); - } else { - const char* str = CHAR(pstr); - feature_buffer.resize(1); - feature_buffer[0] = get_hashed_feature(h_main, str); - if (is_final) feature_buffer[0] = feature_buffer[0] % hash_size; - } - return feature_buffer; - } - - virtual const std::vector& get_value(size_t i) { - SEXP pstr = STRING_ELT(psrc, i); - if (pstr == NA_STRING) { - value_buffer.clear(); - } else { - const char* str = CHAR(pstr); - uint32_t sign_value = get_hashed_feature(h_binary, str); - value_buffer.resize(1); - value_buffer[0] = get_sign(sign_value); - } - return value_buffer; - } -}; - -class FactorConverter : public VectorConverter { - - Rcpp::IntegerVector src; - Rcpp::CharacterVector levels; - SEXP plevels; - -public: - - explicit FactorConverter(SEXP _src, const Param& param) - : VectorConverter(param), src(_src), levels(src.attr("levels")), plevels(wrap(levels)) { - value_buffer.reserve(1); - feature_buffer.reserve(1); - } - - virtual ~FactorConverter() { } - - virtual const std::vector& get_feature(size_t i) { - if (src[i] == NA_INTEGER) { - feature_buffer.clear(); - } else { - feature_buffer.resize(1); - const char* str = CHAR(STRING_ELT(plevels, src[i] - 1)); // R start from 1 and C start from 0 - feature_buffer[0] = get_hashed_feature(h_main, str); - if (is_final) feature_buffer[0] = feature_buffer[0] % hash_size; - } - return feature_buffer; - } - - virtual const std::vector& get_value(size_t i) { - if (src[i] == NA_INTEGER) { - value_buffer.clear(); - } else { - const char* str = CHAR(STRING_ELT(plevels, src[i] - 1)); // R start from 1 and C start from 0 - uint32_t sign_value = get_hashed_feature(h_binary, str); - value_buffer.resize(1); - value_buffer[0] = get_sign(sign_value); - } - return value_buffer; - } - -}; - -template -class DenseConverter : public VectorConverter { - - Rcpp::Vector src; - uint32_t value; - int sign_value; - - static bool isNA(ValueType x) { - switch(RType) { - case REALSXP: - return R_IsNA(x); - case INTSXP: - return x == NA_INTEGER; - case LGLSXP: - return x == NA_LOGICAL; - default: - throw std::logic_error("Invalid RType"); - } - } - -public: - - explicit DenseConverter(SEXP _src, const Param& param) - : VectorConverter(param), src(_src), value(get_hashed_feature(h_main, "")), - sign_value(get_sign(get_hashed_feature(h_binary, ""))) { - feature_buffer.reserve(1); - value_buffer.reserve(1); - } - - virtual ~DenseConverter() { } - - virtual const std::vector& get_feature(size_t i) { - if (isNA(src[i]) | (src[i] == 0)) { - feature_buffer.clear(); - } else { - feature_buffer.resize(1); - feature_buffer[0] = (is_final ? value % hash_size : value); - } - return feature_buffer; - } - - virtual const std::vector& get_value(size_t i) { - if (isNA(src[i]) | (src[i] == 0)) { - value_buffer.clear(); - } else { - value_buffer.resize(1); - value_buffer[0] = sign_value * src[i]; - } - return value_buffer; - } - -}; - -template -class TagConverter : public VectorConverter { - - -protected: - - std::string delim; - size_t cache_i; - CacheTagType cache_tags; - - virtual void get_tags(size_t i) = 0; - - std::vector split_tags(const std::string& src) { - std::vector temp(split(src, delim)); - temp.erase(std::remove(temp.begin(), temp.end(), ""), temp.end()); - return temp; - } - - virtual void decollision_feature(size_t i) { } - - virtual void decollision_value(size_t i) { } - -public: - - explicit TagConverter(const Param& param, const std::string& _delim) - : VectorConverter(param), delim(_delim), cache_i(-1) - { } - - virtual ~TagConverter() { } - - virtual const std::vector& get_feature(size_t i) { - get_tags(i); - feature_buffer.resize(cache_tags.size()); - size_t k = 0; - for(auto j = cache_tags.begin();j != cache_tags.end();j++) { - feature_buffer[k++] = (is_final ? get_hashed_feature(h_main, j->c_str()) % hash_size : get_hashed_feature(h_main, j->c_str())); - } - if (is_final) decollision_feature(i); - return feature_buffer; - } - - virtual const std::vector& get_value(size_t i) { - get_tags(i); - value_buffer.resize(cache_tags.size()); - size_t k = 0; - for(auto j = cache_tags.begin();j != cache_tags.end();j++) { - value_buffer[k++] = get_sign(get_hashed_feature(h_binary, j->c_str())); - } - if (is_final) decollision_value(i); - return value_buffer; - } - -}; - -class TagExistenceConverter : public TagConverter< std::set > { - - size_t decollision_mark; - -protected: - - virtual void decollision_feature(size_t i) { - std::set temp; - temp.insert(feature_buffer.begin(), feature_buffer.end()); - feature_buffer.clear(); - feature_buffer.assign(temp.begin(), temp.end()); - decollision_mark = i + 1; - } - - virtual void decollision_value(size_t i) { - if (decollision_mark != i + 1) throw std::logic_error("The order of decollision is unexpected"); - value_buffer.resize(feature_buffer.size()); - } - -public: - - TagExistenceConverter(const Param& param, const std::string& _delim) - : TagConverter >(param, _delim), decollision_mark(0) - { } - - virtual ~TagExistenceConverter() { } - -}; - -class TagExistenceFactorConverter : public TagExistenceConverter { - - Rcpp::IntegerVector src; - Rcpp::CharacterVector levels; - SEXP plevels; - std::vector cache_splitted; - -protected: - - virtual void get_tags(size_t i) { - if (i == cache_i) return; - if (src[i] == NA_INTEGER) { - cache_tags.clear(); - } else { - const char* str = CHAR(STRING_ELT(plevels, src[i] - 1)); - std::vector temp(split_tags(str)); - cache_splitted.swap(temp); - cache_tags.clear(); - cache_tags.insert(cache_splitted.begin(), cache_splitted.end()); - } - } - -public: - - explicit TagExistenceFactorConverter(SEXP _src, const Param& param, const std::string& _delim) - : TagExistenceConverter(param, _delim), src(_src), levels(src.attr("levels")), plevels(wrap(levels)) - { } - - virtual ~TagExistenceFactorConverter() { } - -}; - -class TagExistenceCharacterConverter : public TagExistenceConverter { - - Rcpp::CharacterVector src; - SEXP psrc; - std::vector cache_splitted; - -protected: - - virtual void get_tags(size_t i) { - if (i == cache_i) return; - SEXP pstr = STRING_ELT(psrc, i); - if (pstr == NA_STRING) { - cache_tags.clear(); - } else { - const char* str = CHAR(STRING_ELT(psrc, i)); - std::vector temp(split_tags(str)); - cache_splitted.swap(temp); - cache_tags.clear(); - cache_tags.insert(cache_splitted.begin(), cache_splitted.end()); - } - } - -public: - - explicit TagExistenceCharacterConverter(SEXP _src, const Param& param, const std::string& _delim) - : TagExistenceConverter(param, _delim), src(_src), psrc(wrap(src)) - { } - - virtual ~TagExistenceCharacterConverter() { } - -}; - -class TagCountFactorConverter : public TagConverter< std::vector > { - - Rcpp::IntegerVector src; - Rcpp::CharacterVector levels; - SEXP plevels; - -protected: - - virtual void get_tags(size_t i) { - if (i == cache_i) return; - if (src[i] == NA_INTEGER) { - cache_tags.clear(); - } else { - const char* str = CHAR(STRING_ELT(plevels, src[i] - 1)); - std::vector temp(split_tags(str)); - cache_tags.swap(temp); - } - } - -public: - - explicit TagCountFactorConverter(SEXP _src, const Param& param, const std::string& _delim) - : TagConverter< std::vector >(param, _delim), src(_src), levels(src.attr("levels")), plevels(wrap(levels)) - { } - - virtual ~TagCountFactorConverter() { } - -}; - -class TagCountCharacterConverter : public TagConverter< std::vector > { - - Rcpp::CharacterVector src; - SEXP psrc; - -protected: - - virtual void get_tags(size_t i) { - if (i == cache_i) return; - SEXP pstr = STRING_ELT(psrc, i); - if (pstr == NA_STRING) { - cache_tags.clear(); - } else { - const char* str = CHAR(STRING_ELT(psrc, i)); - std::vector temp(split_tags(str)); - cache_tags.swap(temp); - } - } - -public: - - explicit TagCountCharacterConverter(SEXP _src, const Param& param, const std::string& _delim) - : TagConverter< std::vector >(param, _delim), src(_src), psrc(wrap(src)) - { } - - virtual ~TagCountCharacterConverter() { } - -}; - -class InteractionConverter : public VectorConverter { - - pVectorConverter a, b; - -public: - - explicit InteractionConverter(pVectorConverter _a, pVectorConverter _b, const Param& param) : - VectorConverter(param), a(_a), b(_b) { - a->is_final = false; - b->is_final = false; - } - - virtual ~InteractionConverter() { } - - virtual const std::vector& get_feature(size_t i) { - const std::vector &afeature_buffer(a->get_feature(i)), &bfeature_buffer(b->get_feature(i)); - feature_buffer.resize(afeature_buffer.size() * bfeature_buffer.size()); - value_buffer.resize(afeature_buffer.size() * bfeature_buffer.size()); - size_t l = 0; - if (is_final) { - for(auto j = 0;j < afeature_buffer.size();j++) { - for(auto k = 0;k < bfeature_buffer.size();k++) { - feature_buffer[l] = get_hashed_feature(h_main, afeature_buffer[j], bfeature_buffer[k]) % hash_size; - value_buffer[l] = get_sign(get_hashed_feature(h_binary, afeature_buffer[j], bfeature_buffer[k])); - l++; - } - } - } else { - for(auto j = 0;j < afeature_buffer.size();j++) { - for(auto k = 0;k < bfeature_buffer.size();k++) { - feature_buffer[l] = get_hashed_feature(h_main, afeature_buffer[j], bfeature_buffer[k]); - value_buffer[l] = get_sign(get_hashed_feature(h_binary, afeature_buffer[j], bfeature_buffer[k])); - l++; - } - } - } - return feature_buffer; - } - - virtual const std::vector& get_value(size_t i) { - const std::vector &avalue_buffer(a->get_value(i)), &bvalue_buffer(b->get_value(i)); - size_t l = 0; - for(auto j = 0;j < avalue_buffer.size();j++) { - for(auto k = 0;k < bvalue_buffer.size();k++) { - value_buffer[l] = avalue_buffer[j] * bvalue_buffer[k] * value_buffer[l]; - l++; - } - } - return value_buffer; - } - -private: - - uint32_t get_hashed_feature(HashFunction *h, uint32_t a, uint32_t b) { - uint32_t buf[2]; - #ifdef BOOST_BIG_ENDIAN - buf[0] = bswap_32(a); - buf[1] = bswap_32(b); - #else - buf[0] = a; - buf[1] = b; - #endif - return (*h)(reinterpret_cast(buf), sizeof(uint32_t) * 2, true); - } - -}; - #endif // __VECTOR_CONVERTER_HPP__ \ No newline at end of file diff --git a/tests/test-jiebaR.R b/tests/test-jiebaR.R new file mode 100644 index 0000000..ab1f419 --- /dev/null +++ b/tests/test-jiebaR.R @@ -0,0 +1,49 @@ +if (require(RUnit) & Sys.getenv("TEST_JIEBAR") == "TRUE") { + library(FeatureHashing) + df <- data.frame(title = c( + "貶值取代降息? 台幣貶破33元", + "優生 培寶4款毒奶瓶下架", + " 秋節上國道 閃11塞車點", + "習近平訪美前…//中國戰機公海危險攔截美機", + "352億公開收購 日月光成矽品最大股東", + "驚 AT-3又出事 南投深山失聯 2飛官生死未卜", + "誰說該廢死的?怕死鄭捷首度道歉", + "歐習會前夕// 美國安顧問:反對片面改變台海現狀" + )) + init_jiebaR_callback() + m <- hashed.model.matrix(~ jiebaR(title), df, create.mapping = TRUE) + title_tokens <- names(hash.mapping(m)) + checkEquals(title_tokens, c("title4", "title股東", "title國道", "title中國", "title現狀", +"title…", "title閃", "title習近平", "title日", "title11", +"title驚", "title公開", "title億", "title又", "title:", +"title該", "title塞車", "title訪美", "title?", "title會", +"title公海", "title深山", "title片面", "title奶瓶", "title說", +"title成矽品", "title危險", "title台海", "title最大", +"title美國", "title貶值", "title上", "title下架", "title秋節", +"titleAT", "title352", "title生死未卜", "title收購", "title月光", +"title怕死", "title貶破", "title飛官", "title出事", "title取代", +"title道歉", "title歐習", "title33", "title ", "title款毒", +"title優生", "title顧問", "title前", "title前夕", "title廢死的", +"title反對", "title改變", "title點", "title培寶", "title台幣", +"title降息", "title美機", "title安", "title-", "title南投", +"title首度", "title戰機", "title鄭捷", "title/", "title元", +"title誰", "title攔截", "title2", "title失聯", "title3")) + m <- hashed.model.matrix(~ jiebaR(title, type = "hmm"), df, create.mapping = TRUE) + title_tokens <- names(hash.mapping(m)) + checkEquals(title_tokens, c("title4", "title改", "title鄭", "title股東", "title死", +"title…", "title海", "title上國道", "title閃", "title日", +"title11", "title現", "title首", "title驚", "title片", "title光成", +"title又", "title:", "title該", "title機公海", "title反", +"title習近", "title矽品", "title怕", "title生死", "title?", +"title捷", "title會", "title對", "title深山", "title奶瓶", +"title說", "title月", "title危險", "title最大", "title貶值", +"title下架", "title台", "title秋節", "titleAT", "title美前", +"title面", "title352", "title收購", "title狀", "title貶破", +"title飛官", "title歉", "title出事", "title取代", "title平訪", +"title歐習", "title億公開", "title33", "title未卜", "title中國戰", +"title ", "title款毒", "title優生", "title前夕", "title度", +"title美國安顧問", "title廢死的", "title變", "title塞車點", +"title培寶", "title台幣", "title降息", "title美機", "title-", +"title南投", "title道", "title/", "title元", "title誰", +"title攔截", "title2", "title失聯", "title3")) +} \ No newline at end of file diff --git a/tests/test-split_callback.R b/tests/test-split_callback.R new file mode 100644 index 0000000..42ceac2 --- /dev/null +++ b/tests/test-split_callback.R @@ -0,0 +1,16 @@ +if (require(RUnit)) { + library(FeatureHashing) + callback <- generate_split_callback(letters, ",", "existence") + checkEquals(callback$delim, ",") + checkEquals(callback$type, "existence") + checkEquals(test_callback(callback, "a,b,a,c,d"), letters[1:4]) + + checkException(callback <- generate_split_callback(letters, ",", "ex")) + + callback <- generate_split_callback(letters, ",,", "existence") + checkEquals(test_callback(callback, "a,b,a,,c,d"), c("a,b,a", "c,d")) + + callback <- generate_split_callback(letters, ",", "count") + checkEquals(callback$type, "count") + checkEquals(test_callback(callback, x <- "a,b,a,c,d"), strsplit(x, ",")[[1]]) +} \ No newline at end of file diff --git a/vignettes/Callback.Rmd b/vignettes/Callback.Rmd new file mode 100644 index 0000000..97b47ce --- /dev/null +++ b/vignettes/Callback.Rmd @@ -0,0 +1,27 @@ +--- +title: "Register Callback for FeatureHashing" +author: "Wush Wu" +output: + rmarkdown::html_vignette: + css: vignette.css + number_sections: yes + toc: yes +date: "September 24, 2015" +vignette: > + %\VignetteIndexEntry{FeatureHashing} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + +This is an introduction of registering callback for the formula interface of FeatureHashing. + +## Demo + +## Getting Started + +### Implement Rcpp Module + +### Implement Generator + +### Register the Generator to the Formula Interface +