Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: FeatureHashing
Type: Package
Title: Creates a Model Matrix via Feature Hashing with a Formula Interface
Version: 0.9.1.2
Version: 0.10.0
Date: 2015-09-22
Authors@R: c(
person("Wush", "Wu", email = "wush978@gmail.com", role = c("aut", "cre")),
Expand All @@ -24,7 +24,8 @@ Imports:
digest(>= 0.6.8),
magrittr (>= 1.5)
LinkingTo: Rcpp, digest(>= 0.6.8), BH
Suggests: RUnit, glmnet, knitr, xgboost, rmarkdown
Suggests: RUnit, glmnet, knitr, xgboost, rmarkdown, jiebaR(>= 0.5.1)
RcppModules: callback, split_callback
SystemRequirements: C++11
BugReports: https://github.com/wush978/FeatureHashing/issues
URL: https://github.com/wush978/FeatureHashing
Expand Down
9 changes: 8 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,17 +1,24 @@
# Generated by roxygen2 (4.1.1): do not edit by hand

export(generate_split_callback)
export(hash.mapping)
export(hash.sign)
export(hash.size)
export(hashed.interaction.value)
export(hashed.model.matrix)
export(hashed.value)
export(init_jiebaR_callback)
export(intToRaw)
export(ls_special)
export(register_callback)
export(test_callback)
import(digest)
importClassesFrom(Matrix,dgCMatrix)
importFrom(Matrix,Diagonal)
importFrom(Matrix,colSums)
importFrom(Rcpp,evalCpp)
importFrom(Rcpp,cpp_object_initializer)
importFrom(Rcpp,loadModule)
importFrom(Rcpp,sourceCpp)
importFrom(magrittr,"%<>%")
importFrom(magrittr,"%>%")
importFrom(methods,as)
Expand Down
9 changes: 9 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,15 @@ tomatrix <- function(m) {
.Call('_FeatureHashing_tomatrix', PACKAGE = 'FeatureHashing', m)
}

#'@title Test the callback function.
#'@param Rcallback external pointer. The pointer of the callback function.
#'@param input string. The input.
#'@return character
#'@export
test_callback <- function(Rcallback, input) {
.Call('_FeatureHashing_test_callback', PACKAGE = 'FeatureHashing', Rcallback, input)
}

#'@export hash.sign
hash.sign <- function(src) {
.Call('_FeatureHashing_xi', PACKAGE = 'FeatureHashing', src)
Expand Down
65 changes: 65 additions & 0 deletions R/callback.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@

#'@export
#'@title Register Special Function for Formula Interface
#'@param special string. The name which will be used in formula interface.
#'@param callback_generator function which will create a callback. Please see the details.
#'@details The callback_generator is a function whose first argument is the
#'input data and the other arguments could be used to initialize the callback
#'function properly. The result should be a Rcpp module which derives the
#'`CallbackFunctor` class. Please see the vignette for details.
#'register_callback("split", generate_split_callback)
register_callback <- function(special, callback_generator) {
.callback[[special]] <- callback_generator
invisible(NULL)
}

#'@title List the Registered Specials
#'@return character vector. The specials which could be used in the
#'formula interface.
#'@export
ls_special <- function() {
ls(.callback)
}

#'@title Generate callback of split
#'@param input character vector. The input of split
#'@param delim string. \code{delim} will be used as delimiter for splitting
#'@param type string. One of \code{c("existence", "count")}
#'"count" indicates the number of occurrence of the token. "existence" indicates the boolean that whether the token exist or not.
#'@export
generate_split_callback <- function(input, delim = ",", type = c("existence", "count")) {
callback <- new(split_callback, input, delim, type[1])
callback
}

.callback <- new.env()
.callback[["split"]] <- generate_split_callback

#'@title Initialize and register jiebaR to the formula interface
#'@details This function will register the callback of word segmentation
#'function provided by jiebaR to the formula interface.
#'For example, `~ jiebaR(...)` will use the feature of word segmentation
#'provided by jiebaR to segment a given column of the data.
#'The first argument of the jiebaR is a character which will be segmented.
#'The left arguments are the same as \code{\link[jiebaR]{worker}}. These
#'arguments will be used to initialize a jiebaR worker which will segment
#'the input data.
#'
#'@examples
#'\dontrun{
#'library(FeatureHashing)
#'init_jiebaR_callback()
#'m <- hashed.model.matrix(~ jiebaR(title, type = "mix", df))
#'# the column `df$title` will be feed into `worker <- worker(type = "mix")`
#'# the result of `worker <= df$title` will be hashed into the sparse matrix
#'# the result is `m`
#'}
#'@export
#'@importFrom Rcpp sourceCpp
init_jiebaR_callback <- function() {
if (!requireNamespace("jiebaR", character.only = TRUE)) stop("Please install the package jiebaR first")
tryCatch({
sourceCpp(system.file("callback/jiebaR_callback.cpp", package = "FeatureHashing"))
}, finally = {
})
}
56 changes: 39 additions & 17 deletions R/hashed.model.matrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ hashed.model.matrix <- function(formula, data, hash.size = 2^18, transpose = FAL
formula <- as.character(formula) %>% gsub(pattern = tf.idf.string, replacement = "type = \"count\"", x = .) %>% paste0(collapse = " ") %>% as.formula
}

tf <- terms.formula(formula, data = data, specials = "split")
tf <- terms.formula(formula, data = data, specials = ls(.callback))
retval <- new(.CSCMatrix)
.hashed.model.matrix.dataframe(tf, data, hash.size, transpose, retval, create.mapping, signed.hash, progress)
class(retval) <- .CSCMatrix
Expand All @@ -228,29 +228,51 @@ hashed.model.matrix <- function(formula, data, hash.size = 2^18, transpose = FAL
} else if (tf.idf) tf.idf.transfo(retval) else retval
}

# This is the function called from C to parse the \code{split} function.
parse_split <- function(text) {
# This is the function called from C to parse the special function.
parse_special <- function(text, special, df) {
origin.keep.source <- options()$keep.source
tryCatch({
options(keep.source = TRUE)
p <- parse(text = text)
tmp <- getParseData(p)
reference_name <- tmp$text[which(tmp$token == "SYMBOL")]
if ("delim" %in% tmp$text) {
delim <- tmp$text[which(tmp$text == "delim")[1] + 2]
delim <- gsub(pattern = '"', replacement = '', delim)
} else {
# the default value of delim
delim <- ","
params <- list()
fname <- NULL
first_symbol <- NULL
start <- FALSE
for(i_symbol in seq_len(nrow(tmp))) {
if (tmp$token[i_symbol] != "SYMBOL_FUNCTION_CALL" & !start) next
start <- TRUE
switch(tmp$token[i_symbol],
"SYMBOL_FUNCTION_CALL" = {
fname <- tmp$text[i_symbol]
},
"SYMBOL" = {
if (tmp$token[i_symbol - 1] == "EQ_SUB") next
value <- eval(parse(text = tmp$text[i_symbol]), envir = df)
params <- append(params, list(value))
if (is.null(first_symbol)) first_symbol <- tmp$text[i_symbol]
},
"STR_CONST" = {
if (tmp$token[i_symbol - 1] == "EQ_SUB") next
value <- eval(parse(text = tmp$text[i_symbol]), envir = parent.frame())
params <- append(params, list(value))
},
"SYMBOL_SUB" = {
if (tmp$token[i_symbol + 1] != "EQ_SUB") next
element <- list()
name <- tmp$text[i_symbol]
value <- eval(parse(text = tmp$text[i_symbol + 2]), envir = df)
element[[name]] <- value
params <- append(params, element)
},
next)
}
if ("type" %in% tmp$text) {
type <- tmp$text[which(tmp$text == "type")[1] + 2]
type <- gsub(pattern = '"', replacement = '', type)
} else {
# the default value of type
type <- "existence"
}
list(reference_name = reference_name, delim = delim, type = type)
stopifnot(!is.null(fname))
stopifnot(start)
retval <- do.call(.callback[[special]], params)
attr(retval, "rname") <- first_symbol
retval
}, finally = {options(keep.source = origin.keep.source)})
}

Expand Down
9 changes: 7 additions & 2 deletions R/zzz.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
#'@useDynLib FeatureHashing
#'@importFrom Rcpp evalCpp
#'@importFrom Rcpp loadModule cpp_object_initializer
#'@import digest
.onLoad <- function(libname, pkgname) { }
.onLoad <- function(libname, pkgname) {
# loadRcppModules()
}

loadModule("callback", TRUE)
loadModule("split_callback", TRUE)

.onAttach <- function(libname, pkgname) {
if (interactive()) {
Expand Down
1 change: 1 addition & 0 deletions appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ install:

build_script:
- travis-tool.sh install_deps
- travis-tool.sh install_github qinwf/jiebaR

test_script:
- travis-tool.sh run_tests
Expand Down
112 changes: 112 additions & 0 deletions inst/callback/jiebaR_callback.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
// [[Rcpp::depends(jiebaR)]]
// [[Rcpp::depends(FeatureHashing)]]

#include "jiebaRAPI.h"
#include <callback.h>
#include <Rcpp.h>

using namespace Rcpp;

struct jiebaRCallbackFunctor : public CallbackFunctor {

enum Type {
MIX,
MP,
HMM,
QUERY,
KEY
};

Type type;
Environment cutter;
SEXP cutter_pointer;

typedef SEXP (*Cut)(SEXP, SEXP);

Cut cut;

void set_type(std::string _type) {
if (_type.compare("mix") == 0) {
type = MIX;
} else if (_type.compare("mp") == 0) {
type = MP;
} else if (_type.compare("hmm") == 0) {
type = HMM;
} else if (_type.compare("query") == 0) {
type = QUERY;
} else if (_type.compare("key") == 0) {
type = KEY;
} else {
throw std::invalid_argument("Unknown type");
}
}

std::string get_type() {
switch (type) {
case MIX:
return "mix";
case MP:
return "mp";
case HMM:
return "hmm";
case QUERY:
return "query";
case KEY:
return "key";
}
}

void set_cut() {
std::string fname("jiebaR_");
fname.append(get_type());
fname.append("_cut");
cut = reinterpret_cast<Cut>(::R_GetCCallable("jiebaR", fname.c_str()));
}

explicit jiebaRCallbackFunctor(
SEXP _src,
std::string _type,
SEXP _cutter
)
: type(MIX),
cutter(_cutter),
cutter_pointer(NULL),
cut(NULL),
CallbackFunctor(_src)
{
set_type(_type);
set_cut();
cutter_pointer = wrap(cutter["worker"]);
}

virtual ~jiebaRCallbackFunctor() { }

virtual const std::vector<std::string> operator()(const char* input) const {
return as<std::vector<std::string> >((*cut)(wrap(input), cutter_pointer));
}

};

RCPP_MODULE(jiebaR_callback) {

class_<CallbackFunctor>("callback")
;

class_<jiebaRCallbackFunctor>("jiebaR_callback")
.derives<CallbackFunctor>("callback")
.constructor<SEXP, std::string, SEXP>()
.property("type", &jiebaRCallbackFunctor::get_type, &jiebaRCallbackFunctor::set_type)
.field("cutter", &jiebaRCallbackFunctor::cutter)
;

}

/***R
generate_jiebaR_callback <- function(input, type = "mix", ...) {
worker <- jiebaR::worker(type = type, ...)
callback <- new(jiebaR_callback, input, type, worker)
callback
}

FeatureHashing::register_callback("jiebaR", generate_jiebaR_callback)
*/
29 changes: 23 additions & 6 deletions src/digest.c → inst/include/callback.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
* This file is part of FeatureHashing
* Copyright (C) 2014-2015 Wush Wu
* Copyright (C) 2015 Wush Wu
*
* This program is free software: you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
Expand All @@ -16,9 +16,26 @@
* this program. If not, see <http://www.gnu.org/licenses/>.
*/

#include "pmurhashAPI.h"
#include <stdint.h>
#ifndef __CALLBACK_H__
#define __CALLBACK_H__

const uint32_t
MURMURHASH3_H_SEED = 3120602769LL,
MURMURHASH3_XI_SEED = 79193439LL;
#include <vector>
#include <string>
#include <Rcpp.h>

class CallbackFunctor {

public:

// TODO: let src private
Rcpp::CharacterVector src;
bool decollision;

CallbackFunctor(SEXP _src) : src(_src), decollision(false) { }
virtual ~CallbackFunctor() { }

virtual const std::vector<std::string> operator()(const char* input) const = 0;

};

#endif //__CALLBACK_H__
Loading