@@ -2,42 +2,48 @@ package classifier
22
33import (
44 "bufio"
5- "bytes"
65 "io"
7- "regexp"
86 "strings"
9- "unsafe"
107)
118
129// Tokenizer provides a common interface to tokenize documents
1310type Tokenizer interface {
14- // Tokenize breaks the provided document into a token slice
15- Tokenize (r io.Reader ) chan string
11+ // Tokenize breaks the provided document into a channel of tokens
12+ Tokenize (io.Reader ) chan string
1613}
1714
18- type regexTokenizer struct {
19- tokenizer * regexp.Regexp
20- }
15+ // StdOption provides configuration settings for a StdTokenizer
16+ type StdOption func (* StdTokenizer )
2117
22- type stdTokenizer struct {
18+ // StdTokenizer provides a common document tokenizer that splits a
19+ // document by word boundaries
20+ type StdTokenizer struct {
21+ transforms []Mapper
22+ filters []Predicate
23+ bufferSize int
2324}
2425
2526// NewTokenizer initializes a new standard Tokenizer instance
26- func NewTokenizer () Tokenizer {
27- return & stdTokenizer {}
28- }
29-
30- // NewRegexTokenizer initializes a new regular expression Tokenizer instance
31- func NewRegexTokenizer () Tokenizer {
32- return & regexTokenizer {
33- tokenizer : regexp .MustCompile ("\\ W+" ),
27+ func NewTokenizer (opts ... StdOption ) * StdTokenizer {
28+ tokenizer := & StdTokenizer {
29+ bufferSize : 100 ,
30+ transforms : []Mapper {
31+ strings .ToLower ,
32+ },
33+ filters : []Predicate {
34+ IsNotStopWord ,
35+ },
36+ }
37+ for _ , opt := range opts {
38+ opt (tokenizer )
3439 }
40+ return tokenizer
3541}
3642
37- func (t * stdTokenizer ) Tokenize (r io.Reader ) chan string {
43+ func (t * StdTokenizer ) Tokenize (r io.Reader ) chan string {
3844 tokenizer := bufio .NewScanner (r )
3945 tokenizer .Split (bufio .ScanWords )
40- tokens := make (chan string )
46+ tokens := make (chan string , t . bufferSize )
4147
4248 go func () {
4349 for tokenizer .Scan () {
@@ -46,27 +52,27 @@ func (t *stdTokenizer) Tokenize(r io.Reader) chan string {
4652 close (tokens )
4753 }()
4854
49- return pipeline (tokens )
55+ return t . pipeline (tokens )
5056}
5157
52- // Tokenize extracts and normalizes all words from a text corpus
53- func (t * regexTokenizer ) Tokenize (r io.Reader ) chan string {
54- buffer := new (bytes.Buffer )
55- buffer .ReadFrom (r )
56- b := buffer .Bytes ()
57- doc := * (* string )(unsafe .Pointer (& b ))
58- tokens := make (chan string )
59-
60- go func () {
61- for _ , token := range t .tokenizer .Split (doc , - 1 ) {
62- tokens <- token
63- }
64- close (tokens )
65- }()
58+ func (t * StdTokenizer ) pipeline (in chan string ) chan string {
59+ return Map (Filter (in , t .filters ... ), t .transforms ... )
60+ }
6661
67- return pipeline (tokens )
62+ func BufferSize (size int ) StdOption {
63+ return func (t * StdTokenizer ) {
64+ t .bufferSize = size
65+ }
6866}
6967
70- func pipeline (tokens chan string ) chan string {
71- return Map (Filter (tokens , IsNotStopWord ), strings .ToLower )
68+ func Transforms (m ... Mapper ) StdOption {
69+ return func (t * StdTokenizer ) {
70+ t .transforms = m
71+ }
7272}
73+
74+ func Filters (f ... Predicate ) StdOption {
75+ return func (t * StdTokenizer ) {
76+ t .filters = f
77+ }
78+ }
0 commit comments