Skip to content

Commit df74ede

Browse files
committed
breaking changes: improved customization support
1 parent 1cd7f48 commit df74ede

5 files changed

Lines changed: 102 additions & 66 deletions

File tree

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
version=0.3.1
1+
version=0.4.0

func.go

Lines changed: 33 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,52 @@
11
package classifier
22

3+
const defaultBufferSize = 50
4+
35
// Predicate provides a predicate function
46
type Predicate func(string) bool
57

68
// Mapper provides a map function
79
type Mapper func(string) string
810

9-
// Map applies f to each element of the supplied input slice
10-
func Map(vs chan string, f Mapper) chan string {
11-
outstream := make(chan string)
11+
// Map applies f to each element of the supplied input channel
12+
func Map(vs chan string, f ...Mapper) chan string {
13+
stream := make(chan string, defaultBufferSize)
14+
1215
go func() {
1316
for v := range vs {
14-
outstream <- f(v)
17+
for _, fn := range f {
18+
v = fn(v)
19+
}
20+
stream <- v
1521
}
16-
close(outstream)
22+
close(stream)
1723
}()
18-
return outstream
24+
25+
return stream
1926
}
2027

21-
// Filter removes elements from the input slice where the supplied predicate
28+
// Filter removes elements from the input channel where the supplied predicate
2229
// is satisfied
23-
func Filter(vs chan string, f Predicate) chan string {
24-
outstream := make(chan string)
30+
// Filter is a Predicate aggregation
31+
func Filter(vs chan string, filters ...Predicate) chan string {
32+
stream := make(chan string, defaultBufferSize)
33+
apply := func(text string) bool {
34+
for _, f := range filters {
35+
if !f(text) {
36+
return false
37+
}
38+
}
39+
return true
40+
}
41+
2542
go func() {
26-
for v := range vs {
27-
if f(v) {
28-
outstream <- v
43+
for text := range vs {
44+
if apply(text) {
45+
stream <- text
2946
}
3047
}
31-
close(outstream)
48+
close(stream)
3249
}()
33-
return outstream
34-
}
50+
51+
return stream
52+
}

naive/naive.go

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,27 +12,42 @@ import (
1212
// ErrNotClassified indicates that a document could not be classified
1313
var ErrNotClassified = errors.New("unable to classify document")
1414

15+
// Option provides a functional setting for the Classifier
16+
type Option func(c *Classifier) error
17+
1518
// Classifier implements a naive bayes classifier
1619
type Classifier struct {
1720
feat2cat map[string]map[string]int
1821
catCount map[string]int
1922
tokenizer classifier.Tokenizer
20-
sync.RWMutex
23+
mu sync.RWMutex
2124
}
2225

23-
// New initializes a new naive Classifier
24-
func New() *Classifier {
25-
return &Classifier{
26+
// New initializes a new naive Classifier using the standard tokenizer
27+
func New(opts ...Option) *Classifier {
28+
c := &Classifier{
2629
feat2cat: make(map[string]map[string]int),
2730
catCount: make(map[string]int),
2831
tokenizer: classifier.NewTokenizer(),
2932
}
33+
for _, opt := range opts {
34+
opt(c)
35+
}
36+
return c
37+
}
38+
39+
// Tokenizer overrides the classifier's default Tokenizer
40+
func Tokenizer(t classifier.Tokenizer) Option {
41+
return func(c *Classifier) error {
42+
c.tokenizer = t
43+
return nil
44+
}
3045
}
3146

3247
// Train provides supervisory training to the classifier
3348
func (c *Classifier) Train(r io.Reader, category string) error {
34-
c.Lock()
35-
defer c.Unlock()
49+
c.mu.Lock()
50+
defer c.mu.Unlock()
3651

3752
for feature := range c.tokenizer.Tokenize(r) {
3853
c.addFeature(feature, category)
@@ -55,8 +70,8 @@ func (c *Classifier) Classify(r io.Reader) (string, error) {
5570
classification := ""
5671
probabilities := make(map[string]float64)
5772

58-
c.RLock()
59-
defer c.RUnlock()
73+
c.mu.RLock()
74+
defer c.mu.RUnlock()
6075

6176
for _, category := range c.categories() {
6277
probabilities[category] = c.probability(r, category)
@@ -65,6 +80,7 @@ func (c *Classifier) Classify(r io.Reader) (string, error) {
6580
classification = category
6681
}
6782
}
83+
6884
if classification == "" {
6985
return "", ErrNotClassified
7086
}
@@ -152,5 +168,5 @@ func (c *Classifier) docProbability(r io.Reader, category string) float64 {
152168
}
153169

154170
func asReader(text string) io.Reader {
155-
return bytes.NewBuffer([]byte(text))
171+
return bytes.NewBufferString(text)
156172
}

tokens.go

Lines changed: 43 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -2,42 +2,48 @@ package classifier
22

33
import (
44
"bufio"
5-
"bytes"
65
"io"
7-
"regexp"
86
"strings"
9-
"unsafe"
107
)
118

129
// Tokenizer provides a common interface to tokenize documents
1310
type Tokenizer interface {
14-
// Tokenize breaks the provided document into a token slice
15-
Tokenize(r io.Reader) chan string
11+
// Tokenize breaks the provided document into a channel of tokens
12+
Tokenize(io.Reader) chan string
1613
}
1714

18-
type regexTokenizer struct {
19-
tokenizer *regexp.Regexp
20-
}
15+
// StdOption provides configuration settings for a StdTokenizer
16+
type StdOption func(*StdTokenizer)
2117

22-
type stdTokenizer struct {
18+
// StdTokenizer provides a common document tokenizer that splits a
19+
// document by word boundaries
20+
type StdTokenizer struct {
21+
transforms []Mapper
22+
filters []Predicate
23+
bufferSize int
2324
}
2425

2526
// NewTokenizer initializes a new standard Tokenizer instance
26-
func NewTokenizer() Tokenizer {
27-
return &stdTokenizer{}
28-
}
29-
30-
// NewRegexTokenizer initializes a new regular expression Tokenizer instance
31-
func NewRegexTokenizer() Tokenizer {
32-
return &regexTokenizer{
33-
tokenizer: regexp.MustCompile("\\W+"),
27+
func NewTokenizer(opts ...StdOption) *StdTokenizer {
28+
tokenizer := &StdTokenizer{
29+
bufferSize: 100,
30+
transforms: []Mapper{
31+
strings.ToLower,
32+
},
33+
filters: []Predicate{
34+
IsNotStopWord,
35+
},
36+
}
37+
for _, opt := range opts {
38+
opt(tokenizer)
3439
}
40+
return tokenizer
3541
}
3642

37-
func (t *stdTokenizer) Tokenize(r io.Reader) chan string {
43+
func (t *StdTokenizer) Tokenize(r io.Reader) chan string {
3844
tokenizer := bufio.NewScanner(r)
3945
tokenizer.Split(bufio.ScanWords)
40-
tokens := make(chan string)
46+
tokens := make(chan string, t.bufferSize)
4147

4248
go func() {
4349
for tokenizer.Scan() {
@@ -46,27 +52,27 @@ func (t *stdTokenizer) Tokenize(r io.Reader) chan string {
4652
close(tokens)
4753
}()
4854

49-
return pipeline(tokens)
55+
return t.pipeline(tokens)
5056
}
5157

52-
// Tokenize extracts and normalizes all words from a text corpus
53-
func (t *regexTokenizer) Tokenize(r io.Reader) chan string {
54-
buffer := new(bytes.Buffer)
55-
buffer.ReadFrom(r)
56-
b := buffer.Bytes()
57-
doc := *(*string)(unsafe.Pointer(&b))
58-
tokens := make(chan string)
59-
60-
go func() {
61-
for _, token := range t.tokenizer.Split(doc, -1) {
62-
tokens <- token
63-
}
64-
close(tokens)
65-
}()
58+
func (t *StdTokenizer) pipeline(in chan string) chan string {
59+
return Map(Filter(in, t.filters...), t.transforms...)
60+
}
6661

67-
return pipeline(tokens)
62+
func BufferSize(size int) StdOption {
63+
return func(t *StdTokenizer) {
64+
t.bufferSize = size
65+
}
6866
}
6967

70-
func pipeline(tokens chan string) chan string {
71-
return Map(Filter(tokens, IsNotStopWord), strings.ToLower)
68+
func Transforms(m ...Mapper) StdOption {
69+
return func(t *StdTokenizer) {
70+
t.transforms = m
71+
}
7272
}
73+
74+
func Filters(f ...Predicate) StdOption {
75+
return func(t *StdTokenizer) {
76+
t.filters = f
77+
}
78+
}

tokens_test.go

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,6 @@ func TestTokenize(t *testing.T) {
1616
tokens := NewTokenizer().Tokenize(toReader(text))
1717
doTokenizeTest(t, tokens)
1818
})
19-
t.Run("Regexp Tokenizer", func(t *testing.T) {
20-
tokens := NewRegexTokenizer().Tokenize(toReader(text))
21-
doTokenizeTest(t, tokens)
22-
})
2319
}
2420

2521
func toReader(text string) io.Reader {

0 commit comments

Comments
 (0)