From e4fbb485c12cf401c1afabe8258d6e5b1d190984 Mon Sep 17 00:00:00 2001 From: Dmitry Russkikh Date: Sat, 27 Sep 2025 18:26:25 +0300 Subject: [PATCH 1/5] Remove ScoreFunc completely --- options.go | 46 +------------------------------------------- spellchecker.go | 5 ----- spellchecker_test.go | 9 --------- 3 files changed, 1 insertion(+), 59 deletions(-) diff --git a/options.go b/options.go index 1aef963..4dea81e 100644 --- a/options.go +++ b/options.go @@ -18,11 +18,7 @@ func (s *Spellchecker) WithOpts(opts ...OptionFunc) error { } } - if s.scoreFunc != nil { - s.dict.filterFunc = wrapScoreFunc(s.scoreFunc, s.maxErrors) - } else { - s.dict.filterFunc = s.filterFunc - } + s.dict.filterFunc = s.filterFunc return nil } @@ -65,21 +61,6 @@ func WithFilterFunc(f FilterFunc) OptionFunc { } } -// ScoreFunc custom scoring function type -// -// Deprecated: use FilterFunc instead -type ScoreFunc func(src []rune, candidate []rune, distance int, cnt uint) float64 - -// WithScoreFunc specify a function that will be used for scoring -// -// Deprecated: use WithFilterFunc instead -func WithScoreFunc(f ScoreFunc) OptionFunc { - return func(s *Spellchecker) error { - s.scoreFunc = f - return nil - } -} - func defaultFilterFunc(maxErrors int) FilterFunc { return func(src, candidate []rune, count uint) (float64, bool) { distance, prefixLen, suffixLen := levenshtein.Calculate(src, candidate, 0, 1, 1, 1) @@ -92,28 +73,3 @@ func defaultFilterFunc(maxErrors int) FilterFunc { return 1 / (1 + float64(distance*distance)) * mult, true } } - -func wrapScoreFunc(f ScoreFunc, maxErrors int) FilterFunc { - return func(src, candidate []rune, count uint) (float64, bool) { - distance, _, _ := levenshtein.Calculate(src, candidate, 0, 1, 1, 1) - if distance > maxErrors { - return 0, false - } - - return f(src, candidate, distance, count), true - } -} - -var defaultScoreFunc ScoreFunc = func(src, candidate []rune, distance int, cnt uint) float64 { - mult := math.Log1p(float64(cnt)) - // if first letters are the same, increase score - if src[0] == candidate[0] { - mult *= 1.5 - // if second letters are the same too, increase score even more - if len(src) > 1 && len(candidate) > 1 && src[1] == candidate[1] { - mult *= 1.5 - } - } - - return 1 / (1 + float64(distance*distance)) * mult -} diff --git a/spellchecker.go b/spellchecker.go index 9b6feba..a85fd63 100644 --- a/spellchecker.go +++ b/spellchecker.go @@ -18,7 +18,6 @@ type Spellchecker struct { dict *dictionary splitter bufio.SplitFunc filterFunc FilterFunc - scoreFunc ScoreFunc maxErrors int } @@ -34,10 +33,6 @@ func New(alphabet string, opts ...OptionFunc) (*Spellchecker, error) { } } - if result.scoreFunc != nil { - result.filterFunc = wrapScoreFunc(result.scoreFunc, result.maxErrors) - } - dict, err := newDictionary(alphabet, result.filterFunc, result.maxErrors) if err != nil { return nil, err diff --git a/spellchecker_test.go b/spellchecker_test.go index 9361058..e7ccffe 100644 --- a/spellchecker_test.go +++ b/spellchecker_test.go @@ -235,15 +235,6 @@ func Test_Spellchecker_Fix(t *testing.T) { require.Equal(t, "problem", result) } -func Test_Spellchecker_Fix_ScoreFunc(t *testing.T) { - s := newSampleSpellchecker() - s.WithOpts(WithScoreFunc(defaultScoreFunc)) - - result, err := s.Fix("problam") - require.NoError(t, err) - require.Equal(t, "problem", result) -} - func Test_Spellchecker_Suggest(t *testing.T) { s := newSampleSpellchecker() result, err := s.Suggest("arang", 5) From 47fa8e65d54015279a71c948ee1e1f879b26d796 Mon Sep 17 00:00:00 2001 From: Dmitry Russkikh Date: Sat, 27 Sep 2025 18:39:10 +0300 Subject: [PATCH 2/5] Do not store FilterFunc in dictionary --- dictionary.go | 30 +++++++++++++----------------- dictionary_test.go | 6 +++--- options.go | 2 -- save.go | 3 ++- save_test.go | 2 +- spellchecker.go | 8 ++++---- 6 files changed, 23 insertions(+), 28 deletions(-) diff --git a/dictionary.go b/dictionary.go index 222d4ed..b61ca22 100644 --- a/dictionary.go +++ b/dictionary.go @@ -20,25 +20,22 @@ type dictionary struct { counts map[uint32]uint index map[uint64][]uint32 - - filterFunc FilterFunc } -func newDictionary(ab string, filterFunc FilterFunc, maxErrors int) (*dictionary, error) { +func newDictionary(ab string, maxErrors int) (*dictionary, error) { alphabet, err := newAlphabet(ab) if err != nil { return nil, err } return &dictionary{ - maxErrors: maxErrors, - alphabet: alphabet, - nextID: idSeq(0), - ids: make(map[string]uint32), - words: make(map[uint32][]rune), - counts: make(map[uint32]uint), - index: make(map[uint64][]uint32), - filterFunc: filterFunc, + maxErrors: maxErrors, + alphabet: alphabet, + nextID: idSeq(0), + ids: make(map[string]uint32), + words: make(map[uint32][]rune), + counts: make(map[uint32]uint), + index: make(map[uint64][]uint32), }, nil } @@ -81,7 +78,7 @@ type Match struct { Score float64 } -func (d *dictionary) find(word string, n int) []Match { +func (d *dictionary) find(word string, n int, fn FilterFunc) []Match { if d.maxErrors <= 0 { return nil } @@ -93,7 +90,7 @@ func (d *dictionary) find(word string, n int) []Match { // check for transposition or exact match and do early termination if found // (the most common mistake is a transposition of letters) - d.fillWithCandidates(result, wordRunes, sum(bmSrc)) + d.fillWithCandidates(result, wordRunes, sum(bmSrc), fn) if result.Len() != 0 { return result.DrainSorted() } @@ -101,7 +98,7 @@ func (d *dictionary) find(word string, n int) []Match { bitmaps := bitmapsPool.Get().(map[uint64]struct{}) d.computeCandidateBitmaps(bitmaps, bmSrc, d.maxErrors) for bm := range bitmaps { - d.fillWithCandidates(result, wordRunes, bm) + d.fillWithCandidates(result, wordRunes, bm, fn) } releaseBitmaps(bitmaps) @@ -131,7 +128,7 @@ func (d *dictionary) computeCandidateBitmaps(bitmaps map[uint64]struct{}, src bi dfs(src.Clone(), 0, 0) } -func (d *dictionary) fillWithCandidates(result *priorityQueue, wordRunes []rune, bm uint64) { +func (d *dictionary) fillWithCandidates(result *priorityQueue, wordRunes []rune, bm uint64, filter FilterFunc) { ids := d.index[bm] for _, id := range ids { docWord, ok := d.words[id] @@ -139,7 +136,7 @@ func (d *dictionary) fillWithCandidates(result *priorityQueue, wordRunes []rune, continue } - score, ok := d.filterFunc(wordRunes, docWord, d.counts[id]) + score, ok := filter(wordRunes, docWord, d.counts[id]) if !ok { continue } @@ -209,7 +206,6 @@ func (d *dictionary) UnmarshalBinary(data []byte) error { d.index = dictData.Index d.maxErrors = dictData.MaxErrors - d.filterFunc = defaultFilterFunc(dictData.MaxErrors) var max uint32 for _, id := range d.ids { diff --git a/dictionary_test.go b/dictionary_test.go index 938ae82..95cf57d 100644 --- a/dictionary_test.go +++ b/dictionary_test.go @@ -7,7 +7,7 @@ import ( ) func Test_dictionary_id(t *testing.T) { - dict, err := newDictionary(DefaultAlphabet, nil, DefaultMaxErrors) + dict, err := newDictionary(DefaultAlphabet, DefaultMaxErrors) require.NoError(t, err) t.Run("must return 0 for unexisting word", func(t *testing.T) { @@ -24,7 +24,7 @@ func Test_dictionary_id(t *testing.T) { func Test_dictionary_add(t *testing.T) { t.Run("must add word to dictionary index", func(t *testing.T) { - dict, err := newDictionary(DefaultAlphabet, nil, DefaultMaxErrors) + dict, err := newDictionary(DefaultAlphabet, DefaultMaxErrors) require.NoError(t, err) id, err := dict.add("qwe", 1) @@ -49,7 +49,7 @@ func Test_dictionary_add(t *testing.T) { func Test_Dictionary_Inc(t *testing.T) { t.Run("must increase counter value", func(t *testing.T) { - dict, err := newDictionary(DefaultAlphabet, nil, DefaultMaxErrors) + dict, err := newDictionary(DefaultAlphabet, DefaultMaxErrors) dict.counts[1] = 0 require.NoError(t, err) diff --git a/options.go b/options.go index 4dea81e..78ecf7f 100644 --- a/options.go +++ b/options.go @@ -18,8 +18,6 @@ func (s *Spellchecker) WithOpts(opts ...OptionFunc) error { } } - s.dict.filterFunc = s.filterFunc - return nil } diff --git a/save.go b/save.go index 657045d..9d22ee8 100644 --- a/save.go +++ b/save.go @@ -31,6 +31,7 @@ func Load(reader io.Reader) (*Spellchecker, error) { } return &Spellchecker{ - dict: data.Dict, + dict: data.Dict, + filterFunc: defaultFilterFunc(data.Dict.maxErrors), }, nil } diff --git a/save_test.go b/save_test.go index d697e09..9487b59 100644 --- a/save_test.go +++ b/save_test.go @@ -29,7 +29,7 @@ func Test_Spellchecker_Save(t *testing.T) { require.EqualValues(t, m1.dict.maxErrors, m2.dict.maxErrors) require.EqualValues(t, m1.dict.nextID(), m2.dict.nextID()) - matches := m2.dict.find("orange", 1) + matches := m2.dict.find("orange", 1, m2.filterFunc) require.Len(t, matches, 1) require.Equal(t, matches[0].Value, "orange") require.Greater(t, matches[0].Score, 0.0) diff --git a/spellchecker.go b/spellchecker.go index a85fd63..9ab0804 100644 --- a/spellchecker.go +++ b/spellchecker.go @@ -33,7 +33,7 @@ func New(alphabet string, opts ...OptionFunc) (*Spellchecker, error) { } } - dict, err := newDictionary(alphabet, result.filterFunc, result.maxErrors) + dict, err := newDictionary(alphabet, result.maxErrors) if err != nil { return nil, err } @@ -115,7 +115,7 @@ func (s *Spellchecker) Fix(word string) (string, error) { return word, nil } - hits := s.dict.find(word, 1) + hits := s.dict.find(word, 1, s.filterFunc) if len(hits) == 0 { return word, ErrUnknownWord } @@ -132,7 +132,7 @@ func (s *Spellchecker) Suggest(word string, n int) ([]string, error) { return []string{word}, nil } - hits := s.dict.find(word, n) + hits := s.dict.find(word, n, s.filterFunc) if len(hits) == 0 { return []string{word}, ErrUnknownWord } @@ -161,6 +161,6 @@ func (s *Spellchecker) SuggestScore(word string, n int) SuggestionResult { } return SuggestionResult{ - Suggestions: s.dict.find(word, n), + Suggestions: s.dict.find(word, n, s.filterFunc), } } From b7065a95d225ae694345abfaf5cfda27da6f9811 Mon Sep 17 00:00:00 2001 From: Dmitry Russkikh Date: Sat, 27 Sep 2025 18:56:15 +0300 Subject: [PATCH 3/5] Remove several API methods --- README.md | 18 ++++++++------- spellchecker.go | 53 +++++++------------------------------------- spellchecker_test.go | 31 +++++++++++--------------- 3 files changed, 31 insertions(+), 71 deletions(-) diff --git a/README.md b/README.md index 492d426..b004857 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ Yet another spellchecker written in go. ## Installation ``` -go get -v github.com/f1monkey/spellchecker +go get -v github.com/f1monkey/spellchecker/v2 ``` ## Usage @@ -41,15 +41,18 @@ func main() { panic(err) } + // The weight increases the likelihood that the word will be chosen as a correction. + weight := uint(1) + // Load data from any io.Reader in, err := os.Open("data/sample.txt") if err != nil { panic(err) } - sc.AddFrom(in) + sc.AddFrom(weight, in) // Add words manually - sc.Add("lock", "stock", "and", "two", "smoking", "barrels") + sc.Add(weight, "lock", "stock", "and", "two", "smoking", "barrels") // Check if a word is valid result := sc.IsCorrect("coffee") @@ -140,9 +143,9 @@ goos: linux goarch: amd64 pkg: github.com/f1monkey/spellchecker cpu: 13th Gen Intel(R) Core(TM) i9-13980HX -Benchmark_Norvig1-32 348 3385868 ns/op 74.44 success_percent 201.0 success_words 270.0 total_words 830803 B/op 15504 allocs/op +Benchmark_Norvig1-32 379 3099977 ns/op 74.44 success_percent 201.0 success_words 270.0 total_words 820251 B/op 15234 allocs/op PASS -ok github.com/f1monkey/spellchecker 3.723s +ok github.com/f1monkey/spellchecker 3.740s ``` #### [Test set 2](http://norvig.com/spell-testset2.txt): @@ -154,8 +157,7 @@ goos: linux goarch: amd64 pkg: github.com/f1monkey/spellchecker cpu: 13th Gen Intel(R) Core(TM) i9-13980HX -Benchmark_Norvig2-32 231 4935406 ns/op 71.25 success_percent 285.0 success_words 400.0 total_words 1270755 B/op 21801 allocs/op +Benchmark_Norvig2-32 219 4916738 ns/op 71.25 success_percent 285.0 success_words 400.0 total_words 1257491 B/op 21401 allocs/op PASS -ok github.com/f1monkey/spellchecker 4.057s - +ok github.com/f1monkey/spellchecker 3.919s ``` diff --git a/spellchecker.go b/spellchecker.go index 9ab0804..3890535 100644 --- a/spellchecker.go +++ b/spellchecker.go @@ -44,7 +44,7 @@ func New(alphabet string, opts ...OptionFunc) (*Spellchecker, error) { } // AddFrom reads input, splits it with spellchecker splitter func and adds words to the dictionary -func (m *Spellchecker) AddFrom(input io.Reader) error { +func (m *Spellchecker) AddFrom(weight uint, input io.Reader) error { words := make([]string, 1000) i := 0 for item := range readInput(input, m.splitter) { @@ -53,7 +53,7 @@ func (m *Spellchecker) AddFrom(input io.Reader) error { } if i == len(words) { - m.Add(words...) + m.Add(weight, words...) i = 0 } words[i] = item.word @@ -61,29 +61,14 @@ func (m *Spellchecker) AddFrom(input io.Reader) error { } if i > 0 { - m.Add(words[:i]...) + m.Add(weight, words[:i]...) } return nil } -// Add adds provided words to the dictionary -func (m *Spellchecker) Add(words ...string) { - m.mtx.Lock() - defer m.mtx.Unlock() - - for _, word := range words { - if id := m.dict.id(word); id > 0 { - m.dict.inc(id, 1) - continue - } - - m.dict.add(word, 1) - } -} - -// AddWeight adds provided words to the dictionary with a custom weight -func (m *Spellchecker) AddWeight(weight uint, words ...string) { +// Add adds provided words to the dictionary with a custom weight +func (m *Spellchecker) Add(weight uint, words ...string) { m.mtx.Lock() defer m.mtx.Unlock() @@ -123,36 +108,14 @@ func (s *Spellchecker) Fix(word string) (string, error) { return hits[0].Value, nil } -// Suggest find top n suggestions for the word -func (s *Spellchecker) Suggest(word string, n int) ([]string, error) { - s.mtx.RLock() - defer s.mtx.RUnlock() - - if s.dict.has(word) { - return []string{word}, nil - } - - hits := s.dict.find(word, n, s.filterFunc) - if len(hits) == 0 { - return []string{word}, ErrUnknownWord - } - - result := make([]string, len(hits)) - for i, h := range hits { - result[i] = h.Value - } - - return result, nil -} - type SuggestionResult struct { - ExactMatch bool + ExactMatch bool // if true, the word is correct Suggestions []Match } -// SuggestScore find top n suggestions for the word. +// Suggest find top n suggestions for the word. // Returns spellchecker scores along with words -func (s *Spellchecker) SuggestScore(word string, n int) SuggestionResult { +func (s *Spellchecker) Suggest(word string, n int) SuggestionResult { s.mtx.RLock() defer s.mtx.RUnlock() diff --git a/spellchecker_test.go b/spellchecker_test.go index e7ccffe..1d81005 100644 --- a/spellchecker_test.go +++ b/spellchecker_test.go @@ -3,7 +3,6 @@ package spellchecker import ( "bufio" "errors" - "fmt" "os" "strings" "testing" @@ -47,7 +46,7 @@ func newFullSpellchecker() *Spellchecker { panic(err) } - err = s.AddFrom(f) + err = s.AddFrom(1, f) if err != nil { panic(err) } @@ -66,7 +65,7 @@ func newSampleSpellchecker() *Spellchecker { panic(err) } - err = s.AddFrom(f) + err = s.AddFrom(1, f) if err != nil { panic(err) } @@ -171,18 +170,21 @@ func benchmarkNorvig(b *testing.B, dataPath string) { } b.StartTimer() - result, err := m.Suggest(word, 10) + result := m.Suggest(word, 10) b.StopTimer() - if err != nil && !errors.Is(err, ErrUnknownWord) { - fmt.Println(err) - } if i == 0 { total++ - if len(result) > 0 && result[0] == item.expected { + if result.ExactMatch && word == item.expected { + ok++ + continue + } + + if len(result.Suggestions) > 0 && result.Suggestions[0].Value == item.expected { ok++ continue } + // got := "" // if len(result) > 0 { // got = result[0] @@ -235,17 +237,10 @@ func Test_Spellchecker_Fix(t *testing.T) { require.Equal(t, "problem", result) } -func Test_Spellchecker_Suggest(t *testing.T) { - s := newSampleSpellchecker() - result, err := s.Suggest("arang", 5) - require.NoError(t, err) - require.Equal(t, []string{"orange", "range"}, result) -} - func Test_Spellchecker_SuggestScore(t *testing.T) { t.Run("fix", func(t *testing.T) { s := newSampleSpellchecker() - result := s.SuggestScore("arang", 5) + result := s.Suggest("arang", 5) require.Equal(t, SuggestionResult{ Suggestions: []Match{ {Value: "orange", Score: 0.2772588722239781}, @@ -256,13 +251,13 @@ func Test_Spellchecker_SuggestScore(t *testing.T) { t.Run("valid word", func(t *testing.T) { s := newSampleSpellchecker() - result := s.SuggestScore("orange", 5) + result := s.Suggest("orange", 5) require.Equal(t, SuggestionResult{ExactMatch: true}, result) }) t.Run("unknown word", func(t *testing.T) { s := newSampleSpellchecker() - result := s.SuggestScore("qwerty", 5) + result := s.Suggest("qwerty", 5) require.Equal(t, SuggestionResult{Suggestions: []Match{}}, result) }) } From 96b229856dd2d6ec6694b236ca49afdfe5cff188 Mon Sep 17 00:00:00 2001 From: Dmitry Russkikh Date: Sat, 27 Sep 2025 20:42:16 +0300 Subject: [PATCH 4/5] Remove option functions, allow to provide filterFunc and maxErrors on every Fix(), Suggest() call --- README.md | 35 ++++++---------- dictionary.go | 62 ++++++++++------------------- dictionary_test.go | 6 +-- options.go | 95 +++++++++++++++++++++++++------------------- reader.go | 14 ------- save.go | 3 +- save_test.go | 3 +- spellchecker.go | 71 ++++++++++++++------------------- spellchecker_test.go | 45 +++++++++------------ 9 files changed, 144 insertions(+), 190 deletions(-) diff --git a/README.md b/README.md index b004857..b71ab4a 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,6 @@ func main() { // Create a new instance sc, err := spellchecker.New( "abcdefghijklmnopqrstuvwxyz1234567890", // allowed symbols, other symbols will be ignored - spellchecker.WithMaxErrors(2) // see options.go ) if err != nil { panic(err) @@ -49,28 +48,30 @@ func main() { if err != nil { panic(err) } - sc.AddFrom(weight, in) + + sc.AddFrom(&spellchecker.AddOptions{Weight: weight}, in) + // OR + sc.AddFrom(nil, in) // Add words manually - sc.Add(weight, "lock", "stock", "and", "two", "smoking", "barrels") + sc.Add(nil, "lock", "stock", "and", "two", "smoking", "barrels") // Check if a word is valid result := sc.IsCorrect("coffee") fmt.Println(result) // true // Correct a single word - fixed, err := sc.Fix("awepon") - if err != nil && !errors.Is(err, spellchecker.ErrUnknownWord) { - panic(err) - } + fixed, isCorrect := sc.Fix(nil, "awepon") + fmt.Println(isCorrect) // false fmt.Println(fixed) // weapon // Find up to 10 suggestions for a word - matches, err := sc.Suggest("rang", 10) - if err != nil && !errors.Is(err, spellchecker.ErrUnknownWord) { - panic(err) - } + matches := sc.Suggest(nil, "rang", 10) fmt.Println(matches) // [range, orange] + + if len(os.Args) < 2 { + log.Fatal("dict path must be provided") + } ``` ### Options @@ -116,17 +117,7 @@ You can provide a custom scoring function if needed: // handle err } - // After loading a spellchecker from a file, - // you need to set the function again: - sc, err = spellchecker.Load(inFile) - if err != nil { - // handle err - } - - err = sc.WithOpts(spellchecker.WithFilterFunc(fn)) - if err != nil { - // handle err - } + sc.Fix(fn, "word") ``` diff --git a/dictionary.go b/dictionary.go index b61ca22..cfd7eb4 100644 --- a/dictionary.go +++ b/dictionary.go @@ -11,9 +11,8 @@ import ( ) type dictionary struct { - maxErrors int - alphabet alphabet - nextID func() uint32 + alphabet alphabet + nextID func() uint32 words map[uint32][]rune ids map[string]uint32 @@ -22,20 +21,19 @@ type dictionary struct { index map[uint64][]uint32 } -func newDictionary(ab string, maxErrors int) (*dictionary, error) { +func newDictionary(ab string) (*dictionary, error) { alphabet, err := newAlphabet(ab) if err != nil { return nil, err } return &dictionary{ - maxErrors: maxErrors, - alphabet: alphabet, - nextID: idSeq(0), - ids: make(map[string]uint32), - words: make(map[uint32][]rune), - counts: make(map[uint32]uint), - index: make(map[uint64][]uint32), + alphabet: alphabet, + nextID: idSeq(0), + ids: make(map[string]uint32), + words: make(map[uint32][]rune), + counts: make(map[uint32]uint), + index: make(map[uint64][]uint32), }, nil } @@ -78,8 +76,8 @@ type Match struct { Score float64 } -func (d *dictionary) find(word string, n int, fn FilterFunc) []Match { - if d.maxErrors <= 0 { +func (d *dictionary) find(word string, n int, maxErrors int, fn FilterFunc) []Match { + if maxErrors <= 0 { return nil } @@ -96,7 +94,7 @@ func (d *dictionary) find(word string, n int, fn FilterFunc) []Match { } bitmaps := bitmapsPool.Get().(map[uint64]struct{}) - d.computeCandidateBitmaps(bitmaps, bmSrc, d.maxErrors) + d.computeCandidateBitmaps(bitmaps, bmSrc, maxErrors) for bm := range bitmaps { d.fillWithCandidates(result, wordRunes, bm, fn) } @@ -152,25 +150,21 @@ var _ encoding.BinaryMarshaler = (*dictionary)(nil) var _ encoding.BinaryUnmarshaler = (*dictionary)(nil) type dictData struct { - Alphabet alphabet - IDs map[string]uint32 - Words map[uint32]string - WordRunes map[uint32][]rune - Counts map[uint32]uint + Alphabet alphabet + IDs map[string]uint32 + Words map[uint32][]rune + Counts map[uint32]uint Index map[uint64][]uint32 - - MaxErrors int } func (d *dictionary) MarshalBinary() ([]byte, error) { data := &dictData{ - Alphabet: d.alphabet, - IDs: d.ids, - WordRunes: d.words, - Counts: d.counts, - Index: d.index, - MaxErrors: d.maxErrors, + Alphabet: d.alphabet, + IDs: d.ids, + Words: d.words, + Counts: d.counts, + Index: d.index, } buf := &bytes.Buffer{} @@ -192,20 +186,8 @@ func (d *dictionary) UnmarshalBinary(data []byte) error { d.alphabet = dictData.Alphabet d.ids = dictData.IDs d.counts = dictData.Counts - - // compatibility with previous versions - if len(dictData.Words) > 0 { - wordRunes := make(map[uint32][]rune, len(dictData.Words)) - for k, v := range dictData.Words { - wordRunes[k] = []rune(v) - } - d.words = wordRunes - } else { - d.words = dictData.WordRunes - } - d.index = dictData.Index - d.maxErrors = dictData.MaxErrors + d.words = dictData.Words var max uint32 for _, id := range d.ids { diff --git a/dictionary_test.go b/dictionary_test.go index 95cf57d..0311fa2 100644 --- a/dictionary_test.go +++ b/dictionary_test.go @@ -7,7 +7,7 @@ import ( ) func Test_dictionary_id(t *testing.T) { - dict, err := newDictionary(DefaultAlphabet, DefaultMaxErrors) + dict, err := newDictionary(DefaultAlphabet) require.NoError(t, err) t.Run("must return 0 for unexisting word", func(t *testing.T) { @@ -24,7 +24,7 @@ func Test_dictionary_id(t *testing.T) { func Test_dictionary_add(t *testing.T) { t.Run("must add word to dictionary index", func(t *testing.T) { - dict, err := newDictionary(DefaultAlphabet, DefaultMaxErrors) + dict, err := newDictionary(DefaultAlphabet) require.NoError(t, err) id, err := dict.add("qwe", 1) @@ -49,7 +49,7 @@ func Test_dictionary_add(t *testing.T) { func Test_Dictionary_Inc(t *testing.T) { t.Run("must increase counter value", func(t *testing.T) { - dict, err := newDictionary(DefaultAlphabet, DefaultMaxErrors) + dict, err := newDictionary(DefaultAlphabet) dict.counts[1] = 0 require.NoError(t, err) diff --git a/options.go b/options.go index 78ecf7f..c23019d 100644 --- a/options.go +++ b/options.go @@ -2,61 +2,61 @@ package spellchecker import ( "bufio" + "bytes" "math" + "regexp" "github.com/agext/levenshtein" ) -// WithOpt set spellchecker options -func (s *Spellchecker) WithOpts(opts ...OptionFunc) error { - s.mtx.Lock() - defer s.mtx.Unlock() +const DefaultMaxErrors = 2 - for _, o := range opts { - if err := o(s); err != nil { - return err - } - } +type FilterFunc func(src, candidate []rune, count uint) (float64, bool) + +type SearchOptions struct { + // MaxErrors — the maximum allowed difference in bits + // between the "search word" and a "dictionary word". + // - deletion is a 1-bit change (proble → problem) + // - insertion is a 1-bit change (problemm → problem) + // - substitution is a 2-bit change (problam → problem) + // - transposition is a 0-bit change (problme → problem) + // + // It is not recommended to set this value greater than 2, + // as it can significantly affect performance. + MaxErrors int - return nil + // FilterFunc compares the source word with a candidate word. + // It returns the candidate's score and a boolean flag. + // If the flag is false, the candidate will be completely filtered out. + FilterFunc FilterFunc } -// WithSplitter set splitter func for AddFrom() reader -func WithSplitter(f bufio.SplitFunc) OptionFunc { - return func(s *Spellchecker) error { - s.splitter = f - return nil - } +var defaultSearchOptions = &SearchOptions{ + MaxErrors: DefaultMaxErrors, + FilterFunc: defaultFilterFunc(DefaultMaxErrors), } -// WithMaxErrors sets maxErrors — the maximum allowed difference in bits -// between the "search word" and a "dictionary word". -// - deletion is a 1-bit change (proble → problem) -// - insertion is a 1-bit change (problemm → problem) -// - substitution is a 2-bit change (problam → problem) -// - transposition is a 0-bit change (problme → problem) -// -// It is not recommended to set this value greater than 2, -// as it can significantly affect performance. -func WithMaxErrors(maxErrors int) OptionFunc { - return func(s *Spellchecker) error { - s.maxErrors = maxErrors - - return nil - } +type AddOptions struct { + Weight uint + // Splitter is a splitter func for AddFrom() reader + Splitter bufio.SplitFunc } -// FilterFunc compares the source word with a candidate word. -// It returns the candidate's score and a boolean flag. -// If the flag is false, the candidate will be completely filtered out. -type FilterFunc func(src, candidate []rune, count uint) (float64, bool) +var defaultAddOptions = &AddOptions{ + Weight: 1, + Splitter: defaultSplitter, +} -// WithFilterFunc set custom scoring function -func WithFilterFunc(f FilterFunc) OptionFunc { - return func(s *Spellchecker) error { - s.filterFunc = f - return nil +var wordSymbols = regexp.MustCompile(`[-\pL]+`) + +func defaultSplitter(data []byte, atEOF bool) (advance int, token []byte, err error) { + advance, token, err = bufio.ScanWords(data, atEOF) + if err != nil { + return } + token = bytes.ToLower(token) + + return advance, wordSymbols.Find(token), nil } func defaultFilterFunc(maxErrors int) FilterFunc { @@ -71,3 +71,18 @@ func defaultFilterFunc(maxErrors int) FilterFunc { return 1 / (1 + float64(distance*distance)) * mult, true } } + +func applyDefaults(opts *SearchOptions) *SearchOptions { + if opts == nil { + opts = defaultSearchOptions + } else { + if opts.MaxErrors == 0 { + opts.MaxErrors = DefaultMaxErrors + } + if opts.FilterFunc == nil { + opts.FilterFunc = defaultFilterFunc(opts.MaxErrors) + } + } + + return opts +} diff --git a/reader.go b/reader.go index 5014139..1f801d6 100644 --- a/reader.go +++ b/reader.go @@ -2,9 +2,7 @@ package spellchecker import ( "bufio" - "bytes" "io" - "regexp" ) type readData struct { @@ -12,18 +10,6 @@ type readData struct { err error } -var wordSymbols = regexp.MustCompile(`[-\pL]+`) - -func defaultSplitter(data []byte, atEOF bool) (advance int, token []byte, err error) { - advance, token, err = bufio.ScanWords(data, atEOF) - if err != nil { - return - } - token = bytes.ToLower(token) - - return advance, wordSymbols.Find(token), nil -} - func readInput(input io.Reader, splitter bufio.SplitFunc) <-chan readData { if splitter == nil { splitter = defaultSplitter diff --git a/save.go b/save.go index 9d22ee8..657045d 100644 --- a/save.go +++ b/save.go @@ -31,7 +31,6 @@ func Load(reader io.Reader) (*Spellchecker, error) { } return &Spellchecker{ - dict: data.Dict, - filterFunc: defaultFilterFunc(data.Dict.maxErrors), + dict: data.Dict, }, nil } diff --git a/save_test.go b/save_test.go index 9487b59..75df267 100644 --- a/save_test.go +++ b/save_test.go @@ -26,10 +26,9 @@ func Test_Spellchecker_Save(t *testing.T) { require.NoError(t, err) require.EqualValues(t, m1.dict.id("green"), m2.dict.id("green")) - require.EqualValues(t, m1.dict.maxErrors, m2.dict.maxErrors) require.EqualValues(t, m1.dict.nextID(), m2.dict.nextID()) - matches := m2.dict.find("orange", 1, m2.filterFunc) + matches := m2.dict.find("orange", 1, 2, defaultFilterFunc(2)) require.Len(t, matches, 1) require.Equal(t, matches[0].Value, "orange") require.Greater(t, matches[0].Score, 0.0) diff --git a/spellchecker.go b/spellchecker.go index 3890535..b9df6d0 100644 --- a/spellchecker.go +++ b/spellchecker.go @@ -1,59 +1,42 @@ package spellchecker import ( - "bufio" - "fmt" "io" "sync" ) -const DefaultMaxErrors = 2 - -// OptionFunc option setter -type OptionFunc func(s *Spellchecker) error - type Spellchecker struct { mtx sync.RWMutex - dict *dictionary - splitter bufio.SplitFunc - filterFunc FilterFunc - maxErrors int + dict *dictionary } -func New(alphabet string, opts ...OptionFunc) (*Spellchecker, error) { - result := &Spellchecker{ - maxErrors: DefaultMaxErrors, - filterFunc: defaultFilterFunc(DefaultMaxErrors), - } - - for _, o := range opts { - if err := o(result); err != nil { - return nil, err - } - } - - dict, err := newDictionary(alphabet, result.maxErrors) +func New(alphabet string) (*Spellchecker, error) { + dict, err := newDictionary(alphabet) if err != nil { return nil, err } - result.dict = dict + result := &Spellchecker{dict: dict} return result, nil } // AddFrom reads input, splits it with spellchecker splitter func and adds words to the dictionary -func (m *Spellchecker) AddFrom(weight uint, input io.Reader) error { +func (m *Spellchecker) AddFrom(opts *AddOptions, input io.Reader) error { + if opts == nil { + opts = defaultAddOptions + } + words := make([]string, 1000) i := 0 - for item := range readInput(input, m.splitter) { + for item := range readInput(input, opts.Splitter) { if item.err != nil { return item.err } if i == len(words) { - m.Add(weight, words...) + m.Add(opts, words...) i = 0 } words[i] = item.word @@ -61,29 +44,31 @@ func (m *Spellchecker) AddFrom(weight uint, input io.Reader) error { } if i > 0 { - m.Add(weight, words[:i]...) + m.Add(opts, words[:i]...) } return nil } // Add adds provided words to the dictionary with a custom weight -func (m *Spellchecker) Add(weight uint, words ...string) { +func (m *Spellchecker) Add(opts *AddOptions, words ...string) { m.mtx.Lock() defer m.mtx.Unlock() + if opts == nil { + opts = defaultAddOptions + } + for _, word := range words { if id := m.dict.id(word); id > 0 { - m.dict.inc(id, weight) + m.dict.inc(id, opts.Weight) continue } - m.dict.add(word, weight) + m.dict.add(word, opts.Weight) } } -var ErrUnknownWord = fmt.Errorf("unknown word") - // IsCorrect check if provided word is in the dictionary func (s *Spellchecker) IsCorrect(word string) bool { s.mtx.RLock() @@ -92,20 +77,22 @@ func (s *Spellchecker) IsCorrect(word string) bool { return s.dict.has(word) } -func (s *Spellchecker) Fix(word string) (string, error) { +func (s *Spellchecker) Fix(opts *SearchOptions, word string) (string, bool) { s.mtx.RLock() defer s.mtx.RUnlock() if s.dict.has(word) { - return word, nil + return word, true } - hits := s.dict.find(word, 1, s.filterFunc) + opts = applyDefaults(opts) + + hits := s.dict.find(word, 1, opts.MaxErrors, opts.FilterFunc) if len(hits) == 0 { - return word, ErrUnknownWord + return word, false } - return hits[0].Value, nil + return hits[0].Value, false } type SuggestionResult struct { @@ -115,7 +102,7 @@ type SuggestionResult struct { // Suggest find top n suggestions for the word. // Returns spellchecker scores along with words -func (s *Spellchecker) Suggest(word string, n int) SuggestionResult { +func (s *Spellchecker) Suggest(opts *SearchOptions, word string, n int) SuggestionResult { s.mtx.RLock() defer s.mtx.RUnlock() @@ -123,7 +110,9 @@ func (s *Spellchecker) Suggest(word string, n int) SuggestionResult { return SuggestionResult{ExactMatch: true} } + opts = applyDefaults(opts) + return SuggestionResult{ - Suggestions: s.dict.find(word, n, s.filterFunc), + Suggestions: s.dict.find(word, n, opts.MaxErrors, opts.FilterFunc), } } diff --git a/spellchecker_test.go b/spellchecker_test.go index 1d81005..60e3d40 100644 --- a/spellchecker_test.go +++ b/spellchecker_test.go @@ -46,7 +46,7 @@ func newFullSpellchecker() *Spellchecker { panic(err) } - err = s.AddFrom(1, f) + err = s.AddFrom(nil, f) if err != nil { panic(err) } @@ -65,7 +65,7 @@ func newSampleSpellchecker() *Spellchecker { panic(err) } - err = s.AddFrom(1, f) + err = s.AddFrom(nil, f) if err != nil { panic(err) } @@ -93,7 +93,7 @@ func Benchmark_Spellchecker_Fix_3(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - m.Fix("tee") + m.Fix(nil, "tee") } } @@ -102,7 +102,7 @@ func Benchmark_Spellchecker_Fix_6_Transposition(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - m.Fix("oragne") + m.Fix(nil, "oragne") } } @@ -111,7 +111,7 @@ func Benchmark_Spellchecker_Fix_6_Replacement(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - m.Fix("problam") + m.Fix(nil, "problam") } } @@ -170,7 +170,7 @@ func benchmarkNorvig(b *testing.B, dataPath string) { } b.StartTimer() - result := m.Suggest(word, 10) + result := m.Suggest(nil, word, 10) b.StopTimer() if i == 0 { @@ -204,23 +204,9 @@ func benchmarkNorvig(b *testing.B, dataPath string) { } func Test_NewSpellchecker(t *testing.T) { - t.Run("must be able to create a spellchecker without any options", func(t *testing.T) { - s, err := New(DefaultAlphabet) - require.NoError(t, err) - require.NotNil(t, s.dict) - }) - t.Run("must be able to create a spellchecker with custom splitter", func(t *testing.T) { - s, err := New(DefaultAlphabet, WithSplitter(bufio.ScanRunes)) - require.NoError(t, err) - require.NotNil(t, s.splitter) - }) -} - -func Test_Spellchecker_WithOpts(t *testing.T) { s, err := New(DefaultAlphabet) require.NoError(t, err) - s.WithOpts(WithSplitter(bufio.ScanLines)) - require.NotNil(t, s.splitter) + require.NotNil(t, s.dict) } func Test_Spellchecker_IsCorrect(t *testing.T) { @@ -232,15 +218,22 @@ func Test_Spellchecker_IsCorrect(t *testing.T) { func Test_Spellchecker_Fix(t *testing.T) { s := newSampleSpellchecker() - result, err := s.Fix("problam") - require.NoError(t, err) + result, isCorrect := s.Fix(nil, "problam") + require.False(t, isCorrect) + require.Equal(t, "problem", result) +} + +func Test_Spellchecker_Fix_CustomOptions(t *testing.T) { + s := newSampleSpellchecker() + result, isCorrect := s.Fix(&SearchOptions{MaxErrors: 2}, "problam") + require.False(t, isCorrect) require.Equal(t, "problem", result) } func Test_Spellchecker_SuggestScore(t *testing.T) { t.Run("fix", func(t *testing.T) { s := newSampleSpellchecker() - result := s.Suggest("arang", 5) + result := s.Suggest(nil, "arang", 5) require.Equal(t, SuggestionResult{ Suggestions: []Match{ {Value: "orange", Score: 0.2772588722239781}, @@ -251,13 +244,13 @@ func Test_Spellchecker_SuggestScore(t *testing.T) { t.Run("valid word", func(t *testing.T) { s := newSampleSpellchecker() - result := s.Suggest("orange", 5) + result := s.Suggest(nil, "orange", 5) require.Equal(t, SuggestionResult{ExactMatch: true}, result) }) t.Run("unknown word", func(t *testing.T) { s := newSampleSpellchecker() - result := s.Suggest("qwerty", 5) + result := s.Suggest(nil, "qwerty", 5) require.Equal(t, SuggestionResult{Suggestions: []Match{}}, result) }) } From c68f23d269e84ae5fcc760d6a2538fac25e5b85d Mon Sep 17 00:00:00 2001 From: Dmitry Russkikh Date: Sat, 27 Sep 2025 21:04:25 +0300 Subject: [PATCH 5/5] V2 pkg --- .github/workflows/test.yaml | 2 +- README.md | 8 ++++---- go.mod | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index b080d3d..82fda90 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -12,7 +12,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v3 with: - go-version: 1.19 + go-version: 1.25 - name: Test run: go test ./... diff --git a/README.md b/README.md index b71ab4a..42c4d67 100644 --- a/README.md +++ b/README.md @@ -134,9 +134,9 @@ goos: linux goarch: amd64 pkg: github.com/f1monkey/spellchecker cpu: 13th Gen Intel(R) Core(TM) i9-13980HX -Benchmark_Norvig1-32 379 3099977 ns/op 74.44 success_percent 201.0 success_words 270.0 total_words 820251 B/op 15234 allocs/op +Benchmark_Norvig1-32 357 3305052 ns/op 74.44 success_percent 201.0 success_words 270.0 total_words 768899 B/op 13302 allocs/op PASS -ok github.com/f1monkey/spellchecker 3.740s +ok github.com/f1monkey/spellchecker 3.801s ``` #### [Test set 2](http://norvig.com/spell-testset2.txt): @@ -148,7 +148,7 @@ goos: linux goarch: amd64 pkg: github.com/f1monkey/spellchecker cpu: 13th Gen Intel(R) Core(TM) i9-13980HX -Benchmark_Norvig2-32 219 4916738 ns/op 71.25 success_percent 285.0 success_words 400.0 total_words 1257491 B/op 21401 allocs/op +Benchmark_Norvig2-32 236 5257185 ns/op 71.25 success_percent 285.0 success_words 400.0 total_words 1201260 B/op 19346 allocs/op PASS -ok github.com/f1monkey/spellchecker 3.919s +ok github.com/f1monkey/spellchecker 4.350s ``` diff --git a/go.mod b/go.mod index 09f0604..d16fc79 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/f1monkey/spellchecker -go 1.19 +go 1.24 require ( github.com/agext/levenshtein v1.2.3