diff --git a/README.md b/README.md index c4ff5ae..a7e8037 100644 --- a/README.md +++ b/README.md @@ -74,4 +74,59 @@ matches, err := model.CosN(expr, 1) if err != nil { log.Fatalf("error evaluating cosine similarity: %v", err) } + +// Create two expressions. +x := word2vec.Expr{"king": 1.0} +y := word2vec.Expr{"queen": 1.0} + +// Compute similarity between the expressions +cosineSimilarity, err := m.Cos(x, y) +if err != nil { + log.Fatalf("error evaluating cosine similarity: %v", err) +} + +``` + +If you only wanna to compute similarity between some words, but not to find the n most similar of a given word, +you can use a lazy model, which initializes faster and uses less memory: + +```go +// Lazy load the model from an io.Reader (i.e. a file). +model, err := word2vec.LazyFromReader(r) +if err != nil { + log.Fatalf("error loading model: %v", err) +} + +// Create two expressions. +x := word2vec.Expr{"king": 1.0} +y := word2vec.Expr{"queen": 1.0} + +// Compute similarity between the expressions +cosineSimilarity, err := m.Cos(x, y) +if err != nil { + log.Fatalf("error evaluating cosine similarity: %v", err) +} + +``` + +Below is a benchmark between the normal and the lazy model using a model file with 456.976 (26 ^ 4) words of 300 dimensions. +You can run the benchmark with your model declaring the var ```filename``` of ```word2vec_bench_test.go``` to the path of your binary model. + + +``` +BenchmarkLazyModel +BenchmarkLazyModel/InitializeEager +BenchmarkLazyModel/InitializeEager-4 1 4217573971 ns/op 1193506272 B/op 1370948 allocs/op +BenchmarkLazyModel/InitializeLazy +BenchmarkLazyModel/InitializeLazy-4 4 321390344 ns/op 31409760 B/op 456996 allocs/op +BenchmarkLazyModel/LoadVectorEager +BenchmarkLazyModel/LoadVectorEager-4 4769024 266.2 ns/op 400 B/op 2 allocs/op +BenchmarkLazyModel/LoadVectorLazy +BenchmarkLazyModel/LoadVectorLazy-4 151197 8163 ns/op 3032 B/op 6 allocs/op + +4217573971ns = 4,21s +321390344ns = 0,32s +1193506272B = 1,19GB +31409760B = 0,03GB(31MB) + ``` diff --git a/cmd/word-server/main.go b/cmd/word-server/main.go index 6661a67..3f5031a 100644 --- a/cmd/word-server/main.go +++ b/cmd/word-server/main.go @@ -14,10 +14,12 @@ import ( ) var listen, modelPath string +var lazy bool func init() { flag.StringVar(&listen, "listen", "localhost:1234", "bind `address` for HTTP server") flag.StringVar(&modelPath, "model", "", "`path` to binary model data") + flag.BoolVar(&lazy, "lazy", false, "whether to lazy load model, without cos-n support, defaults to true, which eager load it") } func main() { @@ -28,7 +30,11 @@ func main() { os.Exit(1) } - log.Println("Loading model...") + if lazy { + log.Println("Lazy loading model...") + } else { + log.Println("Loading model...") + } f, err := os.Open(modelPath) if err != nil { fmt.Printf("error opening binary model data file: %v\n", err) @@ -36,9 +42,16 @@ func main() { } defer f.Close() - m, err := word2vec.FromReader(f) + var m *word2vec.Model + + if lazy { + m, err = word2vec.LazyFromReader(f) + } else { + m, err = word2vec.FromReader(f) + } if err != nil { fmt.Printf("error reading binary model data: %v\n", err) + f.Close() os.Exit(1) } diff --git a/offset_counter.go b/offset_counter.go new file mode 100644 index 0000000..136d347 --- /dev/null +++ b/offset_counter.go @@ -0,0 +1,60 @@ +package word2vec + +import "bufio" + +type offsetCounter struct { + offset int64 + reader *bufio.Reader +} + +func (o *offsetCounter) Discard(n int) (discarded int, err error) { + discarded, err = o.reader.Discard(n) + o.offset += int64(discarded) + return +} + +func (o *offsetCounter) ReadByte() (byte, error) { + b, err := o.reader.ReadByte() + if err == nil { + o.offset++ + } + return b, err +} + +func (o *offsetCounter) ReadRune() (r rune, size int, err error) { + r, size, err = o.reader.ReadRune() + o.offset += int64(size) + return +} + +func (o *offsetCounter) ReadSlice(delim byte) (line []byte, err error) { + line, err = o.reader.ReadSlice(delim) + o.offset += int64(len(line)) + return +} + +func (o *offsetCounter) ReadBytes(delim byte) ([]byte, error) { + b, err := o.reader.ReadBytes(delim) + o.offset += int64(len(b)) + return b, err +} + +func (o *offsetCounter) ReadString(delim byte) (string, error) { + s, err := o.reader.ReadString(delim) + o.offset += int64(len(s)) + return s, err +} + +func (o *offsetCounter) Read(p []byte) (n int, err error) { + n, err = o.reader.Read(p) + o.offset += int64(n) + return +} + +func (o *offsetCounter) UnreadByte() error { + err := o.reader.UnreadByte() + if err == nil { + o.offset-- + } + return err +} diff --git a/word2vec.go b/word2vec.go index 875015e..8ad2fd3 100644 --- a/word2vec.go +++ b/word2vec.go @@ -5,16 +5,20 @@ package word2vec // import "code.sajari.com/word2vec" import ( "bufio" "encoding/binary" + "errors" "fmt" "io" + "log" "sync" ) // Model is a type which represents a word2vec Model and implements the Coser // and Mapper interfaces. type Model struct { - dim int - words map[string]Vector + dim int + words map[string]Vector + lazyWords map[string]int64 + reader LazyReader } var ( @@ -22,7 +26,10 @@ var ( _ Mapper = (*Model)(nil) ) -// FromReader creates a Model using the binary model data provided by the io.Reader. +// FromReader creates a Model using the binary model data provided by the +// io.Reader. It loads all vectors on memory for faster access and to be able to +// find n most similar words but uses more memory and takes longer to initialize +// If you don't need to find n most similar words, consider using LazyFromReader func FromReader(r io.Reader) (*Model, error) { br := bufio.NewReader(r) var size, dim int @@ -73,6 +80,65 @@ func FromReader(r io.Reader) (*Model, error) { return m, nil } +const f32Len = 4 + +type LazyReader interface { + io.Reader + io.ReaderAt +} + +// LazyFromReader creates a lazy Model using the binary model data provided by +// the io.Reader. Returns an error on CosN and MultiCosN(Model) If you need CosN +// and MultiCosN(Model), use FromReader It loads vectors from the underlying +// io.ReaderAt, usually *os.File, only when needed, making it slower but using +// less memory and initializing faster +func LazyFromReader(r LazyReader) (*Model, error) { + br := &offsetCounter{reader: bufio.NewReader(r)} + var size, dim int + n, err := fmt.Fscanln(br, &size, &dim) + if err != nil { + return nil, err + } + if n != 2 { + return nil, fmt.Errorf("could not extract size/dim from binary model data") + } + + m := &Model{ + lazyWords: make(map[string]int64, size), + dim: dim, + reader: r, + } + + for i := 0; i < size; i++ { + w, err := br.ReadString(' ') + if err != nil { + return nil, err + } + w = w[:len(w)-1] + + m.lazyWords[w] = br.offset + + _, err = br.Discard(dim * f32Len) + if err != nil { + return nil, err + } + + b, err := br.ReadByte() + if err != nil { + if i == size-1 && err == io.EOF { + break + } + return nil, err + } + if b != byte('\n') { + if err := br.UnreadByte(); err != nil { + return nil, err + } + } + } + return m, nil +} + // Vector is a type which represents a word vector. type Vector []float32 @@ -137,6 +203,9 @@ type Coser interface { // Size returns the number of words in the model. func (m *Model) Size() int { + if m.lazyWords != nil { + return len(m.lazyWords) + } return len(m.words) } @@ -155,9 +224,28 @@ type Mapper interface { // Unknown words are ignored. func (m *Model) Map(words []string) map[string]Vector { result := make(map[string]Vector) - for _, w := range words { - if v, ok := m.words[w]; ok { - result[w] = v + if m.words != nil { + for _, w := range words { + if v, ok := m.words[w]; ok { + result[w] = v + } + } + } else { + for _, w := range words { + if off, ok := m.lazyWords[w]; ok { + + r := io.NewSectionReader(m.reader, off, int64(m.dim*f32Len)) + + v := make(Vector, m.dim) + if err := binary.Read(r, binary.LittleEndian, v); err != nil { + log.Printf("word2vec: LazyModel.Map read %s %s", w, err) //todo return the error and change the api? + continue + } + + v.Normalise() + + result[w] = v + } } } return result @@ -196,11 +284,25 @@ func (m *Model) Coses(pairs [][2]Expr) ([]float32, error) { func (m *Model) Eval(expr Expr) (Vector, error) { v := Vector(make([]float32, m.dim)) for w, c := range expr { - u, ok := m.words[w] - if !ok { - return nil, &NotFoundError{w} + if m.words != nil { + u, ok := m.words[w] + if !ok { + return nil, &NotFoundError{w} + } + v.Add(c, u) + } else { + off, ok := m.lazyWords[w] + if !ok { + return nil, &NotFoundError{w} + } + r := io.NewSectionReader(m.reader, off, int64(m.dim*f32Len)) + + u := Vector(make([]float32, m.dim)) + if err := binary.Read(r, binary.LittleEndian, u); err != nil { + return nil, err + } + v.Add(c, u) } - v.Add(c, u) } v.Normalise() return v, nil @@ -216,6 +318,10 @@ type Match struct { // CosN computes the n most similar words to the expression. Returns an error if the // expression could not be evaluated. func (m *Model) CosN(e Expr, n int) ([]Match, error) { + if m.words == nil { + return nil, errors.New("CosN not supported on lazy model") + } + if n == 0 { return nil, nil } @@ -278,6 +384,11 @@ type multiMatches struct { // MultiCosN takes a list of expressions and computes the // n most similar words for each. func MultiCosN(m *Model, exprs []Expr, n int) ([][]Match, error) { + + if m.words == nil { + return nil, errors.New("MultiCosN not supported on lazy model") + } + if n == 0 { return make([][]Match, len(exprs)), nil } diff --git a/word2vec_bench_test.go b/word2vec_bench_test.go new file mode 100644 index 0000000..fd8ea4a --- /dev/null +++ b/word2vec_bench_test.go @@ -0,0 +1,143 @@ +package word2vec + +import ( + "bytes" + "encoding/binary" + "fmt" + "os" + "path/filepath" + "testing" +) + +var filename = "" //set this to the path of your model file to a more precise bench + +func BenchmarkLazyModel(b *testing.B) { + + var ( + f *os.File + err error + ) + + if filename == "" { + f, err = os.Create(filepath.Join(b.TempDir(), "go-word2vec-bench.bin")) + if err != nil { + b.Fatal(err) + } + defer f.Close() + + dim := 300 + numWords := 26 * 26 * 26 * 26 + + buf := bytes.NewBuffer(make([]byte, 0, 500_000)) + + _, err := fmt.Fprintln(buf, numWords, dim) + if err != nil { + b.Fatal(err) + } + + emptyVector := make([]float32, dim) + + for i := 'a'; i <= 'z'; i++ { + for j := 'a'; j <= 'z'; j++ { + for k := 'a'; k <= 'z'; k++ { + for l := 'a'; l <= 'z'; l++ { + _, err := fmt.Fprintf(buf, "%s%s%s%s ", string(i), string(j), string(k), string(l)) + if err != nil { + b.Fatal("unexpected error writing word") + } + err = binary.Write(buf, binary.LittleEndian, emptyVector) + if err != nil { + b.Fatal("unexpected error writing vector") + } + _, err = fmt.Fprintf(buf, "\n") + if err != nil { + b.Fatal("unexpected error writing new line") + } + } + } + } + } + bb := buf.Bytes() + + _, err = f.Write(bb) + if err != nil { + b.Fatal(err) + } + + _, err = f.Seek(0, 0) + if err != nil { + b.Fatal(err) + } + + } else { + f, err = os.Open(filename) + if err != nil { + b.Fatal(err) + } + defer f.Close() + } + + b.Run("InitializeEager", func(b *testing.B) { + for i := 0; i < b.N; i++ { + _, err := FromReader(f) + if err != nil { + b.Fatal(err) + } + _, err = f.Seek(0, 0) + if err != nil { + b.Fatal(err) + } + } + }) + + b.Run("InitializeLazy", func(b *testing.B) { + for i := 0; i < b.N; i++ { + _, err := LazyFromReader(f) + if err != nil { + b.Fatal(err) + } + _, err = f.Seek(0, 0) + if err != nil { + b.Fatal(err) + } + } + }) + + m, err := FromReader(f) + if err != nil { + b.Fatal(err) + } + + b.Run("LoadVectorEager", func(b *testing.B) { + + for i := 0; i < b.N; i++ { + r := m.Map([]string{"abcd"}) + if len(r) == 0 { + b.Fatal("empty return") + } + } + + }) + + _, err = f.Seek(0, 0) + if err != nil { + b.Fatal(err) + } + + l, err := LazyFromReader(f) + if err != nil { + b.Fatal(err) + } + + b.Run("LoadVectorLazy", func(b *testing.B) { + + for i := 0; i < b.N; i++ { + r := l.Map([]string{"abcd"}) + if len(r) == 0 { + b.Fatal("empty return") + } + } + + }) + +} diff --git a/word2vec_test.go b/word2vec_test.go index ee92e02..acebb37 100644 --- a/word2vec_test.go +++ b/word2vec_test.go @@ -89,24 +89,13 @@ func TestFromReader(t *testing.T) { fmt.Fprintf(buf, "\n") } - m, err := FromReader(bytes.NewReader(buf.Bytes())) + b := buf.Bytes() + + m, err := FromReader(bytes.NewReader(b)) if err != nil { t.Errorf("unexpected error from FromReader: %v", err) } - if m.Size() != len(vecs) { - t.Errorf("m.Size() = %d, expected %d", m.Size(), len(vecs)) - } - - if m.Dim() != 2 { - t.Errorf("m.Dim() = %d, expected 2", m.Dim()) - } - - mVecs := m.Map([]string{"hello", "world"}) - if !reflect.DeepEqual(vecs, mVecs) { - t.Errorf("m.Map() = %v, expected %v", mVecs, vecs) - } - x := Expr{"hello": 1.0} expectedMatches := []Match{ {Word: "hello", Score: 1.0}, @@ -124,14 +113,47 @@ func TestFromReader(t *testing.T) { t.Errorf("m.CosN(x, 2) = %v, expected: %v", matches, expectedMatches) } - y := Expr{"world": 1.0} - expectedCos := float32(0.0) - c, err := m.Cos(x, y) + l, err := LazyFromReader(bytes.NewReader(b)) if err != nil { - t.Errorf("unexpected error from m.Cos(x, y): %v", err) + t.Errorf("unexpected error from LazyFromReader: %v", err) + } + + _, err = l.CosN(nil, 0) + if err == nil { + t.Error("was expecting an error calling CosN on lazy model") } - if c != expectedCos { - t.Errorf("Cos(x, y) = %f, expected %f", c, expectedCos) + + _, err = MultiCosN(l, nil, 0) + if err == nil { + t.Error("was expecting an error calling MultiCosN on lazy model") + } + + models := []*Model{m, l} + + for _, m := range models { + + if m.Size() != len(vecs) { + t.Errorf("m.Size() = %d, expected %d", m.Size(), len(vecs)) + } + + if m.Dim() != 2 { + t.Errorf("m.Dim() = %d, expected 2", m.Dim()) + } + + mVecs := m.Map([]string{"hello", "world"}) + if !reflect.DeepEqual(vecs, mVecs) { + t.Errorf("m.Map() = %v, expected %v", mVecs, vecs) + } + + y := Expr{"world": 1.0} + expectedCos := float32(0.0) + c, err := m.Cos(x, y) + if err != nil { + t.Errorf("unexpected error from m.Cos(x, y): %v", err) + } + if c != expectedCos { + t.Errorf("Cos(x, y) = %f, expected %f", c, expectedCos) + } } }