From e82ce4bab28ec65d7246c4e484653f869da45922 Mon Sep 17 00:00:00 2001 From: Rojin Dumre Date: Sat, 8 Feb 2025 11:27:11 +1100 Subject: [PATCH] bm25 ranking function implemented --- internal/store/db.go | 2 +- internal/store/models.go | 2 +- internal/store/query.sql.go | 53 ++++++++++++++++++++++++++++++++++++- query.sql | 13 +++++++++ store/schema.sql | 28 ++++++++++++++++++++ 5 files changed, 95 insertions(+), 3 deletions(-) diff --git a/internal/store/db.go b/internal/store/db.go index 401f0c3..3fe36cf 100644 --- a/internal/store/db.go +++ b/internal/store/db.go @@ -1,6 +1,6 @@ // Code generated by sqlc. DO NOT EDIT. // versions: -// sqlc v1.27.0 +// sqlc v1.28.0 package store diff --git a/internal/store/models.go b/internal/store/models.go index 36a8bb1..26d0281 100644 --- a/internal/store/models.go +++ b/internal/store/models.go @@ -1,6 +1,6 @@ // Code generated by sqlc. DO NOT EDIT. // versions: -// sqlc v1.27.0 +// sqlc v1.28.0 package store diff --git a/internal/store/query.sql.go b/internal/store/query.sql.go index 5193cd8..71027cf 100644 --- a/internal/store/query.sql.go +++ b/internal/store/query.sql.go @@ -1,6 +1,6 @@ // Code generated by sqlc. DO NOT EDIT. // versions: -// sqlc v1.27.0 +// sqlc v1.28.0 // source: query.sql package store @@ -466,3 +466,54 @@ func (q *Queries) ListDocuments(ctx context.Context) ([]Document, error) { } return items, nil } + +const searchDocumentsBM25 = `-- name: SearchDocumentsBM25 :many +SELECT + d.id, + d.file_path, + d.total_chunk_size, + d.created_at, + d.updated_at, + -- BM25 rank: + ts_rank_cd(d.text_searchable_column, to_tsquery('english', $1)) AS rank +FROM documents d +WHERE d.text_searchable_column @@ to_tsquery('english', $1) +ORDER BY rank DESC +LIMIT 50 +` + +type SearchDocumentsBM25Row struct { + ID int64 + FilePath string + TotalChunkSize pgtype.Int8 + CreatedAt pgtype.Timestamptz + UpdatedAt pgtype.Timestamptz + Rank float32 +} + +func (q *Queries) SearchDocumentsBM25(ctx context.Context, toTsquery string) ([]SearchDocumentsBM25Row, error) { + rows, err := q.db.Query(ctx, searchDocumentsBM25, toTsquery) + if err != nil { + return nil, err + } + defer rows.Close() + var items []SearchDocumentsBM25Row + for rows.Next() { + var i SearchDocumentsBM25Row + if err := rows.Scan( + &i.ID, + &i.FilePath, + &i.TotalChunkSize, + &i.CreatedAt, + &i.UpdatedAt, + &i.Rank, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} diff --git a/query.sql b/query.sql index 8321ef6..d3e4769 100755 --- a/query.sql +++ b/query.sql @@ -115,3 +115,16 @@ JOIN documents d ON d.id = e.document_id WHERE (sqlc.narg(metadata_hash)::text IS NULL OR e.metadata_hash = sqlc.narg(metadata_hash)::text) ORDER BY e.created_at DESC; +-- name: SearchDocumentsBM25 :many +SELECT + d.id, + d.file_path, + d.total_chunk_size, + d.created_at, + d.updated_at, + -- BM25 rank: + ts_rank_cd(d.text_searchable_column, to_tsquery('english', $1)) AS rank +FROM documents d +WHERE d.text_searchable_column @@ to_tsquery('english', $1) +ORDER BY rank DESC +LIMIT 50; \ No newline at end of file diff --git a/store/schema.sql b/store/schema.sql index 1dcaded..c9ebc4c 100644 --- a/store/schema.sql +++ b/store/schema.sql @@ -274,6 +274,18 @@ ALTER TABLE public.documents OWNER TO admin; -- Name: documents_id_seq; Type: SEQUENCE; Schema: public; Owner: admin -- +-- a tsvector column for full-text search in the documents table + +ALTER TABLE public.documents +ADD COLUMN text_searchable_column tsvector; + + +-- just converting file_path for now + +UPDATE public.documents +SET text_searchable_column = to_tsvector('english', file_path); + + CREATE SEQUENCE public.documents_id_seq START WITH 1 INCREMENT BY 1 @@ -557,3 +569,19 @@ ALTER TABLE ONLY public.embeddings -- PostgreSQL database dump complete -- +-- creating a gin index to speed up text search operations +CREATE INDEX idx_text_searchable_column +ON public.documents USING gin (text_searchable_column); + + +-- trigger to automatically update text_searchable_column + CREATE TRIGGER documents_tsvector_update + BEFORE INSERT OR UPDATE ON public.documents + FOR EACH ROW + EXECUTE FUNCTION tsvector_update_trigger( + text_searchable_column, + 'pg_catalog.english', + file_path + ); + +