diff --git a/src/vouch/health.py b/src/vouch/health.py index ac3fe510..9698dc9d 100644 --- a/src/vouch/health.py +++ b/src/vouch/health.py @@ -183,7 +183,7 @@ def rebuild_index(store: KBStore) -> dict: def _rebuild_embeddings(store: KBStore) -> None: try: - from .embeddings import get_embedder + from .embeddings import content_hash, get_embedder embedder = get_embedder() except Exception: return @@ -201,9 +201,20 @@ def _rebuild_embeddings(store: KBStore) -> None: for i in range(0, len(texts), batch_size): batch = texts[i:i + batch_size] vecs = embedder.encode_batch([t[2] for t in batch]) - for (kind, eid, _), row in zip(batch, vecs, strict=True): - index_db.index_embedding(conn, kind=kind, id=eid, - vec=row.tolist()) + for (kind, eid, text), row in zip(batch, vecs, strict=True): + # Write to `embedding_index` — the table `search_embedding()` + # reads and `reset()` (called at the top of rebuild) clears. + # `index_embedding()` targeted the legacy `embeddings` table, + # which no reader queries, so every rebuild/reindex silently + # left semantic search empty until each artifact was next + # re-written. Mirror the live `KBStore._embed_and_store` + # metadata so the per-content/model skip-check stays consistent. + index_db.put_embedding( + conn, kind=kind, id=eid, vec=row.tolist(), + content_hash=content_hash(text), + model=embedder.name, model_version=embedder.version, + dim=embedder.dim, + ) # --- helpers used by `vouch discover` (CLI) ------------------------------- diff --git a/tests/embeddings/test_search.py b/tests/embeddings/test_search.py index 108e46e1..ae7b0e21 100644 --- a/tests/embeddings/test_search.py +++ b/tests/embeddings/test_search.py @@ -49,6 +49,28 @@ def test_put_page_writes_embedding(store: KBStore) -> None: assert rec[0].shape == (8,) +def test_rebuild_index_repopulates_semantic_index(store: KBStore) -> None: + # rebuild_index resets embedding_index (the table search reads) and then + # re-embeds. The re-embedding must land in embedding_index, not the legacy + # `embeddings` table — otherwise semantic search silently returns nothing + # after every `vouch index` / `reindex` / import-apply. + from vouch import health + + src = store.put_source(b"e") + store.put_claim(Claim(id="c1", text="alpha beta gamma", evidence=[src.id])) + store.put_page(Page(id="p1", title="Title", body="page body")) + store.put_entity(Entity(id="e1", name="JWT", type=EntityType.CONCEPT)) + + health.rebuild_index(store) + + # Each artifact's embedding must be readable from embedding_index after the + # rebuild (these are None before the fix, because the rebuild wrote them to + # the dead `embeddings` table that no reader queries). + assert index_db.get_embedding(store.kb_dir, kind="claim", id="c1") is not None + assert index_db.get_embedding(store.kb_dir, kind="page", id="p1") is not None + assert index_db.get_embedding(store.kb_dir, kind="entity", id="e1") is not None + + def test_put_source_writes_embedding(store: KBStore) -> None: src = store.put_source(b"content bytes here", title="src1") rec = index_db.get_embedding(store.kb_dir, kind="source", id=src.id)