Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 15 additions & 4 deletions src/vouch/health.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def rebuild_index(store: KBStore) -> dict:

def _rebuild_embeddings(store: KBStore) -> None:
try:
from .embeddings import get_embedder
from .embeddings import content_hash, get_embedder
embedder = get_embedder()
except Exception:
return
Expand All @@ -201,9 +201,20 @@ def _rebuild_embeddings(store: KBStore) -> None:
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
vecs = embedder.encode_batch([t[2] for t in batch])
for (kind, eid, _), row in zip(batch, vecs, strict=True):
index_db.index_embedding(conn, kind=kind, id=eid,
vec=row.tolist())
for (kind, eid, text), row in zip(batch, vecs, strict=True):
# Write to `embedding_index` — the table `search_embedding()`
# reads and `reset()` (called at the top of rebuild) clears.
# `index_embedding()` targeted the legacy `embeddings` table,
# which no reader queries, so every rebuild/reindex silently
# left semantic search empty until each artifact was next
# re-written. Mirror the live `KBStore._embed_and_store`
# metadata so the per-content/model skip-check stays consistent.
index_db.put_embedding(

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Rebuild every semantic artifact kind

reset() has already deleted embedding_index, but this new put_embedding loop only writes the claim/page/entity tuples assembled above. The live write path and search_embedding() support source, relation, and evidence embeddings, so after a plain kb.index_rebuild/import-apply those rows are lost and semantic-only searches over those artifact types return nothing until a separate kb.reindex_embeddings/rewrite. Please rebuild all artifact kinds that _embed_and_store creates.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Record rebuilt embedding model metadata

reset() deletes the index_meta embedding keys, and this new direct rebuild path repopulates embedding_index without the set_embedding_meta call used by _embed_and_store/backfill_embeddings. After kb.index_rebuild or import-apply, embedding stats report no model/dim and detect_mismatch() has no stored model to compare, so a later model switch can leave these rebuilt vectors in the old vector space without warning. Please update index_meta after a successful embedding rebuild.

Useful? React with 👍 / 👎.

conn, kind=kind, id=eid, vec=row.tolist(),
content_hash=content_hash(text),

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P3 Badge Use live text when hashing rebuilt embeddings

For rebuilt pages/entities, this stores a hash for the text assembled above with single spaces, but the normal KBStore._embed_and_store path hashes and embeds f"{title}\n\n{body}" and f"{name}\n\n{description or ''}". After kb.index_rebuild, those rows therefore look stale to backfill_embeddings/future write hooks and are embedded from different text than live writes; use the same text extractor as the live path before persisting content_hash.

Useful? React with 👍 / 👎.

model=embedder.name, model_version=embedder.version,
dim=embedder.dim,
)


# --- helpers used by `vouch discover` (CLI) -------------------------------
Expand Down
22 changes: 22 additions & 0 deletions tests/embeddings/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,28 @@ def test_put_page_writes_embedding(store: KBStore) -> None:
assert rec[0].shape == (8,)


def test_rebuild_index_repopulates_semantic_index(store: KBStore) -> None:
# rebuild_index resets embedding_index (the table search reads) and then
# re-embeds. The re-embedding must land in embedding_index, not the legacy
# `embeddings` table — otherwise semantic search silently returns nothing
# after every `vouch index` / `reindex` / import-apply.
from vouch import health

src = store.put_source(b"e")
store.put_claim(Claim(id="c1", text="alpha beta gamma", evidence=[src.id]))
store.put_page(Page(id="p1", title="Title", body="page body"))
store.put_entity(Entity(id="e1", name="JWT", type=EntityType.CONCEPT))

health.rebuild_index(store)

# Each artifact's embedding must be readable from embedding_index after the
# rebuild (these are None before the fix, because the rebuild wrote them to
# the dead `embeddings` table that no reader queries).
assert index_db.get_embedding(store.kb_dir, kind="claim", id="c1") is not None
assert index_db.get_embedding(store.kb_dir, kind="page", id="p1") is not None
assert index_db.get_embedding(store.kb_dir, kind="entity", id="e1") is not None


def test_put_source_writes_embedding(store: KBStore) -> None:
src = store.put_source(b"content bytes here", title="src1")
rec = index_db.get_embedding(store.kb_dir, kind="source", id=src.id)
Expand Down