From 6cacdd62a858dbc0ffe02673d2837007f3a05f6f Mon Sep 17 00:00:00 2001 From: Dmitry Voropaev Date: Mon, 22 Jun 2026 11:08:13 +0300 Subject: [PATCH] feat: per-module LLM-grounded descriptions (kb describe) Extend the key-gated `kb describe` pass to describe each first-party module (file), not just `api_route` / `entity` artifacts. A module is not an artifact, so it is enumerated from its span occurrences at the snapshot SHA (`store.queries.module_targets` -> `ModuleTarget`) and grounded on ALL of the file's spans (module + its classes/functions/imports). `describe.py` is refactored to a shared `_describe_one(...)` reused by the artifact loop and a new module loop; module descriptions use `target_kind="module"` and logical key `desc:module:`. The prompt source body is capped while validation still runs over every span. No new invariants: the same deterministic sub-property gate (`grounding.validate_claims`) drops any claim whose cited symbol does not occur in the file's spans, so a module is described only if a real symbol survives. The `semantic_grounding` HARD gate is extended with the module path (adversarial fabricated claim dropped; a module with no matching symbol gets no description). Headline gate count stays nine. README / DESIGN / CHANGELOG updated; `kb index` stays offline. --- CHANGELOG.md | 11 +- DESIGN.md | 10 +- README.md | 6 +- src/kb/eval/semantic_grounding_test.py | 52 ++++++++-- src/kb/extract/semantic/describe.py | 136 +++++++++++++++++-------- src/kb/store/queries.py | 42 ++++++++ 6 files changed, 198 insertions(+), 59 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a7395e..b58a9de 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,10 +17,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 `description` artifact is stored only if something survives, grounded on the same spans (`extraction_method = "llm_grounded"`, `model_id` + `prompt_version` in the artifact key). Surfaced via MCP `get_knowledge` / `search_knowledge`. Uses `kb.llm` (Anthropic default, OpenAI optional). +- **Per-module descriptions** (`kb describe`, second slice): the same pass now also describes each + first-party module (file). A module is not an artifact, so it is enumerated from its span + occurrences (`store.queries.module_targets`) and grounded on **all** of the file's spans + (module + classes/functions/imports); `target_kind="module"`, logical key `desc:module:`. + The same span-validation gate applies, so a module gets a description only if a cited symbol + actually occurs in the file — no new invariants. The `semantic_grounding` HARD gate is extended + with the module path (adversarial claim dropped; a module with no matching symbol gets nothing). - **Semantic grounding HARD gate** (`kb.eval.semantic_grounding_test`): runs the describer on a **stub** LLM (no API key) and asserts an adversarial fabricated claim is dropped while the grounded - claim is stored — the DESIGN §9 semantic floor, enforced deterministically in CI. Headline HARD - gates: eight → **nine**. + claim is stored — on both the artifact and the module path — the DESIGN §9 semantic floor, + enforced deterministically in CI. Headline HARD gates: eight → **nine**. ## [0.3.0] - 2026-06-21 diff --git a/DESIGN.md b/DESIGN.md index d8a10ef..d157287 100644 --- a/DESIGN.md +++ b/DESIGN.md @@ -288,9 +288,11 @@ rejected. **Verbalized LLM confidence is never used as the score.** > > *Implemented (first slice):* the `kb describe` describer enforces this floor — > `kb.extract.semantic.grounding.validate_claims` drops any claim whose cited symbol is absent from -> the artifact's grounding spans; an artifact with no surviving claim is not stored. The gate is -> deterministic, so `semantic_grounding_test` enforces it in CI (stub LLM, no API key), including an -> adversarial fabricated claim that must be dropped. +> the target's grounding spans; a target with no surviving claim is not stored. It covers +> `api_route`/`entity` artifacts and **per-module (file) descriptions** (a module is enumerated from +> its span occurrences and grounded on *all* of the file's spans). The gate is deterministic, so +> `semantic_grounding_test` enforces it in CI (stub LLM, no API key), including an adversarial +> fabricated claim that must be dropped — on both the artifact and the module path. --- @@ -325,7 +327,7 @@ freshness(current|stale@sha)`, with a deterministic tie-break for reproducible e | `kb.eval` | Tiered eval; deterministic tiers gate CI. | pytest over SHA-pinned golden repos | | `kb.mcp` | Read-only MCP server; provenance-carrying records; budget-aware assembly. | FastMCP (pinned), Pydantic models | | `kb.daemon` | Orchestration + CLI: index a repo @ SHA, run extractors in order, write snapshot, host MCP. | typer | -| `kb.extract.semantic` | **First slice shipped:** `kb describe` — LLM-grounded NL descriptions of routes/entities, each claim validated against the artifact's spans by a deterministic sub-property gate (`grounding.validate_claims`); separate key-gated pass, never on `index`. *Deferred:* the grounded business-process extractor (entrypoints → call-graph slice → sinks → LLM labeler → span-binding validator). | thin LLM adapter (`kb.llm`); later: `PathEngine` (call-graph), YAML sink registry | +| `kb.extract.semantic` | **First slice shipped:** `kb describe` — LLM-grounded NL descriptions of routes/entities/modules (modules grounded on all of the file's spans), each claim validated against the target's spans by a deterministic sub-property gate (`grounding.validate_claims`); separate key-gated pass, never on `index`. *Deferred:* the grounded business-process extractor (entrypoints → call-graph slice → sinks → LLM labeler → span-binding validator). | thin LLM adapter (`kb.llm`); later: `PathEngine` (call-graph), YAML sink registry | --- diff --git a/README.md b/README.md index b728483..e7dc21e 100644 --- a/README.md +++ b/README.md @@ -89,7 +89,7 @@ flowchart LR - **Read-only MCP server** — `find_provenance`, `get_knowledge`, and `search_knowledge`, each returning provenance-carrying units (method + confidence + freshness). - **pgvector embeddings + semantic search** — a replaceable embedding provider (sentence-transformers by default, OpenAI optional) populated by a separate `kb embed` pass; torch stays out of the index path. - **A frozen RAG-over-source baseline** and the **Tier-3 knowledge-vs-RAG recall gate** — the honest A/B that backs the "knowledge > RAG" thesis. -- **LLM-grounded descriptions** — an optional, key-gated `kb describe` pass has an LLM write NL summaries for routes/entities; every claim is validated against the artifact's own spans by a deterministic sub-property gate, so ungrounded claims are *dropped* (the anti-hallucination invariant, with a model in the loop). Stored as `extraction_method = "llm_grounded"`, grounded on the same spans. +- **LLM-grounded descriptions** — an optional, key-gated `kb describe` pass has an LLM write NL summaries for routes, entities, and modules (per file, grounded on all of the file's spans); every claim is validated against the target's own spans by a deterministic sub-property gate, so ungrounded claims are *dropped* (the anti-hallucination invariant, with a model in the loop). Stored as `extraction_method = "llm_grounded"`, grounded on the same spans. - **Nine HARD CI eval gates** (see [Development](#development)). - **A nightly LLM-judged A/B** (optional, key-gated, **non-gating**) — an answerer LLM answers each question from knowbase's grounded context vs a RAG-over-source context, and a judge LLM scores **answer accuracy** (against hand-written gold) + **hallucination**. Tracked metrics on top of recall; it never blocks CI. @@ -149,7 +149,7 @@ uv run kb embed --db-url # separate pass: populate artifact emb uv run kb describe --db-url # separate, key-gated pass (ANTHROPIC_API_KEY / OPENAI_API_KEY) ``` -`kb describe` has an LLM (via `kb.llm`, `KB_LLM_PROVIDER` in {`anthropic`,`openai`}) write a short NL summary + structured claims for each route/entity in the latest snapshot. **Every claim is validated against that artifact's own grounding spans** — claims citing a symbol not in the code are dropped, and a `description` artifact is stored only if something survives, grounded on the same spans (`extraction_method = "llm_grounded"`). It needs an API key, never runs on `kb index`, and the deterministic grounding gate is exercised in CI without a key (stub LLM). +`kb describe` has an LLM (via `kb.llm`, `KB_LLM_PROVIDER` in {`anthropic`,`openai`}) write a short NL summary + structured claims for each route, entity, and module (per file, grounded on all of the file's spans) in the latest snapshot. **Every claim is validated against that target's own grounding spans** — claims citing a symbol not in the code are dropped, and a `description` artifact is stored only if something survives, grounded on the same spans (`extraction_method = "llm_grounded"`). It needs an API key, never runs on `kb index`, and the deterministic grounding gate is exercised in CI without a key (stub LLM). ### Serve to an AI agent (MCP) @@ -218,7 +218,7 @@ A Python package `kb` (uv, src-layout). Modules and their responsibilities: | `kb.mcp` | Read-only MCP server and its provenance-carrying records: `find_provenance`, `get_knowledge`, `search_knowledge`. | | `kb.embed` | Replaceable embedding adapters (sentence-transformers default, OpenAI optional) + snapshot population. Torch isolated behind the `embed` extra and a lazy import. | | `kb.rag` | The frozen pgvector RAG-over-source baseline — the "other arm" of the knowledge-vs-RAG A/B (no provenance, no grounding). | -| `kb.extract.semantic` | LLM-grounded extraction (`kb describe`): NL descriptions of routes/entities with a deterministic sub-property gate (`grounding.validate_claims`) that drops any claim not backed by the artifact's spans. Separate key-gated pass; never on `index`. | +| `kb.extract.semantic` | LLM-grounded extraction (`kb describe`): NL descriptions of routes/entities/modules with a deterministic sub-property gate (`grounding.validate_claims`) that drops any claim not backed by the target's spans. Separate key-gated pass; never on `index`. | | `kb.daemon.cli` | The `kb` CLI: `index`, `migrate`, `embed`, `describe`, `serve` (MCP), and `introspect` — all functional. | | `kb.eval` | Nine HARD CI gates (identity reproducibility, adversarial grounding, Tier-1 import oracle, Tier-1 API oracle, Tier-1 entities oracle, Tier-3 knowledge-vs-RAG recall, Tier-4 one-hop invalidation, invariants, semantic grounding floor) plus the supporting MCP / embed / store suite. | diff --git a/src/kb/eval/semantic_grounding_test.py b/src/kb/eval/semantic_grounding_test.py index 2e09cea..aae72c7 100644 --- a/src/kb/eval/semantic_grounding_test.py +++ b/src/kb/eval/semantic_grounding_test.py @@ -11,6 +11,7 @@ import json from pathlib import Path +from typing import Any from sqlalchemy import Engine, select @@ -61,18 +62,12 @@ def test_validator_drops_fabricated_symbol() -> None: assert [c["symbol"] for c in dropped] == [FAKE] -def test_describe_stores_only_grounded_claims(engine: Engine, tmp_path: Path) -> None: - sha = _index(engine, tmp_path) - result = describe_snapshot(engine, sha, _StubProvider()) - - assert result.described > 0 - assert result.dropped_claims > 0 # the fabricated claim was dropped on every artifact - +def _description_rows(engine: Engine, sha: str) -> list[Any]: join = m.snapshot_entry.join( m.artifact, m.artifact.c.artifact_id == m.snapshot_entry.c.artifact_id ) with engine.connect() as conn: - rows = conn.execute( + return conn.execute( select( m.artifact.c.logical_key, m.artifact.c.payload, @@ -81,7 +76,18 @@ def test_describe_stores_only_grounded_claims(engine: Engine, tmp_path: Path) -> .select_from(join) .where(m.snapshot_entry.c.sha == sha, m.artifact.c.kind == "description") ).all() - assert rows + + +def test_describe_stores_only_grounded_claims(engine: Engine, tmp_path: Path) -> None: + sha = _index(engine, tmp_path) + result = describe_snapshot(engine, sha, _StubProvider()) + + assert result.described > 0 + assert result.dropped_claims > 0 # the fabricated claim was dropped on every artifact + + rows = _description_rows(engine, sha) + assert rows + with engine.connect() as conn: for row in rows: symbols = [c["symbol"] for c in row.payload["claims"]] assert REAL in symbols # the grounded claim survives @@ -89,3 +95,31 @@ def test_describe_stores_only_grounded_claims(engine: Engine, tmp_path: Path) -> assert row.is_deterministic is False # surfaced as llm_grounded prov_files = {p.file_path for p in provenance_for_artifact(conn, sha, row.logical_key)} assert prov_files # grounded on its target's spans (>= 1 file) + + +def test_module_descriptions_are_grounded(engine: Engine, tmp_path: Path) -> None: + """The same span-validation gate covers per-module descriptions (DESIGN.md §9).""" + sha = _index(engine, tmp_path) + describe_snapshot(engine, sha, _StubProvider()) + + rows = _description_rows(engine, sha) + modules = { + r.payload["target_logical_key"]: r + for r in rows + if r.payload["target_kind"] == "module" + } + assert modules # modules are described, not just artifacts + + # Modules where OrderOut occurs (defined in app.schemas, imported/used in app.routes) ARE + # described; the fabricated symbol is dropped on the module path too (adversarial). + described = set(modules) + assert "app.schemas" in described or "app.routes" in described + for row in modules.values(): + symbols = [c["symbol"] for c in row.payload["claims"]] + assert REAL in symbols + assert FAKE not in symbols + + # Modules with no occurrence of OrderOut (e.g. app.main, app.__init__) get NO description: + # every claim was a hallucination relative to the file's spans, so nothing is stored. + assert "app.main" not in described + assert "app.__init__" not in described diff --git a/src/kb/extract/semantic/describe.py b/src/kb/extract/semantic/describe.py index 51e29a7..7bd67ac 100644 --- a/src/kb/extract/semantic/describe.py +++ b/src/kb/extract/semantic/describe.py @@ -1,9 +1,10 @@ """LLM-grounded NL descriptions over a snapshot — a separate, key-gated pass (DESIGN.md §4, §9). -For each ``api_route`` / ``entity`` artifact, an LLM writes a short summary plus structured claims; -each claim is validated against the artifact's own grounding spans (``grounding.validate_claims``), -unvalidated claims are dropped, and — if anything survives — a ``description`` artifact is stored -grounded on the SAME spans (role ``describes``, ``is_deterministic=False``). Never on the +For each ``api_route`` / ``entity`` artifact AND each first-party module (file), an LLM writes a +short summary plus structured claims; each claim is validated against the target's own grounding +spans (``grounding.validate_claims``), unvalidated claims are dropped, and — if anything survives — +a ``description`` artifact is stored grounded on the SAME spans (role ``describes``, +``is_deterministic=False``). Modules are grounded on ALL of the file's spans. Never on the ``kb index`` path. Idempotent per (model, prompt): ``artifact_id`` folds in model_id + prompt. """ @@ -14,19 +15,20 @@ from dataclasses import dataclass from typing import Any -from sqlalchemy import Engine, select +from sqlalchemy import Connection, Engine, select from kb.extract.base import DerivedEdge, ExtractedArtifact from kb.extract.semantic.grounding import validate_claims from kb.llm.providers import LLMProvider from kb.store import models as m -from kb.store.queries import spans_for_artifact +from kb.store.queries import ArtifactSpanRow, module_targets, spans_for_artifact from kb.store.writer import write_grounded_artifact, write_snapshot_entry EXTRACTOR_ID = "llm_describe" EXTRACTOR_VERSION = "1" PROMPT_VERSION = "1" DESCRIBE_KINDS = ("api_route", "entity") +_BODY_CAP = 6000 # prompt source-span body cap (validation still runs over every span) _SYSTEM = ( "You describe a code artifact using ONLY the provided source spans. Respond with STRICT JSON " @@ -44,7 +46,7 @@ class DescribeResult: def describe_snapshot(engine: Engine, sha: str, provider: LLMProvider) -> DescribeResult: - """Generate grounded descriptions for the snapshot's api_route / entity artifacts.""" + """Generate grounded descriptions for the snapshot's api_route/entity artifacts and modules.""" join = m.snapshot_entry.join( m.artifact, m.artifact.c.artifact_id == m.snapshot_entry.c.artifact_id ) @@ -64,45 +66,97 @@ def describe_snapshot(engine: Engine, sha: str, provider: LLMProvider) -> Descri spans = spans_for_artifact(conn, sha, target.logical_key) if not spans: continue - prompt = _build_prompt(target.kind, target.payload, spans) - data = _parse_json(provider.complete(_SYSTEM, prompt, max_tokens=600)) - if data is None: - continue - raw_claims = [c for c in data.get("claims", []) if isinstance(c, dict)] - kept, dropped = validate_claims( - raw_claims, [s.raw_text for s in spans], [s.fq_symbol_path for s in spans] - ) - dropped_total += len(dropped) - if not kept: - continue # nothing grounded survives -> store nothing (anti-hallucination) - artifact = ExtractedArtifact( - kind="description", + stored, dropped = _describe_one( + conn, + sha, + provider, logical_key=f"desc:{target.logical_key}", - payload={ - "target_logical_key": target.logical_key, - "target_kind": target.kind, - "summary": str(data.get("summary", ""))[:500], - "claims": kept, - "dropped_claims": len(dropped), - }, - derived_from=[DerivedEdge(s.span_id, "describes") for s in spans], - extractor_id=EXTRACTOR_ID, - extractor_version=EXTRACTOR_VERSION, - prompt_version=PROMPT_VERSION, - model_id=provider.model_id, - is_deterministic=False, - confidence=len(kept) / (len(kept) + len(dropped)), + target_logical_key=target.logical_key, + target_kind=target.kind, + facts=target.payload, + spans=spans, ) - artifact_id = write_grounded_artifact(conn, artifact) - write_snapshot_entry(conn, sha, artifact.logical_key, artifact_id) - described += 1 + described += int(stored) + dropped_total += dropped + + for module in module_targets(conn, sha): + stored, dropped = _describe_one( + conn, + sha, + provider, + logical_key=f"desc:module:{module.module}", + target_logical_key=module.module, + target_kind="module", + facts={"module": module.module, "file_path": module.file_path}, + spans=module.spans, + ) + described += int(stored) + dropped_total += dropped return DescribeResult(sha=sha, described=described, dropped_claims=dropped_total) -def _build_prompt(kind: str, payload: dict[str, Any], spans: list[Any]) -> str: - facts = json.dumps(payload, default=str)[:800] - body = "\n\n".join(f"# {s.fq_symbol_path}\n{s.raw_text}" for s in spans) - return f"Artifact kind: {kind}\nKnown facts: {facts}\n\nSource spans:\n{body}" +def _describe_one( + conn: Connection, + sha: str, + provider: LLMProvider, + *, + logical_key: str, + target_logical_key: str, + target_kind: str, + facts: dict[str, Any], + spans: list[ArtifactSpanRow], +) -> tuple[bool, int]: + """Describe one target (artifact or module) from its grounding spans. + + Returns ``(stored, dropped_count)``. A ``description`` artifact is stored (grounded on the + spans, role ``describes``) only if >= 1 claim survives span-validation; otherwise nothing is + stored (anti-hallucination). Idempotent per (model, prompt). + """ + prompt = _build_prompt(target_kind, facts, spans) + data = _parse_json(provider.complete(_SYSTEM, prompt, max_tokens=600)) + if data is None: + return False, 0 + raw_claims = [c for c in data.get("claims", []) if isinstance(c, dict)] + kept, dropped = validate_claims( + raw_claims, [s.raw_text for s in spans], [s.fq_symbol_path for s in spans] + ) + if not kept: + return False, len(dropped) # nothing grounded survives -> store nothing + artifact = ExtractedArtifact( + kind="description", + logical_key=logical_key, + payload={ + "target_logical_key": target_logical_key, + "target_kind": target_kind, + "summary": str(data.get("summary", ""))[:500], + "claims": kept, + "dropped_claims": len(dropped), + }, + derived_from=[DerivedEdge(s.span_id, "describes") for s in spans], + extractor_id=EXTRACTOR_ID, + extractor_version=EXTRACTOR_VERSION, + prompt_version=PROMPT_VERSION, + model_id=provider.model_id, + is_deterministic=False, + confidence=len(kept) / (len(kept) + len(dropped)), + ) + artifact_id = write_grounded_artifact(conn, artifact) + write_snapshot_entry(conn, sha, artifact.logical_key, artifact_id) + return True, len(dropped) + + +def _build_prompt(kind: str, facts: dict[str, Any], spans: list[ArtifactSpanRow]) -> str: + facts_json = json.dumps(facts, default=str)[:800] + parts: list[str] = [] + used = 0 + for s in spans: + block = f"# {s.fq_symbol_path}\n{s.raw_text}" + if parts and used + len(block) > _BODY_CAP: + break + parts.append(block) + used += len(block) + body = "\n\n".join(parts) + return f"Artifact kind: {kind}\nKnown facts: {facts_json}\n\nSource spans:\n{body}" def _parse_json(raw: str) -> dict[str, Any] | None: diff --git a/src/kb/store/queries.py b/src/kb/store/queries.py index 1d4e896..b6b5da2 100644 --- a/src/kb/store/queries.py +++ b/src/kb/store/queries.py @@ -232,6 +232,48 @@ def spans_for_artifact(conn: Connection, sha: str, logical_key: str) -> list[Art return [ArtifactSpanRow(r.span_id, r.fq_symbol_path, r.raw_text) for r in rows] +@dataclass(frozen=True) +class ModuleTarget: + module: str # the file's module span fq path (e.g. "app.schemas") + file_path: str + spans: list[ArtifactSpanRow] # ALL of the file's spans (module + classes/functions/imports) + + +def module_targets(conn: Connection, sha: str) -> list[ModuleTarget]: + """First-party modules in ``sha``'s snapshot, each with all of its spans. + + A module is not an artifact, so it is enumerated from its span occurrences at ``sha`` (which are + first-party-only — the pipeline parses solely files under the first-party root). Feeds the + LLM-grounded module describer: the file's spans are both the prompt context and the + deterministic ground truth its claims are validated against (DESIGN.md §9). + """ + join = m.code_span.join(m.span_occurrence, m.span_occurrence.c.span_id == m.code_span.c.span_id) + rows = conn.execute( + select( + m.code_span.c.span_id, + m.code_span.c.span_kind, + m.code_span.c.fq_symbol_path, + m.span_occurrence.c.raw_text, + m.span_occurrence.c.file_path, + ) + .select_from(join) + .where(m.span_occurrence.c.sha == sha) + .order_by(m.span_occurrence.c.file_path, m.code_span.c.fq_symbol_path) + ).all() + by_file: dict[str, list[Any]] = {} + for row in rows: + by_file.setdefault(row.file_path, []).append(row) + targets: list[ModuleTarget] = [] + for file_path, file_rows in by_file.items(): + module = next( + (r.fq_symbol_path for r in file_rows if r.span_kind == "module"), + file_rows[0].fq_symbol_path, + ) + spans = [ArtifactSpanRow(r.span_id, r.fq_symbol_path, r.raw_text) for r in file_rows] + targets.append(ModuleTarget(module=module, file_path=file_path, spans=spans)) + return targets + + def _like_literal(value: str, suffix: str) -> str: escaped = value.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_") return escaped + suffix