From 54e3b22cfde22301a1880e73066b14e0e0324ebc Mon Sep 17 00:00:00 2001 From: dripsmvcp <138900956+dripsmvcp@users.noreply.github.com> Date: Wed, 17 Jun 2026 10:58:10 +0900 Subject: [PATCH] feat(eval): recall-quality eval harness with CI baseline gate (issue #226) Add vouch eval recall: score kb.context retrieval against a labeled query set (P@k, R@k, MRR, nDCG), compare against a committed baseline, and fail CI on a >5% P@5 regression. Pure-Python metrics; deterministic report; starter labeled set + fixture KB + eval workflow. --- .github/workflows/eval.yml | 39 ++++ CHANGELOG.md | 8 + eval/baseline.json | 166 ++++++++++++++++++ eval/fixture-kb/.vouch/.gitignore | 3 + .../.vouch/claims/api-rate-limit.yaml | 17 ++ eval/fixture-kb/.vouch/claims/auth-jwt.yaml | 17 ++ .../.vouch/claims/auth-session.yaml | 17 ++ .../fixture-kb/.vouch/claims/cache-redis.yaml | 17 ++ .../.vouch/claims/db-migrations.yaml | 17 ++ .../fixture-kb/.vouch/claims/db-postgres.yaml | 17 ++ eval/fixture-kb/.vouch/claims/deploy-ci.yaml | 17 ++ .../.vouch/claims/deploy-docker.yaml | 17 ++ .../.vouch/claims/search-embeddings.yaml | 17 ++ .../fixture-kb/.vouch/claims/search-fts5.yaml | 17 ++ eval/fixture-kb/.vouch/config.yaml | 13 ++ .../content | 1 + .../meta.yaml | 12 ++ eval/queries.jsonl | 10 ++ src/vouch/cli.py | 22 +++ src/vouch/eval/__init__.py | 22 +++ src/vouch/eval/recall.py | 132 ++++++++++++++ tests/test_eval_recall.py | 134 ++++++++++++++ 22 files changed, 732 insertions(+) create mode 100644 .github/workflows/eval.yml create mode 100644 eval/baseline.json create mode 100644 eval/fixture-kb/.vouch/.gitignore create mode 100644 eval/fixture-kb/.vouch/claims/api-rate-limit.yaml create mode 100644 eval/fixture-kb/.vouch/claims/auth-jwt.yaml create mode 100644 eval/fixture-kb/.vouch/claims/auth-session.yaml create mode 100644 eval/fixture-kb/.vouch/claims/cache-redis.yaml create mode 100644 eval/fixture-kb/.vouch/claims/db-migrations.yaml create mode 100644 eval/fixture-kb/.vouch/claims/db-postgres.yaml create mode 100644 eval/fixture-kb/.vouch/claims/deploy-ci.yaml create mode 100644 eval/fixture-kb/.vouch/claims/deploy-docker.yaml create mode 100644 eval/fixture-kb/.vouch/claims/search-embeddings.yaml create mode 100644 eval/fixture-kb/.vouch/claims/search-fts5.yaml create mode 100644 eval/fixture-kb/.vouch/config.yaml create mode 100644 eval/fixture-kb/.vouch/sources/1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e/content create mode 100644 eval/fixture-kb/.vouch/sources/1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e/meta.yaml create mode 100644 eval/queries.jsonl create mode 100644 src/vouch/eval/__init__.py create mode 100644 src/vouch/eval/recall.py create mode 100644 tests/test_eval_recall.py diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml new file mode 100644 index 0000000..150b6f2 --- /dev/null +++ b/.github/workflows/eval.yml @@ -0,0 +1,39 @@ +name: eval + +# Gate retrieval quality: score kb.context against the committed labeled set +# and fail on a P@5 regression beyond tolerance vs eval/baseline.json. Runs +# only when retrieval code changes. +on: + pull_request: + paths: + - "src/vouch/embeddings/**" + - "src/vouch/context.py" + - "src/vouch/eval/**" + - "eval/**" + +jobs: + recall: + name: recall eval + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: pip + + - name: install + run: | + python -m pip install --upgrade pip + pip install -e '.[dev]' + + - name: build fixture index + working-directory: eval/fixture-kb + run: python -m vouch.cli reindex + + - name: recall eval (fail on P@5 regression > 5%) + working-directory: eval/fixture-kb + run: >- + python -m vouch.cli eval recall ../queries.jsonl + --k 5 --baseline ../baseline.json --max-regression 0.05 diff --git a/CHANGELOG.md b/CHANGELOG.md index 6d4fff9..a80f5ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,14 @@ All notable changes to vouch are documented here. Format follows ## [Unreleased] +### Added +- `vouch eval recall ` — score `kb.context` retrieval against a + labeled query set with pure-Python P@k / R@k / MRR / nDCG, compare against a + committed `eval/baseline.json`, and fail CI on a P@5 regression beyond + tolerance (default 5%). Ships a starter labeled set, a reproducible fixture + KB under `eval/fixture-kb/`, and an `eval` workflow gating retrieval changes + (#226). + ## [0.1.0] — 2026-05-26 ### Packaging diff --git a/eval/baseline.json b/eval/baseline.json new file mode 100644 index 0000000..f446660 --- /dev/null +++ b/eval/baseline.json @@ -0,0 +1,166 @@ +{ + "k": 5, + "n_queries": 10, + "macro": { + "p_at_k": 0.18, + "r_at_k": 0.9, + "mrr": 0.9, + "ndcg_at_k": 0.9 + }, + "per_query": [ + { + "query": "JWT bearer token authentication", + "expected": [ + "auth-jwt" + ], + "ranked": [ + "auth-jwt" + ], + "scores": { + "p_at_k": 0.2, + "r_at_k": 1.0, + "mrr": 1.0, + "ndcg_at_k": 1.0 + } + }, + { + "query": "how long until a session expires", + "expected": [ + "auth-session" + ], + "ranked": [ + "deploy-docker", + "cache-redis" + ], + "scores": { + "p_at_k": 0.0, + "r_at_k": 0.0, + "mrr": 0.0, + "ndcg_at_k": 0.0 + } + }, + { + "query": "PostgreSQL primary datastore", + "expected": [ + "db-postgres" + ], + "ranked": [ + "db-postgres" + ], + "scores": { + "p_at_k": 0.2, + "r_at_k": 1.0, + "mrr": 1.0, + "ndcg_at_k": 1.0 + } + }, + { + "query": "Alembic database migrations on deploy", + "expected": [ + "db-migrations" + ], + "ranked": [ + "db-migrations", + "deploy-ci" + ], + "scores": { + "p_at_k": 0.2, + "r_at_k": 1.0, + "mrr": 1.0, + "ndcg_at_k": 1.0 + } + }, + { + "query": "Redis cache TTL for query results", + "expected": [ + "cache-redis" + ], + "ranked": [ + "cache-redis" + ], + "scores": { + "p_at_k": 0.2, + "r_at_k": 1.0, + "mrr": 1.0, + "ndcg_at_k": 1.0 + } + }, + { + "query": "SQLite FTS5 search fallback", + "expected": [ + "search-fts5" + ], + "ranked": [ + "search-fts5", + "search-embeddings" + ], + "scores": { + "p_at_k": 0.2, + "r_at_k": 1.0, + "mrr": 1.0, + "ndcg_at_k": 1.0 + } + }, + { + "query": "sentence transformer semantic embeddings", + "expected": [ + "search-embeddings" + ], + "ranked": [ + "search-embeddings", + "search-fts5" + ], + "scores": { + "p_at_k": 0.2, + "r_at_k": 1.0, + "mrr": 1.0, + "ndcg_at_k": 1.0 + } + }, + { + "query": "continuous integration ruff mypy pytest", + "expected": [ + "deploy-ci" + ], + "ranked": [ + "deploy-ci" + ], + "scores": { + "p_at_k": 0.2, + "r_at_k": 1.0, + "mrr": 1.0, + "ndcg_at_k": 1.0 + } + }, + { + "query": "Docker multi-stage image deployment", + "expected": [ + "deploy-docker" + ], + "ranked": [ + "deploy-docker" + ], + "scores": { + "p_at_k": 0.2, + "r_at_k": 1.0, + "mrr": 1.0, + "ndcg_at_k": 1.0 + } + }, + { + "query": "public API rate limit per minute", + "expected": [ + "api-rate-limit" + ], + "ranked": [ + "api-rate-limit" + ], + "scores": { + "p_at_k": 0.2, + "r_at_k": 1.0, + "mrr": 1.0, + "ndcg_at_k": 1.0 + } + } + ] +} diff --git a/eval/fixture-kb/.vouch/.gitignore b/eval/fixture-kb/.vouch/.gitignore new file mode 100644 index 0000000..d8df5b6 --- /dev/null +++ b/eval/fixture-kb/.vouch/.gitignore @@ -0,0 +1,3 @@ +proposed/ +state.db +state.db-* diff --git a/eval/fixture-kb/.vouch/claims/api-rate-limit.yaml b/eval/fixture-kb/.vouch/claims/api-rate-limit.yaml new file mode 100644 index 0000000..cde0c33 --- /dev/null +++ b/eval/fixture-kb/.vouch/claims/api-rate-limit.yaml @@ -0,0 +1,17 @@ +id: api-rate-limit +text: The public API rate limits clients to one hundred requests per minute +type: observation +status: working +confidence: 0.7 +evidence: +- 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e +entities: [] +supersedes: [] +superseded_by: null +contradicts: [] +scope: project +tags: [] +created_at: '2026-06-17T01:54:02.972683Z' +updated_at: '2026-06-17T01:54:02.972684Z' +last_confirmed_at: null +approved_by: null diff --git a/eval/fixture-kb/.vouch/claims/auth-jwt.yaml b/eval/fixture-kb/.vouch/claims/auth-jwt.yaml new file mode 100644 index 0000000..fbaf03f --- /dev/null +++ b/eval/fixture-kb/.vouch/claims/auth-jwt.yaml @@ -0,0 +1,17 @@ +id: auth-jwt +text: Authentication uses JWT bearer tokens signed with RS256 +type: observation +status: working +confidence: 0.7 +evidence: +- 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e +entities: [] +supersedes: [] +superseded_by: null +contradicts: [] +scope: project +tags: [] +created_at: '2026-06-17T01:54:02.969406Z' +updated_at: '2026-06-17T01:54:02.969407Z' +last_confirmed_at: null +approved_by: null diff --git a/eval/fixture-kb/.vouch/claims/auth-session.yaml b/eval/fixture-kb/.vouch/claims/auth-session.yaml new file mode 100644 index 0000000..2e5e0b4 --- /dev/null +++ b/eval/fixture-kb/.vouch/claims/auth-session.yaml @@ -0,0 +1,17 @@ +id: auth-session +text: Sessions expire after thirty minutes of inactivity +type: observation +status: working +confidence: 0.7 +evidence: +- 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e +entities: [] +supersedes: [] +superseded_by: null +contradicts: [] +scope: project +tags: [] +created_at: '2026-06-17T01:54:02.969829Z' +updated_at: '2026-06-17T01:54:02.969829Z' +last_confirmed_at: null +approved_by: null diff --git a/eval/fixture-kb/.vouch/claims/cache-redis.yaml b/eval/fixture-kb/.vouch/claims/cache-redis.yaml new file mode 100644 index 0000000..e66c21a --- /dev/null +++ b/eval/fixture-kb/.vouch/claims/cache-redis.yaml @@ -0,0 +1,17 @@ +id: cache-redis +text: Redis caches hot query results with a sixty second TTL +type: observation +status: working +confidence: 0.7 +evidence: +- 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e +entities: [] +supersedes: [] +superseded_by: null +contradicts: [] +scope: project +tags: [] +created_at: '2026-06-17T01:54:02.970908Z' +updated_at: '2026-06-17T01:54:02.970909Z' +last_confirmed_at: null +approved_by: null diff --git a/eval/fixture-kb/.vouch/claims/db-migrations.yaml b/eval/fixture-kb/.vouch/claims/db-migrations.yaml new file mode 100644 index 0000000..3412062 --- /dev/null +++ b/eval/fixture-kb/.vouch/claims/db-migrations.yaml @@ -0,0 +1,17 @@ +id: db-migrations +text: Database migrations are applied with Alembic on deploy +type: observation +status: working +confidence: 0.7 +evidence: +- 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e +entities: [] +supersedes: [] +superseded_by: null +contradicts: [] +scope: project +tags: [] +created_at: '2026-06-17T01:54:02.970535Z' +updated_at: '2026-06-17T01:54:02.970535Z' +last_confirmed_at: null +approved_by: null diff --git a/eval/fixture-kb/.vouch/claims/db-postgres.yaml b/eval/fixture-kb/.vouch/claims/db-postgres.yaml new file mode 100644 index 0000000..e7cccc1 --- /dev/null +++ b/eval/fixture-kb/.vouch/claims/db-postgres.yaml @@ -0,0 +1,17 @@ +id: db-postgres +text: The primary datastore is PostgreSQL fourteen +type: observation +status: working +confidence: 0.7 +evidence: +- 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e +entities: [] +supersedes: [] +superseded_by: null +contradicts: [] +scope: project +tags: [] +created_at: '2026-06-17T01:54:02.970189Z' +updated_at: '2026-06-17T01:54:02.970190Z' +last_confirmed_at: null +approved_by: null diff --git a/eval/fixture-kb/.vouch/claims/deploy-ci.yaml b/eval/fixture-kb/.vouch/claims/deploy-ci.yaml new file mode 100644 index 0000000..02211ce --- /dev/null +++ b/eval/fixture-kb/.vouch/claims/deploy-ci.yaml @@ -0,0 +1,17 @@ +id: deploy-ci +text: Continuous integration runs ruff, mypy and pytest on every push +type: observation +status: working +confidence: 0.7 +evidence: +- 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e +entities: [] +supersedes: [] +superseded_by: null +contradicts: [] +scope: project +tags: [] +created_at: '2026-06-17T01:54:02.971958Z' +updated_at: '2026-06-17T01:54:02.971958Z' +last_confirmed_at: null +approved_by: null diff --git a/eval/fixture-kb/.vouch/claims/deploy-docker.yaml b/eval/fixture-kb/.vouch/claims/deploy-docker.yaml new file mode 100644 index 0000000..947813d --- /dev/null +++ b/eval/fixture-kb/.vouch/claims/deploy-docker.yaml @@ -0,0 +1,17 @@ +id: deploy-docker +text: The service ships as a multi-stage Docker image +type: observation +status: working +confidence: 0.7 +evidence: +- 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e +entities: [] +supersedes: [] +superseded_by: null +contradicts: [] +scope: project +tags: [] +created_at: '2026-06-17T01:54:02.972305Z' +updated_at: '2026-06-17T01:54:02.972306Z' +last_confirmed_at: null +approved_by: null diff --git a/eval/fixture-kb/.vouch/claims/search-embeddings.yaml b/eval/fixture-kb/.vouch/claims/search-embeddings.yaml new file mode 100644 index 0000000..7430a97 --- /dev/null +++ b/eval/fixture-kb/.vouch/claims/search-embeddings.yaml @@ -0,0 +1,17 @@ +id: search-embeddings +text: Semantic search uses sentence-transformer embeddings +type: observation +status: working +confidence: 0.7 +evidence: +- 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e +entities: [] +supersedes: [] +superseded_by: null +contradicts: [] +scope: project +tags: [] +created_at: '2026-06-17T01:54:02.971611Z' +updated_at: '2026-06-17T01:54:02.971611Z' +last_confirmed_at: null +approved_by: null diff --git a/eval/fixture-kb/.vouch/claims/search-fts5.yaml b/eval/fixture-kb/.vouch/claims/search-fts5.yaml new file mode 100644 index 0000000..e22a285 --- /dev/null +++ b/eval/fixture-kb/.vouch/claims/search-fts5.yaml @@ -0,0 +1,17 @@ +id: search-fts5 +text: Search falls back to SQLite FTS5 when embeddings are absent +type: observation +status: working +confidence: 0.7 +evidence: +- 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e +entities: [] +supersedes: [] +superseded_by: null +contradicts: [] +scope: project +tags: [] +created_at: '2026-06-17T01:54:02.971266Z' +updated_at: '2026-06-17T01:54:02.971267Z' +last_confirmed_at: null +approved_by: null diff --git a/eval/fixture-kb/.vouch/config.yaml b/eval/fixture-kb/.vouch/config.yaml new file mode 100644 index 0000000..b26b474 --- /dev/null +++ b/eval/fixture-kb/.vouch/config.yaml @@ -0,0 +1,13 @@ +version: 1 +review: + require_human_approval: true +retrieval: + backends: + - fts5 + - substring + default_limit: 10 +agents: + recommended_loop: + - kb.search before writing + - kb.propose_* with citations + - human review via vouch pending/show/approve diff --git a/eval/fixture-kb/.vouch/sources/1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e/content b/eval/fixture-kb/.vouch/sources/1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e/content new file mode 100644 index 0000000..cf61ab2 --- /dev/null +++ b/eval/fixture-kb/.vouch/sources/1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e/content @@ -0,0 +1 @@ +vouch architecture notes - fixture evidence for the recall eval \ No newline at end of file diff --git a/eval/fixture-kb/.vouch/sources/1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e/meta.yaml b/eval/fixture-kb/.vouch/sources/1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e/meta.yaml new file mode 100644 index 0000000..e8a4658 --- /dev/null +++ b/eval/fixture-kb/.vouch/sources/1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e/meta.yaml @@ -0,0 +1,12 @@ +id: 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e +type: file +locator: 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e +title: null +hash: 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e +immutable: true +scope: project +byte_size: 63 +media_type: text/plain +created_at: '2026-06-17T01:54:02.968499Z' +metadata: {} +tags: [] diff --git a/eval/queries.jsonl b/eval/queries.jsonl new file mode 100644 index 0000000..1381da6 --- /dev/null +++ b/eval/queries.jsonl @@ -0,0 +1,10 @@ +{"query": "JWT bearer token authentication", "expected": ["auth-jwt"]} +{"query": "how long until a session expires", "expected": ["auth-session"]} +{"query": "PostgreSQL primary datastore", "expected": ["db-postgres"]} +{"query": "Alembic database migrations on deploy", "expected": ["db-migrations"]} +{"query": "Redis cache TTL for query results", "expected": ["cache-redis"]} +{"query": "SQLite FTS5 search fallback", "expected": ["search-fts5"]} +{"query": "sentence transformer semantic embeddings", "expected": ["search-embeddings"]} +{"query": "continuous integration ruff mypy pytest", "expected": ["deploy-ci"]} +{"query": "Docker multi-stage image deployment", "expected": ["deploy-docker"]} +{"query": "public API rate limit per minute", "expected": ["api-rate-limit"]} diff --git a/src/vouch/cli.py b/src/vouch/cli.py index 100dcdc..e32ac87 100644 --- a/src/vouch/cli.py +++ b/src/vouch/cli.py @@ -667,6 +667,28 @@ def eval_embedding(queries: str, metric: str) -> None: click.echo(f"{m_name}\t{v:.4f}") +@eval_group.command("recall") +@click.argument("queries", type=click.Path(exists=True, dir_okay=False)) +@click.option("--k", default=5, show_default=True, type=int) +@click.option("--baseline", default=None, type=click.Path(exists=True, dir_okay=False), + help="Baseline report JSON; fail on a P@k regression beyond tolerance.") +@click.option("--max-regression", default=0.05, show_default=True, type=float) +def eval_recall(queries: str, k: int, baseline: str | None, + max_regression: float) -> None: + """Score kb.context retrieval against a labeled query set (P@k/R@k/MRR/nDCG).""" + from .eval.recall import compare_baseline, run_recall + store = _load_store() + with _cli_errors(): + report = run_recall(store, queries, k=k) + click.echo(json.dumps(report, indent=2)) + if baseline is not None: + base = json.loads(Path(baseline).read_text(encoding="utf-8")) + ok, message = compare_baseline(report, base, max_regression=max_regression) + click.echo(message, err=True) + if not ok: + raise click.ClickException(message) + + @cli.command() @click.option("--embeddings/--no-embeddings", default=False, help="Rebuild the embedding index in addition to FTS5.") diff --git a/src/vouch/eval/__init__.py b/src/vouch/eval/__init__.py new file mode 100644 index 0000000..de519c1 --- /dev/null +++ b/src/vouch/eval/__init__.py @@ -0,0 +1,22 @@ +"""Retrieval-quality evaluation harness. + +Scores the live `kb.context` retrieval (`build_context_pack`) against a +labeled query set and reports P@k / R@k / MRR / nDCG, plus a baseline +comparison used by CI to gate retrieval regressions. +""" + +from __future__ import annotations + +from .recall import ( + compare_baseline, + load_queries, + run_recall, + score_query, +) + +__all__ = [ + "compare_baseline", + "load_queries", + "run_recall", + "score_query", +] diff --git a/src/vouch/eval/recall.py b/src/vouch/eval/recall.py new file mode 100644 index 0000000..177d4d2 --- /dev/null +++ b/src/vouch/eval/recall.py @@ -0,0 +1,132 @@ +"""Recall-quality eval: score `build_context_pack` against a labeled set. + +Pure-Python metrics (no numpy). The ranked result for a query is the +ordered list of `items` ids returned by `build_context_pack`; expected ids +are the human-labeled relevant claim ids for that query. +""" + +from __future__ import annotations + +import json +import math +from pathlib import Path +from typing import TYPE_CHECKING, Any, cast + +from ..context import build_context_pack + +if TYPE_CHECKING: + from ..storage import KBStore + +_METRICS = ("p_at_k", "r_at_k", "mrr", "ndcg_at_k") + + +def load_queries(path: str | Path) -> list[dict[str, Any]]: + """Read a JSONL labeled query set. + + Each line is ``{"query": str, "expected": [, ...]}``. The key + ``expected_ids`` is accepted as an alias for ``expected``. Blank lines are + skipped. + """ + queries: list[dict[str, Any]] = [] + with Path(path).open(encoding="utf-8") as fh: + for line in fh: + if not line.strip(): + continue + row = json.loads(line) + expected = row.get("expected", row.get("expected_ids", [])) + queries.append({"query": row["query"], "expected": list(expected)}) + return queries + + +def score_query( + ranked_ids: list[str], expected: list[str], *, k: int = 5 +) -> dict[str, float]: + """Compute P@k, R@k, MRR and nDCG@k for one ranked list vs expected ids.""" + rel = set(expected) + top = ranked_ids[:k] + hits = sum(1 for rid in top if rid in rel) + + p_at_k = hits / k if k else 0.0 + r_at_k = hits / len(rel) if rel else 0.0 + + mrr = 0.0 + for i, rid in enumerate(ranked_ids, start=1): + if rid in rel: + mrr = 1.0 / i + break + + dcg = 0.0 + for i, rid in enumerate(top, start=1): + if rid in rel: + dcg += 1.0 / math.log2(i + 1) + ideal = sum(1.0 / math.log2(i + 1) for i in range(1, min(len(rel), k) + 1)) + ndcg = dcg / ideal if ideal > 0 else 0.0 + + return {"p_at_k": p_at_k, "r_at_k": r_at_k, "mrr": mrr, "ndcg_at_k": ndcg} + + +def _macro(per_query: list[dict[str, Any]]) -> dict[str, float]: + if not per_query: + return dict.fromkeys(_METRICS, 0.0) + n = len(per_query) + return {m: sum(q["scores"][m] for q in per_query) / n for m in _METRICS} + + +def run_recall( + store: KBStore, queries_path: str | Path, *, k: int = 5 +) -> dict[str, Any]: + """Score retrieval over a labeled query set and return a report. + + For each query the ranked result is the ordered ``items`` ids from + ``build_context_pack(store, query=q, limit=max(k, 10))``. The report is + deterministic (queries preserve input order; metrics are pure functions). + """ + queries = load_queries(queries_path) + limit = max(k, 10) + per_query: list[dict[str, Any]] = [] + for row in queries: + pack = cast( + dict[str, Any], + build_context_pack(store, query=row["query"], limit=limit), + ) + ranked_ids = [item["id"] for item in pack["items"]] + scores = score_query(ranked_ids, row["expected"], k=k) + per_query.append( + { + "query": row["query"], + "expected": row["expected"], + "ranked": ranked_ids, + "scores": scores, + } + ) + return { + "k": k, + "n_queries": len(per_query), + "macro": _macro(per_query), + "per_query": per_query, + } + + +def compare_baseline( + report: dict[str, Any], baseline: dict[str, Any], *, max_regression: float = 0.05 +) -> tuple[bool, str]: + """Compare a fresh report against a committed baseline on macro P@k. + + Returns ``(ok, message)``. Not ok when the report's macro ``p_at_k`` falls + below ``baseline.macro.p_at_k - max_regression``. + """ + cur = float(report["macro"]["p_at_k"]) + base = float(baseline["macro"]["p_at_k"]) + floor = base - max_regression + delta = cur - base + if cur < floor: + return ( + False, + f"P@{report['k']} regression: {cur:.4f} < baseline {base:.4f} " + f"- tol {max_regression:.4f} = {floor:.4f} (delta {delta:+.4f})", + ) + return ( + True, + f"P@{report['k']} ok: {cur:.4f} vs baseline {base:.4f} " + f"(delta {delta:+.4f}, tol {max_regression:.4f})", + ) diff --git a/tests/test_eval_recall.py b/tests/test_eval_recall.py new file mode 100644 index 0000000..2d7641f --- /dev/null +++ b/tests/test_eval_recall.py @@ -0,0 +1,134 @@ +"""Recall-quality eval harness: metrics, run_recall, baseline gate.""" + +from __future__ import annotations + +import json +import math +from pathlib import Path + +import pytest + +from vouch import health +from vouch.eval.recall import ( + compare_baseline, + load_queries, + run_recall, + score_query, +) +from vouch.models import Claim +from vouch.storage import KBStore + + +def test_score_query_hand_computed() -> None: + ranked = ["a", "x", "b", "y", "z", "c"] + expected = ["a", "b", "c"] + out = score_query(ranked, expected, k=5) + # Top-5 = a,x,b,y,z -> 2 of 5 relevant. + assert out["p_at_k"] == 2 / 5 + # 2 of 3 expected appear in top-5. + assert out["r_at_k"] == 2 / 3 + # First relevant hit is at rank 1. + assert out["mrr"] == 1.0 + # DCG: a@1 (1/log2 2) + b@3 (1/log2 4). c is at rank 6, outside k=5. + dcg = 1.0 / math.log2(2) + 1.0 / math.log2(4) + ideal = 1.0 / math.log2(2) + 1.0 / math.log2(3) + 1.0 / math.log2(4) + assert out["ndcg_at_k"] == dcg / ideal + + +def test_score_query_no_hits() -> None: + out = score_query(["x", "y"], ["a"], k=5) + assert out == {"p_at_k": 0.0, "r_at_k": 0.0, "mrr": 0.0, "ndcg_at_k": 0.0} + + +def test_score_query_mrr_uses_full_ranking() -> None: + # Relevant id is at rank 3 (outside k for P@k, but MRR scans full list). + out = score_query(["x", "y", "a", "z"], ["a"], k=2) + assert out["mrr"] == 1 / 3 + assert out["p_at_k"] == 0.0 + + +def test_load_queries_accepts_both_keys(tmp_path: Path) -> None: + p = tmp_path / "q.jsonl" + p.write_text( + json.dumps({"query": "one", "expected": ["c1"]}) + + "\n\n" + + json.dumps({"query": "two", "expected_ids": ["c2", "c3"]}) + + "\n", + encoding="utf-8", + ) + rows = load_queries(p) + assert rows == [ + {"query": "one", "expected": ["c1"]}, + {"query": "two", "expected": ["c2", "c3"]}, + ] + + +def test_run_recall_deterministic_report(tmp_path: Path) -> None: + store = KBStore.init(tmp_path) + src = store.put_source(b"evidence") + store.put_claim(Claim(id="auth-jwt", text="auth uses JWT bearer tokens", + evidence=[src.id])) + store.put_claim(Claim(id="db-postgres", text="datastore is PostgreSQL", + evidence=[src.id])) + store.put_claim(Claim(id="cache-redis", text="Redis caches query results", + evidence=[src.id])) + health.rebuild_index(store) + + qpath = tmp_path / "queries.jsonl" + qpath.write_text( + "\n".join( + json.dumps(row) + for row in ( + {"query": "JWT", "expected": ["auth-jwt"]}, + {"query": "PostgreSQL", "expected": ["db-postgres"]}, + {"query": "Redis", "expected": ["cache-redis"]}, + ) + ) + + "\n", + encoding="utf-8", + ) + + report = run_recall(store, qpath, k=5) + assert report["k"] == 5 + assert report["n_queries"] == 3 + # Each query's single relevant claim is the top FTS5 hit. + for pq in report["per_query"]: + assert pq["ranked"][0] == pq["expected"][0] + assert pq["scores"]["mrr"] == 1.0 + assert pq["scores"]["r_at_k"] == 1.0 + assert pq["scores"]["p_at_k"] == 1 / 5 + assert report["macro"]["mrr"] == 1.0 + assert report["macro"]["p_at_k"] == pytest.approx(1 / 5) + + # Determinism: identical inputs yield an identical report. + assert run_recall(store, qpath, k=5) == report + + +def test_compare_baseline_flags_regression() -> None: + report = {"k": 5, "macro": {"p_at_k": 0.40}} + baseline = {"k": 5, "macro": {"p_at_k": 0.50}} + ok, msg = compare_baseline(report, baseline, max_regression=0.05) + assert ok is False + assert "regression" in msg + + +def test_compare_baseline_within_tolerance() -> None: + report = {"k": 5, "macro": {"p_at_k": 0.47}} + baseline = {"k": 5, "macro": {"p_at_k": 0.50}} + ok, msg = compare_baseline(report, baseline, max_regression=0.05) + assert ok is True + assert "ok" in msg + + +def test_compare_baseline_improvement_is_ok() -> None: + report = {"k": 5, "macro": {"p_at_k": 0.80}} + baseline = {"k": 5, "macro": {"p_at_k": 0.50}} + ok, _ = compare_baseline(report, baseline, max_regression=0.05) + assert ok is True + + +def test_committed_query_set_is_loadable() -> None: + repo_root = Path(__file__).resolve().parents[1] + rows = load_queries(repo_root / "eval" / "queries.jsonl") + assert len(rows) >= 6 + assert all(r["query"] and r["expected"] for r in rows)