From 54e3b22cfde22301a1880e73066b14e0e0324ebc Mon Sep 17 00:00:00 2001
From: dripsmvcp <138900956+dripsmvcp@users.noreply.github.com>
Date: Wed, 17 Jun 2026 10:58:10 +0900
Subject: [PATCH] feat(eval): recall-quality eval harness with CI baseline gate
 (issue #226)

Add vouch eval recall: score kb.context retrieval against a labeled query set (P@k, R@k, MRR, nDCG), compare against a committed baseline, and fail CI on a >5% P@5 regression. Pure-Python metrics; deterministic report; starter labeled set + fixture KB + eval workflow.
---
 .github/workflows/eval.yml                    |  39 ++++
 CHANGELOG.md                                  |   8 +
 eval/baseline.json                            | 166 ++++++++++++++++++
 eval/fixture-kb/.vouch/.gitignore             |   3 +
 .../.vouch/claims/api-rate-limit.yaml         |  17 ++
 eval/fixture-kb/.vouch/claims/auth-jwt.yaml   |  17 ++
 .../.vouch/claims/auth-session.yaml           |  17 ++
 .../fixture-kb/.vouch/claims/cache-redis.yaml |  17 ++
 .../.vouch/claims/db-migrations.yaml          |  17 ++
 .../fixture-kb/.vouch/claims/db-postgres.yaml |  17 ++
 eval/fixture-kb/.vouch/claims/deploy-ci.yaml  |  17 ++
 .../.vouch/claims/deploy-docker.yaml          |  17 ++
 .../.vouch/claims/search-embeddings.yaml      |  17 ++
 .../fixture-kb/.vouch/claims/search-fts5.yaml |  17 ++
 eval/fixture-kb/.vouch/config.yaml            |  13 ++
 .../content                                   |   1 +
 .../meta.yaml                                 |  12 ++
 eval/queries.jsonl                            |  10 ++
 src/vouch/cli.py                              |  22 +++
 src/vouch/eval/__init__.py                    |  22 +++
 src/vouch/eval/recall.py                      | 132 ++++++++++++++
 tests/test_eval_recall.py                     | 134 ++++++++++++++
 22 files changed, 732 insertions(+)
 create mode 100644 .github/workflows/eval.yml
 create mode 100644 eval/baseline.json
 create mode 100644 eval/fixture-kb/.vouch/.gitignore
 create mode 100644 eval/fixture-kb/.vouch/claims/api-rate-limit.yaml
 create mode 100644 eval/fixture-kb/.vouch/claims/auth-jwt.yaml
 create mode 100644 eval/fixture-kb/.vouch/claims/auth-session.yaml
 create mode 100644 eval/fixture-kb/.vouch/claims/cache-redis.yaml
 create mode 100644 eval/fixture-kb/.vouch/claims/db-migrations.yaml
 create mode 100644 eval/fixture-kb/.vouch/claims/db-postgres.yaml
 create mode 100644 eval/fixture-kb/.vouch/claims/deploy-ci.yaml
 create mode 100644 eval/fixture-kb/.vouch/claims/deploy-docker.yaml
 create mode 100644 eval/fixture-kb/.vouch/claims/search-embeddings.yaml
 create mode 100644 eval/fixture-kb/.vouch/claims/search-fts5.yaml
 create mode 100644 eval/fixture-kb/.vouch/config.yaml
 create mode 100644 eval/fixture-kb/.vouch/sources/1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e/content
 create mode 100644 eval/fixture-kb/.vouch/sources/1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e/meta.yaml
 create mode 100644 eval/queries.jsonl
 create mode 100644 src/vouch/eval/__init__.py
 create mode 100644 src/vouch/eval/recall.py
 create mode 100644 tests/test_eval_recall.py

diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml
new file mode 100644
index 0000000..150b6f2
--- /dev/null
+++ b/.github/workflows/eval.yml
@@ -0,0 +1,39 @@
+name: eval
+
+# Gate retrieval quality: score kb.context against the committed labeled set
+# and fail on a P@5 regression beyond tolerance vs eval/baseline.json. Runs
+# only when retrieval code changes.
+on:
+  pull_request:
+    paths:
+      - "src/vouch/embeddings/**"
+      - "src/vouch/context.py"
+      - "src/vouch/eval/**"
+      - "eval/**"
+
+jobs:
+  recall:
+    name: recall eval
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+          cache: pip
+
+      - name: install
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e '.[dev]'
+
+      - name: build fixture index
+        working-directory: eval/fixture-kb
+        run: python -m vouch.cli reindex
+
+      - name: recall eval (fail on P@5 regression > 5%)
+        working-directory: eval/fixture-kb
+        run: >-
+          python -m vouch.cli eval recall ../queries.jsonl
+          --k 5 --baseline ../baseline.json --max-regression 0.05
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6d4fff9..a80f5ba 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,14 @@ All notable changes to vouch are documented here. Format follows
 
 ## [Unreleased]
 
+### Added
+- `vouch eval recall <queries.jsonl>` — score `kb.context` retrieval against a
+  labeled query set with pure-Python P@k / R@k / MRR / nDCG, compare against a
+  committed `eval/baseline.json`, and fail CI on a P@5 regression beyond
+  tolerance (default 5%). Ships a starter labeled set, a reproducible fixture
+  KB under `eval/fixture-kb/`, and an `eval` workflow gating retrieval changes
+  (#226).
+
 ## [0.1.0] — 2026-05-26
 
 ### Packaging
diff --git a/eval/baseline.json b/eval/baseline.json
new file mode 100644
index 0000000..f446660
--- /dev/null
+++ b/eval/baseline.json
@@ -0,0 +1,166 @@
+{
+  "k": 5,
+  "n_queries": 10,
+  "macro": {
+    "p_at_k": 0.18,
+    "r_at_k": 0.9,
+    "mrr": 0.9,
+    "ndcg_at_k": 0.9
+  },
+  "per_query": [
+    {
+      "query": "JWT bearer token authentication",
+      "expected": [
+        "auth-jwt"
+      ],
+      "ranked": [
+        "auth-jwt"
+      ],
+      "scores": {
+        "p_at_k": 0.2,
+        "r_at_k": 1.0,
+        "mrr": 1.0,
+        "ndcg_at_k": 1.0
+      }
+    },
+    {
+      "query": "how long until a session expires",
+      "expected": [
+        "auth-session"
+      ],
+      "ranked": [
+        "deploy-docker",
+        "cache-redis"
+      ],
+      "scores": {
+        "p_at_k": 0.0,
+        "r_at_k": 0.0,
+        "mrr": 0.0,
+        "ndcg_at_k": 0.0
+      }
+    },
+    {
+      "query": "PostgreSQL primary datastore",
+      "expected": [
+        "db-postgres"
+      ],
+      "ranked": [
+        "db-postgres"
+      ],
+      "scores": {
+        "p_at_k": 0.2,
+        "r_at_k": 1.0,
+        "mrr": 1.0,
+        "ndcg_at_k": 1.0
+      }
+    },
+    {
+      "query": "Alembic database migrations on deploy",
+      "expected": [
+        "db-migrations"
+      ],
+      "ranked": [
+        "db-migrations",
+        "deploy-ci"
+      ],
+      "scores": {
+        "p_at_k": 0.2,
+        "r_at_k": 1.0,
+        "mrr": 1.0,
+        "ndcg_at_k": 1.0
+      }
+    },
+    {
+      "query": "Redis cache TTL for query results",
+      "expected": [
+        "cache-redis"
+      ],
+      "ranked": [
+        "cache-redis"
+      ],
+      "scores": {
+        "p_at_k": 0.2,
+        "r_at_k": 1.0,
+        "mrr": 1.0,
+        "ndcg_at_k": 1.0
+      }
+    },
+    {
+      "query": "SQLite FTS5 search fallback",
+      "expected": [
+        "search-fts5"
+      ],
+      "ranked": [
+        "search-fts5",
+        "search-embeddings"
+      ],
+      "scores": {
+        "p_at_k": 0.2,
+        "r_at_k": 1.0,
+        "mrr": 1.0,
+        "ndcg_at_k": 1.0
+      }
+    },
+    {
+      "query": "sentence transformer semantic embeddings",
+      "expected": [
+        "search-embeddings"
+      ],
+      "ranked": [
+        "search-embeddings",
+        "search-fts5"
+      ],
+      "scores": {
+        "p_at_k": 0.2,
+        "r_at_k": 1.0,
+        "mrr": 1.0,
+        "ndcg_at_k": 1.0
+      }
+    },
+    {
+      "query": "continuous integration ruff mypy pytest",
+      "expected": [
+        "deploy-ci"
+      ],
+      "ranked": [
+        "deploy-ci"
+      ],
+      "scores": {
+        "p_at_k": 0.2,
+        "r_at_k": 1.0,
+        "mrr": 1.0,
+        "ndcg_at_k": 1.0
+      }
+    },
+    {
+      "query": "Docker multi-stage image deployment",
+      "expected": [
+        "deploy-docker"
+      ],
+      "ranked": [
+        "deploy-docker"
+      ],
+      "scores": {
+        "p_at_k": 0.2,
+        "r_at_k": 1.0,
+        "mrr": 1.0,
+        "ndcg_at_k": 1.0
+      }
+    },
+    {
+      "query": "public API rate limit per minute",
+      "expected": [
+        "api-rate-limit"
+      ],
+      "ranked": [
+        "api-rate-limit"
+      ],
+      "scores": {
+        "p_at_k": 0.2,
+        "r_at_k": 1.0,
+        "mrr": 1.0,
+        "ndcg_at_k": 1.0
+      }
+    }
+  ]
+}
diff --git a/eval/fixture-kb/.vouch/.gitignore b/eval/fixture-kb/.vouch/.gitignore
new file mode 100644
index 0000000..d8df5b6
--- /dev/null
+++ b/eval/fixture-kb/.vouch/.gitignore
@@ -0,0 +1,3 @@
+proposed/
+state.db
+state.db-*
diff --git a/eval/fixture-kb/.vouch/claims/api-rate-limit.yaml b/eval/fixture-kb/.vouch/claims/api-rate-limit.yaml
new file mode 100644
index 0000000..cde0c33
--- /dev/null
+++ b/eval/fixture-kb/.vouch/claims/api-rate-limit.yaml
@@ -0,0 +1,17 @@
+id: api-rate-limit
+text: The public API rate limits clients to one hundred requests per minute
+type: observation
+status: working
+confidence: 0.7
+evidence:
+- 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e
+entities: []
+supersedes: []
+superseded_by: null
+contradicts: []
+scope: project
+tags: []
+created_at: '2026-06-17T01:54:02.972683Z'
+updated_at: '2026-06-17T01:54:02.972684Z'
+last_confirmed_at: null
+approved_by: null
diff --git a/eval/fixture-kb/.vouch/claims/auth-jwt.yaml b/eval/fixture-kb/.vouch/claims/auth-jwt.yaml
new file mode 100644
index 0000000..fbaf03f
--- /dev/null
+++ b/eval/fixture-kb/.vouch/claims/auth-jwt.yaml
@@ -0,0 +1,17 @@
+id: auth-jwt
+text: Authentication uses JWT bearer tokens signed with RS256
+type: observation
+status: working
+confidence: 0.7
+evidence:
+- 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e
+entities: []
+supersedes: []
+superseded_by: null
+contradicts: []
+scope: project
+tags: []
+created_at: '2026-06-17T01:54:02.969406Z'
+updated_at: '2026-06-17T01:54:02.969407Z'
+last_confirmed_at: null
+approved_by: null
diff --git a/eval/fixture-kb/.vouch/claims/auth-session.yaml b/eval/fixture-kb/.vouch/claims/auth-session.yaml
new file mode 100644
index 0000000..2e5e0b4
--- /dev/null
+++ b/eval/fixture-kb/.vouch/claims/auth-session.yaml
@@ -0,0 +1,17 @@
+id: auth-session
+text: Sessions expire after thirty minutes of inactivity
+type: observation
+status: working
+confidence: 0.7
+evidence:
+- 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e
+entities: []
+supersedes: []
+superseded_by: null
+contradicts: []
+scope: project
+tags: []
+created_at: '2026-06-17T01:54:02.969829Z'
+updated_at: '2026-06-17T01:54:02.969829Z'
+last_confirmed_at: null
+approved_by: null
diff --git a/eval/fixture-kb/.vouch/claims/cache-redis.yaml b/eval/fixture-kb/.vouch/claims/cache-redis.yaml
new file mode 100644
index 0000000..e66c21a
--- /dev/null
+++ b/eval/fixture-kb/.vouch/claims/cache-redis.yaml
@@ -0,0 +1,17 @@
+id: cache-redis
+text: Redis caches hot query results with a sixty second TTL
+type: observation
+status: working
+confidence: 0.7
+evidence:
+- 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e
+entities: []
+supersedes: []
+superseded_by: null
+contradicts: []
+scope: project
+tags: []
+created_at: '2026-06-17T01:54:02.970908Z'
+updated_at: '2026-06-17T01:54:02.970909Z'
+last_confirmed_at: null
+approved_by: null
diff --git a/eval/fixture-kb/.vouch/claims/db-migrations.yaml b/eval/fixture-kb/.vouch/claims/db-migrations.yaml
new file mode 100644
index 0000000..3412062
--- /dev/null
+++ b/eval/fixture-kb/.vouch/claims/db-migrations.yaml
@@ -0,0 +1,17 @@
+id: db-migrations
+text: Database migrations are applied with Alembic on deploy
+type: observation
+status: working
+confidence: 0.7
+evidence:
+- 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e
+entities: []
+supersedes: []
+superseded_by: null
+contradicts: []
+scope: project
+tags: []
+created_at: '2026-06-17T01:54:02.970535Z'
+updated_at: '2026-06-17T01:54:02.970535Z'
+last_confirmed_at: null
+approved_by: null
diff --git a/eval/fixture-kb/.vouch/claims/db-postgres.yaml b/eval/fixture-kb/.vouch/claims/db-postgres.yaml
new file mode 100644
index 0000000..e7cccc1
--- /dev/null
+++ b/eval/fixture-kb/.vouch/claims/db-postgres.yaml
@@ -0,0 +1,17 @@
+id: db-postgres
+text: The primary datastore is PostgreSQL fourteen
+type: observation
+status: working
+confidence: 0.7
+evidence:
+- 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e
+entities: []
+supersedes: []
+superseded_by: null
+contradicts: []
+scope: project
+tags: []
+created_at: '2026-06-17T01:54:02.970189Z'
+updated_at: '2026-06-17T01:54:02.970190Z'
+last_confirmed_at: null
+approved_by: null
diff --git a/eval/fixture-kb/.vouch/claims/deploy-ci.yaml b/eval/fixture-kb/.vouch/claims/deploy-ci.yaml
new file mode 100644
index 0000000..02211ce
--- /dev/null
+++ b/eval/fixture-kb/.vouch/claims/deploy-ci.yaml
@@ -0,0 +1,17 @@
+id: deploy-ci
+text: Continuous integration runs ruff, mypy and pytest on every push
+type: observation
+status: working
+confidence: 0.7
+evidence:
+- 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e
+entities: []
+supersedes: []
+superseded_by: null
+contradicts: []
+scope: project
+tags: []
+created_at: '2026-06-17T01:54:02.971958Z'
+updated_at: '2026-06-17T01:54:02.971958Z'
+last_confirmed_at: null
+approved_by: null
diff --git a/eval/fixture-kb/.vouch/claims/deploy-docker.yaml b/eval/fixture-kb/.vouch/claims/deploy-docker.yaml
new file mode 100644
index 0000000..947813d
--- /dev/null
+++ b/eval/fixture-kb/.vouch/claims/deploy-docker.yaml
@@ -0,0 +1,17 @@
+id: deploy-docker
+text: The service ships as a multi-stage Docker image
+type: observation
+status: working
+confidence: 0.7
+evidence:
+- 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e
+entities: []
+supersedes: []
+superseded_by: null
+contradicts: []
+scope: project
+tags: []
+created_at: '2026-06-17T01:54:02.972305Z'
+updated_at: '2026-06-17T01:54:02.972306Z'
+last_confirmed_at: null
+approved_by: null
diff --git a/eval/fixture-kb/.vouch/claims/search-embeddings.yaml b/eval/fixture-kb/.vouch/claims/search-embeddings.yaml
new file mode 100644
index 0000000..7430a97
--- /dev/null
+++ b/eval/fixture-kb/.vouch/claims/search-embeddings.yaml
@@ -0,0 +1,17 @@
+id: search-embeddings
+text: Semantic search uses sentence-transformer embeddings
+type: observation
+status: working
+confidence: 0.7
+evidence:
+- 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e
+entities: []
+supersedes: []
+superseded_by: null
+contradicts: []
+scope: project
+tags: []
+created_at: '2026-06-17T01:54:02.971611Z'
+updated_at: '2026-06-17T01:54:02.971611Z'
+last_confirmed_at: null
+approved_by: null
diff --git a/eval/fixture-kb/.vouch/claims/search-fts5.yaml b/eval/fixture-kb/.vouch/claims/search-fts5.yaml
new file mode 100644
index 0000000..e22a285
--- /dev/null
+++ b/eval/fixture-kb/.vouch/claims/search-fts5.yaml
@@ -0,0 +1,17 @@
+id: search-fts5
+text: Search falls back to SQLite FTS5 when embeddings are absent
+type: observation
+status: working
+confidence: 0.7
+evidence:
+- 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e
+entities: []
+supersedes: []
+superseded_by: null
+contradicts: []
+scope: project
+tags: []
+created_at: '2026-06-17T01:54:02.971266Z'
+updated_at: '2026-06-17T01:54:02.971267Z'
+last_confirmed_at: null
+approved_by: null
diff --git a/eval/fixture-kb/.vouch/config.yaml b/eval/fixture-kb/.vouch/config.yaml
new file mode 100644
index 0000000..b26b474
--- /dev/null
+++ b/eval/fixture-kb/.vouch/config.yaml
@@ -0,0 +1,13 @@
+version: 1
+review:
+  require_human_approval: true
+retrieval:
+  backends:
+  - fts5
+  - substring
+  default_limit: 10
+agents:
+  recommended_loop:
+  - kb.search before writing
+  - kb.propose_* with citations
+  - human review via vouch pending/show/approve
diff --git a/eval/fixture-kb/.vouch/sources/1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e/content b/eval/fixture-kb/.vouch/sources/1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e/content
new file mode 100644
index 0000000..cf61ab2
--- /dev/null
+++ b/eval/fixture-kb/.vouch/sources/1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e/content
@@ -0,0 +1 @@
+vouch architecture notes - fixture evidence for the recall eval
\ No newline at end of file
diff --git a/eval/fixture-kb/.vouch/sources/1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e/meta.yaml b/eval/fixture-kb/.vouch/sources/1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e/meta.yaml
new file mode 100644
index 0000000..e8a4658
--- /dev/null
+++ b/eval/fixture-kb/.vouch/sources/1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e/meta.yaml
@@ -0,0 +1,12 @@
+id: 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e
+type: file
+locator: 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e
+title: null
+hash: 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e
+immutable: true
+scope: project
+byte_size: 63
+media_type: text/plain
+created_at: '2026-06-17T01:54:02.968499Z'
+metadata: {}
+tags: []
diff --git a/eval/queries.jsonl b/eval/queries.jsonl
new file mode 100644
index 0000000..1381da6
--- /dev/null
+++ b/eval/queries.jsonl
@@ -0,0 +1,10 @@
+{"query": "JWT bearer token authentication", "expected": ["auth-jwt"]}
+{"query": "how long until a session expires", "expected": ["auth-session"]}
+{"query": "PostgreSQL primary datastore", "expected": ["db-postgres"]}
+{"query": "Alembic database migrations on deploy", "expected": ["db-migrations"]}
+{"query": "Redis cache TTL for query results", "expected": ["cache-redis"]}
+{"query": "SQLite FTS5 search fallback", "expected": ["search-fts5"]}
+{"query": "sentence transformer semantic embeddings", "expected": ["search-embeddings"]}
+{"query": "continuous integration ruff mypy pytest", "expected": ["deploy-ci"]}
+{"query": "Docker multi-stage image deployment", "expected": ["deploy-docker"]}
+{"query": "public API rate limit per minute", "expected": ["api-rate-limit"]}
diff --git a/src/vouch/cli.py b/src/vouch/cli.py
index 100dcdc..e32ac87 100644
--- a/src/vouch/cli.py
+++ b/src/vouch/cli.py
@@ -667,6 +667,28 @@ def eval_embedding(queries: str, metric: str) -> None:
         click.echo(f"{m_name}\t{v:.4f}")
 
 
+@eval_group.command("recall")
+@click.argument("queries", type=click.Path(exists=True, dir_okay=False))
+@click.option("--k", default=5, show_default=True, type=int)
+@click.option("--baseline", default=None, type=click.Path(exists=True, dir_okay=False),
+              help="Baseline report JSON; fail on a P@k regression beyond tolerance.")
+@click.option("--max-regression", default=0.05, show_default=True, type=float)
+def eval_recall(queries: str, k: int, baseline: str | None,
+                max_regression: float) -> None:
+    """Score kb.context retrieval against a labeled query set (P@k/R@k/MRR/nDCG)."""
+    from .eval.recall import compare_baseline, run_recall
+    store = _load_store()
+    with _cli_errors():
+        report = run_recall(store, queries, k=k)
+    click.echo(json.dumps(report, indent=2))
+    if baseline is not None:
+        base = json.loads(Path(baseline).read_text(encoding="utf-8"))
+        ok, message = compare_baseline(report, base, max_regression=max_regression)
+        click.echo(message, err=True)
+        if not ok:
+            raise click.ClickException(message)
+
+
 @cli.command()
 @click.option("--embeddings/--no-embeddings", default=False,
               help="Rebuild the embedding index in addition to FTS5.")
diff --git a/src/vouch/eval/__init__.py b/src/vouch/eval/__init__.py
new file mode 100644
index 0000000..de519c1
--- /dev/null
+++ b/src/vouch/eval/__init__.py
@@ -0,0 +1,22 @@
+"""Retrieval-quality evaluation harness.
+
+Scores the live `kb.context` retrieval (`build_context_pack`) against a
+labeled query set and reports P@k / R@k / MRR / nDCG, plus a baseline
+comparison used by CI to gate retrieval regressions.
+"""
+
+from __future__ import annotations
+
+from .recall import (
+    compare_baseline,
+    load_queries,
+    run_recall,
+    score_query,
+)
+
+__all__ = [
+    "compare_baseline",
+    "load_queries",
+    "run_recall",
+    "score_query",
+]
diff --git a/src/vouch/eval/recall.py b/src/vouch/eval/recall.py
new file mode 100644
index 0000000..177d4d2
--- /dev/null
+++ b/src/vouch/eval/recall.py
@@ -0,0 +1,132 @@
+"""Recall-quality eval: score `build_context_pack` against a labeled set.
+
+Pure-Python metrics (no numpy). The ranked result for a query is the
+ordered list of `items` ids returned by `build_context_pack`; expected ids
+are the human-labeled relevant claim ids for that query.
+"""
+
+from __future__ import annotations
+
+import json
+import math
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, cast
+
+from ..context import build_context_pack
+
+if TYPE_CHECKING:
+    from ..storage import KBStore
+
+_METRICS = ("p_at_k", "r_at_k", "mrr", "ndcg_at_k")
+
+
+def load_queries(path: str | Path) -> list[dict[str, Any]]:
+    """Read a JSONL labeled query set.
+
+    Each line is ``{"query": str, "expected": [<claim_id>, ...]}``. The key
+    ``expected_ids`` is accepted as an alias for ``expected``. Blank lines are
+    skipped.
+    """
+    queries: list[dict[str, Any]] = []
+    with Path(path).open(encoding="utf-8") as fh:
+        for line in fh:
+            if not line.strip():
+                continue
+            row = json.loads(line)
+            expected = row.get("expected", row.get("expected_ids", []))
+            queries.append({"query": row["query"], "expected": list(expected)})
+    return queries
+
+
+def score_query(
+    ranked_ids: list[str], expected: list[str], *, k: int = 5
+) -> dict[str, float]:
+    """Compute P@k, R@k, MRR and nDCG@k for one ranked list vs expected ids."""
+    rel = set(expected)
+    top = ranked_ids[:k]
+    hits = sum(1 for rid in top if rid in rel)
+
+    p_at_k = hits / k if k else 0.0
+    r_at_k = hits / len(rel) if rel else 0.0
+
+    mrr = 0.0
+    for i, rid in enumerate(ranked_ids, start=1):
+        if rid in rel:
+            mrr = 1.0 / i
+            break
+
+    dcg = 0.0
+    for i, rid in enumerate(top, start=1):
+        if rid in rel:
+            dcg += 1.0 / math.log2(i + 1)
+    ideal = sum(1.0 / math.log2(i + 1) for i in range(1, min(len(rel), k) + 1))
+    ndcg = dcg / ideal if ideal > 0 else 0.0
+
+    return {"p_at_k": p_at_k, "r_at_k": r_at_k, "mrr": mrr, "ndcg_at_k": ndcg}
+
+
+def _macro(per_query: list[dict[str, Any]]) -> dict[str, float]:
+    if not per_query:
+        return dict.fromkeys(_METRICS, 0.0)
+    n = len(per_query)
+    return {m: sum(q["scores"][m] for q in per_query) / n for m in _METRICS}
+
+
+def run_recall(
+    store: KBStore, queries_path: str | Path, *, k: int = 5
+) -> dict[str, Any]:
+    """Score retrieval over a labeled query set and return a report.
+
+    For each query the ranked result is the ordered ``items`` ids from
+    ``build_context_pack(store, query=q, limit=max(k, 10))``. The report is
+    deterministic (queries preserve input order; metrics are pure functions).
+    """
+    queries = load_queries(queries_path)
+    limit = max(k, 10)
+    per_query: list[dict[str, Any]] = []
+    for row in queries:
+        pack = cast(
+            dict[str, Any],
+            build_context_pack(store, query=row["query"], limit=limit),
+        )
+        ranked_ids = [item["id"] for item in pack["items"]]
+        scores = score_query(ranked_ids, row["expected"], k=k)
+        per_query.append(
+            {
+                "query": row["query"],
+                "expected": row["expected"],
+                "ranked": ranked_ids,
+                "scores": scores,
+            }
+        )
+    return {
+        "k": k,
+        "n_queries": len(per_query),
+        "macro": _macro(per_query),
+        "per_query": per_query,
+    }
+
+
+def compare_baseline(
+    report: dict[str, Any], baseline: dict[str, Any], *, max_regression: float = 0.05
+) -> tuple[bool, str]:
+    """Compare a fresh report against a committed baseline on macro P@k.
+
+    Returns ``(ok, message)``. Not ok when the report's macro ``p_at_k`` falls
+    below ``baseline.macro.p_at_k - max_regression``.
+    """
+    cur = float(report["macro"]["p_at_k"])
+    base = float(baseline["macro"]["p_at_k"])
+    floor = base - max_regression
+    delta = cur - base
+    if cur < floor:
+        return (
+            False,
+            f"P@{report['k']} regression: {cur:.4f} < baseline {base:.4f} "
+            f"- tol {max_regression:.4f} = {floor:.4f} (delta {delta:+.4f})",
+        )
+    return (
+        True,
+        f"P@{report['k']} ok: {cur:.4f} vs baseline {base:.4f} "
+        f"(delta {delta:+.4f}, tol {max_regression:.4f})",
+    )
diff --git a/tests/test_eval_recall.py b/tests/test_eval_recall.py
new file mode 100644
index 0000000..2d7641f
--- /dev/null
+++ b/tests/test_eval_recall.py
@@ -0,0 +1,134 @@
+"""Recall-quality eval harness: metrics, run_recall, baseline gate."""
+
+from __future__ import annotations
+
+import json
+import math
+from pathlib import Path
+
+import pytest
+
+from vouch import health
+from vouch.eval.recall import (
+    compare_baseline,
+    load_queries,
+    run_recall,
+    score_query,
+)
+from vouch.models import Claim
+from vouch.storage import KBStore
+
+
+def test_score_query_hand_computed() -> None:
+    ranked = ["a", "x", "b", "y", "z", "c"]
+    expected = ["a", "b", "c"]
+    out = score_query(ranked, expected, k=5)
+    # Top-5 = a,x,b,y,z -> 2 of 5 relevant.
+    assert out["p_at_k"] == 2 / 5
+    # 2 of 3 expected appear in top-5.
+    assert out["r_at_k"] == 2 / 3
+    # First relevant hit is at rank 1.
+    assert out["mrr"] == 1.0
+    # DCG: a@1 (1/log2 2) + b@3 (1/log2 4). c is at rank 6, outside k=5.
+    dcg = 1.0 / math.log2(2) + 1.0 / math.log2(4)
+    ideal = 1.0 / math.log2(2) + 1.0 / math.log2(3) + 1.0 / math.log2(4)
+    assert out["ndcg_at_k"] == dcg / ideal
+
+
+def test_score_query_no_hits() -> None:
+    out = score_query(["x", "y"], ["a"], k=5)
+    assert out == {"p_at_k": 0.0, "r_at_k": 0.0, "mrr": 0.0, "ndcg_at_k": 0.0}
+
+
+def test_score_query_mrr_uses_full_ranking() -> None:
+    # Relevant id is at rank 3 (outside k for P@k, but MRR scans full list).
+    out = score_query(["x", "y", "a", "z"], ["a"], k=2)
+    assert out["mrr"] == 1 / 3
+    assert out["p_at_k"] == 0.0
+
+
+def test_load_queries_accepts_both_keys(tmp_path: Path) -> None:
+    p = tmp_path / "q.jsonl"
+    p.write_text(
+        json.dumps({"query": "one", "expected": ["c1"]})
+        + "\n\n"
+        + json.dumps({"query": "two", "expected_ids": ["c2", "c3"]})
+        + "\n",
+        encoding="utf-8",
+    )
+    rows = load_queries(p)
+    assert rows == [
+        {"query": "one", "expected": ["c1"]},
+        {"query": "two", "expected": ["c2", "c3"]},
+    ]
+
+
+def test_run_recall_deterministic_report(tmp_path: Path) -> None:
+    store = KBStore.init(tmp_path)
+    src = store.put_source(b"evidence")
+    store.put_claim(Claim(id="auth-jwt", text="auth uses JWT bearer tokens",
+                          evidence=[src.id]))
+    store.put_claim(Claim(id="db-postgres", text="datastore is PostgreSQL",
+                          evidence=[src.id]))
+    store.put_claim(Claim(id="cache-redis", text="Redis caches query results",
+                          evidence=[src.id]))
+    health.rebuild_index(store)
+
+    qpath = tmp_path / "queries.jsonl"
+    qpath.write_text(
+        "\n".join(
+            json.dumps(row)
+            for row in (
+                {"query": "JWT", "expected": ["auth-jwt"]},
+                {"query": "PostgreSQL", "expected": ["db-postgres"]},
+                {"query": "Redis", "expected": ["cache-redis"]},
+            )
+        )
+        + "\n",
+        encoding="utf-8",
+    )
+
+    report = run_recall(store, qpath, k=5)
+    assert report["k"] == 5
+    assert report["n_queries"] == 3
+    # Each query's single relevant claim is the top FTS5 hit.
+    for pq in report["per_query"]:
+        assert pq["ranked"][0] == pq["expected"][0]
+        assert pq["scores"]["mrr"] == 1.0
+        assert pq["scores"]["r_at_k"] == 1.0
+        assert pq["scores"]["p_at_k"] == 1 / 5
+    assert report["macro"]["mrr"] == 1.0
+    assert report["macro"]["p_at_k"] == pytest.approx(1 / 5)
+
+    # Determinism: identical inputs yield an identical report.
+    assert run_recall(store, qpath, k=5) == report
+
+
+def test_compare_baseline_flags_regression() -> None:
+    report = {"k": 5, "macro": {"p_at_k": 0.40}}
+    baseline = {"k": 5, "macro": {"p_at_k": 0.50}}
+    ok, msg = compare_baseline(report, baseline, max_regression=0.05)
+    assert ok is False
+    assert "regression" in msg
+
+
+def test_compare_baseline_within_tolerance() -> None:
+    report = {"k": 5, "macro": {"p_at_k": 0.47}}
+    baseline = {"k": 5, "macro": {"p_at_k": 0.50}}
+    ok, msg = compare_baseline(report, baseline, max_regression=0.05)
+    assert ok is True
+    assert "ok" in msg
+
+
+def test_compare_baseline_improvement_is_ok() -> None:
+    report = {"k": 5, "macro": {"p_at_k": 0.80}}
+    baseline = {"k": 5, "macro": {"p_at_k": 0.50}}
+    ok, _ = compare_baseline(report, baseline, max_regression=0.05)
+    assert ok is True
+
+
+def test_committed_query_set_is_loadable() -> None:
+    repo_root = Path(__file__).resolve().parents[1]
+    rows = load_queries(repo_root / "eval" / "queries.jsonl")
+    assert len(rows) >= 6
+    assert all(r["query"] and r["expected"] for r in rows)