Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions .github/workflows/eval.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: eval

# Gate retrieval quality: score kb.context against the committed labeled set
# and fail on a P@5 regression beyond tolerance vs eval/baseline.json. Runs
# only when retrieval code changes.
on:
pull_request:
paths:
- "src/vouch/embeddings/**"
- "src/vouch/context.py"
- "src/vouch/eval/**"
- "eval/**"

jobs:
recall:
name: recall eval
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- uses: actions/setup-python@v5
with:
python-version: "3.12"
cache: pip

- name: install
run: |
python -m pip install --upgrade pip
pip install -e '.[dev]'

- name: build fixture index
working-directory: eval/fixture-kb
run: python -m vouch.cli reindex

- name: recall eval (fail on P@5 regression > 5%)
working-directory: eval/fixture-kb
run: >-
python -m vouch.cli eval recall ../queries.jsonl
--k 5 --baseline ../baseline.json --max-regression 0.05
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,13 @@ All notable changes to vouch are documented here. Format follows

## [Unreleased]

### Added
- `vouch eval recall <queries.jsonl>` — score `kb.context` retrieval against a
labeled query set with pure-Python P@k / R@k / MRR / nDCG, compare against a
committed `eval/baseline.json`, and fail CI on a P@5 regression beyond
tolerance (default 5%). Ships a starter labeled set, a reproducible fixture
KB under `eval/fixture-kb/`, and an `eval` workflow gating retrieval changes
(#226).
### Fixed
- `vouch serve` now fails fast with a clear `vouch init` hint when no `.vouch/` KB is present, instead of starting a server that immediately misbehaves (#95).

Expand Down
166 changes: 166 additions & 0 deletions eval/baseline.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
{
"k": 5,
"n_queries": 10,
"macro": {
"p_at_k": 0.18,
"r_at_k": 0.9,
"mrr": 0.9,
"ndcg_at_k": 0.9
},
"per_query": [
{
"query": "JWT bearer token authentication",
"expected": [
"auth-jwt"
],
"ranked": [
"auth-jwt"
],
"scores": {
"p_at_k": 0.2,
"r_at_k": 1.0,
"mrr": 1.0,
"ndcg_at_k": 1.0
}
},
{
"query": "how long until a session expires",
"expected": [
"auth-session"
],
"ranked": [
"deploy-docker",
"cache-redis"
],
"scores": {
"p_at_k": 0.0,
"r_at_k": 0.0,
"mrr": 0.0,
"ndcg_at_k": 0.0
}
},
{
"query": "PostgreSQL primary datastore",
"expected": [
"db-postgres"
],
"ranked": [
"db-postgres"
],
"scores": {
"p_at_k": 0.2,
"r_at_k": 1.0,
"mrr": 1.0,
"ndcg_at_k": 1.0
}
},
{
"query": "Alembic database migrations on deploy",
"expected": [
"db-migrations"
],
"ranked": [
"db-migrations",
"deploy-ci"
],
"scores": {
"p_at_k": 0.2,
"r_at_k": 1.0,
"mrr": 1.0,
"ndcg_at_k": 1.0
}
},
{
"query": "Redis cache TTL for query results",
"expected": [
"cache-redis"
],
"ranked": [
"cache-redis"
],
"scores": {
"p_at_k": 0.2,
"r_at_k": 1.0,
"mrr": 1.0,
"ndcg_at_k": 1.0
}
},
{
"query": "SQLite FTS5 search fallback",
"expected": [
"search-fts5"
],
"ranked": [
"search-fts5",
"search-embeddings"
],
"scores": {
"p_at_k": 0.2,
"r_at_k": 1.0,
"mrr": 1.0,
"ndcg_at_k": 1.0
}
},
{
"query": "sentence transformer semantic embeddings",
"expected": [
"search-embeddings"
],
"ranked": [
"search-embeddings",
"search-fts5"
],
"scores": {
"p_at_k": 0.2,
"r_at_k": 1.0,
"mrr": 1.0,
"ndcg_at_k": 1.0
}
},
{
"query": "continuous integration ruff mypy pytest",
"expected": [
"deploy-ci"
],
"ranked": [
"deploy-ci"
],
"scores": {
"p_at_k": 0.2,
"r_at_k": 1.0,
"mrr": 1.0,
"ndcg_at_k": 1.0
}
},
{
"query": "Docker multi-stage image deployment",
"expected": [
"deploy-docker"
],
"ranked": [
"deploy-docker"
],
"scores": {
"p_at_k": 0.2,
"r_at_k": 1.0,
"mrr": 1.0,
"ndcg_at_k": 1.0
}
},
{
"query": "public API rate limit per minute",
"expected": [
"api-rate-limit"
],
"ranked": [
"api-rate-limit"
],
"scores": {
"p_at_k": 0.2,
"r_at_k": 1.0,
"mrr": 1.0,
"ndcg_at_k": 1.0
}
}
]
}
3 changes: 3 additions & 0 deletions eval/fixture-kb/.vouch/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
proposed/
state.db
state.db-*
17 changes: 17 additions & 0 deletions eval/fixture-kb/.vouch/claims/api-rate-limit.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
id: api-rate-limit
text: The public API rate limits clients to one hundred requests per minute
type: observation
status: working
confidence: 0.7
evidence:
- 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e
entities: []
supersedes: []
superseded_by: null
contradicts: []
scope: project
tags: []
created_at: '2026-06-17T01:54:02.972683Z'
updated_at: '2026-06-17T01:54:02.972684Z'
last_confirmed_at: null
approved_by: null
17 changes: 17 additions & 0 deletions eval/fixture-kb/.vouch/claims/auth-jwt.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
id: auth-jwt
text: Authentication uses JWT bearer tokens signed with RS256
type: observation
status: working
confidence: 0.7
evidence:
- 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e
entities: []
supersedes: []
superseded_by: null
contradicts: []
scope: project
tags: []
created_at: '2026-06-17T01:54:02.969406Z'
updated_at: '2026-06-17T01:54:02.969407Z'
last_confirmed_at: null
approved_by: null
17 changes: 17 additions & 0 deletions eval/fixture-kb/.vouch/claims/auth-session.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
id: auth-session
text: Sessions expire after thirty minutes of inactivity
type: observation
status: working
confidence: 0.7
evidence:
- 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e
entities: []
supersedes: []
superseded_by: null
contradicts: []
scope: project
tags: []
created_at: '2026-06-17T01:54:02.969829Z'
updated_at: '2026-06-17T01:54:02.969829Z'
last_confirmed_at: null
approved_by: null
17 changes: 17 additions & 0 deletions eval/fixture-kb/.vouch/claims/cache-redis.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
id: cache-redis
text: Redis caches hot query results with a sixty second TTL
type: observation
status: working
confidence: 0.7
evidence:
- 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e
entities: []
supersedes: []
superseded_by: null
contradicts: []
scope: project
tags: []
created_at: '2026-06-17T01:54:02.970908Z'
updated_at: '2026-06-17T01:54:02.970909Z'
last_confirmed_at: null
approved_by: null
17 changes: 17 additions & 0 deletions eval/fixture-kb/.vouch/claims/db-migrations.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
id: db-migrations
text: Database migrations are applied with Alembic on deploy
type: observation
status: working
confidence: 0.7
evidence:
- 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e
entities: []
supersedes: []
superseded_by: null
contradicts: []
scope: project
tags: []
created_at: '2026-06-17T01:54:02.970535Z'
updated_at: '2026-06-17T01:54:02.970535Z'
last_confirmed_at: null
approved_by: null
17 changes: 17 additions & 0 deletions eval/fixture-kb/.vouch/claims/db-postgres.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
id: db-postgres
text: The primary datastore is PostgreSQL fourteen
type: observation
status: working
confidence: 0.7
evidence:
- 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e
entities: []
supersedes: []
superseded_by: null
contradicts: []
scope: project
tags: []
created_at: '2026-06-17T01:54:02.970189Z'
updated_at: '2026-06-17T01:54:02.970190Z'
last_confirmed_at: null
approved_by: null
17 changes: 17 additions & 0 deletions eval/fixture-kb/.vouch/claims/deploy-ci.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
id: deploy-ci
text: Continuous integration runs ruff, mypy and pytest on every push
type: observation
status: working
confidence: 0.7
evidence:
- 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e
entities: []
supersedes: []
superseded_by: null
contradicts: []
scope: project
tags: []
created_at: '2026-06-17T01:54:02.971958Z'
updated_at: '2026-06-17T01:54:02.971958Z'
last_confirmed_at: null
approved_by: null
17 changes: 17 additions & 0 deletions eval/fixture-kb/.vouch/claims/deploy-docker.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
id: deploy-docker
text: The service ships as a multi-stage Docker image
type: observation
status: working
confidence: 0.7
evidence:
- 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e
entities: []
supersedes: []
superseded_by: null
contradicts: []
scope: project
tags: []
created_at: '2026-06-17T01:54:02.972305Z'
updated_at: '2026-06-17T01:54:02.972306Z'
last_confirmed_at: null
approved_by: null
17 changes: 17 additions & 0 deletions eval/fixture-kb/.vouch/claims/search-embeddings.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
id: search-embeddings
text: Semantic search uses sentence-transformer embeddings
type: observation
status: working
confidence: 0.7
evidence:
- 1410e7845543213186bddf486561536087d01cbefcf7f0c35d0fe6e7b008113e
entities: []
supersedes: []
superseded_by: null
contradicts: []
scope: project
tags: []
created_at: '2026-06-17T01:54:02.971611Z'
updated_at: '2026-06-17T01:54:02.971611Z'
last_confirmed_at: null
approved_by: null
Loading
Loading