vouchdev · plind-junior · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,12 @@ All notable changes to vouch are documented here. Format follows
 ## [Unreleased]
 
 ### Added
+- Entity-salience retrieval reflex: a per-session, in-memory ring buffer of
+  recent caller queries drives a zero-LLM substring/FTS entity pass that
+  attaches top-K matched claim candidates as `_meta.vouch_salience` on
+  `kb_context` read responses. Config-gated via `retrieval.reflex`
+  (`enabled`/`window`/`top_k`); the buffer is never persisted and resets on
+  `session_end` (#223).
 - `vouch eval recall <queries.jsonl>` — score `kb.context` retrieval against a
   labeled query set with pure-Python P@k / R@k / MRR / nDCG, compare against a
   committed `eval/baseline.json`, and fail CI on a P@5 regression beyond

diff --git a/src/vouch/jsonl_server.py b/src/vouch/jsonl_server.py
@@ -28,6 +28,7 @@
 
 from . import audit, bundle, health, volunteer_context
 from . import lifecycle as life
+from . import salience as salience_mod
 from . import sessions as sess_mod
 from . import verify as verify_mod
 from .capabilities import capabilities as build_caps
@@ -157,11 +158,25 @@ def _h_search(p: dict) -> dict:
     }
 
 
+def _load_cfg(store: KBStore) -> dict:
+    try:
+        loaded = yaml.safe_load((store.kb_dir / "config.yaml").read_text())
+    except Exception:
+        return {}
+    return loaded if isinstance(loaded, dict) else {}
+
 
 def _h_context(p: dict) -> dict:
-    return build_context_pack(  # type: ignore[return-value]
-        _store(),
-        query=p["task"],
+    store = _store()
+    query = p["task"]
+    cfg = _load_cfg(store)
+    session_id = p.get("session_id")
+    if session_id:
+        _, window, _ = salience_mod.reflex_cfg(cfg)
+        salience_mod.record_query(session_id, query, window=window)
+    result: dict = build_context_pack(  # type: ignore[assignment]
+        store,
+        query=query,
         limit=int(p.get("limit", 10)),
         max_chars=int(p["max_chars"]) if p.get("max_chars") is not None else None,
         min_items=int(p.get("min_items", 0)),
@@ -171,6 +186,7 @@ def _h_context(p: dict) -> dict:
         project=p.get("project"),
         agent=p.get("agent"),
     )
+    return salience_mod.attach_salience(result, store, session_id, cfg)
 
 
 def _h_read_page(p: dict) -> dict:

diff --git a/src/vouch/salience.py b/src/vouch/salience.py
@@ -0,0 +1,199 @@
+"""Entity-salience retrieval reflex — auto-prefetch claim candidates.
+
+Keeps a per-session, in-memory ring buffer of the recent query strings a
+caller passed to the read path. On each read it runs a zero-LLM entity pass
+(substring match on entity name/aliases, plus FTS via the existing index
+when available) over the buffered queries and attaches the top-K matched
+entities — with the claims that reference them — as ``_meta.vouch_salience``.
+
+Design constraints (issue #223):
+  - zero LLM calls — substring + FTS only;
+  - per-session state only, keyed by ``session_id``;
+  - the ring buffer is NEVER written to disk;
+  - buffers expire after 30 minutes of inactivity.
+
+Config (read defensively from ``.vouch/config.yaml``, like the rest of the
+codebase — no pydantic Config model):
+  - ``retrieval.reflex.enabled`` (default True)
+  - ``retrieval.reflex.window`` (default 8)
+  - ``retrieval.reflex.top_k`` (default 3)
+"""
+
+from __future__ import annotations
+
+import sqlite3
+import time
+from collections import deque
+from dataclasses import dataclass, field
+from typing import Any
+
+from . import index_db
+from .storage import KBStore
+
+DEFAULT_WINDOW = 8
+DEFAULT_TOP_K = 3
+_EXPIRY_SECONDS = 30 * 60
+
+
+@dataclass
+class _SessionBuffer:
+    queries: deque[str]
+    last_active: float = field(default_factory=time.monotonic)
+
+
+# Module-level in-memory state. Keyed by session_id. Never persisted.
+_BUFFERS: dict[str, _SessionBuffer] = {}
+
+
+def _expire_stale(now: float) -> None:
+    stale = [
+        sid for sid, buf in _BUFFERS.items()
+        if now - buf.last_active > _EXPIRY_SECONDS
+    ]
+    for sid in stale:
+        del _BUFFERS[sid]
+
+
+def record_query(session_id: str, query: str, *, window: int = DEFAULT_WINDOW) -> None:
+    """Append ``query`` to this session's in-memory ring buffer."""
+    query = (query or "").strip()
+    if not session_id or not query:
+        return
+    now = time.monotonic()
+    _expire_stale(now)
+    window = max(1, window)
+    buf = _BUFFERS.get(session_id)
+    if buf is None or buf.queries.maxlen != window:
+        existing = list(buf.queries) if buf is not None else []
+        buf = _SessionBuffer(queries=deque(existing, maxlen=window))
+        _BUFFERS[session_id] = buf
+    buf.queries.append(query)
+    buf.last_active = now
+
+
+def reset_session(session_id: str) -> None:
+    """Clear a session's buffer — called when the session ends."""
+    _BUFFERS.pop(session_id, None)
+
+
+def _buffered_queries(session_id: str) -> list[str]:
+    now = time.monotonic()
+    _expire_stale(now)
+    buf = _BUFFERS.get(session_id)
+    if buf is None:
+        return []
+    buf.last_active = now
+    return list(buf.queries)
+
+
+def _fts_entity_ids(store: KBStore, query: str) -> list[str]:
+    """Entity ids the FTS index matches for ``query`` (best-effort)."""
+    try:
+        hits = index_db.search(store.kb_dir, query, limit=50)
+    except sqlite3.Error:
+        return []
+    return [hid for kind, hid, _snip, _score in hits if kind == "entity"]
+
+
+def _substring_entity_ids(entities: list[Any], query: str) -> list[str]:
+    """Entity ids whose name or any alias appears in ``query`` (or vice versa)."""
+    q = query.casefold()
+    matched: list[str] = []
+    for ent in entities:
+        needles = [ent.name, *ent.aliases]
+        for needle in needles:
+            n = (needle or "").casefold()
+            if n and (n in q or q in n):
+                matched.append(ent.id)
+                break
+    return matched
+
+
+def compute_salience(
+    store: KBStore, session_id: str, *, top_k: int = DEFAULT_TOP_K
+) -> list[dict]:
+    """Rank buffered-query entity matches; return top-K salience records.
+
+    Each record is ``{"entity_id", "claim_count", "top_claim_id"}`` where
+    ``claim_count`` is the number of claims referencing the entity and
+    ``top_claim_id`` is the highest-relevance claim (or None).
+    """
+    queries = _buffered_queries(session_id)
+    if not queries:
+        return []
+
+    entities = store.list_entities()
+    if not entities:
+        return []
+    by_id = {ent.id: ent for ent in entities}
+
+    # Score entities by how many buffered queries match them (substring + FTS).
+    scores: dict[str, int] = {}
+    for query in queries:
+        hits = set(_substring_entity_ids(entities, query))
+        hits.update(eid for eid in _fts_entity_ids(store, query) if eid in by_id)
+        for eid in hits:
+            scores[eid] = scores.get(eid, 0) + 1
+
+    if not scores:
+        return []
+
+    # Claims referencing each matched entity, by claim id (for stable picking).
+    claims_by_entity: dict[str, list[str]] = {}
+    for claim in store.list_claims():
+        for eid in claim.entities:
+            if eid in scores:
+                claims_by_entity.setdefault(eid, []).append(claim.id)
+
+    ranked = sorted(
+        scores,
+        key=lambda eid: (scores[eid], len(claims_by_entity.get(eid, [])), eid),
+        reverse=True,
+    )
+
+    out: list[dict] = []
+    for eid in ranked[: max(0, top_k)]:
+        claim_ids = sorted(claims_by_entity.get(eid, []))
+        out.append({
+            "entity_id": eid,
+            "claim_count": len(claim_ids),
+            "top_claim_id": claim_ids[0] if claim_ids else None,
+        })
+    return out
+
+
+def reflex_cfg(cfg: dict) -> tuple[bool, int, int]:
+    """Read ``retrieval.reflex`` config defensively. Returns (enabled, window, top_k)."""
+    retrieval = cfg.get("retrieval") if isinstance(cfg, dict) else None
+    reflex = retrieval.get("reflex") if isinstance(retrieval, dict) else None
+    if not isinstance(reflex, dict):
+        reflex = {}
+
+    enabled = reflex.get("enabled", True)
+    enabled = bool(enabled) if isinstance(enabled, bool) else True
+
+    window = reflex.get("window", DEFAULT_WINDOW)
+    window = window if isinstance(window, int) and window > 0 else DEFAULT_WINDOW
+
+    top_k = reflex.get("top_k", DEFAULT_TOP_K)
+    top_k = top_k if isinstance(top_k, int) and top_k > 0 else DEFAULT_TOP_K
+
+    return enabled, window, top_k
+
+
+def attach_salience(
+    result: dict, store: KBStore, session_id: str | None, cfg: dict
+) -> dict:
+    """Attach ``_meta.vouch_salience`` to ``result`` when the reflex applies.
+
+    No-op (returns ``result`` unchanged, no field added) when the reflex is
+    disabled, no ``session_id`` is given, or the session buffer is empty.
+    """
+    enabled, _window, top_k = reflex_cfg(cfg)
+    if not enabled or not session_id:
+        return result
+    salience = compute_salience(store, session_id, top_k=top_k)
+    if not salience:
+        return result
+    result.setdefault("_meta", {})["vouch_salience"] = salience
+    return result
diff --git a/src/vouch/server.py b/src/vouch/server.py
@@ -16,10 +16,12 @@
 from pathlib import Path
 from typing import Any
 
+import yaml
 from mcp.server.fastmcp import FastMCP
 
 from . import audit, bundle, health, volunteer_context
 from . import lifecycle as life
+from . import salience as salience_mod
 from . import sessions as sess_mod
 from . import verify as verify_mod
 from .capabilities import capabilities as build_caps
@@ -171,22 +173,41 @@ def _to_dicts(h: list[tuple[str, str, str, float]], used: str) -> dict[str, Any]
     raise ValueError(f"unknown backend: {backend}")
 
 
+def _load_cfg(store: KBStore) -> dict[str, Any]:
+    try:
+        loaded = yaml.safe_load((store.kb_dir / "config.yaml").read_text())
+    except Exception:
+        return {}
+    return loaded if isinstance(loaded, dict) else {}
+
+
 @mcp.tool()
 def kb_context(
     task: str,
     limit: int = 10,
     max_chars: int | None = None,
     min_items: int = 0,
     require_citations: bool = False,
+    session_id: str | None = None,
     project: str | None = None,
     agent: str | None = None,
 ) -> dict[str, Any]:
-    """Build a ContextPack ready to inject into an agent prompt."""
-    return build_context_pack(  # type: ignore[return-value]
-        _store(), query=task, limit=limit, max_chars=max_chars,
+    """Build a ContextPack ready to inject into an agent prompt.
+
+    When ``session_id`` is given, the entity-salience reflex records the query
+    and may attach ``_meta.vouch_salience`` (see ``salience`` module).
+    """
+    store = _store()
+    cfg = _load_cfg(store)
+    if session_id:
+        _, window, _ = salience_mod.reflex_cfg(cfg)
+        salience_mod.record_query(session_id, task, window=window)
+    result: dict[str, Any] = build_context_pack(  # type: ignore[assignment]
+        store, query=task, limit=limit, max_chars=max_chars,
         min_items=min_items, require_citations=require_citations,
         project=project, agent=agent,
     )
+    return salience_mod.attach_salience(result, store, session_id, cfg)
 
 
 @mcp.tool()

diff --git a/src/vouch/sessions.py b/src/vouch/sessions.py
@@ -39,6 +39,7 @@ def session_start(store: KBStore, *, agent: str, task: str | None = None,
 
 def session_end(store: KBStore, session_id: str, *, note: str | None = None) -> Session:
     sess = store.get_session(session_id)
+    salience.reset_session(session_id)
     if sess.ended_at is not None:
         return sess  # idempotent
     sess.ended_at = datetime.now(UTC)