diff --git a/.gitignore b/.gitignore
index b7476d0a..9595dd13 100644
--- a/.gitignore
+++ b/.gitignore
@@ -58,3 +58,4 @@ htmlcov/
 pytest_cache/
 *.log
 repositories/
+logs/
diff --git a/api/analyzers/analyzer.py b/api/analyzers/analyzer.py
index 64d49004..0564606b 100644
--- a/api/analyzers/analyzer.py
+++ b/api/analyzers/analyzer.py
@@ -57,6 +57,11 @@ def resolve(self, files: dict[Path, File], lsp: SyncLanguageServer, file_path: P
             locations = lsp.request_definition(str(file_path), node.start_point.row, node.start_point.column)
             return [(files[Path(self.resolve_path(location['absolutePath'], path))], files[Path(self.resolve_path(location['absolutePath'], path))].tree.root_node.descendant_for_point_range(Point(location['range']['start']['line'], location['range']['start']['character']), Point(location['range']['end']['line'], location['range']['end']['character']))) for location in locations if location and Path(self.resolve_path(location['absolutePath'], path)) in files]
         except Exception as e:
+            import logging
+            logging.getLogger(__name__).warning(
+                "resolve() failed for %s @%d:%d: %s",
+                file_path, node.start_point.row, node.start_point.column, e,
+            )
             return []
         
     @abstractmethod
diff --git a/api/analyzers/source_analyzer.py b/api/analyzers/source_analyzer.py
index 4186f358..ead8707a 100644
--- a/api/analyzers/source_analyzer.py
+++ b/api/analyzers/source_analyzer.py
@@ -134,7 +134,27 @@ def second_pass(self, graph: Graph, files: list[Path], path: Path) -> None:
         else:
             lsps[".java"] = NullLanguageServer()
         if any(path.rglob('*.py')):
-            config = MultilspyConfig.from_dict({"code_language": "python", "environment_path": f"{path}/venv"})
+            import sys
+            py_venv = path / "venv"
+            py_dotvenv = path / ".venv"
+            if py_venv.is_dir() and (py_venv / "bin" / "python").exists():
+                env_path = str(py_venv)
+            elif py_dotvenv.is_dir() and (py_dotvenv / "bin" / "python").exists():
+                env_path = str(py_dotvenv)
+            else:
+                # Fall back to the host's Python environment so jedi has a
+                # valid interpreter to introspect; otherwise every
+                # request_definition() raises InvalidPythonEnvironment and
+                # we'd silently produce a graph with zero CALLS edges.
+                env_path = str(Path(sys.executable).resolve().parent.parent)
+                logging.info(
+                    "No venv at %s; falling back to host env %s for jedi LSP",
+                    path, env_path,
+                )
+            config = MultilspyConfig.from_dict({
+                "code_language": "python",
+                "environment_path": env_path,
+            })
             lsps[".py"] = SyncLanguageServer.create(config, logger, str(path))
         else:
             lsps[".py"] = NullLanguageServer()
@@ -146,7 +166,16 @@ def second_pass(self, graph: Graph, files: list[Path], path: Path) -> None:
         with lsps[".java"].start_server(), lsps[".py"].start_server(), lsps[".cs"].start_server():
             files_len = len(self.files)
             for i, file_path in enumerate(files):
-                file = self.files[file_path]
+                file = self.files.get(file_path)
+                if file is None:
+                    # first_pass skipped this file (e.g. parse error, empty,
+                    # or ignored after entering the candidate list). Skip
+                    # in second_pass too instead of crashing the whole index.
+                    logging.warning(
+                        "second_pass: %s not in files map (first_pass skipped it); skipping",
+                        file_path,
+                    )
+                    continue
                 logging.info(f'Processing file ({i + 1}/{files_len}): {file_path}')
                 for _, entity in file.entities.items():
                     entity.resolved_symbol(lambda key, symbol, fp=file_path: analyzers[fp.suffix].resolve_symbol(self.files, lsps[fp.suffix], fp, path, key, symbol))
diff --git a/bench/agents/code_graph_mcp_adapter.py b/bench/agents/code_graph_mcp_adapter.py
new file mode 100644
index 00000000..9a6347bd
--- /dev/null
+++ b/bench/agents/code_graph_mcp_adapter.py
@@ -0,0 +1,163 @@
+"""MCP-transport adapter to cgraph-mcp for the benchmark.
+
+Sibling of `code_graph_adapter.py` (HTTP). Where the HTTP adapter talks
+to the host FastAPI service over the network, this one spawns the
+`cgraph-mcp` stdio MCP server in-process via the official MCP Python
+SDK and dispatches tool calls over JSON-RPC.
+
+This gives us a second, real-world benchmark track that exercises the
+exact same transport agents (Claude Code, Cursor, …) will use in
+production. Tool names match the 8-tool MCP surface
+(`index_repo`, `search_code`, `get_callers`, `get_callees`,
+`get_dependencies`, `impact_analysis`, `find_path`, `ask`).
+
+Each call spawns a fresh server, runs the call, and exits. That's
+~0.5-1s overhead per call but keeps the model trivially safe to call
+from a bash shim (one process per invocation, no shared state).
+A future optimisation could persist the server across calls via a
+side-channel daemon, but per-call spawn matches how external agents
+actually use MCP servers today.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import os
+from typing import Any
+
+from mcp import ClientSession, StdioServerParameters
+from mcp.client.stdio import stdio_client
+
+
+DEFAULT_TIMEOUT_SEC = 60.0
+
+
+def _env_for_mcp() -> dict[str, str]:
+    """Build the env for the spawned cgraph-mcp process.
+
+    Pass through everything from the caller but make sure the FalkorDB
+    coordinates are present — the runner usually sets them to point at
+    the host FalkorDB container.
+    """
+    env = dict(os.environ)
+    env.setdefault("FALKORDB_HOST", os.environ.get("FALKORDB_HOST", "127.0.0.1"))
+    env.setdefault("FALKORDB_PORT", os.environ.get("FALKORDB_PORT", "6379"))
+    return env
+
+
+def _extract(result: Any) -> Any:
+    """Normalize a CallToolResult into a JSON-serialisable Python value.
+
+    The MCP spec lets servers put the payload in `structuredContent`
+    and/or echo it as a JSON text chunk. Our 8 tools do both; agents
+    have historically preferred the text payload. We mirror that:
+    return the parsed text chunk when present, otherwise fall back to
+    structuredContent (unwrapping the spec's `{"result": ...}` wrapper
+    for collection-returning tools).
+    """
+    for chunk in result.content:
+        if hasattr(chunk, "text") and chunk.text:
+            try:
+                return json.loads(chunk.text)
+            except json.JSONDecodeError:
+                return chunk.text
+    struct = getattr(result, "structuredContent", None)
+    if isinstance(struct, dict) and set(struct.keys()) == {"result"}:
+        return struct["result"]
+    return struct
+
+
+async def _call_tool_async(name: str, arguments: dict[str, Any], timeout: float) -> Any:
+    params = StdioServerParameters(command="cgraph-mcp", args=[], env=_env_for_mcp())
+    async with stdio_client(params) as (read, write):
+        async with ClientSession(read, write) as session:
+            await asyncio.wait_for(session.initialize(), timeout=timeout)
+            result = await asyncio.wait_for(
+                session.call_tool(name, arguments), timeout=timeout
+            )
+            payload = _extract(result)
+            if getattr(result, "isError", False):
+                return {"error": payload}
+            return payload
+
+
+def call_tool(name: str, arguments: dict[str, Any], *, timeout: float = DEFAULT_TIMEOUT_SEC) -> Any:
+    """Sync entry point for the bash shim. One spawn per call."""
+    return asyncio.run(_call_tool_async(name, arguments, timeout))
+
+
+# ── Top-level convenience wrappers ─────────────────────────────────────
+# Names map 1:1 onto MCP tool names (and onto bench/tools/code_graph_mcp/
+# tools.yaml entries). Kwargs mirror each tool's MCP arg schema.
+
+
+def index_repo(path_or_url: str, branch: str | None = None, ignore: list[str] | None = None) -> dict[str, Any]:
+    args: dict[str, Any] = {"path_or_url": path_or_url}
+    if branch is not None:
+        args["branch"] = branch
+    if ignore is not None:
+        args["ignore"] = ignore
+    return call_tool("index_repo", args)
+
+
+def search_code(prefix: str, project: str, branch: str | None = None, limit: int = 10) -> Any:
+    args: dict[str, Any] = {"prefix": prefix, "project": project, "limit": limit}
+    if branch is not None:
+        args["branch"] = branch
+    return call_tool("search_code", args)
+
+
+def _neighbors(tool: str, symbol_id: int, project: str, branch: str | None, limit: int) -> Any:
+    args: dict[str, Any] = {"symbol_id": symbol_id, "project": project, "limit": limit}
+    if branch is not None:
+        args["branch"] = branch
+    return call_tool(tool, args)
+
+
+def get_callers(symbol_id: int, project: str, branch: str | None = None, limit: int = 50) -> Any:
+    return _neighbors("get_callers", symbol_id, project, branch, limit)
+
+
+def get_callees(symbol_id: int, project: str, branch: str | None = None, limit: int = 50) -> Any:
+    return _neighbors("get_callees", symbol_id, project, branch, limit)
+
+
+def get_dependencies(symbol_id: int, project: str, branch: str | None = None, limit: int = 50) -> Any:
+    return _neighbors("get_dependencies", symbol_id, project, branch, limit)
+
+
+def impact_analysis(
+    symbol_id: int,
+    project: str,
+    branch: str | None = None,
+    direction: str = "IN",
+    depth: int = 3,
+) -> Any:
+    args: dict[str, Any] = {
+        "symbol_id": symbol_id,
+        "project": project,
+        "direction": direction,
+        "depth": depth,
+    }
+    if branch is not None:
+        args["branch"] = branch
+    return call_tool("impact_analysis", args)
+
+
+def find_path(source_id: int, dest_id: int, project: str, branch: str | None = None) -> Any:
+    args: dict[str, Any] = {
+        "source_id": source_id,
+        "dest_id": dest_id,
+        "project": project,
+    }
+    if branch is not None:
+        args["branch"] = branch
+    return call_tool("find_path", args)
+
+
+def ask(question: str, project: str, branch: str | None = None) -> Any:
+    args: dict[str, Any] = {"question": question, "project": project}
+    if branch is not None:
+        args["branch"] = branch
+    return call_tool("ask", args)
diff --git a/bench/agents/lsp_adapter.py b/bench/agents/lsp_adapter.py
index aee8f2e6..3247862f 100644
--- a/bench/agents/lsp_adapter.py
+++ b/bench/agents/lsp_adapter.py
@@ -131,6 +131,7 @@ def __init__(self, repo_root: str | Path, language: str = "python",
         self.shim = shim
         self._env_path = environment_path
         self._server: Any | None = None  # SyncLanguageServer
+        self._cm: Any | None = None  # live start_server() context (persistent mode)
 
     # ----- lifecycle ------------------------------------------------------
 
@@ -166,6 +167,34 @@ def server_running(self) -> Iterator["LSPClient"]:
             finally:
                 self._server = None
 
+    # ----- persistent lifecycle (for a long-lived MCP server) -------------
+
+    def start(self) -> "LSPClient":
+        """Start a persistent language-server subprocess.
+
+        Unlike ``server_running`` (a per-call context manager used by the
+        bash CLI), this keeps one jedi process alive so an MCP server can
+        serve many tool calls without paying the ~1-3s startup each time.
+        The caller is responsible for calling ``stop()`` at shutdown.
+        """
+        if self._server is not None:
+            return self
+        server = self._build_server()
+        cm = server.start_server()
+        cm.__enter__()
+        self._server = server
+        self._cm = cm
+        return self
+
+    def stop(self) -> None:
+        cm = getattr(self, "_cm", None)
+        if cm is not None:
+            try:
+                cm.__exit__(None, None, None)
+            finally:
+                self._cm = None
+        self._server = None
+
     # ----- relative path normalization -----------------------------------
 
     def _rel(self, file_path: str) -> str:
diff --git a/bench/analysis/__init__.py b/bench/analysis/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/bench/analysis/adopt_audit/edit_critical_overrides.json b/bench/analysis/adopt_audit/edit_critical_overrides.json
new file mode 100644
index 00000000..0967ef42
--- /dev/null
+++ b/bench/analysis/adopt_audit/edit_critical_overrides.json
@@ -0,0 +1 @@
+{}
diff --git a/bench/analysis/adopt_controls.py b/bench/analysis/adopt_controls.py
new file mode 100644
index 00000000..8d754c8c
--- /dev/null
+++ b/bench/analysis/adopt_controls.py
@@ -0,0 +1,429 @@
+"""Negative-control + relabel infrastructure for the adoption-calibration
+experiment (Lane 1). All FREE / offline -- no API, no network.
+
+This module implements the prereg controls (see
+``files/prereg-adoption-calibration.md`` ss3, ss6, ss7) that make the
+"overfitting boundary" *sufficient* rather than merely necessary:
+
+  1. edit-critical relabel (ss7) -- only EDIT-CRITICAL gold count as FN, so a
+     lever is not rewarded for reproducing the patch footprint. Path/type
+     heuristic first guess + a frozen manual-override JSON.
+
+  2. GRAPH-WRONG selection (ss3) -- the subset of tasks whose top-ranked (rank-1)
+     graph hit is verified non-gold. Tests whether a lever can still correctly
+     DROP under a misleading #1. Pure offline scan of cached runs.
+
+  3. NOISY distractor manifest (ss3, ss6) -- a deterministic, seeded set of K
+     plausible-but-false sibling candidates per task, for injection into the
+     LIVE MCP output at pilot run-time. On CACHED data the agent never saw these
+     files, so they would always score TN; therefore the FREE deliverable here
+     is the *manifest generator* + validity assertions + a coverage report, NOT
+     an offline NOISY score. The live NOISY arm is deferred to the pilot.
+
+Scoping note (per rubber-duck): NOISY is a ROBUSTNESS PROBE, not evidence that
+the injected junk matches the graph's real false-positive distribution (the
+hardened cache has only FP=2/213 real FPs -- too few to characterize). Report
+the real FPs alongside any NOISY result so a reader can judge the gap.
+
+Offline mapping trick: the per-task worktrees under
+``<batch>/worktrees/code_graph/loc-<hash>`` are git-sanitized (no .git), but each
+code_graph run's ``stdout.jsonl`` references its own ``loc-<hash>`` exactly once.
+So task -> worktree is recovered by grepping the run log (unique per task, no
+same-repo collision), and the gold file's *directory siblings at base_commit*
+are read straight off the worktree filesystem. No base_commit SHA or HF load
+needed.
+"""
+
+from __future__ import annotations
+
+import argparse
+import ast
+import difflib
+import json
+import re
+from pathlib import Path
+
+from bench.analysis.exposure_adoption import (
+    analyze_batch,
+    candidate_calibration,
+    classify_run,
+    row_stdout_path,
+    surfaced_files,
+)
+
+DEFAULT_SEED = 1234
+DEFAULT_K = 2
+
+_LOC_RE = re.compile(r"loc-[0-9a-f]{16}")
+
+# Incidental-gold path markers (prereg ss7): test-only, fixture, migration,
+# generated, or docs files. A gold file matching any of these is INCIDENTAL
+# unless an override says otherwise. Everything else defaults to EDIT-CRITICAL.
+_INCIDENTAL_PATH_RE = re.compile(
+    r"(^|/)(tests?|testing|test_[^/]*|[^/]*_test\.py|conftest\.py|fixtures?|"
+    r"migrations?|_generated|generated|\.pb\.py|docs?|examples?)(/|$|\.)",
+    re.IGNORECASE,
+)
+# Files we never inject as distractors (not real "plausible source siblings"):
+# tests/fixtures/migrations, caches, and package markers (__init__.py / dunder
+# files) which are near-universal and not credible edit locations.
+_NONSOURCE_DISTRACTOR_RE = re.compile(
+    r"(^|/)(tests?|conftest\.py|fixtures?|migrations?|__pycache__|"
+    r"[^/]*_test\.py|test_[^/]*\.py|__[a-z0-9_]+__\.py|_?version\.py|setup\.py)(/|$)",
+    re.IGNORECASE,
+)
+
+
+# ---------------------------------------------------------------------------
+# task -> worktree mapping (offline, via the run log)
+# ---------------------------------------------------------------------------
+def _run_stdout(batch_root: Path, model: str, task: str,
+                prompt_mode: str = "nudged") -> Path | None:
+    base = batch_root / "runs" / model / "localize" / prompt_mode / "code_graph"
+    cand = base / task / "logs" / "stdout.jsonl"
+    if cand.exists():
+        return cand
+    hits = list(base.glob(f"{task}/**/stdout.jsonl"))
+    if hits:
+        return hits[0]
+    # fall back to any mode dir
+    hits = list((batch_root / "runs" / model).glob(f"*/*/code_graph/{task}/**/stdout.jsonl"))
+    return hits[0] if hits else None
+
+
+def map_task_worktree(batch_root: Path, model: str, task: str,
+                      prompt_mode: str = "nudged") -> Path | None:
+    """Return the on-disk worktree dir for ``task`` at base_commit, or None.
+
+    Recovered by reading the unique ``loc-<hash>`` referenced in the task's
+    code_graph run log. Asserts uniqueness (raises on >1 distinct hash).
+    """
+    sp = _run_stdout(batch_root, model, task, prompt_mode)
+    if sp is None:
+        return None
+    locs = sorted(set(_LOC_RE.findall(sp.read_text())))
+    if len(locs) != 1:
+        return None
+    wt = batch_root / "worktrees" / "code_graph" / locs[0]
+    return wt if wt.exists() else None
+
+
+# ---------------------------------------------------------------------------
+# edit-critical relabel (prereg ss7)
+# ---------------------------------------------------------------------------
+def load_overrides(path: Path | None) -> dict[str, dict[str, str]]:
+    """Load the frozen manual-audit override file.
+
+    Schema: ``{task_id: {gold_file: "critical"|"incidental"}}``. Missing file
+    or None -> empty (heuristic-only).
+    """
+    if path is None or not path.exists():
+        return {}
+    data = json.loads(path.read_text())
+    return {k: dict(v) for k, v in data.items()}
+
+
+def edit_critical_split(
+    gold_files: list[str],
+    task: str | None = None,
+    overrides: dict[str, dict[str, str]] | None = None,
+) -> tuple[list[str], list[str]]:
+    """Split gold into (edit_critical, incidental).
+
+    Heuristic: a gold file whose path matches ``_INCIDENTAL_PATH_RE`` is
+    INCIDENTAL; otherwise EDIT-CRITICAL. The manual override for ``task`` (if
+    present) wins over the heuristic, per gold file.
+    """
+    ov = (overrides or {}).get(task or "", {})
+    critical, incidental = [], []
+    for g in gold_files:
+        label = ov.get(g)
+        if label is None:
+            label = "incidental" if _INCIDENTAL_PATH_RE.search(g) else "critical"
+        (incidental if label == "incidental" else critical).append(g)
+    return critical, incidental
+
+
+# ---------------------------------------------------------------------------
+# GRAPH-WRONG selection (prereg ss3)
+# ---------------------------------------------------------------------------
+def select_graph_wrong(runs: list[dict], gold_by_task: dict[str, list[str]],
+                       batch_root: Path, model: str,
+                       prompt_mode: str = "nudged") -> list[dict]:
+    """Tasks whose rank-1 surfaced file is verified non-gold.
+
+    ``runs`` are the per-run dicts from ``analyze_batch`` (carry ``task``). We
+    re-read surfaced_files to find rank-1, then mark a task GRAPH-WRONG if, in at
+    least one of its runs, the best (rank-1) primary hit is not in that task's
+    gold set. Returns ``[{task, run_idx, rank1, is_wrong}]`` for wrong runs.
+    """
+    out = []
+    for r in runs:
+        task = r.get("task")
+        gold = set(gold_by_task.get(task, []))
+        sp = _run_stdout(batch_root, model, task, prompt_mode)
+        if sp is None:
+            continue
+        surf = surfaced_files(sp)
+        rank1 = min((f for f, v in surf.items() if v["best_rank"] is not None),
+                    key=lambda f: surf[f]["best_rank"], default=None)
+        if rank1 is not None and rank1 not in gold:
+            out.append({"task": task, "run_idx": r.get("run_idx"),
+                        "rank1": rank1, "is_wrong": True})
+    return out
+
+
+# ---------------------------------------------------------------------------
+# NOISY distractor manifest (prereg ss3, ss6)
+# ---------------------------------------------------------------------------
+def gold_symbols_offline(worktree: Path, gold_file: str) -> list[str]:
+    """Top-level + class-method symbol names from the gold file on disk.
+
+    Parsed with ``ast`` straight off the base_commit worktree -- no HF instance
+    needed. Returns [] for non-Python or unparseable files.
+    """
+    if not gold_file.endswith(".py"):
+        return []
+    p = worktree / gold_file
+    try:
+        tree = ast.parse(p.read_text())
+    except (OSError, SyntaxError, ValueError):
+        return []
+    out: list[str] = []
+    for node in ast.walk(tree):
+        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
+            name = node.name
+            # dunder methods (__init__, __repr__, ...) are near-universal and
+            # would spuriously match package files / generic stems -- skip them.
+            if name.startswith("__") and name.endswith("__"):
+                continue
+            out.append(name)
+    return out
+
+
+def _similarity(stem: str, targets: list[str]) -> float:
+    stem = stem.lower()
+    return max((difflib.SequenceMatcher(None, stem, t.lower()).ratio()
+                for t in targets if t), default=0.0)
+
+
+def sibling_distractors(
+    worktree: Path,
+    gold_files: list[str],
+    incidental: list[str],
+    k: int = DEFAULT_K,
+    seed: int = DEFAULT_SEED,
+) -> list[dict]:
+    """Deterministic K plausible-but-false sibling distractors for a task.
+
+    Pool = source-file siblings in every gold file's directory (base_commit
+    on-disk tree), excluding all gold + incidental files and any test/fixture/
+    generated file. Ranked by max name-similarity to a gold stem OR gold AST
+    symbol. Ties broken by a seeded but reproducible key, then by path. Returns
+    the top-k as ``[{file, score, similar_to}]``.
+    """
+    gold_set = set(gold_files) | set(incidental)
+    # similarity targets: gold stems + gold symbols
+    targets: list[str] = []
+    for g in gold_files:
+        targets.append(Path(g).stem)
+        targets.extend(gold_symbols_offline(worktree, g))
+
+    pool: dict[str, float] = {}
+    for g in gold_files:
+        gdir = str(Path(g).parent)
+        ddir = worktree / gdir
+        if not ddir.is_dir():
+            continue
+        for child in sorted(ddir.iterdir()):
+            if not child.is_file() or not child.name.endswith(".py"):
+                continue
+            rel = str(Path(gdir) / child.name) if gdir != "." else child.name
+            if rel in gold_set:
+                continue
+            if _NONSOURCE_DISTRACTOR_RE.search(rel):
+                continue
+            score = _similarity(child.stem, targets)
+            # keep the best score if a file is reachable from >1 gold dir
+            if rel not in pool or score > pool[rel]:
+                pool[rel] = score
+
+    def _tiebreak(item: tuple[str, float]) -> tuple:
+        rel, score = item
+        # seeded, deterministic, path-stable ordering for equal scores
+        h = difflib.SequenceMatcher(None, f"{seed}", rel).ratio()
+        return (-score, -h, rel)
+
+    ranked = sorted(pool.items(), key=_tiebreak)
+    chosen = ranked[:k]
+    return [{"file": rel, "score": round(score, 4),
+             "similar_to": _closest_target(Path(rel).stem, targets)}
+            for rel, score in chosen]
+
+
+def _closest_target(stem: str, targets: list[str]) -> str | None:
+    if not targets:
+        return None
+    return max(targets, key=lambda t: difflib.SequenceMatcher(
+        None, stem.lower(), t.lower()).ratio())
+
+
+def build_noisy_manifest(
+    results_path: Path,
+    *,
+    k: int = DEFAULT_K,
+    seed: int = DEFAULT_SEED,
+    overrides_path: Path | None = None,
+) -> dict:
+    """Build the deterministic NOISY injection manifest + coverage report.
+
+    For every code_graph task, emit K verified-non-gold sibling distractors. The
+    manifest is keyed by task and is reproducible across runs. Validity
+    assertions (non-gold, distinct, on-disk) are enforced and surfaced in the
+    report so a frozen artifact is auditable.
+    """
+    batch_root = results_path.parent.parent
+    model = results_path.parent.name
+    rows = [json.loads(ln) for ln in results_path.read_text().splitlines() if ln.strip()]
+    cg = [r for r in rows if r.get("config") == "code_graph"]
+    overrides = load_overrides(overrides_path)
+
+    seen_tasks: set[str] = set()
+    manifest: dict[str, dict] = {}
+    coverage = {"tasks": 0, "full_k": 0, "partial": 0, "empty": 0, "no_worktree": 0}
+    for r in cg:
+        task = r["task_id"]
+        if task in seen_tasks:
+            continue
+        seen_tasks.add(task)
+        coverage["tasks"] += 1
+        gold = r.get("gold_files", [])
+        _crit, incidental = edit_critical_split(gold, task, overrides)
+        wt = map_task_worktree(batch_root, model, task)
+        if wt is None:
+            coverage["no_worktree"] += 1
+            manifest[task] = {"distractors": [], "note": "no_worktree"}
+            continue
+        distractors = sibling_distractors(wt, gold, incidental, k=k, seed=seed)
+        # validity assertions
+        gold_set = set(gold)
+        files = [d["file"] for d in distractors]
+        assert len(files) == len(set(files)), f"dup distractor for {task}"
+        assert not (set(files) & gold_set), f"gold leaked into distractors for {task}"
+        for d in distractors:
+            assert (wt / d["file"]).is_file(), f"distractor not on disk: {d['file']}"
+        n = len(distractors)
+        coverage["full_k" if n >= k else ("empty" if n == 0 else "partial")] += 1
+        manifest[task] = {"worktree": wt.name, "distractors": distractors,
+                          "gold_files": gold, "incidental": incidental}
+    return {"k": k, "seed": seed, "coverage": coverage, "manifest": manifest}
+
+
+# ---------------------------------------------------------------------------
+# edit-critical recall sensitivity (heuristic-only vs +overrides)
+# ---------------------------------------------------------------------------
+def _rescore_with_labels(results_path: Path,
+                         overrides: dict[str, dict[str, str]] | None,
+                         prompt_mode: str | None = None) -> dict:
+    """Re-run the candidate metric, passing per-task edit_critical labels.
+
+    Locates each run's log by FULL row identity (``row_stdout_path``) so coexisting
+    prompt-mode arms are never cross-wired. Pass ``prompt_mode`` to restrict to one
+    arm.
+    """
+    batch_root = results_path.parent.parent
+    model = results_path.parent.name
+    rows = [json.loads(ln) for ln in results_path.read_text().splitlines() if ln.strip()]
+    cg = [r for r in rows if r.get("config") == "code_graph"]
+    if prompt_mode is not None:
+        cg = [r for r in cg if r.get("prompt_mode") == prompt_mode]
+    per_run = []
+    for r in cg:
+        task = r.get("task_id")
+        stdout = row_stdout_path(batch_root, model, r)
+        if not stdout:
+            continue
+        gold = r.get("gold_files", [])
+        crit, _inc = edit_critical_split(gold, task, overrides)
+        cls = classify_run(stdout, gold, r.get("pred_files", []), edit_critical=crit)
+        cls["task"] = task
+        cls["run_idx"] = r.get("run_idx")
+        per_run.append(cls)
+    return candidate_calibration(per_run)
+
+
+def recall_sensitivity(results_path: Path, overrides_path: Path | None) -> dict:
+    """Macro P/R/F1 under (a) all-gold-critical, (b) heuristic-only,
+    (c) heuristic+overrides. Surfaces how much the relabel moves recall so a
+    skeptic can see labels weren't tuned to taste."""
+    overrides = load_overrides(overrides_path)
+    # (a) baseline: every gold critical -> use analyze_batch (edit_critical=None)
+    base = candidate_calibration(
+        [r for r in analyze_batch(results_path)["per_run"] if "error" not in r])
+    heur = _rescore_with_labels(results_path, None)
+    audit = _rescore_with_labels(results_path, overrides)
+    return {"all_critical": base["macro"], "heuristic_only": heur["macro"],
+            "heuristic_plus_audit": audit["macro"], "n_overrides": sum(
+                len(v) for v in overrides.values())}
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+def main() -> None:
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("results", type=Path, help="path to a code_graph results.jsonl")
+    ap.add_argument("--overrides", type=Path,
+                    default=Path(__file__).parent / "adopt_audit" / "edit_critical_overrides.json")
+    ap.add_argument("--k", type=int, default=DEFAULT_K)
+    ap.add_argument("--seed", type=int, default=DEFAULT_SEED)
+    ap.add_argument("--json", type=Path, help="write full manifest+report JSON here")
+    args = ap.parse_args()
+
+    out = analyze_batch(args.results)
+    runs = [r for r in out["per_run"] if "error" not in r]
+    rows = [json.loads(ln) for ln in args.results.read_text().splitlines() if ln.strip()]
+    gold_by_task = {r["task_id"]: r.get("gold_files", [])
+                    for r in rows if r.get("config") == "code_graph"}
+
+    model = args.results.parent.name
+    batch_root = args.results.parent.parent
+
+    print("==== EDIT-CRITICAL RELABEL (recall sensitivity) ====")
+    sens = recall_sensitivity(args.results, args.overrides)
+
+    def _m(d):
+        f = lambda x: f"{x:.3f}" if x is not None else " n/a"  # noqa: E731
+        return f"P={f(d['precision'])} R={f(d['recall'])} F1={f(d['f1'])}"
+
+    print(f"  all-gold-critical    : {_m(sens['all_critical'])}")
+    print(f"  heuristic-only       : {_m(sens['heuristic_only'])}")
+    print(f"  heuristic+audit (n={sens['n_overrides']:>2}): {_m(sens['heuristic_plus_audit'])}")
+
+    print("\n==== GRAPH-WRONG SUBSET (rank-1 surfaced file is non-gold) ====")
+    gw = select_graph_wrong(runs, gold_by_task, batch_root, model)
+    gw_tasks = sorted({g["task"] for g in gw})
+    print(f"  graph-wrong runs: {len(gw)}  |  distinct tasks: {len(gw_tasks)}")
+    for g in gw:
+        print(f"    {g['task']:34s} idx={g['run_idx']}  rank1={g['rank1']}")
+
+    print("\n==== NOISY DISTRACTOR MANIFEST (deterministic, for run-time injection) ====")
+    noisy = build_noisy_manifest(args.results, k=args.k, seed=args.seed,
+                                 overrides_path=args.overrides)
+    cov = noisy["coverage"]
+    print(f"  k={noisy['k']} seed={noisy['seed']}  tasks={cov['tasks']}  "
+          f"full_k={cov['full_k']} partial={cov['partial']} "
+          f"empty={cov['empty']} no_worktree={cov['no_worktree']}")
+    for task, m in noisy["manifest"].items():
+        ds = ", ".join(f"{Path(d['file']).name}({d['score']})" for d in m.get("distractors", []))
+        print(f"    {task:34s} -> {ds or m.get('note', '(none)')}")
+
+    if args.json:
+        payload = {"recall_sensitivity": sens, "graph_wrong": gw,
+                   "noisy": noisy}
+        args.json.write_text(json.dumps(payload, indent=2))
+        print(f"\nwrote {args.json}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bench/analysis/adopt_diag.py b/bench/analysis/adopt_diag.py
new file mode 100644
index 00000000..6fee65fc
--- /dev/null
+++ b/bench/analysis/adopt_diag.py
@@ -0,0 +1,345 @@
+"""Per-arm diagnostics for the Lane 1 adoption-calibration pilot.
+
+Compares the CTRL / SEM / RAT arms (``prompt_mode`` in ``adopt-ctrl`` /
+``adopt-sem`` / ``adopt-rat``) on one results.jsonl, all on the code_graph +
+localize track. Reports, side by side per arm:
+
+* candidate-level calibration (macro, **macro_strict** = prereg PRIMARY, micro)
+  over ALL surfaced candidates, plus the same restricted to the **GRAPH-WRONG**
+  control subset (rank-1 graph hit verified non-gold);
+* exposure / adoption aggregates and **per-arm exposure drift** (the harness is
+  agent-driven, so SEM/RAT may surface different candidate sets than CTRL --
+  prereg amendment "identical candidate sets is measured, not forced");
+* token deltas (median total / output / **visible_output = output - reasoning**
+  / input / premium_requests / turns) so the RAT thinking-vs-calibration
+  confound is attributable;
+* a **RAT compliance audit**: did the agent emit the mandated ``KEEP``/``DROP``
+  lines, and is the final answer consistent with them (no DROP file kept).
+
+The GRAPH-WRONG subset is selected ONCE from a reference arm (default
+``adopt-ctrl``) and the SAME task set is applied to every arm, so the control is
+fixed across arms rather than re-derived per arm.
+
+Run:
+    uv run python -m bench.analysis.adopt_diag <results.jsonl> [--json out.json]
+        [--ref-arm adopt-ctrl]
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import statistics
+from pathlib import Path
+
+from bench.analysis.adopt_controls import select_graph_wrong
+from bench.analysis.exposure_adoption import (
+    analyze_batch,
+    candidate_calibration,
+    row_stdout_path,
+)
+
+ARM_PROMPT_MODES = ("adopt-ctrl", "adopt-sem", "adopt-rat")
+
+# A KEEP/DROP decision line from the RAT step. The prompt format is
+# ``KEEP <file> — <reason>`` / ``DROP <file> — <reason>``; the file may be
+# wrapped in backticks and the dash is an em dash, en dash or hyphen. We only
+# need the decision verb and the path token, so we accept any leading list
+# marker / bullet and stop at the first whitespace, backtick, em/en dash or colon.
+_RAT_LINE = re.compile(
+    r"^\s*[-*>\d.)\]\s]*`?\s*(KEEP|DROP)\b[\s:`]*([^\s`—–:]+)",
+    re.IGNORECASE,
+)
+
+
+def _norm(path: str) -> str:
+    """Repo-root-relative posix-ish normalization (mirrors copilot_runner)."""
+    p = path.strip().strip("'\"`").strip().replace("\\", "/")
+    while p.startswith("./"):
+        p = p[2:]
+    for prefix in ("a/", "b/"):
+        if p.startswith(prefix):
+            p = p[len(prefix):]
+    return p.lstrip("/")
+
+
+def parse_rat_decisions(agent_text: str) -> dict[str, str]:
+    """Map normalized file -> final decision ("keep"/"drop") from RAT lines.
+
+    Last decision for a file wins (the agent may revise). Only lines that match
+    the KEEP/DROP contract are considered; prose mentioning the words is ignored
+    because the verb must be line-initial (after optional list markers).
+    """
+    decisions: dict[str, str] = {}
+    for line in agent_text.splitlines():
+        m = _RAT_LINE.match(line)
+        if not m:
+            continue
+        verb = m.group(1).lower()
+        f = _norm(m.group(2))
+        if f:
+            decisions[f] = "keep" if verb == "keep" else "drop"
+    return decisions
+
+
+def rat_audit(agent_text: str, pred_files: list[str]) -> dict:
+    """Did the agent run the keep/drop step, and is the answer consistent?
+
+    * ``compliant`` -- emitted at least one KEEP/DROP decision line.
+    * ``consistent`` -- no file the agent marked DROP appears in the final
+      answer (the prereg requires the final answer to honor the decisions).
+    * ``kept_omitted`` -- files marked KEEP but absent from the final answer
+      (allowed by the prompt, but tracked: silent erosion after deciding keep).
+    """
+    decisions = parse_rat_decisions(agent_text)
+    pred = {_norm(p) for p in (pred_files or [])}
+    kept = {f for f, d in decisions.items() if d == "keep"}
+    dropped = {f for f, d in decisions.items() if d == "drop"}
+    dropped_but_kept = sorted(dropped & pred)
+    kept_omitted = sorted(kept - pred)
+    return {
+        "compliant": bool(decisions),
+        "n_keep": len(kept),
+        "n_drop": len(dropped),
+        "consistent": not dropped_but_kept,
+        "dropped_but_kept": dropped_but_kept,
+        "kept_omitted": kept_omitted,
+    }
+
+
+def _median(xs: list[float]) -> float | None:
+    xs = [x for x in xs if x is not None]
+    return round(statistics.median(xs), 1) if xs else None
+
+
+def token_summary(rows: list[dict]) -> dict:
+    """Median token / step usage across an arm's completed rows.
+
+    ``visible_output`` excludes hidden reasoning tokens so a RAT calibration win
+    can be separated from "the model just thought/typed more".
+    """
+    def col(key: str) -> list[float]:
+        return [r[key] for r in rows if r.get(key) is not None]
+
+    visible = [
+        r.get("output_tokens", 0) - r.get("reasoning_tokens", 0)
+        for r in rows
+        if r.get("output_tokens") is not None
+    ]
+    return {
+        "n_rows": len(rows),
+        "median_total_tokens": _median(col("total_tokens")),
+        "median_output_tokens": _median(col("output_tokens")),
+        "median_visible_output_tokens": _median(visible),
+        "median_reasoning_tokens": _median(col("reasoning_tokens")),
+        "median_input_tokens": _median(col("input_tokens")),
+        "median_premium_requests": _median(col("premium_requests")),
+        "median_num_turns": _median(col("num_turns")),
+    }
+
+
+def _agent_text_for(batch_root: Path, model: str, row: dict) -> str:
+    """Read the saved agent_text.txt for a localize row (fallback: empty)."""
+    sp = row_stdout_path(batch_root, model, row)
+    if sp is None:
+        return ""
+    # run_dir/logs/stdout.jsonl -> run_dir/agent_text.txt
+    cand = sp.parent.parent / "agent_text.txt"
+    if cand.exists():
+        return cand.read_text(errors="replace")
+    return ""
+
+
+def _subset_calibration(runs: list[dict], tasks: set[str]) -> dict:
+    return candidate_calibration([r for r in runs if r.get("task") in tasks])
+
+
+def arm_diagnostics(
+    results_path: Path,
+    arm: str,
+    *,
+    gold_by_task: dict[str, list[str]],
+    graph_wrong_tasks: set[str],
+) -> dict:
+    """All per-arm diagnostics for one ``adopt-<arm>`` prompt_mode."""
+    rows = [json.loads(ln) for ln in results_path.read_text().splitlines() if ln.strip()]
+    model = results_path.parent.name
+    batch_root = results_path.parent.parent
+
+    arm_rows = [
+        r
+        for r in rows
+        if r.get("config") == "code_graph"
+        and r.get("mode") == "localize"
+        and r.get("prompt_mode") == arm
+        and r.get("completed")
+    ]
+
+    batch = analyze_batch(results_path, prompt_mode=arm, mode="localize")
+    runs = [r for r in batch["per_run"] if "error" not in r]
+
+    cal_all = candidate_calibration(runs)
+    cal_gw = _subset_calibration(runs, graph_wrong_tasks)
+
+    tot_gold = sum(r["n_gold"] for r in runs)
+    tot_surf = sum(r["n_surfaced"] for r in runs)
+    tot_surf_adopt = sum(r["n_surfaced_adopted"] for r in runs)
+
+    out: dict = {
+        "arm": arm,
+        "n_runs": len(runs),
+        "exposure": {
+            "gold_run_x_gold": tot_gold,
+            "surfaced": tot_surf,
+            "surfaced_adopted": tot_surf_adopt,
+            "exposure_recall": round(tot_surf / tot_gold, 4) if tot_gold else None,
+            "adoption_rate": round(tot_surf_adopt / tot_surf, 4) if tot_surf else None,
+        },
+        "calibration_clean": cal_all,
+        "calibration_graph_wrong": cal_gw,
+        "tokens": token_summary(arm_rows),
+    }
+
+    if arm == "adopt-rat":
+        audits = []
+        for r in arm_rows:
+            a = rat_audit(_agent_text_for(batch_root, model, r), r.get("pred_files", []))
+            a["task"] = r.get("task_id")
+            a["run_idx"] = r.get("run_idx")
+            audits.append(a)
+        n = len(audits) or 1
+        out["rat_audit"] = {
+            "n": len(audits),
+            "compliance_rate": round(sum(a["compliant"] for a in audits) / n, 4),
+            "consistency_rate": round(sum(a["consistent"] for a in audits) / n, 4),
+            "n_dropped_but_kept": sum(len(a["dropped_but_kept"]) for a in audits),
+            "n_kept_omitted": sum(len(a["kept_omitted"]) for a in audits),
+            "per_run": audits,
+        }
+    return out
+
+
+def _build_gold_by_task(rows: list[dict]) -> dict[str, list[str]]:
+    gold: dict[str, list[str]] = {}
+    for r in rows:
+        t = r.get("task_id")
+        g = r.get("gold_files")
+        if t and g and t not in gold:
+            gold[t] = list(g)
+    return gold
+
+
+def diagnose(results_path: Path, *, ref_arm: str = "adopt-ctrl") -> dict:
+    """Full per-arm diagnostic report for every arm present in the results."""
+    rows = [json.loads(ln) for ln in results_path.read_text().splitlines() if ln.strip()]
+    model = results_path.parent.name
+    batch_root = results_path.parent.parent
+    gold_by_task = _build_gold_by_task(rows)
+
+    present = [
+        a
+        for a in ARM_PROMPT_MODES
+        if any(
+            r.get("prompt_mode") == a and r.get("config") == "code_graph"
+            for r in rows
+        )
+    ]
+
+    # Freeze the GRAPH-WRONG control task set ONCE from the reference arm so the
+    # same tasks are scored across all arms. Fall back to the first present arm.
+    sel_arm = ref_arm if ref_arm in present else (present[0] if present else ref_arm)
+    ref_batch = analyze_batch(results_path, prompt_mode=sel_arm, mode="localize")
+    ref_runs = [r for r in ref_batch["per_run"] if "error" not in r]
+    gw = select_graph_wrong(
+        ref_runs, gold_by_task, batch_root, model, prompt_mode=sel_arm
+    )
+    graph_wrong_tasks = {g["task"] for g in gw}
+
+    arms = {
+        a: arm_diagnostics(
+            results_path, a,
+            gold_by_task=gold_by_task, graph_wrong_tasks=graph_wrong_tasks,
+        )
+        for a in present
+    }
+    return {
+        "results": str(results_path),
+        "model": model,
+        "arms_present": present,
+        "graph_wrong": {"ref_arm": sel_arm, "tasks": sorted(graph_wrong_tasks)},
+        "arms": arms,
+    }
+
+
+def _f(x) -> str:
+    return f"{x:.3f}" if isinstance(x, (int, float)) else "  n/a"
+
+
+def _print_report(rep: dict) -> None:
+    present = rep["arms_present"]
+    if not present:
+        print("no adopt-* arms found in results")
+        return
+    gw = rep["graph_wrong"]
+    print(f"model: {rep['model']}   arms: {', '.join(present)}")
+    print(f"GRAPH-WRONG subset ({len(gw['tasks'])} tasks, ref={gw['ref_arm']}): "
+          f"{gw['tasks']}")
+
+    hdr = f"\n{'metric':32s} " + " ".join(f"{a.replace('adopt-',''):>10s}" for a in present)
+    print(hdr)
+    print("-" * len(hdr))
+
+    def row(label: str, getter) -> None:
+        cells = " ".join(f"{getter(rep['arms'][a]):>10s}" for a in present)
+        print(f"{label:32s} {cells}")
+
+    row("runs", lambda d: str(d["n_runs"]))
+    row("exposure_recall", lambda d: _f(d["exposure"]["exposure_recall"]))
+    row("adoption_rate", lambda d: _f(d["exposure"]["adoption_rate"]))
+    row("CLEAN macro_strict F1 (PRIMARY)",
+        lambda d: _f(d["calibration_clean"]["macro_strict"]["f1"]))
+    row("CLEAN macro F1", lambda d: _f(d["calibration_clean"]["macro"]["f1"]))
+    row("CLEAN macro precision",
+        lambda d: _f(d["calibration_clean"]["macro"]["precision"]))
+    row("CLEAN macro recall",
+        lambda d: _f(d["calibration_clean"]["macro"]["recall"]))
+    row("GRAPH-WRONG macro precision",
+        lambda d: _f(d["calibration_graph_wrong"]["macro"]["precision"]))
+    row("GRAPH-WRONG macro_strict F1",
+        lambda d: _f(d["calibration_graph_wrong"]["macro_strict"]["f1"]))
+    row("median total tokens",
+        lambda d: _f(d["tokens"]["median_total_tokens"]))
+    row("median visible-output tokens",
+        lambda d: _f(d["tokens"]["median_visible_output_tokens"]))
+    row("median reasoning tokens",
+        lambda d: _f(d["tokens"]["median_reasoning_tokens"]))
+    row("median turns", lambda d: _f(d["tokens"]["median_num_turns"]))
+
+    if "adopt-rat" in present:
+        ra = rep["arms"]["adopt-rat"].get("rat_audit", {})
+        print("\nRAT keep/drop audit:")
+        print(f"  compliance (emitted KEEP/DROP) : {_f(ra.get('compliance_rate'))}"
+              f"  over n={ra.get('n')}")
+        print(f"  consistency (no DROP kept)     : {_f(ra.get('consistency_rate'))}")
+        print(f"  dropped-but-kept conflicts     : {ra.get('n_dropped_but_kept')}")
+        print(f"  kept-then-omitted (erosion)    : {ra.get('n_kept_omitted')}")
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("results", type=Path)
+    ap.add_argument("--json", type=Path)
+    ap.add_argument("--ref-arm", default="adopt-ctrl",
+                    help="arm whose runs fix the GRAPH-WRONG task subset")
+    args = ap.parse_args()
+
+    rep = diagnose(args.results, ref_arm=args.ref_arm)
+    _print_report(rep)
+    if args.json:
+        args.json.write_text(json.dumps(rep, indent=2))
+        print(f"\nwrote {args.json}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bench/analysis/aggregate.py b/bench/analysis/aggregate.py
new file mode 100644
index 00000000..beef1d6f
--- /dev/null
+++ b/bench/analysis/aggregate.py
@@ -0,0 +1,185 @@
+"""Robust, task-weighted aggregation for the localization benchmark.
+
+WHY (per rubber-duck): the raw per-row means are confounded and outlier-driven:
+
+  1. REPLICATE confound -- decision instances are run multiple times (run_idx
+     0/1/2) while controls run once, so a naive mean over rows weights some
+     tasks 3x. Fix: average replicates WITHIN each (task, config) first
+     ("per-task cell"), then take the macro-mean ACROSS tasks. Every task then
+     carries equal weight regardless of replicate count.
+
+  2. TOKEN tail -- a single runaway trajectory (>1M input tokens) dominates the
+     arithmetic mean. Fix: report median + mean + p90 + max + #runaways(>500k)
+     + a winsorized mean (p90 cap) for SENSITIVITY ONLY (never as the headline,
+     never silently dropping data).
+
+  3. CONNECTIVITY stratum -- recall on graph-connected gold is the only stratum
+     where the graph can mechanically help. Fix: join the connectivity label
+     (all_connected / partial / unconnected) per task and report recall per
+     stratum per arm.
+
+Headline accuracy metric = task-weighted macro-mean of ``file_recall`` (and the
+strict ``file_all_found`` set-exact metric) per config. Headline token metric =
+per-task median input tokens + the robust tail stats.
+
+CLI:
+    python -m bench.analysis.aggregate <results.jsonl> [--conn conn.json] \
+        [--json out.json] [--runaway 500000]
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import statistics
+from collections import defaultdict
+from pathlib import Path
+
+
+def _percentile(xs: list[float], q: float) -> float:
+    """Linear-interpolation percentile (q in [0,1]). Empty -> 0.0."""
+    if not xs:
+        return 0.0
+    s = sorted(xs)
+    if len(s) == 1:
+        return float(s[0])
+    pos = q * (len(s) - 1)
+    lo = int(pos)
+    frac = pos - lo
+    if lo + 1 >= len(s):
+        return float(s[-1])
+    return float(s[lo] + (s[lo + 1] - s[lo]) * frac)
+
+
+def _winsorized_mean(xs: list[float], cap_q: float = 0.90) -> float:
+    """Mean after clamping values above the cap_q percentile down to it."""
+    if not xs:
+        return 0.0
+    cap = _percentile(xs, cap_q)
+    return statistics.fmean(min(x, cap) for x in xs)
+
+
+def _cells(rows: list[dict], field: str) -> dict[tuple[str, str], float]:
+    """Average ``field`` over replicates within each (config, task) cell."""
+    buckets: dict[tuple[str, str], list[float]] = defaultdict(list)
+    for r in rows:
+        v = r.get(field)
+        if v is None:
+            continue
+        buckets[(r["config"], r["task_id"])].append(float(v))
+    return {k: statistics.fmean(v) for k, v in buckets.items() if v}
+
+
+def _per_config(cells: dict[tuple[str, str], float]) -> dict[str, list[float]]:
+    out: dict[str, list[float]] = defaultdict(list)
+    for (config, _task), v in cells.items():
+        out[config].append(v)
+    return out
+
+
+def aggregate(results_path: Path, conn_path: Path | None,
+              runaway: float = 500_000) -> dict:
+    rows = [json.loads(l) for l in results_path.read_text().splitlines() if l.strip()]
+    configs = sorted({r["config"] for r in rows})
+
+    conn_label: dict[str, str] = {}
+    if conn_path and conn_path.exists():
+        for c in json.loads(conn_path.read_text()):
+            conn_label[c["task"]] = c.get("label", "unknown")
+
+    # --- task-weighted accuracy (recall + strict all-found) ---
+    recall_cells = _cells(rows, "file_recall")
+    allfound_cells = _cells(rows, "file_all_found")
+    recall_by_cfg = _per_config(recall_cells)
+    allfound_by_cfg = _per_config(allfound_cells)
+
+    # --- token cell means (per task) for robust stats ---
+    intok_cells = _cells(rows, "input_tokens")
+    intok_by_cfg = _per_config(intok_cells)
+
+    # raw per-row input tokens (for tail stats that should see every runaway)
+    intok_rows_by_cfg: dict[str, list[float]] = defaultdict(list)
+    for r in rows:
+        v = r.get("input_tokens")
+        if v is not None:
+            intok_rows_by_cfg[r["config"]].append(float(v))
+
+    summary = {}
+    for cfg in configs:
+        rec = recall_by_cfg.get(cfg, [])
+        allf = allfound_by_cfg.get(cfg, [])
+        intask = intok_by_cfg.get(cfg, [])
+        inrows = intok_rows_by_cfg.get(cfg, [])
+        summary[cfg] = {
+            "n_tasks": len(rec),
+            "n_rows": sum(1 for r in rows if r["config"] == cfg),
+            "recall_task_weighted": round(statistics.fmean(rec), 4) if rec else None,
+            "all_found_task_weighted": round(statistics.fmean(allf), 4) if allf else None,
+            "tokens": {
+                "median_per_task": round(statistics.median(intask)) if intask else None,
+                "mean_per_task": round(statistics.fmean(intask)) if intask else None,
+                "p90_per_task": round(_percentile(intask, 0.90)) if intask else None,
+                "max_row": round(max(inrows)) if inrows else None,
+                "n_runaways": sum(1 for x in inrows if x > runaway),
+                "winsorized_mean_per_task": round(_winsorized_mean(intask)) if intask else None,
+            },
+        }
+
+    # --- recall per connectivity stratum per config ---
+    strata: dict[str, dict[str, list[float]]] = defaultdict(lambda: defaultdict(list))
+    for (cfg, task), v in recall_cells.items():
+        strata[conn_label.get(task, "unknown")][cfg].append(v)
+    by_stratum = {
+        stratum: {
+            cfg: {"n": len(vals), "recall": round(statistics.fmean(vals), 4)}
+            for cfg, vals in cfgmap.items()
+        }
+        for stratum, cfgmap in strata.items()
+    }
+
+    return {
+        "configs": configs,
+        "runaway_threshold": runaway,
+        "summary": summary,
+        "by_connectivity_stratum": by_stratum,
+        "connectivity_labels": conn_label,
+    }
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("results", type=Path)
+    ap.add_argument("--conn", type=Path, help="connectivity.py json output")
+    ap.add_argument("--json", type=Path)
+    ap.add_argument("--runaway", type=float, default=500_000)
+    args = ap.parse_args()
+
+    out = aggregate(args.results, args.conn, args.runaway)
+
+    print("=== TASK-WEIGHTED ACCURACY + ROBUST TOKENS (per config) ===")
+    hdr = (f"{'config':16s} {'tasks':5s} {'rows':4s} {'recall':7s} {'allfnd':7s} "
+           f"{'tok_med':9s} {'tok_mean':9s} {'tok_p90':9s} {'tok_max':10s} {'runaway':7s}")
+    print(hdr)
+    for cfg in out["configs"]:
+        s = out["summary"][cfg]
+        t = s["tokens"]
+        print(f"{cfg:16s} {s['n_tasks']!s:5s} {s['n_rows']!s:4s} "
+              f"{s['recall_task_weighted']!s:7s} {s['all_found_task_weighted']!s:7s} "
+              f"{t['median_per_task']!s:9s} {t['mean_per_task']!s:9s} "
+              f"{t['p90_per_task']!s:9s} {t['max_row']!s:10s} {t['n_runaways']!s:7s}")
+
+    print("\n=== RECALL BY CONNECTIVITY STRATUM ===")
+    for stratum, cfgmap in sorted(out["by_connectivity_stratum"].items()):
+        print(f"\n[{stratum}]")
+        for cfg in out["configs"]:
+            d = cfgmap.get(cfg)
+            if d:
+                print(f"  {cfg:16s} n={d['n']:<3d} recall={d['recall']}")
+
+    if args.json:
+        args.json.write_text(json.dumps(out, indent=2))
+        print(f"\nwrote {args.json}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bench/analysis/cg_report.py b/bench/analysis/cg_report.py
new file mode 100644
index 00000000..c62cb465
--- /dev/null
+++ b/bench/analysis/cg_report.py
@@ -0,0 +1,97 @@
+"""Per-instance code_graph-vs-reference reporter for the cg-n5 micro-cycle.
+
+Given the code_graph cache dir and a task_id, prints:
+  * the code_graph result row (recall / acc@1 / tokens / tool usage)
+  * the FROZEN reference rows (copilot_no_mcp + lsp) for the same task
+  * a compact agent trace (tool steps) reconstructed via bench.analysis.trace
+
+Usage:
+    python -m bench.analysis.cg_report <task_id>
+    python -m bench.analysis.cg_report --list      # show which tasks have results
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+CG_RESULTS = Path("bench/cache/cg-n5-cooverride/claude-opus-4.8/results.jsonl")
+REF_RESULTS = Path("bench/cache/ref-n5-baseline-lsp/claude-opus-4.8/results.jsonl")
+CG_RUNS = Path("bench/cache/cg-n5-cooverride/runs/claude-opus-4.8/localize/nudged/code_graph")
+
+
+def _load(path: Path) -> list[dict]:
+    if not path.exists():
+        return []
+    return [json.loads(line) for line in path.read_text().splitlines() if line.strip()]
+
+
+def _fmt_row(r: dict) -> str:
+    tools = r.get("tool_calls_by_name") or {}
+    tool_str = ", ".join(f"{k}={v}" for k, v in sorted(tools.items())) or "(none)"
+    return (
+        f"  config={r.get('config'):<16} recall={r.get('file_recall')!s:<5} "
+        f"acc@1={r.get('acc_at_1')!s:<5} acc@5={r.get('acc_at_5')!s:<5} "
+        f"in_tok={r.get('input_tokens'):<8} out_tok={r.get('output_tokens'):<6} "
+        f"premium={r.get('premium_requests')!s:<4} wall={r.get('wall_clock_sec')}s\n"
+        f"      tools: {tool_str}  first={r.get('first_tool')} "
+        f"graph_calls={r.get('graph_calls')} "
+        f"outcome={r.get('outcome')} timed_out={r.get('timed_out')} "
+        f"leak={r.get('network_leak')}"
+    )
+
+
+def _recall(r: dict) -> str:
+    v = r.get("file_recall")
+    return "?" if v is None else str(v)
+
+
+def report(task_id: str) -> None:
+    cg = [r for r in _load(CG_RESULTS) if r.get("task_id") == task_id]
+    ref = [r for r in _load(REF_RESULTS) if r.get("task_id") == task_id]
+    print("=" * 78)
+    print(f"INSTANCE: {task_id}")
+    print("=" * 78)
+    if cg:
+        gold = cg[0].get("gold_files") or cg[0].get("gold")
+        print(f"gold_files: {gold}")
+    print("\n--- code_graph (THIS run, co-override) ---")
+    for r in cg:
+        print(_fmt_row(r))
+        pred = r.get("pred_files")
+        print(f"      pred: {pred}")
+    print("\n--- FROZEN reference ---")
+    for r in sorted(ref, key=lambda x: x.get("config", "")):
+        print(_fmt_row(r))
+
+    # Trace
+    run_dir = CG_RUNS / task_id
+    print(f"\n--- agent trace ({run_dir}) ---")
+    tr = run_dir / "trace.md"
+    if tr.exists():
+        print(tr.read_text())
+    else:
+        print(f"  (no trace.md yet at {tr}; run: python -m bench.analysis.trace {run_dir})")
+
+
+def list_done() -> None:
+    cg = _load(CG_RESULTS)
+    print(f"code_graph results so far: {len(cg)}")
+    for r in cg:
+        print(f"  {r.get('task_id'):<40} recall={_recall(r)} outcome={r.get('outcome')}")
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("task_id", nargs="?")
+    ap.add_argument("--list", action="store_true")
+    args = ap.parse_args()
+    if args.list or not args.task_id:
+        list_done()
+        return
+    report(args.task_id)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bench/analysis/connectivity.py b/bench/analysis/connectivity.py
new file mode 100644
index 00000000..cfc192f2
--- /dev/null
+++ b/bench/analysis/connectivity.py
@@ -0,0 +1,275 @@
+"""Graph-connectivity stratification for the localization benchmark.
+
+WHY: ``swe_bench.is_structural`` only checks that the gold patch spans >=2 files
+or >=2 directories. It does NOT verify that those gold files are actually
+connected in the code graph. So an instance like ``jupyterhub__oauthenticator-764``
+(gold = ``oauthenticator/google.py`` + ``setup.py``) counts as "structural" even
+though ``setup.py`` has no code edges to the auth module — code_graph cannot
+surface it via structure by construction. Evaluating a graph-traversal tool on
+such instances dilutes signal and makes false negatives uninterpretable.
+
+This module assigns each instance a ``graph_connected_gold`` label computed from
+the STATIC graph edges (independent of ``search_code`` ranking, to avoid
+circularity). The label answers: "does the graph even contain a structural path
+between the gold files?" — i.e. is there structural signal available for the
+tool to exploit, separate from whether the agent adopts it.
+
+PRE-REGISTERED DEFINITION (fixed before reading results):
+  * Edge set for file<->file adjacency (undirected):
+      - direct:        File -[IMPORTS]- File
+      - symbol-bridge: File -[DEFINES]-> sym -[CALLS|EXTENDS|OVERRIDES]- sym <-[DEFINES]- File
+  * Max depth: D = 2 file-hops.
+  * Labels:
+      - gold_missing      : >=1 gold file is not present as a File node in the graph
+      - all_connected     : all present gold files fall in ONE connected component
+                            (reachable within <=D hops) -- full structural signal
+      - partial_connected : >=1 gold-gold pair connected, but not all -- partial signal
+      - unconnected       : >=2 gold files present, no gold-gold pair connected -- NO signal
+      - single            : only 1 gold file (no multi-file structure to traverse)
+
+Framing (per rubber-duck): the label means "graph has structural signal
+available", NOT "task is inherently structural". Using the same graph that is
+under test is acceptable under that framing.
+
+Offline: reads gold files from the benchmark results.jsonl; queries the already
+-indexed FalkorDB graphs on port 6380. No HuggingFace reload, no LLM, $0.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from collections import defaultdict
+from pathlib import Path
+
+import redis
+
+FALKOR_HOST = "localhost"
+FALKOR_PORT = 6380
+GRAPH_FMT = "code:{task}__loc:_default"
+
+# Pre-registered traversal parameters.
+MAX_DEPTH = 2          # file-level hops
+NEIGHBOR_CAP = 500     # per-query fanout cap (guards hub files); flagged if hit
+VISIT_CAP = 8000       # total BFS frontier cap per source file
+
+_IMPORTS_Q = (
+    "MATCH (a:File)-[:IMPORTS]-(b:File) WHERE a.path = $p AND b.path <> $p "
+    "RETURN DISTINCT b.path LIMIT $cap"
+)
+_BRIDGE_Q = (
+    "MATCH (a:File)-[:DEFINES]->(s)-[:CALLS|EXTENDS|OVERRIDES]-(t)"
+    "<-[:DEFINES]-(b:File) WHERE a.path = $p AND b.path <> a.path "
+    "RETURN DISTINCT b.path LIMIT $cap"
+)
+
+
+def _graph_query(r: redis.Redis, graph: str, cypher: str, params: dict):
+    """Run a parameterized GRAPH.QUERY (FalkorDB ``CYPHER k=v`` prefix)."""
+    parts = []
+    for k, v in params.items():
+        if isinstance(v, str):
+            esc = v.replace("\\", "\\\\").replace('"', '\\"')
+            parts.append(f'{k}="{esc}"')
+        else:
+            parts.append(f"{k}={v}")
+    prefix = ("CYPHER " + " ".join(parts) + " ") if parts else ""
+    return r.execute_command("GRAPH.QUERY", graph, prefix + cypher)
+
+
+def _all_file_paths(r: redis.Redis, graph: str) -> list[str]:
+    res = _graph_query(r, graph, "MATCH (f:File) RETURN f.path", {})
+    return [row[0] for row in res[1]]
+
+
+def _resolve_gold(gold_files: list[str], file_paths: list[str]) -> dict[str, str | None]:
+    """Map each repo-relative gold path to its absolute File-node path.
+
+    File nodes store absolute paths ending in ``.../<task>__loc/<relpath>``; we
+    match by suffix ``/<relpath>``. If multiple nodes match (shouldn't for a
+    full relpath), prefer the shortest. Returns ``{gold: node_path | None}``.
+    """
+    out: dict[str, str | None] = {}
+    for g in gold_files:
+        suffix = "/" + g.lstrip("/")
+        cands = [p for p in file_paths if p.endswith(suffix)]
+        out[g] = min(cands, key=len) if cands else None
+    return out
+
+
+def _file_neighbors(r: redis.Redis, graph: str, fpath: str,
+                    cap: int = NEIGHBOR_CAP) -> tuple[set[str], bool]:
+    """1-hop file neighbors of ``fpath`` (IMPORTS + symbol-bridge). Returns
+    ``(neighbors, capped)`` where ``capped`` flags a fanout-limit hit."""
+    out: set[str] = set()
+    capped = False
+    for cypher in (_IMPORTS_Q, _BRIDGE_Q):
+        res = _graph_query(r, graph, cypher, {"p": fpath, "cap": cap})
+        rows = res[1]
+        if len(rows) >= cap:
+            capped = True
+        out.update(row[0] for row in rows)
+    out.discard(fpath)
+    return out, capped
+
+
+def _reachable(r: redis.Redis, graph: str, src: str, targets: set[str],
+               depth: int = MAX_DEPTH) -> set[str]:
+    """BFS up to ``depth`` file-hops from ``src``; return the subset of
+    ``targets`` reached. Early-exits once all targets are found."""
+    found: set[str] = set()
+    visited = {src}
+    frontier = {src}
+    for _ in range(depth):
+        nxt: set[str] = set()
+        for node in frontier:
+            neigh, _capped = _file_neighbors(r, graph, node)
+            for n in neigh:
+                if n in targets:
+                    found.add(n)
+                if n not in visited:
+                    visited.add(n)
+                    nxt.add(n)
+            if len(visited) > VISIT_CAP:
+                break
+        if found >= targets or len(visited) > VISIT_CAP:
+            break
+        frontier = nxt
+    return found
+
+
+class _UF:
+    def __init__(self, items):
+        self.p = {i: i for i in items}
+
+    def find(self, x):
+        while self.p[x] != x:
+            self.p[x] = self.p[self.p[x]]
+            x = self.p[x]
+        return x
+
+    def union(self, a, b):
+        self.p[self.find(a)] = self.find(b)
+
+    def groups(self):
+        g = defaultdict(list)
+        for i in self.p:
+            g[self.find(i)].append(i)
+        return list(g.values())
+
+
+def classify_instance(r: redis.Redis, task: str, gold_files: list[str]) -> dict:
+    """Compute the connectivity stratum for one instance."""
+    graph = GRAPH_FMT.format(task=task)
+    py_gold = [g for g in gold_files if g.endswith(".py")]
+    result = {
+        "task": task,
+        "gold_files": gold_files,
+        "n_gold": len(gold_files),
+        "n_gold_py": len(py_gold),
+    }
+    try:
+        file_paths = _all_file_paths(r, graph)
+    except redis.exceptions.ResponseError as e:
+        result["label"] = "graph_missing"
+        result["error"] = str(e)
+        return result
+
+    resolved = _resolve_gold(gold_files, file_paths)
+    present = {g: p for g, p in resolved.items() if p}
+    missing = [g for g, p in resolved.items() if not p]
+    result["gold_present"] = sorted(present)
+    result["gold_missing_from_graph"] = sorted(missing)
+
+    if len(gold_files) < 2:
+        result["label"] = "single"
+        return result
+    if missing:
+        # Still compute connectivity among the present ones for context, but the
+        # instance cannot be fully won via structure.
+        result["label"] = "gold_missing"
+
+    # pairwise connectivity among present gold files via union-find
+    present_paths = present  # {gold_rel: node_path}
+    target_by_node = {p: g for g, p in present_paths.items()}
+    uf = _UF(list(present_paths))
+    edges: list[tuple[str, str]] = []
+    glist = list(present_paths.items())
+    for i, (g_a, p_a) in enumerate(glist):
+        others = {p for _, p in glist if p != p_a}
+        if not others:
+            continue
+        reached = _reachable(r, graph, p_a, others)
+        for node in reached:
+            g_b = target_by_node[node]
+            uf.union(g_a, g_b)
+            edges.append((g_a, g_b))
+    comps = uf.groups()
+    result["components"] = [sorted(c) for c in comps]
+    result["connected_pairs"] = sorted({tuple(sorted(e)) for e in edges})
+    result["isolated_gold"] = sorted(
+        g for c in comps if len(c) == 1 for g in c
+    )
+
+    if missing:
+        return result  # label already 'gold_missing'
+    if len(comps) == 1:
+        result["label"] = "all_connected"
+    elif edges:
+        result["label"] = "partial_connected"
+    else:
+        result["label"] = "unconnected"
+    return result
+
+
+def load_gold_from_results(results_path: Path) -> dict[str, list[str]]:
+    """Extract ``{task_id: gold_files}`` from a benchmark results.jsonl."""
+    gold: dict[str, list[str]] = {}
+    for line in results_path.read_text().splitlines():
+        if not line.strip():
+            continue
+        row = json.loads(line)
+        tid = row.get("task_id") or row.get("instance_id")
+        if tid and row.get("gold_files"):
+            gold[tid] = row["gold_files"]
+    return gold
+
+
+def classify_results(results_path: Path,
+                     host: str = FALKOR_HOST, port: int = FALKOR_PORT) -> list[dict]:
+    r = redis.Redis(host=host, port=port, decode_responses=True)
+    gold = load_gold_from_results(results_path)
+    return [classify_instance(r, task, g) for task, g in sorted(gold.items())]
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("results", type=Path, help="path to results.jsonl")
+    ap.add_argument("--json", type=Path, help="write full classification JSON here")
+    ap.add_argument("--port", type=int, default=FALKOR_PORT)
+    args = ap.parse_args()
+
+    rows = classify_results(args.results, port=args.port)
+
+    print(f"{'task':34s} {'label':18s} {'gold':4s} present/connected")
+    counts: dict[str, int] = defaultdict(int)
+    for row in rows:
+        counts[row["label"]] += 1
+        present = len(row.get("gold_present", []))
+        comps = row.get("components", [])
+        conn = "-" if not comps else "+".join(str(len(c)) for c in sorted(comps, key=len, reverse=True))
+        miss = row.get("gold_missing_from_graph", [])
+        flag = f"  MISSING:{','.join(Path(m).name for m in miss)}" if miss else ""
+        print(f"{row['task']:34s} {row['label']:18s} {row['n_gold']:<4d} "
+              f"{present}/[{conn}]{flag}")
+    print("\nstratum counts:")
+    for label, n in sorted(counts.items(), key=lambda x: -x[1]):
+        print(f"  {label:18s} {n}")
+
+    if args.json:
+        args.json.write_text(json.dumps(rows, indent=2))
+        print(f"\nwrote {args.json}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bench/analysis/embed_probe.py b/bench/analysis/embed_probe.py
new file mode 100644
index 00000000..19ac4433
--- /dev/null
+++ b/bench/analysis/embed_probe.py
@@ -0,0 +1,117 @@
+"""Semantic-embedding arm for the retrieval probe (Phase A fix #1).
+
+Lexical retrieval (bm25/tfidf in retrieval_probe.py) ~2x the current name-prefix
+interface but stays below the live agent's no-tool recall (0.61) on these
+pretraining-saturated repos. The deciding question for fix #1 is whether a
+SEMANTIC `description -> file` retriever closes that gap.
+
+This embeds per-file text (path + symbol names + truncated bodies/docstrings)
+with a small local HF model (no API, $0) and cosine-ranks files against the
+problem statement. Same 20 instances, same scoring as retrieval_probe.
+"""
+
+from __future__ import annotations
+
+import sys
+from collections import Counter
+
+import numpy as np
+import redis
+import torch
+from transformers import AutoModel, AutoTokenizer
+
+from bench.analysis.retrieval_probe import (
+    FALKOR_PORT,
+    KS,
+    fetch_graph,
+    score,
+)
+from bench.datasets import swe_bench
+
+MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+MAX_LEN = 256
+BATCH = 64
+PER_FILE_CHARS = 1200  # natural-language-ish snippet per file
+
+
+def build_file_text(files, bodytok, symbols) -> dict[str, str]:
+    """Compose a compact NL-ish description per file: path words + symbol
+    names + a bounded slice of body tokens (captures signatures/docstrings)."""
+    sym_by_file: dict[str, list[str]] = {}
+    for name, f in symbols:
+        sym_by_file.setdefault(f, []).append(name)
+    out = {}
+    for f in files:
+        path_words = f.replace("/", " ").replace("_", " ").replace(".py", "")
+        names = " ".join(sym_by_file.get(f, [])[:80])
+        body = " ".join(bodytok.get(f, [])[:300])
+        out[f] = (path_words + " . " + names + " . " + body)[: PER_FILE_CHARS]
+    return out
+
+
+class Embedder:
+    def __init__(self):
+        self.tok = AutoTokenizer.from_pretrained(MODEL)
+        self.model = AutoModel.from_pretrained(MODEL)
+        self.model.eval()
+
+    @torch.no_grad()
+    def encode(self, texts: list[str]) -> np.ndarray:
+        vecs = []
+        for i in range(0, len(texts), BATCH):
+            batch = texts[i : i + BATCH]
+            enc = self.tok(batch, padding=True, truncation=True,
+                           max_length=MAX_LEN, return_tensors="pt")
+            out = self.model(**enc)
+            mask = enc["attention_mask"].unsqueeze(-1).float()
+            summed = (out.last_hidden_state * mask).sum(1)
+            counts = mask.sum(1).clamp(min=1e-9)
+            emb = (summed / counts).cpu().numpy()
+            emb /= (np.linalg.norm(emb, axis=1, keepdims=True) + 1e-9)
+            vecs.append(emb)
+        return np.vstack(vecs) if vecs else np.zeros((0, 384))
+
+
+def main():
+    insts = swe_bench.load_instances()
+    sel = swe_bench.select_structural(insts, n=20, no_leak=True)
+    r = redis.Redis(host="localhost", port=FALKOR_PORT, decode_responses=True)
+    emb = Embedder()
+
+    rows = []
+    for inst in sel:
+        task = inst.instance_id
+        gold = [g for g in swe_bench.gold_changed_files(inst.patch, source_only=True)
+                if g.endswith(".py")]
+        try:
+            files, bodytok, symbols = fetch_graph(r, task)
+        except Exception as e:  # noqa: BLE001
+            print(f"!! {task}: {e}", file=sys.stderr)
+            continue
+        text = build_file_text(files, bodytok, symbols)
+        doc_vecs = emb.encode([text[f] for f in files])
+        qv = emb.encode([inst.problem_statement])[0]
+        scores = doc_vecs @ qv
+        order = np.argsort(-scores)
+        ranked = [files[i] for i in order]
+        sc = score(ranked, gold)
+        rows.append(sc)
+        print(f"{task:38s} gold={len(gold)} files={len(files):5d} "
+              f"R@5={sc['recall@5']:.2f} hit@5={sc['hit@5']:.0f} "
+              f"rk={sc['gold_best_rank']}")
+
+    print("\n========= EMBEDDING (all-MiniLM-L6-v2) AGGREGATE n={} =========".format(len(rows)))
+    line = "embed       "
+    for k in KS:
+        line += f"  R@{k}={np.mean([x[f'recall@{k}'] for x in rows]):.3f}"
+    for k in KS:
+        line += f"  hit@{k}={np.mean([x[f'hit@{k}'] for x in rows]):.3f}"
+    line += f"  MRR={np.mean([x['mrr'] for x in rows]):.3f}"
+    print(line)
+    print("\nLexical ref: bm25 R@5=0.279 hit@5=0.500 MRR=0.230 ; "
+          "tfidf R@5=0.312 hit@5=0.500 MRR=0.419 ; name_prefix R@5=0.175 MRR=0.176")
+    print("Agent ref:   no_mcp recall=0.613 MRR=0.875 ; code_graph recall=0.512 MRR=0.800")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bench/analysis/exposure_adoption.py b/bench/analysis/exposure_adoption.py
new file mode 100644
index 00000000..eefa9e3f
--- /dev/null
+++ b/bench/analysis/exposure_adoption.py
@@ -0,0 +1,375 @@
+"""Retrieval-exposure vs adoption metrics for the code_graph arm.
+
+WHY (per rubber-duck): end-to-end ``file_recall`` conflates two very different
+failure modes:
+  1. RETRIEVAL miss  -- the graph never surfaced the gold file at all.
+  2. ADOPTION miss   -- the graph surfaced the gold file, but the agent dropped
+                        it from its final answer during reasoning.
+Blaming the graph for (2) is unfair: that is an agent-reasoning property, not a
+tool-retrieval property. This module separates them so we can say e.g. "graph
+exposed 7/10 missed gold files; the agent adopted only N of them."
+
+For each code_graph run we parse ``stdout.jsonl`` and, for every ``search_code``
+call, join ``tool.execution_start`` -> ``tool.execution_complete`` by
+``toolCallId`` to recover the UNTRUNCATED result (trace.md/trace.jsonl truncate
+tool output). We collect every file the graph surfaced:
+  * primary hits          (``file`` + ``score`` + rank position)
+  * likely_related_files  (``file`` + ``via`` co_override/shared_method + confidence)
+
+Then, per gold file, we classify exposure:
+  * direct@<rank>     -- surfaced as a primary ranked hit
+  * related:<via>     -- surfaced only as a likely_related sibling
+  * not_surfaced      -- never surfaced by the graph (true retrieval miss)
+and adoption: was the surfaced gold file in the run's final ``pred_files``?
+
+Derived metrics (aggregated over runs):
+  * exposure_recall = surfaced_gold / total_gold
+  * adoption_rate   = adopted_gold / surfaced_gold   (of what the graph surfaced,
+                      how much did the agent keep)
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from collections import defaultdict
+from pathlib import Path
+
+
+def _iter_search_results(stdout_path: Path):
+    """Yield parsed search_code result objects (one per primary) for a run."""
+    names: dict[str, str] = {}
+    for line in stdout_path.read_text().splitlines():
+        if not line.strip():
+            continue
+        try:
+            ev = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        t = ev.get("type")
+        d = ev.get("data", {}) or {}
+        if t == "tool.execution_start":
+            names[d.get("toolCallId")] = d.get("toolName") or d.get("mcpToolName") or ""
+        elif t == "tool.execution_complete":
+            if "search_code" not in (names.get(d.get("toolCallId")) or ""):
+                continue
+            res = d.get("result") or {}
+            # ``contents`` is the clean per-item list (one JSON object per text
+            # entry). ``content`` is the same payload concatenated into a single
+            # string and is NOT valid JSON when there is >1 result -- do not use
+            # it. Fall back to ``content`` only when ``contents`` is absent.
+            items = res.get("contents")
+            if not isinstance(items, list):
+                c = res.get("content")
+                items = c if isinstance(c, list) else [c]
+            for it in items:
+                txt = it.get("text") if isinstance(it, dict) else it
+                if not txt:
+                    continue
+                try:
+                    obj = json.loads(txt)
+                except (json.JSONDecodeError, TypeError):
+                    continue
+                for prim in (obj if isinstance(obj, list) else [obj]):
+                    if isinstance(prim, dict):
+                        yield prim
+
+
+def surfaced_files(stdout_path: Path) -> dict[str, dict]:
+    """Return ``{file: {best_rank, via, confidence}}`` for every file the graph
+    surfaced across all search_code calls in a run. ``best_rank`` is the best
+    primary rank (1-based) or None if only surfaced as a related sibling."""
+    out: dict[str, dict] = {}
+
+    def note(f: str, rank: int | None, via: str, conf: str | None):
+        rec = out.setdefault(f, {"best_rank": None, "via": via, "confidence": conf})
+        if rank is not None and (rec["best_rank"] is None or rank < rec["best_rank"]):
+            rec["best_rank"] = rank
+            rec["via"] = via
+            rec["confidence"] = conf
+        elif rec["best_rank"] is None and via == "direct":
+            rec["via"] = "direct"
+
+    rank = 0
+    for prim in _iter_search_results(stdout_path):
+        f = prim.get("file")
+        if f:
+            if prim.get("rank_kind") == "related":
+                note(f, None, f"related:{prim.get('via', '?')}", prim.get("confidence"))
+            else:
+                rank += 1
+                note(f, rank, "direct", None)
+        for rel in prim.get("likely_related_files", []) or []:
+            rf = rel.get("file")
+            if rf:
+                note(rf, None, f"related:{rel.get('via', '?')}", rel.get("confidence"))
+    return out
+
+
+def classify_run(stdout_path: Path, gold_files: list[str],
+                 pred_files: list[str], edit_critical: list[str] | None = None) -> dict:
+    """Exposure + adoption classification for a single code_graph run.
+
+    ``edit_critical`` optionally restricts which gold files count toward the
+    candidate-level TP/FN (prereg sec7 relabel). Defaults to all gold files.
+    """
+    surf = surfaced_files(stdout_path)
+    pred = set(pred_files or [])
+    per_gold = {}
+    for g in gold_files:
+        rec = surf.get(g)
+        if rec is None:
+            exposure = "not_surfaced"
+        elif rec["best_rank"] is not None:
+            exposure = f"direct@{rec['best_rank']}"
+        else:
+            exposure = rec["via"]
+        per_gold[g] = {
+            "exposure": exposure,
+            "surfaced": rec is not None,
+            "adopted": g in pred,
+        }
+    n_gold = len(gold_files)
+    n_surf = sum(1 for v in per_gold.values() if v["surfaced"])
+    n_surf_adopted = sum(1 for v in per_gold.values() if v["surfaced"] and v["adopted"])
+    n_miss_not_surf = sum(1 for v in per_gold.values() if not v["surfaced"])
+
+    # Candidate-level confusion matrix over EVERY surfaced candidate (the agent's
+    # keep/drop DECISION quality, per prereg-adoption-calibration.md). A
+    # not-surfaced gold file is a RETRIEVAL miss, not a decision, so it is
+    # excluded here (it is already counted in n_not_surfaced above).
+    #   TP = surfaced gold kept       FN = surfaced gold dropped
+    #   FP = surfaced non-gold kept   TN = surfaced non-gold dropped
+    # NOTE: every gold file is treated as edit-critical until the relabel rubric
+    # (prereg sec7) supplies an ``incidental`` set; see ``edit_critical`` arg.
+    gold_set = set(gold_files)
+    crit = set(edit_critical) if edit_critical is not None else gold_set
+    tp = fp = fn = tn = 0
+    cand_detail = {}
+    for f, rec in surf.items():
+        kept = f in pred
+        is_gold = f in gold_set
+        # Only edit-critical gold counts toward TP/FN; incidental gold is excluded
+        # from the decision matrix (keeping or dropping it is not penalized).
+        if is_gold and f not in crit:
+            cand_detail[f] = "incidental_gold"
+            continue
+        if is_gold:
+            label = "TP" if kept else "FN"
+            tp += kept
+            fn += not kept
+        else:
+            label = "FP" if kept else "TN"
+            fp += kept
+            tn += not kept
+        cand_detail[f] = label
+
+    return {
+        "n_gold": n_gold,
+        "n_surfaced": n_surf,
+        "n_surfaced_adopted": n_surf_adopted,
+        "n_not_surfaced": n_miss_not_surf,
+        "per_gold": per_gold,
+        "cand": {"tp": tp, "fp": fp, "fn": fn, "tn": tn, "detail": cand_detail},
+    }
+
+
+def row_stdout_path(batch_root: Path, model: str, row: dict) -> Path | None:
+    """Locate the stdout.jsonl for a results row by its FULL identity.
+
+    Uses (mode, prompt_mode, config, task_id, run_idx) so runs from different
+    prompt_modes (e.g. ``adopt-ctrl`` vs ``adopt-sem``) are never cross-wired --
+    the previous glob-first-match over ``runs/<model>/*/*/code_graph`` could
+    classify a SEM row against a CTRL log when both coexist in one batch.
+
+    Supports both the legacy layout (``<task>/logs``) and the run-indexed layout
+    (``<task>/run<idx>/logs``) introduced for multi-run pilots.
+    """
+    mode = row.get("mode", "fix")
+    prompt_mode = row.get("prompt_mode", "neutral")
+    track = row.get("config")
+    task = row.get("task_id")
+    ridx = int(row.get("run_idx", 0) or 0)
+    base = batch_root / "runs" / model / mode / prompt_mode / track / task
+    for cand in (
+        base / f"run{ridx}" / "logs" / "stdout.jsonl",
+        base / "logs" / "stdout.jsonl",
+    ):
+        if cand.exists():
+            return cand
+    # Last resort: a single stdout under this exact (mode,prompt_mode,track,task)
+    # subtree. Still identity-scoped, so no cross-prompt-mode leakage.
+    hits = sorted(base.glob("**/stdout.jsonl"))
+    return hits[0] if hits else None
+
+
+def analyze_batch(
+    results_path: Path,
+    *,
+    prompt_mode: str | None = None,
+    mode: str | None = None,
+) -> dict:
+    """Analyze code_graph runs referenced by a results.jsonl.
+
+    Locates each run's log by FULL row identity (see ``row_stdout_path``) rather
+    than a first-match glob, so multiple prompt-mode arms in one batch are scored
+    against their OWN logs. Pass ``prompt_mode``/``mode`` to restrict to a single
+    arm (e.g. ``prompt_mode="adopt-sem"``).
+    """
+    rows = [json.loads(ln) for ln in results_path.read_text().splitlines() if ln.strip()]
+    cg = [r for r in rows if r.get("config") == "code_graph"]
+    if mode is not None:
+        cg = [r for r in cg if r.get("mode") == mode]
+    if prompt_mode is not None:
+        cg = [r for r in cg if r.get("prompt_mode") == prompt_mode]
+    model = results_path.parent.name
+    batch_root = results_path.parent.parent
+    per_run = []
+    for r in cg:
+        task = r.get("task_id")
+        stdout = row_stdout_path(batch_root, model, r)
+        if not stdout:
+            per_run.append({"task": task, "run_idx": r.get("run_idx"),
+                            "prompt_mode": r.get("prompt_mode"),
+                            "error": "stdout not found"})
+            continue
+        cls = classify_run(stdout, r.get("gold_files", []), r.get("pred_files", []))
+        cls["task"] = task
+        cls["run_idx"] = r.get("run_idx")
+        cls["prompt_mode"] = r.get("prompt_mode")
+        cls["file_recall"] = r.get("file_recall")
+        per_run.append(cls)
+    return {"per_run": per_run}
+
+
+def _prf(tp: int, fp: int, fn: int) -> tuple[float | None, float | None, float | None]:
+    """Precision, recall, F1 from counts; None when the denominator is 0."""
+    prec = tp / (tp + fp) if (tp + fp) else None
+    rec = tp / (tp + fn) if (tp + fn) else None
+    if prec is None or rec is None or (prec + rec) == 0:
+        f1 = None
+    else:
+        f1 = 2 * prec * rec / (prec + rec)
+    return prec, rec, f1
+
+
+def candidate_calibration(runs: list[dict]) -> dict:
+    """Macro (by task) + micro candidate-level precision/recall/F1.
+
+    The unit of analysis is the TASK (files within a task are correlated). We sum
+    a task's candidate counts across its runs, compute per-task P/R/F1, then macro
+    average. Micro pools all candidates. Macro-F1 by task is the prereg PRIMARY.
+    """
+    by_task: dict[str, dict[str, int]] = defaultdict(lambda: {"tp": 0, "fp": 0, "fn": 0, "tn": 0})
+    for r in runs:
+        c = r.get("cand")
+        if not c:
+            continue
+        t = by_task[r["task"]]
+        for k in ("tp", "fp", "fn", "tn"):
+            t[k] += c[k]
+
+    per_task = {}
+    precs, recs, f1s = [], [], []
+    # macro_strict: a task where gold was SURFACED but ALL of it was dropped
+    # (tp=0, fn>0) is a real adoption FAILURE, not a degenerate task -- score it
+    # F1=0 instead of dropping it, so a conservative lever cannot inflate macro-F1
+    # by silently removing the tasks it broke. Tasks with no surfaced gold at all
+    # (fn=0 and tp=0) remain genuinely undefined and stay dropped.
+    f1s_strict: list[float] = []
+    n_dropped_undefined = 0
+    n_dropped_gold_failures = 0
+    for task, c in by_task.items():
+        p, rc, f1 = _prf(c["tp"], c["fp"], c["fn"])
+        per_task[task] = {**c, "precision": p, "recall": rc, "f1": f1}
+        if p is not None:
+            precs.append(p)
+        if rc is not None:
+            recs.append(rc)
+        if f1 is not None:
+            f1s.append(f1)
+            f1s_strict.append(f1)
+        elif c["tp"] == 0 and c["fn"] > 0:
+            # surfaced gold, none kept -> adoption failure -> strict F1 = 0
+            f1s_strict.append(0.0)
+            n_dropped_gold_failures += 1
+        else:
+            n_dropped_undefined += 1
+
+    tp = sum(c["tp"] for c in by_task.values())
+    fp = sum(c["fp"] for c in by_task.values())
+    fn = sum(c["fn"] for c in by_task.values())
+    tn = sum(c["tn"] for c in by_task.values())
+    mp, mr, mf1 = _prf(tp, fp, fn)
+
+    def _avg(xs):
+        return sum(xs) / len(xs) if xs else None
+
+    return {
+        "n_tasks": len(by_task),
+        "n_tasks_scored_f1": len(f1s),
+        "n_tasks_dropped_undefined": n_dropped_undefined,
+        "n_tasks_gold_dropped_failures": n_dropped_gold_failures,
+        "macro": {"precision": _avg(precs), "recall": _avg(recs), "f1": _avg(f1s)},
+        "macro_strict": {"f1": _avg(f1s_strict), "n": len(f1s_strict)},
+        "micro": {"tp": tp, "fp": fp, "fn": fn, "tn": tn,
+                  "precision": mp, "recall": mr, "f1": mf1},
+        "per_task": per_task,
+    }
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("results", type=Path)
+    ap.add_argument("--json", type=Path)
+    args = ap.parse_args()
+
+    out = analyze_batch(args.results)
+    runs = [r for r in out["per_run"] if "error" not in r]
+
+    tot_gold = sum(r["n_gold"] for r in runs)
+    tot_surf = sum(r["n_surfaced"] for r in runs)
+    tot_surf_adopt = sum(r["n_surfaced_adopted"] for r in runs)
+    tot_not_surf = sum(r["n_not_surfaced"] for r in runs)
+
+    print(f"code_graph runs analyzed: {len(runs)}")
+    print(f"\n{'task':34s} {'idx':3s} {'recall':6s}  exposure -> adoption (per gold)")
+    for r in sorted(runs, key=lambda x: (x["task"], x["run_idx"] or 0)):
+        bits = []
+        for g, v in r["per_gold"].items():
+            tag = v["exposure"]
+            mark = "OK" if v["adopted"] else ("DROP" if v["surfaced"] else "MISS")
+            bits.append(f"{Path(g).name}:{tag}/{mark}")
+        print(f"{r['task']:34s} {str(r['run_idx']):3s} {r['file_recall']!s:6s}  " + "  ".join(bits))
+
+    print("\n==== AGGREGATE (gold-file level, over code_graph runs) ====")
+    print(f"total gold files (run x gold)      : {tot_gold}")
+    print(f"surfaced by graph                  : {tot_surf}  "
+          f"(exposure_recall = {tot_surf/tot_gold:.3f})")
+    print(f"  of which adopted in final answer : {tot_surf_adopt}  "
+          f"(adoption_rate = {tot_surf_adopt/tot_surf:.3f})" if tot_surf else "")
+    print(f"  surfaced-but-DROPPED (adoption gap): {tot_surf - tot_surf_adopt}")
+    print(f"never surfaced (true retrieval miss): {tot_not_surf}  "
+          f"({tot_not_surf/tot_gold:.3f})")
+
+    cal = candidate_calibration(runs)
+    out["candidate_calibration"] = cal
+    mac, mic = cal["macro"], cal["micro"]
+
+    def _f(x):
+        return f"{x:.3f}" if x is not None else "  n/a"
+
+    print("\n==== CANDIDATE-LEVEL CALIBRATION (keep/drop over surfaced candidates) ====")
+    print("  (TP surfaced-gold kept | FP non-gold kept | FN surfaced-gold dropped | TN non-gold dropped)")
+    print(f"  micro: TP={mic['tp']} FP={mic['fp']} FN={mic['fn']} TN={mic['tn']}  "
+          f"P={_f(mic['precision'])} R={_f(mic['recall'])} F1={_f(mic['f1'])}")
+    print(f"  MACRO by task (PRIMARY, n_tasks={cal['n_tasks']}): "
+          f"P={_f(mac['precision'])} R={_f(mac['recall'])} F1={_f(mac['f1'])}")
+
+    if args.json:
+        args.json.write_text(json.dumps(out, indent=2))
+        print(f"\nwrote {args.json}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bench/analysis/reader_capture.py b/bench/analysis/reader_capture.py
new file mode 100644
index 00000000..4aede3fd
--- /dev/null
+++ b/bench/analysis/reader_capture.py
@@ -0,0 +1,78 @@
+"""Capture verbatim ``search_code`` calls (query + full result objects) per run.
+
+This is the Stage-A "reader experiment" capture layer. Unlike
+``exposure_adoption.surfaced_files`` (which flattens to a ``{file: rank}`` map),
+here we keep the FULL, ORDERED, UNTRUNCATED result objects exactly as the agent
+saw them, grouped per ``search_code`` call, together with the ``query`` argument
+the agent passed. The reader harness re-annotates these captured objects via
+``rel_explain.annotate_results`` (the EXACT production builder) so the offline
+A/B exercises the real intervention, not a re-implementation.
+
+Join rule (same as exposure_adoption): in ``stdout.jsonl`` join
+``tool.execution_start`` -> ``tool.execution_complete`` by ``toolCallId``. The
+untruncated payload is under ``result.contents`` (a list, one clean JSON object
+per text entry); fall back to ``result.content`` only when ``contents`` is
+absent (single-result case).
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+
+def _parse_result_objs(res: dict) -> list[dict]:
+    """Parse the list of primary result objects from a tool result payload."""
+    items = res.get("contents")
+    if not isinstance(items, list):
+        c = res.get("content")
+        items = c if isinstance(c, list) else [c]
+    out: list[dict] = []
+    for it in items:
+        txt = it.get("text") if isinstance(it, dict) else it
+        if not txt:
+            continue
+        try:
+            obj = json.loads(txt)
+        except (json.JSONDecodeError, TypeError):
+            continue
+        for prim in (obj if isinstance(obj, list) else [obj]):
+            if isinstance(prim, dict):
+                out.append(prim)
+    return out
+
+
+def capture_search_calls(stdout_path: Path) -> list[dict[str, Any]]:
+    """Return ordered ``[{query, results:[...]}]`` for every search_code call.
+
+    ``results`` are the verbatim primary objects (with their nested
+    ``likely_related_files``) the agent received for that call.
+    """
+    starts: dict[str, dict] = {}
+    calls: list[dict[str, Any]] = []
+    for line in stdout_path.read_text().splitlines():
+        if not line.strip():
+            continue
+        try:
+            ev = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        t = ev.get("type")
+        d = ev.get("data", {}) or {}
+        if t == "tool.execution_start":
+            name = d.get("toolName") or d.get("mcpToolName") or ""
+            if "search_code" in name:
+                starts[d.get("toolCallId")] = d
+        elif t == "tool.execution_complete":
+            start = starts.get(d.get("toolCallId"))
+            if start is None:
+                continue
+            query = (start.get("arguments") or {}).get("query") or ""
+            results = _parse_result_objs(d.get("result") or {})
+            calls.append({
+                "tool_call_id": d.get("toolCallId"),
+                "query": query,
+                "results": results,
+            })
+    return calls
diff --git a/bench/analysis/retrieval_probe.py b/bench/analysis/retrieval_probe.py
new file mode 100644
index 00000000..5ff1c378
--- /dev/null
+++ b/bench/analysis/retrieval_probe.py
@@ -0,0 +1,287 @@
+"""Offline intrinsic-retrieval probe (Phase A fix #1 go/no-go).
+
+Question: is the gold file findable from the graph's CONTENT, and how much
+better is a `description -> file` retriever than today's name-prefix interface?
+
+This isolates RETRIEVAL QUALITY from AGENT BEHAVIOR. No agent, no LLM, no API
+tokens. For each of the 20 no-leak structural-hard SWE-bench instances we take
+the problem statement as the query and rank the repo's files using retrievers
+built directly over the already-indexed FalkorDB graph (port 6380), then score
+recall@k / MRR against the gold (patched) files.
+
+Retriever arms:
+  - name_prefix : emulates current `auto_complete`/`find-symbol` interface --
+                  pull identifier-ish tokens from the issue, prefix-match symbol
+                  names, rank files by # matching symbols. (current floor)
+  - bm25        : Okapi BM25 over per-file text (path + symbol names + bodies).
+  - tfidf       : TF-IDF cosine over the same per-file text.
+
+All retrievers are pure-numpy, deterministic, $0. BM25/TF-IDF are the proxy for
+the candidate `search_semantic` primitive (production could use embeddings for
+additional lift; lexical already establishes the ceiling/floor gap).
+"""
+
+from __future__ import annotations
+
+import math
+import re
+import sys
+from collections import Counter, defaultdict
+
+import numpy as np
+import redis
+
+from bench.datasets import swe_bench
+
+FALKOR_PORT = 6380
+GRAPH_FMT = "code:{task}__loc:_default"
+KS = (1, 3, 5, 10)
+PER_FILE_BODY_TOKEN_CAP = 4000  # cap body tokens contributed per file
+MIN_PREFIX_LEN = 4              # identifier length floor for name_prefix arm
+
+_word_re = re.compile(r"[A-Za-z_][A-Za-z0-9_]*")
+_camel_re = re.compile(r"[A-Z]+(?=[A-Z][a-z])|[A-Z]?[a-z0-9]+|[A-Z]+")
+
+
+def subtokens(ident: str) -> list[str]:
+    """Split an identifier into lowercased camelCase/snake_case subtokens
+    AND keep the whole lowercased identifier."""
+    out: list[str] = []
+    for part in ident.split("_"):
+        if not part:
+            continue
+        out.extend(m.group(0).lower() for m in _camel_re.finditer(part))
+    out.append(ident.lower())
+    return [t for t in out if t]
+
+
+def tokenize(text: str) -> list[str]:
+    toks: list[str] = []
+    for m in _word_re.finditer(text or ""):
+        toks.extend(subtokens(m.group(0)))
+    return toks
+
+
+def issue_identifiers(text: str) -> list[str]:
+    """Candidate code symbols an agent would prefix-search: backticked names,
+    dotted paths, and CamelCase / snake_case identifiers in the issue."""
+    cands: set[str] = set()
+    for m in re.finditer(r"`([^`]+)`", text or ""):
+        for ident in _word_re.findall(m.group(1)):
+            if len(ident) >= MIN_PREFIX_LEN:
+                cands.add(ident)
+    for ident in _word_re.findall(text or ""):
+        if len(ident) >= MIN_PREFIX_LEN and (
+            "_" in ident or re.search(r"[a-z][A-Z]", ident) or ident[0].isupper()
+        ):
+            cands.add(ident)
+    return sorted(cands)
+
+
+def fetch_graph(r: redis.Redis, task: str):
+    """Return (files: list[relpath], file_text: {relpath: token list},
+    symbols: list[(name, relpath)]). relpath is repo-relative."""
+    g = GRAPH_FMT.format(task=task)
+    split_key = f"{task}__loc/"
+
+    def rel(p: str) -> str:
+        return p.split(split_key, 1)[-1] if split_key in p else p
+
+    files: list[str] = []
+    bodytok: dict[str, list[str]] = defaultdict(list)
+    symbols: list[tuple[str, str]] = []
+
+    res = r.execute_command("GRAPH.QUERY", g, "MATCH (f:File) RETURN f.path")
+    for row in res[1]:
+        rp = rel(row[0])
+        files.append(rp)
+        bodytok[rp].extend(tokenize(rp.replace("/", " ")))
+
+    # Symbols: name + body (doc). Bodies are source; cap per-file contribution.
+    q = "MATCH (n) WHERE n:Function OR n:Class RETURN n.name, n.path, n.doc"
+    res = r.execute_command("GRAPH.QUERY", g, q)
+    bodycount: Counter = Counter()
+    for name, path, doc in res[1]:
+        if not path:
+            continue
+        rp = rel(path)
+        if name:
+            symbols.append((name, rp))
+            bodytok[rp].extend(subtokens(name))
+        if doc and bodycount[rp] < PER_FILE_BODY_TOKEN_CAP:
+            toks = tokenize(doc)
+            take = toks[: PER_FILE_BODY_TOKEN_CAP - bodycount[rp]]
+            bodytok[rp].extend(take)
+            bodycount[rp] += len(take)
+
+    files = sorted(set(files))
+    return files, bodytok, symbols
+
+
+# ---------- retrievers: return ranked list of relpaths ----------
+
+def rank_name_prefix(query: str, files, bodytok, symbols) -> list[str]:
+    cands = [c.lower() for c in issue_identifiers(query)]
+    by_file: Counter = Counter()
+    # prefix match against symbol names (emulates auto_complete prefix search)
+    names = [(n.lower(), f) for n, f in symbols]
+    for c in cands:
+        for nl, f in names:
+            if nl.startswith(c):
+                by_file[f] += 1
+    return [f for f, _ in by_file.most_common()]
+
+
+def _build_index(files, bodytok):
+    docs = [bodytok[f] for f in files]
+    df: Counter = Counter()
+    for d in docs:
+        for t in set(d):
+            df[t] += 1
+    return docs, df
+
+
+def rank_bm25(query, files, bodytok, symbols, k1=1.5, b=0.75) -> list[str]:
+    docs, df = _build_index(files, bodytok)
+    N = len(docs)
+    if N == 0:
+        return []
+    avgdl = sum(len(d) for d in docs) / N or 1.0
+    idf = {t: math.log(1 + (N - n + 0.5) / (n + 0.5)) for t, n in df.items()}
+    qtok = set(tokenize(query))
+    scores = np.zeros(N)
+    for i, d in enumerate(docs):
+        if not d:
+            continue
+        tf = Counter(d)
+        dl = len(d)
+        s = 0.0
+        for t in qtok:
+            f = tf.get(t)
+            if not f:
+                continue
+            s += idf.get(t, 0.0) * (f * (k1 + 1)) / (f + k1 * (1 - b + b * dl / avgdl))
+        scores[i] = s
+    order = np.argsort(-scores)
+    return [files[i] for i in order if scores[i] > 0]
+
+
+def rank_tfidf(query, files, bodytok, symbols) -> list[str]:
+    docs, df = _build_index(files, bodytok)
+    N = len(docs)
+    if N == 0:
+        return []
+    vocab = {t: j for j, t in enumerate(df)}
+    idf = np.array([math.log((1 + N) / (1 + df[t])) + 1 for t in vocab])
+    rows = []
+    for d in docs:
+        v = np.zeros(len(vocab))
+        if d:
+            tf = Counter(d)
+            for t, c in tf.items():
+                j = vocab.get(t)
+                if j is not None:
+                    v[j] = c / len(d)
+        v *= idf
+        nrm = np.linalg.norm(v)
+        rows.append(v / nrm if nrm else v)
+    mat = np.array(rows)
+    qv = np.zeros(len(vocab))
+    qtf = Counter(tokenize(query))
+    for t, c in qtf.items():
+        j = vocab.get(t)
+        if j is not None:
+            qv[j] = c
+    qv *= idf
+    qn = np.linalg.norm(qv)
+    if qn:
+        qv /= qn
+    scores = mat @ qv
+    order = np.argsort(-scores)
+    return [files[i] for i in order if scores[i] > 0]
+
+
+RETRIEVERS = {
+    "name_prefix": rank_name_prefix,
+    "bm25": rank_bm25,
+    "tfidf": rank_tfidf,
+}
+
+
+def score(ranked: list[str], gold: list[str]):
+    goldset = set(gold)
+    pos = {f: i for i, f in enumerate(ranked)}
+    ranks = [pos[g] + 1 for g in gold if g in pos]
+    out = {}
+    for k in KS:
+        topk = set(ranked[:k])
+        out[f"recall@{k}"] = len(topk & goldset) / len(goldset) if goldset else 0.0
+        out[f"hit@{k}"] = 1.0 if (topk & goldset) else 0.0
+    out["mrr"] = 1.0 / min(ranks) if ranks else 0.0
+    out["gold_best_rank"] = min(ranks) if ranks else None
+    out["gold_found"] = sum(1 for g in gold if g in pos)
+    out["n_gold"] = len(gold)
+    out["n_files"] = len(ranked)
+    return out
+
+
+def main():
+    insts = swe_bench.load_instances()
+    sel = swe_bench.select_structural(insts, n=20, no_leak=True)
+    r = redis.Redis(host="localhost", port=FALKOR_PORT, decode_responses=True)
+
+    agg: dict[str, list[dict]] = {a: [] for a in RETRIEVERS}
+    per_task = []
+    for inst in sel:
+        task = inst.instance_id
+        gold = swe_bench.gold_changed_files(inst.patch, source_only=True)
+        gold = [g for g in gold if g.endswith(".py")]
+        try:
+            files, bodytok, symbols = fetch_graph(r, task)
+        except Exception as e:  # noqa: BLE001
+            print(f"!! {task}: graph fetch failed: {e}", file=sys.stderr)
+            continue
+        gold_in_graph = sum(1 for g in gold if g in set(files))
+        row = {"task": task, "n_gold": len(gold), "gold_in_graph": gold_in_graph,
+               "n_files": len(files)}
+        for arm, fn in RETRIEVERS.items():
+            ranked = fn(inst.problem_statement, files, bodytok, symbols)
+            sc = score(ranked, gold)
+            agg[arm].append(sc)
+            row[arm] = sc
+        per_task.append(row)
+        gp = " ".join(
+            f"{a}:R@5={row[a]['recall@5']:.2f}/rk={row[a]['gold_best_rank']}"
+            for a in RETRIEVERS
+        )
+        print(f"{task:38s} gold={len(gold)} in_graph={gold_in_graph}/{len(gold)} "
+              f"files={len(files):5d} | {gp}")
+
+    print("\n================ AGGREGATE (n={}) ================".format(len(per_task)))
+    hdr = f"{'arm':12s}"
+    for k in KS:
+        hdr += f"  R@{k:<4}"
+    for k in KS:
+        hdr += f"  hit@{k:<2}"
+    hdr += "   MRR"
+    print(hdr)
+    for arm in RETRIEVERS:
+        rows = agg[arm]
+        line = f"{arm:12s}"
+        for k in KS:
+            line += f"  {np.mean([x[f'recall@{k}'] for x in rows]):.3f}"
+        for k in KS:
+            line += f"  {np.mean([x[f'hit@{k}'] for x in rows]):.3f}"
+        line += f"   {np.mean([x['mrr'] for x in rows]):.3f}"
+        print(line)
+
+    tot_gold = sum(p["n_gold"] for p in per_task)
+    tot_in = sum(p["gold_in_graph"] for p in per_task)
+    print(f"\ngold-file coverage in graph: {tot_in}/{tot_gold} "
+          f"({100*tot_in/tot_gold:.1f}%)")
+    print("\nReference (live agent, Phase A Sonnet): no_mcp recall=0.613 "
+          "acc@3=0.95 MRR=0.875 ; code_graph recall=0.512 MRR=0.800")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bench/analysis/trace.py b/bench/analysis/trace.py
new file mode 100644
index 00000000..ce1cc64b
--- /dev/null
+++ b/bench/analysis/trace.py
@@ -0,0 +1,772 @@
+"""Trajectory trace extractor for Copilot benchmark runs.
+
+Each benchmark run persists the full Copilot event stream to
+``<run_dir>/logs/stdout.jsonl``. The runner itself only derives scalar counts
+from it. This module reconstructs the *decision loop* so we can analyse what
+the agent did rather than guess:
+
+    (tool_name, arguments)  ->  (success, result_content, size, empty?)
+                            ->  (assistant reasoning/message that followed)
+
+It emits, per run:
+  * ``trace.jsonl`` -- one JSON object per tool step (machine-readable)
+  * ``trace.md``    -- a readable timeline (human review)
+  * a ``summary`` dict -- derived behaviour signals + per-file *attribution*
+    (did a structural tool actually surface each correctly-predicted file, or
+    did it come from the prompt / a builtin view-grep / the model's own prior?)
+
+Standalone & post-hoc: it reads an existing ``run_dir`` (and the matching row
+in ``results.jsonl`` for gold/pred), so it works on runs already on disk and
+can also be wired into the runner for future runs.
+
+Usage:
+    python -m bench.analysis.trace <run_dir> [<run_dir> ...]
+    python -m bench.analysis.trace --cache-dir bench/cache/phaseB-levers \
+        --model claude-sonnet-4.6 [--mode localize]
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+from pathlib import Path
+from typing import Any, Iterable, Optional
+
+# Tool-name classification --------------------------------------------------
+# MCP code-graph tools surface as ``code-graph-<tool>``; the LSP arm (when
+# built) will surface as ``lsp-<tool>``. Builtin agent tools are everything
+# else the CLI ships (view/str_replace/grep/glob/bash/report_intent/...).
+GRAPH_PREFIX = "code-graph"
+LSP_PREFIX = "lsp"
+BUILTIN_READERS = {"view", "read", "cat", "grep", "glob", "search", "ripgrep"}
+LOCALIZE_SENTINEL = "FINAL_LOCALIZATION_JSON:"
+
+# Result payloads can be huge (whole file slices). Cap what we inline into the
+# readable/structured trace; keep enough to see what the agent actually saw.
+_RESULT_CHARS_MD = 800
+_RESULT_CHARS_JSONL = 4000
+_ARGS_CHARS = 600
+_REACTION_CHARS_MD = 600
+
+
+def _tool_kind(name: str) -> str:
+    if not name:
+        return "unknown"
+    if name.startswith(GRAPH_PREFIX):
+        return "graph"
+    if name.startswith(LSP_PREFIX):
+        return "lsp"
+    base = name.split("-")[-1].lower()
+    if base in BUILTIN_READERS:
+        return "builtin_reader"
+    return "builtin_other"
+
+
+def _est_tokens(text: str) -> int:
+    """Cheap token estimate (~4 chars/token) for result-size accounting."""
+    return (len(text) + 3) // 4 if text else 0
+
+
+def _load_events(stdout_path: Path) -> list[dict[str, Any]]:
+    events: list[dict[str, Any]] = []
+    if not stdout_path.exists():
+        return events
+    with stdout_path.open() as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                events.append(json.loads(line))
+            except json.JSONDecodeError:
+                continue
+    return events
+
+
+def _result_to_text(result: Any) -> str:
+    """Flatten a tool.execution_complete ``result`` into displayable text."""
+    if result is None:
+        return ""
+    if isinstance(result, str):
+        return result
+    if isinstance(result, dict):
+        for key in ("content", "detailedContent"):
+            val = result.get(key)
+            if isinstance(val, str) and val.strip():
+                return val
+        # Fall back to a compact JSON dump of the whole result object.
+        try:
+            return json.dumps(result, ensure_ascii=False)
+        except (TypeError, ValueError):
+            return str(result)
+    return str(result)
+
+
+def _is_empty_result(text: str) -> bool:
+    t = text.strip()
+    return t in ("", "{}", "[]", '{"result":[]}', '{"result": []}', "null")
+
+
+def build_steps(events: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Reconstruct ordered tool steps, each paired with the agent's reaction.
+
+    A *step* = one ``tool.execution_start`` matched (by toolCallId) to its
+    ``tool.execution_complete``, annotated with the assistant reasoning/message
+    text that streamed *after* that completion and *before* the next tool
+    started (the agent's reaction to the tool's output).
+    """
+    # Index completions by toolCallId for O(1) pairing.
+    completions: dict[str, dict[str, Any]] = {}
+    for ev in events:
+        if ev.get("type") == "tool.execution_complete":
+            d = ev.get("data", {})
+            cid = d.get("toolCallId")
+            if cid:
+                completions[cid] = d
+
+    steps: list[dict[str, Any]] = []
+    # First pass: collect starts in order with their event index.
+    starts: list[tuple[int, dict[str, Any]]] = []
+    for i, ev in enumerate(events):
+        if ev.get("type") == "tool.execution_start":
+            starts.append((i, ev))
+
+    for step_idx, (ev_idx, ev) in enumerate(starts):
+        d = ev.get("data", {})
+        cid = d.get("toolCallId")
+        name = d.get("toolName") or d.get("name") or "unknown"
+        comp = completions.get(cid, {})
+        result_text = _result_to_text(comp.get("result"))
+
+        # Thoughts BEFORE this tool call = assistant reasoning/message that
+        # streamed AFTER the previous tool start (or stream start for step 0)
+        # and BEFORE this tool's start. This is the chain-of-thought that led
+        # to this action. Within one assistant turn the model emits one
+        # reasoning + one message block then fires N tool starts, so the
+        # before-block attaches to the FIRST tool of the turn; siblings get
+        # empty before-blocks (honest — the thought happened once). The `turn`
+        # field lets a reader regroup siblings.
+        prev_ev_idx = starts[step_idx - 1][0] if step_idx > 0 else -1
+        thinking_parts: list[str] = []
+        narration_parts: list[str] = []
+        for j in range(prev_ev_idx + 1, ev_idx):
+            ej = events[j]
+            etype = ej.get("type")
+            content = ej.get("data", {}).get("content")
+            if not isinstance(content, str) or not content.strip():
+                continue
+            if etype == "assistant.reasoning":
+                thinking_parts.append(content.strip())
+            elif etype == "assistant.message":
+                narration_parts.append(content.strip())
+        thinking_before = "\n".join(thinking_parts)
+        narration_before = "\n".join(narration_parts)
+
+        # Reaction = assistant message/reasoning text between this step's event
+        # index and the next step's event index (or end of stream). Kept for
+        # backward-compat with programmatic consumers; equals the NEXT step's
+        # before-block, so render_md uses the before-blocks instead.
+        next_ev_idx = starts[step_idx + 1][0] if step_idx + 1 < len(starts) else len(events)
+        reaction_parts: list[str] = []
+        for j in range(ev_idx + 1, next_ev_idx):
+            ej = events[j]
+            if ej.get("type") in ("assistant.message", "assistant.reasoning"):
+                content = ej.get("data", {}).get("content")
+                if isinstance(content, str) and content.strip():
+                    reaction_parts.append(content.strip())
+        reaction = "\n".join(reaction_parts)
+
+        steps.append({
+            "step": step_idx,
+            "turn": d.get("turnId"),
+            "tool": name,
+            "kind": _tool_kind(name),
+            "mcp_server": d.get("mcpServerName"),
+            "mcp_tool": d.get("mcpToolName"),
+            "arguments": d.get("arguments"),
+            "success": comp.get("success"),
+            "result_text": result_text,
+            "result_chars": len(result_text),
+            "result_tokens_est": _est_tokens(result_text),
+            "empty": _is_empty_result(result_text),
+            "thinking_before": thinking_before,
+            "narration_before": narration_before,
+            "reaction": reaction,
+        })
+    return steps
+
+
+def final_blocks(events: list[dict[str, Any]]) -> dict[str, str]:
+    """Trailing thinking + narration AFTER the last tool call.
+
+    This is the agent's closing reasoning and final answer (e.g. the
+    ``FINAL_LOCALIZATION_JSON:`` payload) which streams after the last tool
+    completes and would otherwise be dropped by the per-step windows.
+    """
+    last_start = -1
+    for i, ev in enumerate(events):
+        if ev.get("type") == "tool.execution_start":
+            last_start = i
+    thinking_parts: list[str] = []
+    narration_parts: list[str] = []
+    for j in range(last_start + 1, len(events)):
+        ej = events[j]
+        etype = ej.get("type")
+        content = ej.get("data", {}).get("content")
+        if not isinstance(content, str) or not content.strip():
+            continue
+        if etype == "assistant.reasoning":
+            thinking_parts.append(content.strip())
+        elif etype == "assistant.message":
+            narration_parts.append(content.strip())
+    return {
+        "thinking": "\n".join(thinking_parts),
+        "narration": "\n".join(narration_parts),
+    }
+
+
+def _mentions(text: str, path: str) -> bool:
+    """Does ``text`` reference this file by full relative path or basename?"""
+    if not text or not path:
+        return False
+    base = os.path.basename(path)
+    return (path in text) or (bool(base) and base in text)
+
+
+def _iter_json_objects(text: str) -> Iterable[dict[str, Any]]:
+    """Yield every top-level JSON object embedded in ``text``.
+
+    Structural ``search_code`` results are a stream of concatenated JSON
+    objects (NDJSON-like), not a single array, so a plain ``json.loads`` fails
+    with "Extra data". This walks the string with ``raw_decode`` and yields
+    each object it can decode, tolerating non-JSON noise between them.
+    """
+    if not text:
+        return
+    dec = json.JSONDecoder()
+    i, n = 0, len(text)
+    while i < n:
+        # Skip to the next plausible object/array start.
+        while i < n and text[i] not in "{[":
+            i += 1
+        if i >= n:
+            return
+        try:
+            obj, end = dec.raw_decode(text, i)
+        except json.JSONDecodeError:
+            i += 1
+            continue
+        if isinstance(obj, dict):
+            yield obj
+        elif isinstance(obj, list):
+            for item in obj:
+                if isinstance(item, dict):
+                    yield item
+        i = end
+
+
+def _structural_surface_map(text: str) -> dict[str, str]:
+    """Map every file path surfaced in a structural result to HOW it surfaced.
+
+    Returns ``{path_or_basename: via}`` where ``via`` is ``"direct"`` for a
+    genuinely ranked hit (a top-level object carrying a numeric ``score``), or
+    the edge label (e.g. ``"co_override"``) for a file that only appears as a
+    ``likely_related_files`` sibling. The tool also re-emits edge siblings as
+    trailing *score-less* top-level objects; those are NOT counted as direct
+    ranked hits. ``"direct"`` always wins when a file surfaces both ways. Both
+    the full path and its basename are indexed so attribution can match either.
+    """
+    edges: dict[str, str] = {}
+    direct: set[str] = set()
+    for obj in _iter_json_objects(text):
+        related = obj.get("likely_related_files")
+        if isinstance(related, list):
+            for rel in related:
+                if isinstance(rel, dict):
+                    path = rel.get("file")
+                    if isinstance(path, str) and path:
+                        for key in (path, os.path.basename(path)):
+                            edges.setdefault(key, str(rel.get("via") or "edge"))
+        path = obj.get("file")
+        # Only a numeric-scored top-level object is a true ranked ("direct") hit;
+        # score-less entries are the re-emitted edge siblings.
+        if isinstance(path, str) and path and isinstance(obj.get("score"), (int, float)):
+            for key in (path, os.path.basename(path)):
+                direct.add(key)
+
+    surfaces: dict[str, str] = dict(edges)
+    for key in direct:
+        surfaces[key] = "direct"  # direct ranked hit always wins
+    return surfaces
+
+
+def attribute_files(
+    pred_files: list[str],
+    gold_files: list[str],
+    steps: list[dict[str, Any]],
+    prompt_text: str,
+) -> list[dict[str, Any]]:
+    """For each predicted file, decide WHERE it first surfaced.
+
+    Source precedence (earliest evidence wins):
+      * ``prompt``          -- named in the problem statement (leak / given)
+      * ``graph`` / ``lsp`` -- first appeared in a structural tool's result
+      * ``builtin_reader``  -- first appeared via view/grep/glob output
+      * ``model``           -- never seen in prompt or any tool result; the
+                               agent produced it from its own prior knowledge
+
+    This is the anti-guessing metric: it tells us whether the structural tool
+    *actually contributed* the correct answer or was decorative.
+    """
+    gold_set = {g for g in gold_files}
+    attded: list[dict[str, Any]] = []
+    for p in pred_files:
+        is_hit = p in gold_set
+        source = "model"
+        source_step: Optional[int] = None
+        source_tool: Optional[str] = None
+        via: Optional[str] = None
+        if _mentions(prompt_text, p):
+            source = "prompt"
+        else:
+            for s in steps:
+                # Only successful, non-empty tool results count as a "surface".
+                if s.get("success") is False or s.get("empty"):
+                    continue
+                if _mentions(s.get("result_text", ""), p):
+                    kind = s["kind"]
+                    if kind in ("graph", "lsp"):
+                        source = kind
+                        # Distinguish a direct ranked hit from an edge-derived
+                        # one (e.g. co_override) so the structural mechanism that
+                        # actually surfaced the file gets explicit credit.
+                        surfaces = _structural_surface_map(s.get("result_text", ""))
+                        via = surfaces.get(p) or surfaces.get(os.path.basename(p)) or "direct"
+                    elif kind == "builtin_reader":
+                        source = "builtin_reader"
+                    else:
+                        source = "builtin_other"
+                    source_step = s["step"]
+                    source_tool = s["tool"]
+                    break
+        attded.append({
+            "file": p,
+            "is_gold_hit": is_hit,
+            "source": source,
+            "source_step": source_step,
+            "source_tool": source_tool,
+            "via": via,
+        })
+    return attded
+
+
+def summarize(steps: list[dict[str, Any]], attribution: list[dict[str, Any]]) -> dict[str, Any]:
+    """Derive behaviour signals from the reconstructed steps."""
+    by_name: dict[str, int] = {}
+    by_kind: dict[str, int] = {}
+    empty = 0
+    errors = 0
+    seen_calls: set[str] = set()
+    redundant = 0
+    structural_first: Optional[bool] = None
+    first_tool: Optional[str] = None
+
+    for s in steps:
+        name = s["tool"]
+        by_name[name] = by_name.get(name, 0) + 1
+        by_kind[s["kind"]] = by_kind.get(s["kind"], 0) + 1
+        if s.get("empty"):
+            empty += 1
+        if s.get("success") is False:
+            errors += 1
+        sig = f"{name}:{json.dumps(s.get('arguments'), sort_keys=True)}"
+        if sig in seen_calls:
+            redundant += 1
+        else:
+            seen_calls.add(sig)
+        if first_tool is None:
+            first_tool = name
+            structural_first = s["kind"] in ("graph", "lsp")
+
+    structural_calls = by_kind.get("graph", 0) + by_kind.get("lsp", 0)
+
+    # Attribution rollup over correctly-predicted (gold-hit) files only.
+    hit_sources: dict[str, int] = {}
+    hit_via: dict[str, int] = {}
+    for a in attribution:
+        if a["is_gold_hit"]:
+            hit_sources[a["source"]] = hit_sources.get(a["source"], 0) + 1
+            if a["source"] in ("graph", "lsp"):
+                key = f"{a['source']}:{a.get('via') or 'direct'}"
+                hit_via[key] = hit_via.get(key, 0) + 1
+
+    cost_without_benefit = _cost_without_benefit(steps, attribution)
+
+    return {
+        "tool_calls_total": len(steps),
+        "tool_calls_by_name": by_name,
+        "tool_calls_by_kind": by_kind,
+        "structural_calls": structural_calls,
+        "structural_adopted": structural_calls > 0,
+        "first_tool": first_tool,
+        "structural_first": structural_first,
+        "empty_result_count": empty,
+        "tool_error_count": errors,
+        "redundant_call_count": redundant,
+        "gold_hit_source_counts": hit_sources,
+        "gold_hit_via_counts": hit_via,
+        "cost_without_benefit": cost_without_benefit,
+    }
+
+
+# Tool kinds that are "under test" — the navigation tools whose value we are
+# trying to measure. Builtin grep/view/glob are the baseline the agent always
+# has, so they are not charged as cost-without-benefit here.
+_TESTED_KINDS = ("graph", "lsp")
+
+
+def _cost_without_benefit(
+    steps: list[dict[str, Any]],
+    attribution: list[dict[str, Any]],
+) -> dict[str, Any]:
+    """Tokens the tool-under-test injected into context that did NOT surface a
+    correctly-predicted (gold) file.
+
+    A structural call "benefits" the run iff it is the step that first surfaced
+    a gold-hit predicted file (per ``attribute_files`` precedence). Every other
+    structural call — empty results, redundant queries, verbose dumps the agent
+    never used, or surfaces of non-gold files — is charged as wasted context
+    cost. This is the sharp "cost without benefit" indicator: high wasted_tokens
+    with benefited=False means the tool spent context and contributed nothing to
+    the answer.
+    """
+    beneficial_steps = {
+        a["source_step"]
+        for a in attribution
+        if a.get("is_gold_hit")
+        and a.get("source") in _TESTED_KINDS
+        and a.get("source_step") is not None
+    }
+    by_kind: dict[str, dict[str, int]] = {}
+    total_tokens = 0
+    beneficial_tokens = 0
+    wasted_tokens = 0
+    wasted_calls = 0
+    for s in steps:
+        kind = s.get("kind")
+        if kind not in _TESTED_KINDS:
+            continue
+        tok = int(s.get("result_tokens_est") or 0)
+        slot = by_kind.setdefault(
+            kind, {"calls": 0, "tokens": 0, "wasted_calls": 0, "wasted_tokens": 0}
+        )
+        slot["calls"] += 1
+        slot["tokens"] += tok
+        total_tokens += tok
+        if s.get("step") in beneficial_steps:
+            beneficial_tokens += tok
+        else:
+            wasted_tokens += tok
+            wasted_calls += 1
+            slot["wasted_calls"] += 1
+            slot["wasted_tokens"] += tok
+    return {
+        "tested_kinds": [k for k in _TESTED_KINDS if k in by_kind],
+        "structural_result_tokens": total_tokens,
+        "beneficial_tokens": beneficial_tokens,
+        "wasted_tokens": wasted_tokens,
+        "wasted_calls": wasted_calls,
+        "wasted_fraction": round(wasted_tokens / total_tokens, 4) if total_tokens else None,
+        "benefited": bool(beneficial_steps),
+        "by_kind": by_kind,
+    }
+
+
+def _fmt_block(text: str, cap: int) -> str:
+    if not text:
+        return "(none)"
+    t = text.strip()
+    if len(t) > cap:
+        t = t[:cap] + f"\n… [+{len(t) - cap} chars truncated]"
+    return t
+
+
+def render_md(meta: dict[str, Any], steps: list[dict[str, Any]],
+              attribution: list[dict[str, Any]], summary: dict[str, Any],
+              final: Optional[dict[str, str]] = None) -> str:
+    lines: list[str] = []
+    lines.append(f"# Trace — {meta.get('task_id')} [{meta.get('config')}] ({meta.get('prompt_mode')})")
+    lines.append("")
+    lines.append(f"- model: {meta.get('model')}  mode: {meta.get('mode')}  run_idx: {meta.get('run_idx')}")
+    lines.append(f"- outcome: {meta.get('outcome')}  recall: {meta.get('file_recall')}  "
+                 f"precision: {meta.get('file_precision')}  acc@1: {meta.get('acc_at_1')}  mrr: {meta.get('file_mrr')}")
+    _rt = meta.get("reasoning_tokens")
+    _rt_str = f" (of which reasoning: {_rt})" if _rt not in (None, 0) else ""
+    lines.append(f"- tokens: in={meta.get('input_tokens')} out={meta.get('output_tokens')}{_rt_str} "
+                 f"total={meta.get('total_tokens')}  turns≈{meta.get('usage_blocks')}  wall={meta.get('wall_clock_sec')}s")
+    lines.append(f"- gold: {meta.get('gold_files')}")
+    lines.append(f"- pred: {meta.get('pred_files')}")
+    lines.append("")
+    lines.append("## Behaviour summary")
+    lines.append(f"- tool calls: {summary['tool_calls_total']}  by kind: {summary['tool_calls_by_kind']}")
+    lines.append(f"- structural adopted: {summary['structural_adopted']}  "
+                 f"structural calls: {summary['structural_calls']}  first tool: {summary['first_tool']}")
+    lines.append(f"- empty results: {summary['empty_result_count']}  errors: {summary['tool_error_count']}  "
+                 f"redundant calls: {summary['redundant_call_count']}")
+    lines.append(f"- **gold-hit attribution**: {summary['gold_hit_source_counts'] or '(no gold hits)'}")
+    via_counts = summary.get("gold_hit_via_counts")
+    if via_counts:
+        lines.append(f"- **structural gold-hits by surface**: {via_counts} "
+                     f"(direct = ranked hit; co_override/other = edge-derived)")
+    cwb = summary.get("cost_without_benefit")
+    if cwb and cwb.get("structural_result_tokens"):
+        frac = cwb.get("wasted_fraction")
+        frac_str = f"{frac:.0%}" if frac is not None else "n/a"
+        benefit_str = "yes" if cwb.get("benefited") else "**NO — tool contributed nothing**"
+        lines.append(
+            f"- **cost without benefit**: wasted ~{cwb['wasted_tokens']} of "
+            f"{cwb['structural_result_tokens']} structural tokens ({frac_str}) "
+            f"across {cwb['wasted_calls']} call(s); benefited: {benefit_str}"
+        )
+        for kind, slot in (cwb.get("by_kind") or {}).items():
+            lines.append(
+                f"    - {kind}: {slot['wasted_calls']}/{slot['calls']} calls wasted, "
+                f"~{slot['wasted_tokens']}/{slot['tokens']} tok wasted"
+            )
+    lines.append("")
+    lines.append("## Predicted-file attribution")
+    for a in attribution:
+        tag = "✓gold" if a["is_gold_hit"] else " miss"
+        where = a["source"]
+        if a["source"] in ("graph", "lsp") and a.get("via"):
+            where += f"/{a['via']}"
+        if a["source_tool"]:
+            where += f" (step {a['source_step']} {a['source_tool']})"
+        lines.append(f"- [{tag}] {a['file']}  ←  {where}")
+    lines.append("")
+    lines.append("## Step-by-step trajectory")
+    lines.append("")
+    lines.append("_Each step shows the agent's thinking and narration **before** the "
+                 "tool call (the reasoning that led to the action), then the call and "
+                 "its result._")
+    lines.append("")
+    for s in steps:
+        args = _fmt_block(json.dumps(s.get("arguments"), ensure_ascii=False), _ARGS_CHARS)
+        flags = []
+        if s.get("empty"):
+            flags.append("EMPTY")
+        if s.get("success") is False:
+            flags.append("ERROR")
+        flag_str = (" [" + ",".join(flags) + "]") if flags else ""
+        lines.append(f"### Step {s['step']} · turn {s['turn']} · {s['tool']} ({s['kind']}){flag_str}")
+        thinking = s.get("thinking_before", "")
+        narration = s.get("narration_before", "")
+        if thinking:
+            lines.append(f"**thinking (before call):** {_fmt_block(thinking, _REACTION_CHARS_MD)}")
+        if narration:
+            lines.append(f"**narration (before call):** {_fmt_block(narration, _REACTION_CHARS_MD)}")
+        lines.append(f"**call:** `{args}`")
+        lines.append(f"**tool returned** ({s['result_chars']} chars, ~{s['result_tokens_est']} tok):")
+        lines.append("```")
+        lines.append(_fmt_block(s.get("result_text", ""), _RESULT_CHARS_MD))
+        lines.append("```")
+        lines.append("")
+    if final and (final.get("thinking") or final.get("narration")):
+        lines.append("## Final (after last tool call)")
+        if final.get("thinking"):
+            lines.append(f"**thinking:** {_fmt_block(final['thinking'], _RESULT_CHARS_MD)}")
+        if final.get("narration"):
+            lines.append(f"**narration / answer:** {_fmt_block(final['narration'], _RESULT_CHARS_MD)}")
+        lines.append("")
+    return "\n".join(lines)
+
+
+def extract_run(run_dir: Path, row: Optional[dict[str, Any]] = None,
+                write: bool = True) -> dict[str, Any]:
+    """Extract trace + summary for a single run directory.
+
+    ``row`` is the matching results.jsonl record (for gold/pred/tokens). If
+    omitted, gold/pred attribution falls back to empty lists but the
+    trajectory + behaviour summary are still produced.
+    """
+    stdout_path = run_dir / "logs" / "stdout.jsonl"
+    prompt_path = run_dir / "prompt.txt"
+    events = _load_events(stdout_path)
+    steps = build_steps(events)
+    final = final_blocks(events)
+    prompt_text = prompt_path.read_text() if prompt_path.exists() else ""
+
+    row = row or {}
+    pred_files = row.get("pred_files") or []
+    gold_files = row.get("gold_files") or []
+    attribution = attribute_files(pred_files, gold_files, steps, prompt_text)
+    summary = summarize(steps, attribution)
+
+    meta = {
+        "task_id": row.get("task_id"),
+        "config": row.get("config"),
+        "model": row.get("model"),
+        "mode": row.get("mode"),
+        "prompt_mode": row.get("prompt_mode"),
+        "run_idx": row.get("run_idx"),
+        "outcome": row.get("outcome"),
+        "file_recall": row.get("file_recall"),
+        "file_precision": row.get("file_precision"),
+        "acc_at_1": row.get("acc_at_1"),
+        "file_mrr": row.get("file_mrr"),
+        "input_tokens": row.get("input_tokens"),
+        "output_tokens": row.get("output_tokens"),
+        "reasoning_tokens": row.get("reasoning_tokens"),
+        "total_tokens": row.get("total_tokens"),
+        "usage_blocks": row.get("usage_blocks"),
+        "wall_clock_sec": row.get("wall_clock_sec"),
+        "gold_files": gold_files,
+        "pred_files": pred_files,
+    }
+
+    if write:
+        with (run_dir / "trace.jsonl").open("w") as f:
+            f.write(json.dumps({"_meta": meta, "_summary": summary,
+                                "_attribution": attribution, "_final": final}) + "\n")
+            for s in steps:
+                rec = dict(s)
+                # Cap inline result text in the structured file too.
+                rt = rec.get("result_text", "")
+                if len(rt) > _RESULT_CHARS_JSONL:
+                    rec["result_text"] = rt[:_RESULT_CHARS_JSONL]
+                    rec["result_truncated"] = True
+                f.write(json.dumps(rec, ensure_ascii=False) + "\n")
+        (run_dir / "trace.md").write_text(render_md(meta, steps, attribution, summary, final))
+
+    return {"meta": meta, "summary": summary, "attribution": attribution,
+            "steps": steps, "final": final, "run_dir": str(run_dir)}
+
+
+# Discovery -----------------------------------------------------------------
+
+
+def _load_rows(results_path: Path) -> dict[tuple, dict[str, Any]]:
+    """Index results.jsonl rows by (task_id, config, prompt_mode, run_idx)."""
+    rows: dict[tuple, dict[str, Any]] = {}
+    if not results_path.exists():
+        return rows
+    for line in results_path.read_text().splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            r = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        key = (r.get("task_id"), r.get("config"), r.get("prompt_mode", "neutral"),
+               int(r.get("run_idx", 0)))
+        rows[key] = r
+    return rows
+
+
+def iter_run_dirs(cache_dir: Path, model: str, mode: Optional[str] = None) -> Iterable[Path]:
+    """Yield every run dir under ``cache_dir/runs/model[/mode]``."""
+    base = cache_dir / "runs" / model
+    if not base.exists():
+        return
+    modes = [mode] if mode else [p.name for p in base.iterdir() if p.is_dir()]
+    for m in modes:
+        mdir = base / m
+        if not mdir.is_dir():
+            continue
+        for prompt_mode_dir in mdir.iterdir():
+            if not prompt_mode_dir.is_dir():
+                continue
+            for track_dir in prompt_mode_dir.iterdir():
+                if not track_dir.is_dir():
+                    continue
+                for inst_dir in track_dir.iterdir():
+                    if (inst_dir / "logs" / "stdout.jsonl").exists():
+                        yield inst_dir
+
+
+def _row_for_run(run_dir: Path, rows: dict[tuple, dict[str, Any]]) -> Optional[dict[str, Any]]:
+    # Path layout: .../runs/<model>/<mode>/<prompt_mode>/<track>/<task_id>
+    parts = run_dir.parts
+    try:
+        task_id = parts[-1]
+        track = parts[-2]
+        prompt_mode = parts[-3]
+    except IndexError:
+        return None
+    for run_idx in range(0, 8):
+        key = (task_id, track, prompt_mode, run_idx)
+        if key in rows:
+            return rows[key]
+    # Fallback: match on task_id+config only.
+    for (t, c, _pm, _ri), r in rows.items():
+        if t == task_id and c == track:
+            return r
+    return None
+
+
+def _auto_results_path(run_dir: Path) -> Optional[Path]:
+    """Locate the results.jsonl for a positional run dir.
+
+    Layout is ``<cache>/runs/<model>/<mode>/<prompt_mode>/<track>/<task_id>``
+    and results live at ``<cache>/<model>/results.jsonl``. Walk up to the
+    ``runs`` anchor, recover ``<cache>`` and ``<model>``, and return that path
+    if it exists. This means a bare ``trace.py <run_dir>`` still joins gold/pred
+    (otherwise attribution runs blind and falsely reports "contributed nothing").
+    """
+    parts = run_dir.parts
+    if "runs" not in parts:
+        return None
+    ri = len(parts) - 1 - parts[::-1].index("runs")
+    if ri + 1 >= len(parts):
+        return None
+    cache_dir = Path(*parts[:ri]) if ri > 0 else Path(parts[0])
+    model = parts[ri + 1]
+    candidate = cache_dir / model / "results.jsonl"
+    return candidate if candidate.exists() else None
+
+
+def main(argv: Optional[list[str]] = None) -> int:
+    p = argparse.ArgumentParser(description="Extract decision-loop traces from benchmark runs.")
+    p.add_argument("run_dirs", nargs="*", help="explicit run dir(s) to extract")
+    p.add_argument("--cache-dir", help="extract every run under this cache dir")
+    p.add_argument("--model", default="claude-sonnet-4.6")
+    p.add_argument("--mode", default=None, help="restrict to one mode (e.g. localize)")
+    p.add_argument("--results", default=None, help="results.jsonl (default: <cache>/<model>/results.jsonl)")
+    args = p.parse_args(argv)
+
+    targets: list[tuple[Path, Optional[dict[str, Any]]]] = []
+    if args.cache_dir:
+        cache_dir = Path(args.cache_dir).resolve()
+        results_path = Path(args.results) if args.results else cache_dir / args.model / "results.jsonl"
+        rows = _load_rows(results_path)
+        for rd in iter_run_dirs(cache_dir, args.model, args.mode):
+            targets.append((rd, _row_for_run(rd, rows)))
+    for rd in args.run_dirs:
+        rdp = Path(rd).resolve()
+        row = None
+        results_path = Path(args.results) if args.results else _auto_results_path(rdp)
+        if results_path:
+            row = _row_for_run(rdp, _load_rows(results_path))
+        targets.append((rdp, row))
+
+    if not targets:
+        print("no run dirs found")
+        return 1
+
+    print(f"extracting {len(targets)} run(s)…")
+    for rd, row in targets:
+        out = extract_run(rd, row=row, write=True)
+        s = out["summary"]
+        m = out["meta"]
+        print(f"  {m.get('task_id')} [{m.get('config')}/{m.get('prompt_mode')}] "
+              f"recall={m.get('file_recall')} tools={s['tool_calls_total']} "
+              f"kinds={s['tool_calls_by_kind']} empty={s['empty_result_count']} "
+              f"err={s['tool_error_count']} hit_src={s['gold_hit_source_counts']} "
+              f"-> {rd}/trace.md")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/bench/cli/cg-mcp b/bench/cli/cg-mcp
new file mode 100755
index 00000000..be6c09bb
--- /dev/null
+++ b/bench/cli/cg-mcp
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+# Bash-callable entry point for the code-graph MCP CLI. Mirrors `cg`
+# but speaks JSON-RPC over stdio to a spawned `cgraph-mcp` server
+# instead of HTTP to the FastAPI service. Runner adds bench/cli to PATH.
+exec "${BENCH_PYTHON:-python3}" -m bench.cli.cg_mcp "$@"
diff --git a/bench/cli/cg_mcp.py b/bench/cli/cg_mcp.py
new file mode 100644
index 00000000..95c91390
--- /dev/null
+++ b/bench/cli/cg_mcp.py
@@ -0,0 +1,140 @@
+"""`cg-mcp` — bash-callable CLI exposing code-graph's 8 MCP tools.
+
+This is the MCP-transport sibling of `cg`. Where `cg` calls the host
+FastAPI service over HTTP, `cg-mcp` spawns the `cgraph-mcp` stdio
+server (via the official MCP Python SDK) for every invocation and
+dispatches one tool call.
+
+The MCP track is what external agents (Claude Code, Cursor, …) use
+in production; benchmarking through it tells us how the *real-world*
+integration behaves under SWE-bench, not just the in-process FastAPI
+adapter.
+
+Subcommands mirror the MCP tool names:
+
+  cg-mcp index_repo       --path-or-url . [--branch B] [--ignore PAT ...]
+  cg-mcp search_code      --project P --prefix STR [--branch B] [--limit N]
+  cg-mcp get_callers      --project P --symbol-id ID [--branch B] [--limit N]
+  cg-mcp get_callees      --project P --symbol-id ID [--branch B] [--limit N]
+  cg-mcp get_dependencies --project P --symbol-id ID [--branch B] [--limit N]
+  cg-mcp impact_analysis  --project P --symbol-id ID [--direction IN|OUT] [--depth N]
+  cg-mcp find_path        --project P --source-id ID --dest-id ID [--branch B]
+  cg-mcp ask              --project P --question "..." [--branch B]
+
+Output: one JSON document per call on stdout. Errors print to stderr
+and exit non-zero.
+
+Env: FALKORDB_HOST / FALKORDB_PORT are passed through to the spawned
+server. Optionally set CGRAPH_MCP_TIMEOUT_SEC to override the
+default 60s timeout.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+from typing import Any
+
+from bench.agents import code_graph_mcp_adapter as cgm
+
+
+def _print(obj: Any) -> None:
+    json.dump(obj, sys.stdout, indent=2, sort_keys=True, default=str)
+    sys.stdout.write("\n")
+
+
+def _timeout() -> float:
+    try:
+        return float(os.getenv("CGRAPH_MCP_TIMEOUT_SEC", "60"))
+    except ValueError:
+        return 60.0
+
+
+def _add_project(p: argparse.ArgumentParser) -> None:
+    p.add_argument("--project", required=True)
+    p.add_argument("--branch", default=None)
+
+
+def _add_symbol(p: argparse.ArgumentParser) -> None:
+    p.add_argument("--symbol-id", type=int, required=True, dest="symbol_id")
+    p.add_argument("--limit", type=int, default=50)
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(prog="cg-mcp", description=__doc__)
+    sub = parser.add_subparsers(dest="cmd", required=True)
+
+    ir = sub.add_parser("index_repo")
+    ir.add_argument("--path-or-url", required=True, dest="path_or_url")
+    ir.add_argument("--branch", default=None)
+    ir.add_argument("--ignore", nargs="*", default=None)
+
+    sc = sub.add_parser("search_code")
+    _add_project(sc)
+    sc.add_argument("--prefix", required=True)
+    sc.add_argument("--limit", type=int, default=10)
+
+    for name in ("get_callers", "get_callees", "get_dependencies"):
+        p = sub.add_parser(name)
+        _add_project(p)
+        _add_symbol(p)
+
+    ia = sub.add_parser("impact_analysis")
+    _add_project(ia)
+    ia.add_argument("--symbol-id", type=int, required=True, dest="symbol_id")
+    ia.add_argument("--direction", choices=["IN", "OUT"], default="IN")
+    ia.add_argument("--depth", type=int, default=3)
+
+    fp = sub.add_parser("find_path")
+    _add_project(fp)
+    fp.add_argument("--source-id", type=int, required=True, dest="source_id")
+    fp.add_argument("--dest-id", type=int, required=True, dest="dest_id")
+
+    aq = sub.add_parser("ask")
+    _add_project(aq)
+    aq.add_argument("--question", required=True)
+
+    args = parser.parse_args(argv)
+    timeout = _timeout()
+
+    # Inject timeout for adapter calls.
+    cgm.DEFAULT_TIMEOUT_SEC = timeout
+
+    try:
+        if args.cmd == "index_repo":
+            _print(cgm.index_repo(args.path_or_url, branch=args.branch, ignore=args.ignore))
+        elif args.cmd == "search_code":
+            _print(cgm.search_code(args.prefix, args.project, branch=args.branch, limit=args.limit))
+        elif args.cmd == "get_callers":
+            _print(cgm.get_callers(args.symbol_id, args.project, branch=args.branch, limit=args.limit))
+        elif args.cmd == "get_callees":
+            _print(cgm.get_callees(args.symbol_id, args.project, branch=args.branch, limit=args.limit))
+        elif args.cmd == "get_dependencies":
+            _print(cgm.get_dependencies(args.symbol_id, args.project, branch=args.branch, limit=args.limit))
+        elif args.cmd == "impact_analysis":
+            _print(
+                cgm.impact_analysis(
+                    args.symbol_id,
+                    args.project,
+                    branch=args.branch,
+                    direction=args.direction,
+                    depth=args.depth,
+                )
+            )
+        elif args.cmd == "find_path":
+            _print(cgm.find_path(args.source_id, args.dest_id, args.project, branch=args.branch))
+        elif args.cmd == "ask":
+            _print(cgm.ask(args.question, args.project, branch=args.branch))
+        else:  # pragma: no cover — argparse already enforces this
+            parser.error(f"unknown subcommand: {args.cmd}")
+    except Exception as e:  # noqa: BLE001 — surface everything to the agent
+        print(f"cg-mcp error: {e}", file=sys.stderr)
+        return 1
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/bench/datasets/swe_bench.py b/bench/datasets/swe_bench.py
index 6f89373a..a1e3428c 100644
--- a/bench/datasets/swe_bench.py
+++ b/bench/datasets/swe_bench.py
@@ -30,9 +30,13 @@
 
 from __future__ import annotations
 
+import hashlib
+import hmac
 import json
 import os
 import random
+import re
+import secrets
 import shutil
 import subprocess
 from dataclasses import dataclass
@@ -42,6 +46,9 @@
 from bench.runners.mini_runner import Task
 
 DATASET_NAME = "princeton-nlp/SWE-bench_Verified"
+# Loc-Bench (LocAgent, ACL 2025): curated multi-hop code-localization benchmark.
+# Schema-compatible subset; localization-only (no FAIL_TO_PASS / PASS_TO_PASS).
+LOC_BENCH_DATASET = "czlll/Loc-Bench_V1"
 DEFAULT_CACHE_ROOT = Path(__file__).resolve().parents[1] / "cache"
 REPOS_DIR = DEFAULT_CACHE_ROOT / "repos"
 WORKTREES_DIR = DEFAULT_CACHE_ROOT / "worktrees"
@@ -49,6 +56,62 @@
 # Locked-in seed from plan / configs/default.yaml.
 DEFAULT_SEED = 20260526
 
+# ---------------------------------------------------------------------------
+# Answer-leakage hardening (default ON; opt out with BENCH_BLOCK_NETWORK=0)
+# ---------------------------------------------------------------------------
+# The localize worktree was historically named ``{instance_id}__loc``. The
+# instance_id embeds the upstream GitHub PR/issue number, so that name leaked
+# into the prompt cwd, ``--add-dir`` and the code-graph ``project=`` key — the
+# agent could read the PR number off the path and fetch the merged PR's file
+# list (the gold answer), or read the cloned ``.git`` (origin + post-fix
+# default-branch ref) fully offline. When hardening is enabled we (a) name the
+# worktree with an opaque salted HMAC of the instance_id and (b) strip ``.git``.
+#
+# The salt defaults to a per-process random value; pin BENCH_LEAK_SALT only if
+# a stable mapping across processes is needed (it is NOT required for resume,
+# since localize worktrees are rmtree'd per run). The salt must never reach the
+# agent process env (the runner scrubs it from the Copilot child environment).
+_RUN_SALT = os.environ.get("BENCH_LEAK_SALT") or secrets.token_hex(16)
+
+# Env vars scrubbed from the agent's process environment under hardening, so the
+# agent cannot recover the opaque-name salt or use ambient GitHub credentials.
+LEAK_SCRUB_ENV_VARS = (
+    "BENCH_LEAK_SALT",
+    "GITHUB_TOKEN",
+    "GH_TOKEN",
+    "GITHUB_PAT",
+    "GH_PAT",
+)
+
+
+def network_block_enabled() -> bool:
+    """Whether answer-leakage hardening is active for this run.
+
+    Default ON. Tracing repeatedly caught the agent fetching the gold file list
+    from GitHub (``gh pr view``, ``web_fetch`` of the issue/PR) and reading the
+    cloned ``.git`` post-fix ref, which silently turned localization misses into
+    fake recall=1.0 wins. Hardening is therefore enabled unless explicitly
+    disabled with ``BENCH_BLOCK_NETWORK`` set to a falsy value
+    (``0``/``false``/``no``/``off``).
+    """
+    val = os.environ.get("BENCH_BLOCK_NETWORK")
+    if val is None:
+        return True
+    return val.strip().lower() not in ("0", "false", "no", "off", "")
+
+
+def opaque_worktree_name(instance_id: str) -> str:
+    """Opaque, salted worktree dir name that does not embed the PR/issue number.
+
+    HMAC-SHA256(salt, instance_id) truncated to 16 hex chars, ``loc-`` prefixed.
+    Deterministic within a process (stable salt) so a single run's index/prompt/
+    query all agree, but reveals nothing about the upstream instance.
+    """
+    digest = hmac.new(
+        _RUN_SALT.encode(), instance_id.encode(), hashlib.sha256
+    ).hexdigest()[:16]
+    return f"loc-{digest}"
+
 # Per-stage sample sizes (locked-in from plan).
 STAGE_SIZES = {"smoke": 3, "calibration": 10, "headline": 37}
 
@@ -66,6 +129,8 @@ class SweBenchInstance:
     pass_to_pass: list[str]
     environment_setup_commit: str
     version: str
+    patch: str = ""  # gold source patch (localization ground truth)
+    category: str = ""  # Loc-Bench issue category (Bug, Feature, Performance, ...)
 
 
 def _git(args: list[str], cwd: Path | None = None, check: bool = True) -> subprocess.CompletedProcess:
@@ -79,11 +144,20 @@ def _git(args: list[str], cwd: Path | None = None, check: bool = True) -> subpro
 
 
 def _parse_list_field(value: Any) -> list[str]:
-    """SWE-bench stores FAIL_TO_PASS / PASS_TO_PASS as JSON strings."""
+    """SWE-bench stores FAIL_TO_PASS / PASS_TO_PASS as JSON strings.
+
+    Localization-only datasets (e.g. Loc-Bench) omit these; treat missing /
+    empty values as an empty list rather than raising.
+    """
+    if value is None:
+        return []
     if isinstance(value, list):
         return list(value)
     if isinstance(value, str):
-        return list(json.loads(value))
+        s = value.strip()
+        if not s:
+            return []
+        return list(json.loads(s))
     raise TypeError(f"unsupported list field: {type(value)!r}")
 
 
@@ -91,14 +165,21 @@ def load_instances(
     *,
     split: str = "test",
     cache_dir: Path | None = None,
+    dataset_name: str | None = None,
 ) -> list[SweBenchInstance]:
-    """Load all SWE-bench Verified instances from HuggingFace."""
+    """Load SWE-bench instances from HuggingFace.
+
+    Defaults to `princeton-nlp/SWE-bench_Verified`. Pass `dataset_name` (e.g.
+    `SWE-bench-Live/SWE-bench-Live`, which is schema-compatible and exposes a
+    `verified` split) to evaluate a contamination-free / less-pretraining-
+    saturated corpus.
+    """
     from datasets import load_dataset  # local import — heavy
 
     kwargs: dict[str, Any] = {"split": split}
     if cache_dir is not None:
         kwargs["cache_dir"] = str(cache_dir)
-    ds = load_dataset(DATASET_NAME, **kwargs)
+    ds = load_dataset(dataset_name or DATASET_NAME, **kwargs)
 
     out: list[SweBenchInstance] = []
     for row in ds:
@@ -108,11 +189,13 @@ def load_instances(
                 repo=row["repo"],
                 base_commit=row["base_commit"],
                 problem_statement=row["problem_statement"],
-                test_patch=row["test_patch"],
-                fail_to_pass=_parse_list_field(row["FAIL_TO_PASS"]),
-                pass_to_pass=_parse_list_field(row["PASS_TO_PASS"]),
+                test_patch=row.get("test_patch") or "",
+                fail_to_pass=_parse_list_field(row.get("FAIL_TO_PASS")),
+                pass_to_pass=_parse_list_field(row.get("PASS_TO_PASS")),
                 environment_setup_commit=row.get("environment_setup_commit") or "",
                 version=row.get("version") or "",
+                patch=row.get("patch") or "",
+                category=row.get("category") or "",
             )
         )
     return out
@@ -214,6 +297,287 @@ def instance_to_task(inst: SweBenchInstance, repo_path: Path) -> Task:
     )
 
 
+# ---------------------------------------------------------------------------
+# Localization ground truth (LocAgent-style)
+# ---------------------------------------------------------------------------
+
+# Paths we exclude from the "files to modify" gold set: tests, docs, and
+# anything that isn't Python source. Localization asks for the *implementation*
+# files, so an agent that correctly avoids tests shouldn't be penalized.
+_TEST_PATH_RE = re.compile(
+    r"(^|/)(tests?|testing|test)(/|$)"          # tests/ dir
+    r"|(^|/)conftest\.py$"                        # pytest conftest
+    r"|(^|/)test_[^/]*\.py$"                       # test_*.py
+    r"|[^/]*_test\.py$"                            # *_test.py
+)
+_DOC_PATH_RE = re.compile(r"(^|/)docs?(/|$)|\.(rst|md|txt|cfg|ini|toml)$")
+
+
+def is_source_file(path: str) -> bool:
+    """True for non-test, non-doc Python source files."""
+    if not path.endswith(".py"):
+        return False
+    if _TEST_PATH_RE.search(path):
+        return False
+    if _DOC_PATH_RE.search(path):
+        return False
+    return True
+
+
+def gold_changed_files(patch: str, *, source_only: bool = True) -> list[str]:
+    """Repo-relative files touched by a unified diff, in patch order.
+
+    Reads `+++ b/<path>` headers (skips /dev/null deletions). When
+    `source_only`, filters to non-test non-doc Python files.
+    """
+    files: list[str] = []
+    for line in patch.splitlines():
+        if not line.startswith("+++ "):
+            continue
+        target = line[4:].strip()
+        if target == "/dev/null":
+            continue
+        # strip the leading "b/" git prefix if present
+        if target.startswith("b/"):
+            target = target[2:]
+        if target in files:
+            continue
+        if source_only and not is_source_file(target):
+            continue
+        files.append(target)
+    return files
+
+
+def _patch_hunk_ranges(patch: str) -> dict[str, list[tuple[int, int]]]:
+    """Map each target file -> list of (start, end) NEW-file line ranges
+    that the gold patch modifies. Used for symbol-level localization."""
+    ranges: dict[str, list[tuple[int, int]]] = {}
+    cur: str | None = None
+    for line in patch.splitlines():
+        if line.startswith("+++ "):
+            target = line[4:].strip()
+            if target.startswith("b/"):
+                target = target[2:]
+            cur = None if target == "/dev/null" else target
+            if cur is not None:
+                ranges.setdefault(cur, [])
+            continue
+        if line.startswith("@@"):
+            m = re.search(r"\+(\d+)(?:,(\d+))?", line)
+            if m and cur is not None:
+                start = int(m.group(1))
+                count = int(m.group(2) or "1")
+                ranges[cur].append((start, start + max(count - 1, 0)))
+            continue
+    return ranges
+
+
+def gold_symbols(inst: SweBenchInstance, repo_path: Path) -> dict[str, list[str]]:
+    """Best-effort Python symbol-level gold: for each gold source file,
+    the set of enclosing top-level/def/class symbol names whose body the
+    gold patch modifies. Maps NEW-file hunk line ranges to enclosing
+    ast.FunctionDef/AsyncFunctionDef/ClassDef. Files that don't parse or
+    don't map are silently skipped (reported as unmappable upstream).
+    """
+    import ast
+
+    out: dict[str, list[str]] = {}
+    ranges = _patch_hunk_ranges(inst.patch)
+    for rel, rngs in ranges.items():
+        if not is_source_file(rel):
+            continue
+        fpath = repo_path / rel
+        if not fpath.exists():
+            continue
+        try:
+            tree = ast.parse(fpath.read_text())
+        except (SyntaxError, UnicodeDecodeError):
+            continue
+        # Build (start,end,qualname) for every def/class.
+        spans: list[tuple[int, int, str]] = []
+
+        def _walk(node: ast.AST, prefix: str) -> None:
+            for child in ast.iter_child_nodes(node):
+                if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
+                    qual = f"{prefix}{child.name}"
+                    start = child.lineno
+                    end = getattr(child, "end_lineno", start)
+                    spans.append((start, end, qual))
+                    _walk(child, qual + ".")
+                else:
+                    _walk(child, prefix)
+
+        _walk(tree, "")
+        hit: list[str] = []
+        for (hs, he) in rngs:
+            # innermost enclosing symbol per hunk
+            best: tuple[int, str] | None = None
+            for (s, e, q) in spans:
+                if s <= hs <= e or s <= he <= e or (hs <= s and he >= e):
+                    size = e - s
+                    if best is None or size < best[0]:
+                        best = (size, q)
+            if best and best[1] not in hit:
+                hit.append(best[1])
+        if hit:
+            out[rel] = hit
+    return out
+
+
+def leakage_flags(inst: SweBenchInstance, gold_files: list[str]) -> dict[str, bool]:
+    """Annotate whether the issue text trivially leaks the gold location."""
+    text = inst.problem_statement or ""
+    basenames = {Path(f).name for f in gold_files}
+    return {
+        "mentions_gold_path": any(f in text for f in gold_files),
+        "mentions_gold_basename": any(b in text for b in basenames),
+        "contains_traceback": ("Traceback (most recent call last)" in text)
+        or ("\n  File \"" in text),
+    }
+
+
+def is_structural(inst: SweBenchInstance) -> bool:
+    """A task stresses structural navigation if its gold source patch
+    spans >=2 source files OR >=2 distinct directories."""
+    files = gold_changed_files(inst.patch, source_only=True)
+    if len(files) >= 2:
+        return True
+    dirs = {str(Path(f).parent) for f in files}
+    return len(dirs) >= 2
+
+
+def select_structural(
+    instances: Iterable[SweBenchInstance],
+    *,
+    seed: int = DEFAULT_SEED,
+    n: int | None = None,
+    repos: set[str] | None = None,
+    python_only: bool = False,
+    no_leak: bool = False,
+) -> list[SweBenchInstance]:
+    """Deterministically sample instances whose gold patch is multi-file/
+    multi-dir (structural-navigation stressors).
+
+    `repos`: if given, restrict to these `owner/name` repos (used to target
+    large, less-pretraining-saturated codebases on the SWE-bench-Live corpus).
+    `python_only`: require at least one `.py` gold source file (the navigation
+    tools — tree-sitter / jedi — are Python-only).
+    `no_leak`: drop instances whose problem statement names a gold file's path
+    or basename (the "structural-hard" gate — forces real multi-hop navigation
+    rather than single-hop lookup of an explicitly-named file).
+    """
+    pool = [i for i in instances if is_structural(i)]
+    if repos is not None:
+        pool = [i for i in pool if i.repo in repos]
+    if python_only:
+        pool = [
+            i
+            for i in pool
+            if any(f.endswith(".py") for f in gold_changed_files(i.patch, source_only=True))
+        ]
+    if no_leak:
+        kept = []
+        for i in pool:
+            gold = gold_changed_files(i.patch, source_only=True)
+            lf = leakage_flags(i, gold)
+            if not lf["mentions_gold_path"] and not lf["mentions_gold_basename"]:
+                kept.append(i)
+        pool = kept
+    rng = random.Random(seed)
+    rng.shuffle(pool)
+    return pool[:n] if n is not None else pool
+
+
+def prepare_localize_worktree(
+    inst: SweBenchInstance,
+    *,
+    repos_dir: Path = REPOS_DIR,
+    worktrees_dir: Path | None = None,
+) -> Path:
+    """Materialize a TEST-FREE worktree under a distinct name.
+
+    The distinct dirname matters: the code-graph backend keys its index on
+    the worktree dirname, so a fresh name forces a clean re-index that does
+    NOT contain the test_patch files (which would leak the bug location).
+
+    Naming:
+      * unhardened (``BENCH_BLOCK_NETWORK=0``): ``{instance_id}__loc``
+        (preserves prior-run provenance).
+      * hardened (default): ``loc-<salted HMAC>`` so the
+        dirname does NOT embed the upstream PR/issue number, and the cloned
+        ``.git`` is stripped so the post-fix oracle is unreachable offline.
+    """
+    hardened = network_block_enabled()
+    wt_dir = worktrees_dir or (DEFAULT_CACHE_ROOT / "worktrees-localize")
+    src = _ensure_repo_clone(inst.repo, repos_dir)
+    name = opaque_worktree_name(inst.instance_id) if hardened else f"{inst.instance_id}__loc"
+    dest = wt_dir / name
+    if dest.exists():
+        shutil.rmtree(dest, ignore_errors=True)
+    if dest.exists():
+        # A locked/partial dir survived rmtree (e.g. an open handle from a
+        # prior interrupted run). Move it aside so the clone can proceed.
+        import time as _t
+        dest.rename(dest.with_name(f"{dest.name}.stale.{int(_t.time())}"))
+    dest.parent.mkdir(parents=True, exist_ok=True)
+    # Clone with a single retry. We have observed a transient `git clone`
+    # exit-128 on the *first* clone of a freshly-cleaned worktree dir (the
+    # next config's clone of the same instance then succeeds). Re-clean and
+    # retry once; surface git's stderr if it still fails so it's diagnosable.
+    last_err = ""
+    for attempt in range(2):
+        if dest.exists():
+            shutil.rmtree(dest, ignore_errors=True)
+        res = _git(["clone", str(src), str(dest)], check=False)
+        if res.returncode == 0:
+            break
+        last_err = (res.stderr or res.stdout or "").strip()
+        print(
+            f"[warn] git clone {dest.name} failed (attempt {attempt + 1}/2): "
+            f"{last_err}",
+            flush=True,
+        )
+    else:
+        raise RuntimeError(f"git clone failed for {dest}: {last_err}")
+    _git(["fetch", "origin", inst.base_commit], cwd=dest, check=False)
+    # The cached clone (origin) only has commits reachable from the default
+    # branch. Loc-Bench base_commits are sometimes unreachable from it (PR
+    # bases, rewritten history). GitHub serves any reachable SHA directly, so
+    # fall back to fetching the commit straight from the upstream URL.
+    if _git(["cat-file", "-e", inst.base_commit], cwd=dest, check=False).returncode != 0:
+        url = f"https://github.com/{inst.repo}.git"
+        _git(["fetch", "--depth", "1", url, inst.base_commit], cwd=dest, check=False)
+    _git(["checkout", "--detach", inst.base_commit], cwd=dest)
+    if hardened:
+        # Strip the offline oracle: the cloned ``.git`` retains ``origin`` plus a
+        # local default-branch ref at the post-fix tip, so ``git log/diff
+        # origin/<branch>`` would reveal the gold change with no network. The
+        # localize path needs no git history (gold comes from the dataset patch;
+        # analyze_folder ignores ``.git``), so removing it is safe.
+        strip_git_oracle(dest)
+    return dest
+
+
+def strip_git_oracle(root: Path) -> None:
+    """Remove every ``.git`` (directory OR gitdir-pointer file) under ``root``.
+
+    A bare ``rmtree(root/'.git')`` only handles the top-level repo dir. It misses
+    (a) submodule checkouts, whose ``.git`` is a *file* containing a
+    ``gitdir: ...`` pointer back into the superproject, and (b) any nested git
+    checkout. Any surviving ``.git`` lets ``git`` rediscover history from inside
+    the worktree, re-exposing the post-fix oracle. Remove them all so the
+    worktree is genuinely history-free.
+    """
+    for git_path in sorted(root.rglob(".git"), key=lambda p: len(p.parts), reverse=True):
+        if git_path.is_dir() and not git_path.is_symlink():
+            shutil.rmtree(git_path, ignore_errors=True)
+        else:
+            try:
+                git_path.unlink()
+            except OSError:
+                pass
+
+
 # ---------------------------------------------------------------------------
 # Verification (approximate — official harness needs Docker)
 # ---------------------------------------------------------------------------
@@ -239,6 +603,19 @@ def verify_instance(
 
     cmd = [py, "-m", "pytest", "-q", "--no-header", "-p", "no:cacheprovider", *test_ids]
     res = subprocess.run(cmd, cwd=str(repo_path), capture_output=True, text=True)
-    ok = res.returncode == 0
     summary = res.stdout[-500:] + res.stderr[-500:]
+    # Distinguish "tests ran and failed" (authoritative-ish negative) from
+    # "we could not run tests at all" (no pytest in env, collection crash).
+    # The latter must NOT be reported as a real failure — the authoritative
+    # grade comes from the SWE-bench Docker harness (bench.runners.
+    # swebench_verify). pytest uses returncode 2-5 for usage/collection/internal
+    # errors, and 1 for genuine test failures; 0 is pass.
+    could_not_run = (
+        "No module named pytest" in summary
+        or "no tests ran" in summary
+        or res.returncode >= 2
+    )
+    if could_not_run and res.returncode != 1:
+        return False, "UNGRADED: " + summary
+    ok = res.returncode == 0
     return ok, summary
diff --git a/bench/mcp/__init__.py b/bench/mcp/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/bench/mcp/lsp_server.py b/bench/mcp/lsp_server.py
new file mode 100644
index 00000000..08b55cab
--- /dev/null
+++ b/bench/mcp/lsp_server.py
@@ -0,0 +1,120 @@
+"""FastMCP server exposing LSP navigation primitives for the ``lsp`` arm.
+
+This is the symmetric counterpart to the code-graph MCP server: both arms
+surface a small navigation tool set to the Copilot agent over stdio, so the
+4-arm comparison isolates *what each backend can answer*, not the harness.
+
+Backend: multilspy's ``SyncLanguageServer`` (jedi-language-server for Python),
+wrapped by :mod:`bench.agents.lsp_adapter`. A single jedi subprocess is kept
+alive for the whole session (see ``LSPClient.start``) so per-call startup cost
+isn't paid on every tool call — the agent-token comparison stays fair.
+
+Tools (mirroring bench/agents/lsp_adapter):
+  - goto_definition(file, line, col)  -> [{path, line, col}]
+  - find_references(file, line, col)  -> [{path, line, col}]
+  - hover(file, line, col)            -> {text}
+  - document_symbols(file)            -> [{name, kind, path, line, col}]
+
+Positions are 0-based (LSP convention): the first line of a file is line 0,
+the first column is col 0. This matches raw LSP/multilspy semantics.
+
+Required env (set by the benchmark runner):
+  LSP_REPO_ROOT   absolute path to the repo being analyzed (the agent's cwd)
+  LSP_LANGUAGE    optional; defaults to "python"
+  LSP_ENV_PATH    optional; environment_path passed to jedi for import
+                  resolution (defaults to the server interpreter's prefix)
+"""
+
+from __future__ import annotations
+
+import os
+from typing import Any
+
+from mcp.server.fastmcp import FastMCP
+
+from bench.agents.lsp_adapter import DEFAULT_SHIM, LSPClient
+
+app: FastMCP = FastMCP("lsp")
+
+# Single persistent client/server for the process lifetime. Started lazily on
+# the first tool call so server construction errors surface as a tool error
+# (visible in the trajectory) rather than killing stdio startup.
+_client: LSPClient | None = None
+
+
+def _get_client() -> LSPClient:
+    global _client
+    if _client is None:
+        repo_root = os.environ.get("LSP_REPO_ROOT") or os.getcwd()
+        language = os.environ.get("LSP_LANGUAGE", "python")
+        env_path = os.environ.get("LSP_ENV_PATH") or None
+        client = LSPClient(
+            repo_root=repo_root,
+            language=language,
+            shim=DEFAULT_SHIM,
+            environment_path=env_path,
+        )
+        client.start()
+        _client = client
+    return _client
+
+
+@app.tool(
+    name="goto_definition",
+    description=(
+        "Resolve the symbol at a 0-based (line, col) in `file` to its "
+        "definition site(s); returns [{path, line, col}]. Use after you have "
+        "located a symbol's position (e.g. via grep) to jump to where it is "
+        "defined. line/col are 0-based: the first line is 0."
+    ),
+)
+async def goto_definition(file: str, line: int, col: int) -> list[dict[str, Any]]:
+    return _get_client().goto_definition(file, int(line), int(col))
+
+
+@app.tool(
+    name="find_references",
+    description=(
+        "Find all references to the symbol at a 0-based (line, col) in `file`; "
+        "returns [{path, line, col}] across the repo (capped). Use to discover "
+        "which other source files use a symbol. line/col are 0-based."
+    ),
+)
+async def find_references(file: str, line: int, col: int) -> list[dict[str, Any]]:
+    return _get_client().find_references(file, int(line), int(col))
+
+
+@app.tool(
+    name="hover",
+    description=(
+        "Get the signature + 1-sentence docstring for the symbol at a 0-based "
+        "(line, col) in `file`; returns {text}. line/col are 0-based."
+    ),
+)
+async def hover(file: str, line: int, col: int) -> dict[str, Any]:
+    return _get_client().hover(file, int(line), int(col))
+
+
+@app.tool(
+    name="document_symbols",
+    description=(
+        "List the symbols (functions, classes, methods) defined in `file` with "
+        "their 0-based positions; returns [{name, kind, path, line, col}]. Use "
+        "to map a file's structure without reading the whole file."
+    ),
+)
+async def document_symbols(file: str) -> list[dict[str, Any]]:
+    return _get_client().document_symbols(file)
+
+
+def main() -> None:
+    """Run the LSP MCP server over stdio."""
+    try:
+        app.run(transport="stdio")
+    finally:
+        if _client is not None:
+            _client.stop()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bench/mcp/noisy_inject.py b/bench/mcp/noisy_inject.py
new file mode 100644
index 00000000..5170f180
--- /dev/null
+++ b/bench/mcp/noisy_inject.py
@@ -0,0 +1,150 @@
+"""Runtime NOISY negative-control injection for ``search_code`` results.
+
+This module is the SINGLE source of truth for the adoption-calibration pilot's
+NOISY arm (prereg §6): after the real graph result, append K deterministic
+"distractor" files (verified-non-gold siblings, pre-computed offline into a
+manifest) so the experiment can measure whether the agent KEEPS a plausible but
+wrong graph-surfaced candidate (a false positive) or correctly DROPS it (a true
+negative).
+
+Design constraints (mirrors ``rel_explain`` and validated with rubber-duck):
+  * The injection lives INSIDE the registered tool, before FastMCP serializes
+    the return value, so distractors flow through the exact same schema and
+    output path as real results (no JSON-RPC proxy, no registry surgery).
+  * It is ENV-GATED and DEFAULT-OFF: with ``BENCH_NOISY_MANIFEST`` /
+    ``BENCH_NOISY_TASK`` unset, ``maybe_inject`` is a no-op and ``search_code``
+    output is byte-identical to production. Only the NOISY arm sets the env.
+  * PURE core (``inject``) so the no-LLM dry-run / unit tests can exercise the
+    real intervention against a canned result list with no FalkorDB, no agent.
+  * Distractors are appended AFTER the real result and never duplicate a file
+    already present, so they cannot displace a genuine hit.
+
+Env contract (set by the bench runner only for the NOISY condition):
+  * ``BENCH_NOISY_MANIFEST`` -- path to the JSON manifest produced by
+    ``bench.analysis.adopt_controls.build_noisy_manifest`` (top-level
+    ``{"k", "seed", "coverage", "manifest": {task -> {... "distractors": [...]}}}``).
+  * ``BENCH_NOISY_TASK``     -- the task id key for THIS run's instance.
+  * ``BENCH_NOISY_K``        -- optional override of how many distractors to
+    append (default: the manifest's ``k``, else ``DEFAULT_K``).
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+from functools import lru_cache
+from pathlib import Path
+from typing import Any
+
+log = logging.getLogger(__name__)
+
+ENV_MANIFEST = "BENCH_NOISY_MANIFEST"
+ENV_TASK = "BENCH_NOISY_TASK"
+ENV_K = "BENCH_NOISY_K"
+
+DEFAULT_K = 2
+
+# Provenance marker stamped on every injected record so the offline diagnostic
+# can distinguish a NOISY distractor from a genuine co-override sibling.
+VIA_NOISY = "noisy_inject"
+
+
+def build_distractor_record(file: str) -> dict[str, Any]:
+    """A single injected distractor in the flat ``rank_kind:"related"`` schema.
+
+    Mirrors the shape ``search_code`` already uses for flat-appended siblings so
+    the agent and the offline scorer see a uniform candidate list. ``file_id`` is
+    ``None`` (a distractor is identified by path, not a query-relevant node id).
+    """
+    return {
+        "file": file,
+        "file_id": None,
+        "score": None,
+        "name": None,
+        "line": None,
+        "label": "File",
+        "rank_kind": "related",
+        "confidence": "medium",
+        "via": VIA_NOISY,
+        "related_to": None,
+        "shared_methods": [],
+    }
+
+
+def inject(
+    out: list[dict[str, Any]],
+    distractors: list[dict[str, Any]],
+    k: int,
+) -> list[dict[str, Any]]:
+    """Append up to ``k`` distractor records to ``out`` (pure, in place).
+
+    Skips any distractor whose ``file`` already appears in ``out`` (a real hit is
+    never duplicated/displaced). Returns ``out`` for convenience.
+    """
+    if k <= 0 or not distractors:
+        return out
+    present = {r.get("file") for r in out}
+    appended = 0
+    for d in distractors:
+        if appended >= k:
+            break
+        f = d.get("file")
+        if not f or f in present:
+            continue
+        present.add(f)
+        out.append(build_distractor_record(f))
+        appended += 1
+    return out
+
+
+@lru_cache(maxsize=8)
+def _load_manifest(path: str) -> dict[str, Any]:
+    """Load + cache the manifest JSON (cache keyed by path string)."""
+    return json.loads(Path(path).read_text())
+
+
+def distractors_for_task(manifest: dict[str, Any], task: str) -> list[dict[str, Any]]:
+    """The ``distractors`` list for ``task`` from a loaded manifest, or ``[]``."""
+    entry = manifest.get("manifest", {}).get(task)
+    if not entry:
+        return []
+    return entry.get("distractors", []) or []
+
+
+def _resolve_k(manifest: dict[str, Any]) -> int:
+    raw = os.getenv(ENV_K)
+    if raw is not None and raw.strip():
+        try:
+            return max(0, int(raw))
+        except ValueError:
+            log.warning("noisy_inject: bad %s=%r, falling back to manifest k", ENV_K, raw)
+    mk = manifest.get("k")
+    if isinstance(mk, int) and mk >= 0:
+        return mk
+    return DEFAULT_K
+
+
+def maybe_inject(out: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Env-gated entry point called by ``search_code`` before returning.
+
+    No-op (byte-identical output) unless BOTH ``BENCH_NOISY_MANIFEST`` and
+    ``BENCH_NOISY_TASK`` are set. A misconfiguration (missing file, unknown task,
+    fewer than K distractors) is logged and otherwise tolerated so a NOISY run
+    degrades to "fewer distractors" rather than crashing the agent mid-task.
+    """
+    manifest_path = os.getenv(ENV_MANIFEST)
+    task = os.getenv(ENV_TASK)
+    if not manifest_path or not task:
+        return out
+    try:
+        manifest = _load_manifest(manifest_path)
+    except (OSError, ValueError) as e:
+        log.warning("noisy_inject: cannot load manifest %s: %s", manifest_path, e)
+        return out
+    distractors = distractors_for_task(manifest, task)
+    if not distractors:
+        log.warning("noisy_inject: no distractors for task %r in %s", task, manifest_path)
+        return out
+    k = _resolve_k(manifest)
+    return inject(out, distractors, k)
diff --git a/bench/mcp/rel_explain.py b/bench/mcp/rel_explain.py
new file mode 100644
index 00000000..25c6b794
--- /dev/null
+++ b/bench/mcp/rel_explain.py
@@ -0,0 +1,226 @@
+"""Factual relationship/provenance explanations for ``search_code`` results.
+
+This module is the SINGLE source of truth for the optional, env-gated
+``relationship_explanation`` / ``match_provenance`` strings attached to
+``search_code`` output. It is deliberately PURE (no FastMCP, no graph, no I/O)
+and dependency-free so that:
+
+  * the production tool (``structural.py``) can call it at query time, and
+  * the offline A/B "reader" harness (bench tree, a *different* worktree/venv)
+    can import it and re-annotate captured ``search_code`` outputs with the
+    EXACT same logic — guaranteeing the offline mechanism test exercises the
+    real intervention, not a re-implementation that could drift.
+
+Design constraints (validated with rubber-duck):
+  * FACTUAL, not directive. Strings describe the STRUCTURAL relationship or the
+    matched query provenance. They never tell the agent what to answer
+    ("you should include this file"), which would overfit/game the benchmark.
+  * Derived only from data already present in the result entry (``via``,
+    ``shared_methods``, ``related_to``) or trivially verifiable against the
+    query (token overlap with ``name``/``file``).
+  * A length-matched, semantically EMPTY ``placebo`` is provided so the A/B can
+    isolate "explanation content" from "extra prose / salience".
+
+Modes (string, case-insensitive):
+  * ``"off"``      -- no annotation (control arm; current production default).
+  * ``"explain"``  -- attach the real factual explanation/provenance.
+  * ``"placebo"``  -- attach a length-matched neutral filler (salience control).
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Any, Optional
+
+OFF = "off"
+EXPLAIN = "explain"
+PLACEBO = "placebo"
+VALID_MODES = (OFF, EXPLAIN, PLACEBO)
+
+# Field names attached to result entries.
+RELATED_FIELD = "relationship_explanation"
+DIRECT_FIELD = "match_provenance"
+
+_CAMEL_RE = re.compile(r"[A-Z]+(?=[A-Z][a-z])|[A-Z]?[a-z0-9]+|[A-Z]+")
+_WORD_RE = re.compile(r"[A-Za-z_][A-Za-z0-9_]*")
+_MIN_TOK = 4  # ignore short/common tokens when reporting query provenance
+
+
+def normalize_mode(value: Optional[str]) -> str:
+    """Coerce an env value to a valid mode; unknown/empty -> ``off``."""
+    if not value:
+        return OFF
+    v = str(value).strip().lower()
+    if v in ("1", "true", "yes", "on"):
+        return EXPLAIN
+    if v in ("0", "false", "no"):
+        return OFF
+    return v if v in VALID_MODES else OFF
+
+
+def _subtokens(ident: str) -> set[str]:
+    out: set[str] = set()
+    for part in re.split(r"[_\s]+", ident or ""):
+        for m in _CAMEL_RE.findall(part):
+            if m:
+                out.add(m.lower())
+    return out
+
+
+def _query_tokens(query: str) -> set[str]:
+    toks: set[str] = set()
+    for w in _WORD_RE.findall(query or ""):
+        toks |= _subtokens(w)
+        toks.add(w.lower())
+    return {t for t in toks if len(t) >= _MIN_TOK}
+
+
+def _fmt_methods(methods: Any) -> str:
+    if not methods:
+        return ""
+    if isinstance(methods, str):
+        methods = [methods]
+    parts = [f"`{m}`" for m in methods if m]
+    if not parts:
+        return ""
+    if len(parts) == 1:
+        return parts[0]
+    if len(parts) == 2:
+        return f"{parts[0]} and {parts[1]}"
+    return ", ".join(parts[:-1]) + f", and {parts[-1]}"
+
+
+def related_explanation(entry: dict[str, Any], related_to: Optional[str]) -> Optional[str]:
+    """Factual explanation of WHY a related file is coupled to ``related_to``.
+
+    ``entry`` is a ``likely_related_files`` item or a flat ``rank_kind=related``
+    object carrying ``via`` (co_override|shared_method) and ``shared_methods``.
+    ``related_to`` is the primary (seed) file this sibling attaches to; for flat
+    entries it is the entry's own ``related_to`` field.
+    """
+    via = entry.get("via")
+    methods = _fmt_methods(entry.get("shared_methods"))
+    seed = related_to or entry.get("related_to")
+    seed_s = f"`{seed}`" if seed else "a top-ranked file"
+    if via == "co_override":
+        base = (
+            f"Overrides the same base method {methods} as {seed_s}"
+            if methods
+            else f"Overrides the same base method as {seed_s}"
+        )
+        return (
+            f"{base} (co-override sibling). Files that override a shared base "
+            f"method are frequent co-edit candidates that a textual search misses."
+        )
+    if via == "shared_method":
+        base = (
+            f"Defines the same method name {methods} as {seed_s}"
+            if methods
+            else f"Defines a same-named method as {seed_s}"
+        )
+        return (
+            f"{base} (shared-method sibling, often a co-change companion). "
+            f"Not linked by a resolved inheritance edge, so a name lookup would "
+            f"not connect them."
+        )
+    # Unknown channel: fall back to a minimal factual statement.
+    if seed:
+        return f"Structurally coupled to {seed_s} in the code graph."
+    return None
+
+
+def direct_provenance(entry: dict[str, Any], query: str) -> Optional[str]:
+    """Honest provenance for a DIRECT (primary ranked) hit.
+
+    Reports which query terms verifiably appear in the hit's representative
+    symbol ``name`` or its ``file`` path. Makes no claim of relevance beyond the
+    literal token overlap; when there is none, it states the ranking was driven
+    by symbol/docstring relevance (BM25/centrality) rather than inventing a
+    match. This keeps direct-hit annotations FACTUAL, not directive.
+    """
+    name = entry.get("name") or ""
+    file = entry.get("file") or ""
+    qtok = _query_tokens(query)
+    if not qtok:
+        return None
+    name_hits = sorted(qtok & _subtokens(name))
+    path_hits = sorted(qtok & _subtokens(file.replace("/", " ").replace(".", " ")))
+    if name_hits:
+        terms = _fmt_methods(name_hits)
+        where = f"symbol `{name}`" if name else "a symbol in this file"
+        return f"Query term {terms} appears in {where}."
+    if path_hits:
+        terms = _fmt_methods(path_hits)
+        return f"Query term {terms} appears in the file path."
+    return (
+        "Ranked by symbol-name/docstring relevance to the query "
+        "(no exact query term in the file path or representative symbol)."
+    )
+
+
+# ---------------------------------------------------------------------------
+# Length-matched placebo (salience control)
+# ---------------------------------------------------------------------------
+
+# A neutral vocabulary with NO file names, symbol names, or structural-coupling
+# terms. Used to build filler of comparable length to a real explanation so the
+# A/B can attribute any adoption change to explanation CONTENT, not to the mere
+# presence of extra prose near the entry.
+_PLACEBO_WORDS = (
+    "this entry is part of the indexed repository and was returned by the "
+    "search operation along with other candidate entries for your review at "
+    "this time as additional general information about the result listing here"
+).split()
+
+
+def placebo_for(real_text: Optional[str]) -> Optional[str]:
+    """Return a neutral filler string of length comparable to ``real_text``."""
+    if not real_text:
+        return None
+    target = len(real_text)
+    words: list[str] = []
+    n = 0
+    i = 0
+    while n < target:
+        w = _PLACEBO_WORDS[i % len(_PLACEBO_WORDS)]
+        words.append(w)
+        n += len(w) + 1
+        i += 1
+    s = " ".join(words)
+    return s[:target].rstrip()
+
+
+# ---------------------------------------------------------------------------
+# Top-level annotators (operate IN PLACE on a list of search_code result objs)
+# ---------------------------------------------------------------------------
+
+
+def annotate_results(results: list[dict[str, Any]], query: str, mode: str) -> list[dict[str, Any]]:
+    """Attach explanation/provenance fields to a list of search_code objects.
+
+    Mutates and returns ``results``. ``mode`` is one of ``VALID_MODES``. In
+    ``placebo`` mode the SAME fields are attached but with length-matched
+    neutral filler, so the two arms differ only in CONTENT, not in which entries
+    carry a field or roughly how many tokens they add.
+    """
+    mode = normalize_mode(mode)
+    if mode == OFF:
+        return results
+    explain = mode == EXPLAIN
+
+    for prim in results:
+        if not isinstance(prim, dict):
+            continue
+        is_related = prim.get("rank_kind") == "related"
+        if is_related:
+            real = related_explanation(prim, prim.get("related_to"))
+            prim[RELATED_FIELD] = real if explain else placebo_for(real)
+        else:
+            real = direct_provenance(prim, query)
+            if real is not None:
+                prim[DIRECT_FIELD] = real if explain else placebo_for(real)
+            for rel in prim.get("likely_related_files", []) or []:
+                if isinstance(rel, dict):
+                    r = related_explanation(rel, prim.get("file"))
+                    rel[RELATED_FIELD] = r if explain else placebo_for(r)
+    return results
diff --git a/bench/runners/compare_models.py b/bench/runners/compare_models.py
new file mode 100644
index 00000000..b7c1563e
--- /dev/null
+++ b/bench/runners/compare_models.py
@@ -0,0 +1,118 @@
+"""Compare two model runs of the SWE-bench fix benchmark (e.g. Opus vs Sonnet).
+
+Reads two results.jsonl files (same instance set, same 4 configs) and reports,
+per config: resolved accuracy, summed input/output tokens, and estimated USD
+cost under list pricing. Prints a paired Opus-vs-Sonnet table and the overall
+price delta.
+
+Token accounting note: `input_tokens` here is the cumulative count the agent
+loop sends across all steps (history is re-sent each turn), so it is "tokens
+processed", not unique context. We price it as-is and apply the SAME accounting
+to both models, so the *ratio* is the honest comparison; absolute dollars are
+an upper bound for a no-prompt-caching setup.
+
+Usage:
+    python -m bench.runners.compare_models \
+        --a bench/cache/opus/results.jsonl --a-name opus --a-model opus \
+        --b bench/cache/sonnet-n40/results.jsonl --b-name sonnet --b-model sonnet
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from collections import defaultdict
+from pathlib import Path
+from typing import Any
+
+# List price per 1M tokens (USD), input / output.
+PRICING = {
+    "opus": (15.0, 75.0),      # Claude Opus 4.x
+    "sonnet": (3.0, 15.0),     # Claude Sonnet 4.5
+    "haiku": (0.80, 4.0),
+}
+
+CONFIG_ORDER = ["baseline", "lsp", "code_graph", "code_graph_mcp"]
+
+
+def load(path: Path) -> dict[str, list[dict[str, Any]]]:
+    by: dict[str, list[dict[str, Any]]] = defaultdict(list)
+    for line in path.read_text().splitlines():
+        if not line.strip():
+            continue
+        r = json.loads(line)
+        by[r.get("config", "?")].append(r)
+    return by
+
+
+def cost_usd(in_tok: int, out_tok: int, model: str) -> float:
+    pin, pout = PRICING[model]
+    return in_tok / 1e6 * pin + out_tok / 1e6 * pout
+
+
+def config_stats(rows: list[dict[str, Any]], model: str) -> dict[str, Any]:
+    n = len(rows)
+    in_sum = sum(r.get("input_tokens", 0) for r in rows)
+    out_sum = sum(r.get("output_tokens", 0) for r in rows)
+    resolved = sum(1 for r in rows if r.get("outcome") == "resolved")
+    return {
+        "n": n,
+        "in_sum": in_sum,
+        "out_sum": out_sum,
+        "resolved": resolved,
+        "acc": round(100 * resolved / n, 1) if n else 0.0,
+        "usd": round(cost_usd(in_sum, out_sum, model), 2),
+    }
+
+
+def main(argv: list[str] | None = None) -> int:
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument("--a", type=Path, required=True)
+    p.add_argument("--a-name", default="A")
+    p.add_argument("--a-model", default="opus", choices=list(PRICING))
+    p.add_argument("--b", type=Path, required=True)
+    p.add_argument("--b-name", default="B")
+    p.add_argument("--b-model", default="sonnet", choices=list(PRICING))
+    p.add_argument("--json-out", type=Path, default=None)
+    args = p.parse_args(argv)
+
+    a = load(args.a)
+    b = load(args.b)
+    configs = [c for c in CONFIG_ORDER if c in a or c in b]
+
+    report: dict[str, Any] = {"a": args.a_name, "b": args.b_name, "configs": {}}
+    print(f"\n{'config':>16} | {args.a_name:>26} | {args.b_name:>26} | price")
+    print(f"{'':>16} | {'acc   in_tok   out_tok    $':>26} | "
+          f"{'acc   in_tok   out_tok    $':>26} | A/B$")
+    print("-" * 92)
+    tot_a_usd = tot_b_usd = 0.0
+    for c in configs:
+        sa = config_stats(a.get(c, []), args.a_model)
+        sb = config_stats(b.get(c, []), args.b_model)
+        tot_a_usd += sa["usd"]
+        tot_b_usd += sb["usd"]
+        ratio = round(sa["usd"] / sb["usd"], 2) if sb["usd"] else None
+        report["configs"][c] = {"a": sa, "b": sb, "price_ratio_a_over_b": ratio}
+        print(f"{c:>16} | {sa['acc']:>4}% {sa['in_sum']:>9} {sa['out_sum']:>7} "
+              f"${sa['usd']:>7} | {sb['acc']:>4}% {sb['in_sum']:>9} "
+              f"{sb['out_sum']:>7} ${sb['usd']:>7} | {ratio}x")
+
+    print("-" * 92)
+    print(f"{'TOTAL $':>16} | {'':>19}${tot_a_usd:>7.2f} | "
+          f"{'':>19}${tot_b_usd:>7.2f} | "
+          f"{round(tot_a_usd / tot_b_usd, 2) if tot_b_usd else None}x")
+    report["total"] = {
+        "a_usd": round(tot_a_usd, 2),
+        "b_usd": round(tot_b_usd, 2),
+        "a_over_b": round(tot_a_usd / tot_b_usd, 2) if tot_b_usd else None,
+    }
+
+    if args.json_out:
+        args.json_out.parent.mkdir(parents=True, exist_ok=True)
+        args.json_out.write_text(json.dumps(report, indent=2))
+        print(f"\nwrote {args.json_out}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/bench/runners/copilot_runner.py b/bench/runners/copilot_runner.py
new file mode 100644
index 00000000..5a52ce7b
--- /dev/null
+++ b/bench/runners/copilot_runner.py
@@ -0,0 +1,1868 @@
+"""Benchmark harness driving the **real GitHub Copilot CLI** over SWE-bench.
+
+Unlike `mini_runner` (a scripted ReAct loop with a hard step cap), this runner
+invokes the production Copilot CLI headlessly so the measured token / accuracy
+numbers reflect how people actually use the agent. It compares tracks that
+differ ONLY in their MCP wiring:
+
+    * ``copilot_no_mcp``  -- Copilot's native tools, no extra MCP servers.
+    * ``code_graph``      -- same, plus our ``cgraph-mcp`` stdio server.
+    * ``lsp``             -- (reserved) same, plus an LSP-backed MCP server.
+
+For each ``(instance, track)`` it:
+    1. prepares a fresh worktree at the instance base commit,
+    2. (code_graph only) deletes any stale FalkorDB graph and re-indexes,
+    3. builds a neutral prompt from the SWE-bench problem statement,
+    4. runs ``copilot`` headless with a wall-clock timeout,
+    5. parses tokens (summed from the debug process logs), premium requests and
+       tool calls,
+    6. extracts the patch via ``git diff <base_commit>`` (junk-excluded),
+    7. writes a results row in the shared ``mini_runner`` schema so the existing
+       Docker grader (``swebench_verify.py``) works unchanged.
+
+Grading is intentionally deferred: run this to generate patches + token rows,
+then grade with the official SWE-bench Docker harness.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import re
+import shutil
+import signal
+import subprocess
+import sys
+import time
+import traceback
+import uuid
+from pathlib import Path
+from typing import Any
+
+from bench.datasets import swe_bench
+
+RUNNER_VERSION = "copilot-runner/2"
+
+# Marks the measurement epoch for answer-leakage hardening + thinking-on +
+# full-trace capture. Recorded on every row so rows from different harness
+# generations are never silently pooled.
+#
+# harden/2: closed the git-walk-up leak. Stripping the worktree's own ``.git``
+# did NOT stop ``git`` (run by the agent or by the indexer's branch detection)
+# from traversing UP to the enclosing harness repo, which leaked its branch name
+# and commit messages (revealing the benchmark intent) and mis-keyed the index
+# under the parent branch. Fixed by recursively stripping ``.git``, pinning the
+# index to the ``_default`` branch, scrubbing inherited ``GIT_*`` vars, and
+# fencing the agent's git with ``GIT_CEILING_DIRECTORIES``. harden/1 rows where
+# the agent ran git are suspect and must not be pooled with harden/2.
+HARNESS_HARDENING_VERSION = "harden/2"
+
+# Reasoning effort for scored runs. Thinking is now ENABLED by default so the
+# agent's deliberation is captured in the trace; the reasoning-token cost is
+# accounted SEPARATELY (see parse_tokens_from_logs -> reasoning_tokens) so the
+# base token comparison across arms stays interpretable. All arms in an epoch
+# share one effort level. Override with COPILOT_REASONING_EFFORT.
+DEFAULT_REASONING_EFFORT = os.environ.get("BENCH_REASONING_EFFORT", "medium")
+
+
+def _resolve_reasoning_effort() -> str | None:
+    """Effort level to pass to copilot, or None to omit the flag entirely."""
+    effort = os.environ.get("COPILOT_REASONING_EFFORT", DEFAULT_REASONING_EFFORT)
+    if not effort or effort.lower() == "off":
+        return None
+    return effort
+
+
+# Tracks that only need different Copilot MCP wiring.
+NO_MCP = "copilot_no_mcp"
+CODE_GRAPH = "code_graph"
+LSP = "lsp"
+VALID_TRACKS = (NO_MCP, CODE_GRAPH, LSP)
+
+DEFAULT_CACHE = Path(__file__).resolve().parents[1] / "cache" / "copilot"
+
+# Dirs that must never end up in an extracted patch even if Copilot or a tool
+# left them untracked in the worktree.
+_PATCH_EXCLUDES = (
+    "__pycache__",
+    ".pytest_cache",
+    ".mypy_cache",
+    ".ruff_cache",
+    ".tox",
+    ".eggs",
+    "node_modules",
+    ".git",
+    ".venv",
+    "venv",
+    "build",
+    "dist",
+)
+
+# The code-graph MCP server lives in a sibling worktree and is launched via a
+# wrapper that fixes PYTHONPATH (see _write_mcp_config).
+DEFAULT_MCP_SERVER_ROOT = Path(
+    os.environ.get(
+        "CGRAPH_MCP_SERVER_ROOT",
+        "/Users/dvirdukhan/Code/code-graph/.worktrees/mcp-smoke",
+    )
+)
+
+# The LSP-backed MCP server (bench/mcp/lsp_server.py) lives in THIS bench tree
+# (mcp-t17), but must be launched with the mcp-smoke venv python because that is
+# the only environment with BOTH `mcp`/FastMCP AND `multilspy`. Its wrapper also
+# prepends the mcp-smoke venv `bin/` to PATH so the `jedi-language-server`
+# console script multilspy launches by bare name is found.
+LSP_BENCH_ROOT = Path(
+    os.environ.get("LSP_BENCH_ROOT", str(Path(__file__).resolve().parents[2]))
+)
+DEFAULT_LSP_SERVER_PYTHON_ROOT = Path(
+    os.environ.get("LSP_SERVER_PYTHON_ROOT", str(DEFAULT_MCP_SERVER_ROOT))
+)
+
+
+# ---------------------------------------------------------------------------
+# Prompt assembly (symmetric across tracks; only the capability note differs)
+# ---------------------------------------------------------------------------
+
+FIX = "fix"
+LOCALIZE = "localize"
+VALID_MODES = (FIX, LOCALIZE)
+
+# The strict line the localization agent must end on. Re-used by the parser.
+LOCALIZE_SENTINEL = "FINAL_LOCALIZATION_JSON:"
+
+_BASE_PROMPT = """\
+You are fixing a bug in the Python repository checked out at {cwd}.
+
+{problem}
+
+Inspect the repository to understand the relevant code before editing, then
+make the minimal source change that fixes the issue. Do not modify test files.
+{capability}
+When you are done, stop and give a one-line summary of what you changed."""
+
+_LOCALIZE_PROMPT = """\
+You are localizing (not fixing) a bug in the Python repository checked out at {cwd}.
+
+{problem}
+
+Investigate the repository to determine which SOURCE files must be edited to fix
+this issue. Do NOT modify any files. Do NOT run or edit tests.
+{capability}
+When you are confident, finish your FINAL assistant message with a single line in
+EXACTLY this format (most-likely file first, repo-root-relative paths, Python
+source files only, no test or doc files):
+
+{sentinel} ["pkg/module_a.py", "pkg/module_b.py"]
+
+Write that line as plain text in your own final message. Do NOT emit it through a
+shell command, `echo`, a file write, or any tool call."""
+
+# Lane 1 adoption-calibration frozen text (prereg §5). Do NOT edit without
+# amending the pre-registration; the experiment's validity depends on the exact
+# wording (negative-control / non-overfitting requirement).
+#
+# SEM (lever a): edge-semantics clause appended verbatim to the code_graph
+# capability preamble. NO frequency/benchmark prior (the rejected wording "the
+# edit site is often a caller or a sibling, not the matched symbol" is forbidden).
+_ADOPT_SEM_CLAUSE = (
+    "Graph edges (calls, imports, inheritance, overrides, definitions) are "
+    "evidence that code is RELATED — not evidence that a connected file is the "
+    "location you must change. Treat every graph result as a hypothesis. Keep a "
+    "candidate in your final answer only when the code you have read supports "
+    "that the file participates directly in the behavior the task asks you to "
+    "change; drop it otherwise. Relatedness alone is not a reason to keep or to "
+    "drop."
+)
+# RAT (lever b): mandatory keep/drop-with-reason step injected into the localize
+# prompt body, BEFORE the FINAL sentinel instruction.
+_ADOPT_RAT_STEP = (
+    "Before your final answer, list every file the graph surfaced and, for each, "
+    "write one line: `KEEP <file> — <reason from code you read>` or "
+    "`DROP <file> — <reason>`. Your final answer must be consistent with these "
+    "decisions. You may add files the graph did not surface."
+)
+# RAT localize variant: identical to _LOCALIZE_PROMPT but with _ADOPT_RAT_STEP
+# inserted after the capability note and before the FINAL sentinel instruction.
+_LOCALIZE_PROMPT_RAT = """\
+You are localizing (not fixing) a bug in the Python repository checked out at {cwd}.
+
+{problem}
+
+Investigate the repository to determine which SOURCE files must be edited to fix
+this issue. Do NOT modify any files. Do NOT run or edit tests.
+{capability}
+{rat_step}
+When you are confident, finish your FINAL assistant message with a single line in
+EXACTLY this format (most-likely file first, repo-root-relative paths, Python
+source files only, no test or doc files):
+
+{sentinel} ["pkg/module_a.py", "pkg/module_b.py"]
+
+Write that line as plain text in your own final message. Do NOT emit it through a
+shell command, `echo`, a file write, or any tool call."""
+
+# Valid Lane 1 arm names. CTRL == canonical nudge base (prereg §2 amended: a
+# neutral preamble yields ~0% spontaneous adoption on strong models, leaving
+# nothing to calibrate, so CTRL is pinned to _CAP_CODE_GRAPH_NUDGE).
+ADOPT_ARMS = ("ctrl", "sem", "rat")
+
+_CAP_NO_MCP = (
+    "No external MCP tools are available; use Copilot's built-in file, search "
+    "and edit tools."
+)
+# Matched no-MCP nudge: parallels the code_graph search-first mandate without
+# naming any specific tool, so the comparison isolates the graph, not the
+# "search before grep" instruction.
+_CAP_NO_MCP_NUDGE = (
+    "No external MCP tools are available. Before resorting to plain text search "
+    "(grep/rg), begin by broadly mapping the repository structure to locate the "
+    "relevant symbols and how they relate; use Copilot's built-in file, search "
+    "and edit tools."
+)
+_CAP_CODE_GRAPH = (
+    "A code-graph MCP server is available exposing code-navigation tools "
+    "(search_code, get_callers, get_callees, get_dependencies, impact_analysis, "
+    "find_path). The repository has ALREADY been indexed under project=\"{project}\" "
+    "and is ready to query immediately — do NOT call index_repo; call the "
+    "navigation tools directly with project=\"{project}\". Prefer precise "
+    "code-navigation tools over plain text search when they help. Do not use the "
+    "`ask` tool."
+)
+# Nudged code_graph: mandate an initial search_code call to measure the tool's
+# value when the model is forced to engage it (the neutral prompt yields ~0%
+# spontaneous adoption on strong models).
+_CAP_CODE_GRAPH_NUDGE = (
+    "A code-graph MCP server is available exposing code-navigation tools "
+    "(search_code, get_callers, get_callees, get_dependencies, impact_analysis, "
+    "find_path). The repository has ALREADY been indexed under project=\"{project}\" "
+    "and is ready to query — do NOT call index_repo. You MUST begin by calling "
+    "search_code(project=\"{project}\") to "
+    "locate the relevant symbols BEFORE any plain text search, and prefer these "
+    "graph tools over grep throughout your investigation. Do not use the `ask` tool."
+)
+
+
+# Traversal-mandate variant: gated by CGRAPH_TRAVERSE_NUDGE=1 + --nudge. Forces the
+# model to actually traverse (get_callers/get_callees/find_path) from candidate
+# symbols, isolating whether traversal — not just search-first — helps localization.
+_CAP_CODE_GRAPH_TRAVERSE = (
+    "A code-graph MCP server is available exposing code-navigation tools "
+    "(search_code, get_callers, get_callees, get_dependencies, impact_analysis, "
+    "find_path). The repository has ALREADY been indexed under project=\"{project}\" "
+    "— do NOT call index_repo. You MUST follow this workflow: (1) call search_code(project=\"{project}\") "
+    "to locate candidate symbols; (2) for your top candidate symbol(s) you MUST call "
+    "get_callers AND get_callees (and find_path between candidates when relevant), and "
+    "inspect the files those calls surface, BEFORE finalizing your answer; (3) prefer "
+    "these graph tools over grep throughout. Do not use the `ask` tool."
+)
+
+
+# Spike variant (Spike 1a: IMPORTS + OVERRIDES edges): gated by CGRAPH_SPIKE_NUDGE=1
+# + --nudge. Forces the model to exercise the NEW edge types — get_importers
+# (file<-file IMPORTS) and get_overrides (subclass.method->ancestor.method) — which
+# can bridge to gold files that the CALLS/DEFINES/EXTENDS call-graph never reached.
+_CAP_CODE_GRAPH_SPIKE = (
+    "A code-graph MCP server is available exposing code-navigation tools "
+    "(search_code, get_callers, get_callees, get_dependencies, impact_analysis, "
+    "find_path, get_importers, get_overrides). The repository has ALREADY been "
+    "indexed under project=\"{project}\" — do NOT call index_repo. You MUST follow this workflow: "
+    "(1) call search_code(project=\"{project}\") to locate candidate symbols and "
+    "their files; (2) for your top candidate file(s) you MUST call "
+    "get_importers (to find which other source files import them) AND, for any "
+    "candidate class/method, get_overrides (to find ancestor or subclass methods "
+    "that share its behavior); inspect the files those calls surface BEFORE "
+    "finalizing your answer; (3) prefer these graph tools over grep throughout. "
+    "Do not use the `ask` tool."
+)
+
+
+# Substitution+stop variant: gated by CGRAPH_SUBST_NUDGE=1 + --nudge. Targets the
+# observed thrash failure mode (agent ignores a correct high-confidence rank-1 hit,
+# chases a wrong hypothesis with broad grep sweeps, and never stops). Instructs the
+# agent to TRUST the ranked search_code output (the top hits and their
+# likely_related_files) as the candidate answer set, confirm with at most 1-2 file
+# views, then STOP — substituting the graph for grep rather than running both.
+_CAP_CODE_GRAPH_SUBST = (
+    "A code-graph MCP server is available exposing code-navigation tools "
+    "(search_code, get_callers, get_callees, get_dependencies, impact_analysis, "
+    "find_path). The repository has ALREADY been indexed under project=\"{project}\" "
+    "— do NOT call index_repo. You MUST follow this workflow: (1) call "
+    "search_code(project=\"{project}\") with a CONCEPTUAL free-text query describing "
+    "the buggy behavior and area; (2) TRUST the ranked results — the top-ranked files "
+    "and the likely_related_files attached to them ARE your candidate answer set. "
+    "Confirm with AT MOST 1-2 targeted file views; (3) do NOT run broad grep/find "
+    "sweeps to second-guess a confident high-ranked hit, and do NOT keep searching "
+    "once the ranked results plus a quick view agree — STOP and answer. Substitute "
+    "the graph for grep; do not run both. Do not use the `ask` tool."
+)
+
+
+# LSP capability note. The LSP MCP server exposes jedi-backed navigation tools
+# (goto_definition, find_references, hover, document_symbols). Positions are
+# 0-based (LSP convention) while grep/view are 1-based — the agent must adjust.
+_CAP_LSP = (
+    "An LSP MCP server is available exposing jedi-backed Python navigation tools "
+    "(goto_definition, find_references, hover, document_symbols). Paths are "
+    "repo-root-relative; line/character positions are 0-based (subtract 1 from "
+    "the 1-based line numbers grep/view report). Prefer these precise "
+    "navigation tools over plain text search when they help."
+)
+# Nudged LSP: mandate an initial navigation call to measure the tool's value
+# when the model is forced to engage it.
+_CAP_LSP_NUDGE = (
+    "An LSP MCP server is available exposing jedi-backed Python navigation tools "
+    "(goto_definition, find_references, hover, document_symbols). Paths are "
+    "repo-root-relative; line/character positions are 0-based (subtract 1 from "
+    "the 1-based line numbers grep/view report). You MUST begin by calling "
+    "document_symbols on a likely-relevant file (or goto_definition on a symbol "
+    "from the problem statement) BEFORE any plain text search, and prefer these "
+    "LSP tools over grep throughout your investigation."
+)
+
+
+# Appended to the prompt under hardening (default ON; BENCH_BLOCK_NETWORK=0 to opt out). Tells the agent
+# to derive the answer from the code only — not from the network, GitHub, the
+# issue/PR number, git remotes, or the harness's own files.
+_HARDEN_PROMPT_LINE = (
+    "IMPORTANT: Determine the answer ONLY from the source code in the working "
+    "directory. Do NOT access the network or fetch any URL; do NOT consult "
+    "GitHub, pull requests, commits, patches, or diffs; do NOT read or infer "
+    "anything from a git remote, `origin`, the issue/PR number, or files "
+    "outside the working directory. Any attempt to look up the fix externally "
+    "invalidates the result."
+)
+
+
+def _capability(track: str, project: str, *, nudge: bool, adopt_arm: str | None = None) -> str:
+    if track == CODE_GRAPH:
+        if adopt_arm is not None:
+            # Lane 1 arms bypass the env-gated nudge variants entirely. CTRL,
+            # SEM and RAT all share the canonical nudge base (prereg §2 amended);
+            # SEM additionally appends the frozen edge-semantics clause.
+            cap = _CAP_CODE_GRAPH_NUDGE.format(project=project)
+            if adopt_arm == "sem":
+                cap = f"{cap} {_ADOPT_SEM_CLAUSE}"
+            return cap
+        if nudge and os.environ.get("CGRAPH_SUBST_NUDGE") == "1":
+            tmpl = _CAP_CODE_GRAPH_SUBST
+        elif nudge and os.environ.get("CGRAPH_SPIKE_NUDGE") == "1":
+            tmpl = _CAP_CODE_GRAPH_SPIKE
+        elif nudge and os.environ.get("CGRAPH_TRAVERSE_NUDGE") == "1":
+            tmpl = _CAP_CODE_GRAPH_TRAVERSE
+        elif nudge:
+            tmpl = _CAP_CODE_GRAPH_NUDGE
+        else:
+            tmpl = _CAP_CODE_GRAPH
+        return tmpl.format(project=project)
+    if track == LSP:
+        return _CAP_LSP_NUDGE if nudge else _CAP_LSP
+    return _CAP_NO_MCP_NUDGE if nudge else _CAP_NO_MCP
+
+
+def build_prompt(
+    track: str,
+    cwd: Path,
+    problem: str,
+    project: str,
+    *,
+    nudge: bool = False,
+    mode: str = FIX,
+    adopt_arm: str | None = None,
+) -> str:
+    if adopt_arm is not None and (track != CODE_GRAPH or mode != LOCALIZE):
+        raise ValueError(
+            f"adopt_arm={adopt_arm!r} requires track={CODE_GRAPH} and mode={LOCALIZE}; "
+            f"got track={track!r} mode={mode!r}"
+        )
+    if adopt_arm is not None and adopt_arm not in ADOPT_ARMS:
+        raise ValueError(f"unknown adopt_arm={adopt_arm!r}; expected one of {ADOPT_ARMS}")
+    capability = _capability(track, project, nudge=nudge, adopt_arm=adopt_arm)
+    if swe_bench.network_block_enabled():
+        capability = f"{capability}\n{_HARDEN_PROMPT_LINE}"
+    if mode == LOCALIZE:
+        if adopt_arm == "rat":
+            return _LOCALIZE_PROMPT_RAT.format(
+                cwd=cwd,
+                problem=problem.strip(),
+                capability=capability,
+                rat_step=_ADOPT_RAT_STEP,
+                sentinel=LOCALIZE_SENTINEL,
+            )
+        return _LOCALIZE_PROMPT.format(
+            cwd=cwd,
+            problem=problem.strip(),
+            capability=capability,
+            sentinel=LOCALIZE_SENTINEL,
+        )
+    return _BASE_PROMPT.format(cwd=cwd, problem=problem.strip(), capability=capability)
+
+
+# ---------------------------------------------------------------------------
+# code-graph MCP wiring
+# ---------------------------------------------------------------------------
+
+
+def _write_mcp_wrapper(run_dir: Path, server_root: Path) -> Path:
+    """Write the stdio launcher for cgraph-mcp.
+
+    The server's editable install is only importable with the server worktree
+    on PYTHONPATH, so the wrapper cd's there and sets PYTHONPATH before exec'ing
+    the server's venv python. Validated to start in ~1.7s from any cwd.
+    """
+    py = server_root / ".venv" / "bin" / "python"
+    if not py.exists():
+        raise FileNotFoundError(f"cgraph-mcp server python not found: {py}")
+    wrapper = run_dir / "cgraph-mcp-wrapper.sh"
+    wrapper.write_text(
+        "#!/bin/bash\n"
+        f'cd "{server_root}"\n'
+        f'export PYTHONPATH="{server_root}:$PYTHONPATH"\n'
+        f'exec "{py}" -c "from api.mcp.server import main; main()"\n'
+    )
+    wrapper.chmod(0o755)
+    return wrapper
+
+
+def _write_mcp_config(
+    run_dir: Path,
+    wrapper: Path,
+    falkor_host: str,
+    falkor_port: int,
+    extra_env: dict[str, str] | None = None,
+) -> Path:
+    cfg = run_dir / "cg-mcp-config.json"
+    env = {
+        "FALKORDB_HOST": falkor_host,
+        "FALKORDB_PORT": str(falkor_port),
+    }
+    if extra_env:
+        env.update(extra_env)
+    cfg.write_text(
+        json.dumps(
+            {
+                "mcpServers": {
+                    "code-graph": {
+                        "command": str(wrapper),
+                        "args": [],
+                        "env": env,
+                    }
+                }
+            },
+            indent=2,
+        )
+    )
+    return cfg
+
+
+def _write_lsp_wrapper(run_dir: Path, repo_path: Path) -> Path:
+    """Write the stdio launcher for the LSP MCP server (bench/mcp/lsp_server.py).
+
+    The server module lives in this bench tree (LSP_BENCH_ROOT) but must run on
+    the mcp-smoke venv python (the only env with both `mcp` and `multilspy`). The
+    wrapper also prepends that venv's `bin/` to PATH so multilspy can exec the
+    `jedi-language-server` console script by bare name, and points the adapter at
+    the target repo via LSP_REPO_ROOT.
+    """
+    py = DEFAULT_LSP_SERVER_PYTHON_ROOT / ".venv" / "bin" / "python"
+    if not py.exists():
+        raise FileNotFoundError(f"lsp-mcp server python not found: {py}")
+    venv_bin = DEFAULT_LSP_SERVER_PYTHON_ROOT / ".venv" / "bin"
+    wrapper = run_dir / "lsp-mcp-wrapper.sh"
+    wrapper.write_text(
+        "#!/bin/bash\n"
+        f'cd "{LSP_BENCH_ROOT}"\n'
+        f'export PATH="{venv_bin}:$PATH"\n'
+        f'export PYTHONPATH="{LSP_BENCH_ROOT}:$PYTHONPATH"\n'
+        f'export LSP_REPO_ROOT="{repo_path}"\n'
+        'export LSP_LANGUAGE="python"\n'
+        f'exec "{py}" -c "from bench.mcp.lsp_server import main; main()"\n'
+    )
+    wrapper.chmod(0o755)
+    return wrapper
+
+
+def _write_lsp_mcp_config(run_dir: Path, wrapper: Path) -> Path:
+    cfg = run_dir / "lsp-mcp-config.json"
+    cfg.write_text(
+        json.dumps(
+            {
+                "mcpServers": {
+                    "lsp": {
+                        "command": str(wrapper),
+                        "args": [],
+                        "env": {},
+                    }
+                }
+            },
+            indent=2,
+        )
+    )
+    return cfg
+
+
+def _falkor_settings() -> tuple[str, int]:
+    return (
+        os.environ.get("FALKORDB_HOST", "127.0.0.1"),
+        int(os.environ.get("FALKORDB_PORT", "6379")),
+    )
+
+
+def ensure_indexed(repo_path: Path, *, fresh: bool = True) -> float:
+    """Delete any stale graph for this worktree and (re)index it.
+
+    Returns indexing wall-clock seconds. Indexes via the running code-graph
+    HTTP API (``/api/analyze_folder``); the agent's cgraph-mcp reads the same
+    FalkorDB instance, so the graph ``code:{repo_path.name}:_default`` is what
+    the agent will query with ``project=repo_path.name``.
+
+    ``branch="_default"`` is passed EXPLICITLY so the index lands on the exact
+    key the agent (which omits ``branch``) reads. Without it the API falls back
+    to ``detect_branch(worktree)`` = ``git rev-parse``; when the hardened path
+    has stripped the worktree's ``.git``, git walks UP to the enclosing harness
+    repo and returns ITS branch, so the index would land under that branch key
+    while the agent queries an empty ``_default`` graph.
+    """
+    import httpx
+    import redis
+
+    host, port = _falkor_settings()
+    repo_name = repo_path.name
+    graph = f"code:{repo_name}:_default"
+
+    if fresh:
+        try:
+            r = redis.Redis(host=host, port=port, decode_responses=True, socket_timeout=2)
+            if graph in (r.execute_command("GRAPH.LIST") or []):
+                r.execute_command("GRAPH.DELETE", graph)
+                print(f"[index] dropped stale {graph}")
+        except Exception as exc:  # noqa: BLE001
+            print(f"[index] WARN could not drop {graph}: {exc!r}")
+
+    base = os.environ.get("CODEGRAPH_URL", "http://127.0.0.1:5000").rstrip("/")
+    token = os.environ.get("SECRET_TOKEN") or os.environ.get("CODEGRAPH_TOKEN")
+    headers = {"Authorization": f"Bearer {token}"} if token else {}
+    default_ignore = [
+        ".git", "venv", ".venv", "node_modules", "__pycache__",
+        "rubi/rules", "build", "dist", ".tox", ".eggs",
+    ]
+    t0 = time.time()
+    with httpx.Client(timeout=7200.0, headers=headers) as c:
+        # Preflight: confirm the API server points at the same FalkorDB the
+        # agent's MCP server will read, else the agent queries an empty graph.
+        try:
+            h = c.get(f"{base}/api/_health", timeout=10.0)
+            if h.status_code == 200:
+                hp = int(h.json().get("falkordb_port", port))
+                if hp != port:
+                    raise RuntimeError(
+                        f"API server FalkorDB port {hp} != runner port {port}; "
+                        "agent and indexer would see different graphs."
+                    )
+        except httpx.HTTPError:
+            pass  # _health is best-effort
+        resp = c.post(
+            f"{base}/api/analyze_folder",
+            json={"path": str(repo_path), "ignore": default_ignore, "branch": "_default"},
+        )
+        if resp.status_code != 200:
+            raise RuntimeError(
+                f"analyze_folder {resp.status_code}: {resp.text[:300]}. "
+                f"Check ALLOWED_ANALYSIS_DIR covers {repo_path}."
+            )
+    dt = time.time() - t0
+    print(f"[index] indexed {repo_name} in {dt:.1f}s")
+    return dt
+
+
+# ---------------------------------------------------------------------------
+# Copilot invocation
+# ---------------------------------------------------------------------------
+
+COPILOT_MAX_ATTEMPTS = 3
+COPILOT_RETRY_BACKOFF_SEC = 15.0
+
+# Substrings that mark a transient startup/network failure (token validation
+# fetch failed, connection resets) rather than a real model run.
+_TRANSIENT_STARTUP_MARKERS = (
+    "could not be validated",
+    "fetch failed",
+    "econnreset",
+    "etimedout",
+    "enotfound",
+    "socket hang up",
+    "network",
+    "getaddrinfo",
+)
+
+
+def _is_transient_startup_failure(
+    returncode: int | None, stdout: str, stderr: str
+) -> bool:
+    """True when Copilot exited early without producing any result stream.
+
+    A genuine run always emits at least one JSON line on stdout. A transient
+    auth/network failure exits non-zero with empty stdout and a recognizable
+    error on stderr; those rows must be retried, not scored as recall=0.
+    """
+    if returncode in (0, None):
+        return False
+    if stdout and stdout.strip():
+        return False
+    blob = (stderr or "").lower()
+    return any(marker in blob for marker in _TRANSIENT_STARTUP_MARKERS)
+
+
+# ---------------------------------------------------------------------------
+# Answer-leakage hardening (default ON; opt out with BENCH_BLOCK_NETWORK=0)
+# ---------------------------------------------------------------------------
+# Shell commands that can exfiltrate the gold answer from the network or from a
+# git remote. Denied as ``shell(<cmd>:*)`` so the agent's tool layer refuses
+# them outright (deny takes precedence over --allow-all-tools). These are a
+# defense-in-depth layer, NOT a hermetic jail: a determined agent can still
+# reach the network via python/node/etc., which is why detect_network_leak()
+# backstops every run and trips signals are quarantined from scored numbers.
+_DENY_SHELL_CMDS = (
+    "curl", "wget", "gh", "nc", "ncat", "ssh", "scp", "telnet",
+    "git fetch", "git pull", "git clone", "git remote",
+    "git ls-remote", "git push",
+)
+
+# GitHub domains that serve merged-PR file lists / patches / commits. Denied via
+# --deny-url (precedence over allow). The model endpoint (*.githubcopilot.com)
+# and localhost (code-graph API :5000, FalkorDB) are deliberately NOT blocked.
+_DENY_URLS = (
+    "github.com",
+    "*.github.com",
+    "api.github.com",
+    "raw.githubusercontent.com",
+    "*.githubusercontent.com",
+    "codeload.github.com",
+    "patch-diff.githubusercontent.com",
+    "objects.githubusercontent.com",
+)
+
+
+def _network_deny_flags() -> list[str]:
+    """copilot CLI flags that block network/remote exfiltration of the gold answer."""
+    flags = ["--excluded-tools=web_fetch"]
+    for cmd in _DENY_SHELL_CMDS:
+        flags.append(f"--deny-tool=shell({cmd}:*)")
+    for url in _DENY_URLS:
+        flags.append(f"--deny-url={url}")
+    return flags
+
+
+def _git_ceiling_dirs(cwd: Path) -> str:
+    """``GIT_CEILING_DIRECTORIES`` value that fences git inside the worktree.
+
+    Lists the worktree's parent (both resolved and lexical, to defeat symlinked
+    paths) so git's upward repo discovery stops there: from inside the
+    history-free worktree it then finds no repository instead of walking up to
+    the enclosing harness repo. Listed dirs are NOT themselves crossed.
+    """
+    cwd_resolved = cwd.resolve()
+    ceilings = {str(cwd_resolved.parent), str(cwd.parent)}
+    return os.pathsep.join(sorted(ceilings))
+
+
+def _harden_env(env: dict[str, str]) -> dict[str, str]:
+    """Strip leak-enabling vars (opaque-name salt, GitHub creds) from the agent env.
+
+    Also removes inherited ``GIT_*`` discovery overrides (``GIT_DIR``,
+    ``GIT_WORK_TREE``, ``GIT_COMMON_DIR``, ``GIT_CONFIG``) which would otherwise
+    let the agent's git escape the worktree regardless of ``GIT_CEILING_DIRECTORIES``,
+    and sets ``GIT_CONFIG_NOSYSTEM=1`` so host git config can't re-point discovery.
+    The actual upward fence (``GIT_CEILING_DIRECTORIES``) is set in ``run_copilot``
+    where the worktree path is known.
+    """
+    for var in swe_bench.LEAK_SCRUB_ENV_VARS:
+        env.pop(var, None)
+    for var in ("GIT_DIR", "GIT_WORK_TREE", "GIT_COMMON_DIR", "GIT_CONFIG"):
+        env.pop(var, None)
+    env["GIT_CONFIG_NOSYSTEM"] = "1"
+    return env
+
+
+# Substrings in a bash command that indicate an attempt to reach the gold answer
+# via the network or a git remote / the cloned-repo offline oracle.
+# NOTE: "/.git/" is handled separately (see _git_read_is_suspicious) because it
+# legitimately appears in benign `find -not -path '*/.git/*'` / grep
+# `--exclude-dir=.git` exclusions, which must NOT be flagged.
+_LEAK_CMD_PATTERNS = (
+    "github.com", "githubusercontent", "/pull/", "pull/", "/commit/",
+    ".patch", ".diff", "curl", "wget", " gh ", "gh pr", "gh api",
+    "git fetch", "git pull", "git ls-remote", "git remote",
+    "log origin", "diff origin", "rev-parse origin", "show origin",
+    # git-escape attempts: explicitly re-pointing git past the GIT_CEILING
+    # fence to reach the enclosing harness repo (branch name + commit messages).
+    "git -c ", "git --git-dir", "--git-dir=", "--work-tree",
+    "env -u git", "git_ceiling", "git_dir=", "git_work_tree",
+    "cache/repos", "urllib", "requests.get", "http.client",
+    "socket.", "urlopen", "httpx", "fetch(",
+)
+# Regexes that strip BENIGN ``.git`` references (path-exclusion filters) from a
+# command before we test for a genuine ``.git`` *read*. Without this, every
+# ``find . -not -path '*/.git/*'`` directory listing trips a false leak.
+_GIT_EXCLUSION_RE = re.compile(
+    r"""(?:!\s*)?-?-?(?:not\s+)?              # optional ! / - / --not
+        (?:-path|-ipath|exclude(?:-dir)?)\s*  # find -path / grep --exclude-dir
+        =?\s*['"]?[^'"\s]*\.git[^'"\s]*['"]?  # a token containing .git
+     """,
+    re.VERBOSE,
+)
+# Verbs/redirections that indicate an actual READ of git internals (the oracle).
+# Deliberately excludes grep/rg/sed/awk: those are directory searchers that take
+# benign ``.git`` exclusion globs (e.g. ``rg --glob '!**/.git/**'``); a genuine
+# git-internal read through them is still caught by the specific-file alternative
+# below (``.git/HEAD`` etc.).
+_GIT_READ_RE = re.compile(
+    r"(?:cat|less|more|head|tail|strings|xxd|od|"
+    r"open\(|cp|rsync)\b[^|;&]*\.git/"
+    r"|<\s*[^|;&]*\.git/"          # input redirection from a .git file
+    r"|\.git/(?:HEAD|refs|logs|objects|COMMIT_EDITMSG|ORIG_HEAD|packed-refs)"
+)
+# Path substrings whose READ would leak the answer or the harness's own state.
+_LEAK_PATH_PATTERNS = (
+    "cache/repos", "/.git/", "results.jsonl", "gold", "mapping",
+    "trace.jsonl", "trace.md",
+)
+
+
+def _scan_leak_arguments(name: str, args: dict[str, Any]) -> list[str]:
+    """Return leak signals for a single tool-execution-start event."""
+    signals: list[str] = []
+    lname = (name or "").lower()
+    if lname in ("web_fetch", "fetch") or lname.endswith("-fetch"):
+        url = str(args.get("url") or args.get("uri") or "")
+        signals.append(f"{name}:url={url[:120]}")
+        return signals
+    # Shell / bash: inspect the command string.
+    cmd = args.get("command") or args.get("cmd") or args.get("script")
+    if isinstance(cmd, str) and cmd:
+        low = cmd.lower()
+        for pat in _LEAK_CMD_PATTERNS:
+            if pat in low:
+                signals.append(f"bash:{pat.strip()}")
+        # ".git" needs context: ignore benign path-exclusion filters
+        # (find -not -path '*/.git/*', grep --exclude-dir=.git) and only flag a
+        # genuine READ of git internals (the offline gold oracle).
+        if ".git" in low:
+            stripped = _GIT_EXCLUSION_RE.sub(" ", low)
+            if _GIT_READ_RE.search(stripped):
+                signals.append("bash:.git-read")
+    # File-reader tools: inspect the path. Skip benign .github/.gitignore.
+    path = args.get("path") or args.get("file") or args.get("filename")
+    if isinstance(path, str) and path:
+        low = path.lower()
+        for pat in _LEAK_PATH_PATTERNS:
+            if pat in low:
+                signals.append(f"path:{pat.strip()}")
+    return signals
+
+
+def detect_network_leak(stdout: str) -> dict[str, Any]:
+    """Scan the event stream for attempts to reach the gold answer off-task.
+
+    Inspects every ``tool.execution_start`` event (both ``data.*`` and flat
+    top-level shapes; ``arguments`` is a dict). Flags web_fetch, GitHub/PR/
+    commit/patch URLs, network shell commands, git-remote / origin reads, and
+    reads of the cloned ``.git`` oracle, the shared repos cache, or the
+    harness's own results/gold/trace files. Returns a bool + de-duplicated
+    signal list recorded on the row so tripped runs can be quarantined.
+    """
+    signals: list[str] = []
+    for line in stdout.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            ev = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        if not str(ev.get("type", "")).startswith("tool.execution_start"):
+            continue
+        data = ev.get("data") if isinstance(ev.get("data"), dict) else {}
+        name = data.get("name") or data.get("toolName") or ev.get("toolName") or ""
+        args = data.get("arguments")
+        if not isinstance(args, dict):
+            top = ev.get("arguments")
+            args = top if isinstance(top, dict) else {}
+        signals.extend(_scan_leak_arguments(name, args))
+    deduped = sorted(set(signals))
+    return {"network_leak": bool(deduped), "leak_signals": deduped}
+
+
+def hardening_meta(repo_path: Path, stdout: str, reasoning_tokens: int) -> dict[str, Any]:
+    """Per-row leak-hardening + thinking provenance, recorded on every run.
+
+    Marks which harness generation produced the row (so generations are never
+    pooled), whether the network/opaque-path/.git defenses were active, the
+    reasoning effort + separately-accounted thinking tokens, and any leak
+    signals the detector tripped (so contaminated runs can be quarantined).
+    """
+    hardened = swe_bench.network_block_enabled()
+    leak = detect_network_leak(stdout)
+    return {
+        "harness_hardening_version": HARNESS_HARDENING_VERSION,
+        "network_block_mode": hardened,
+        "opaque_path_mode": hardened,
+        "git_sanitized": hardened and not (repo_path / ".git").exists(),
+        "git_walk_up_blocked": hardened,
+        "reasoning_effort": _resolve_reasoning_effort(),
+        "reasoning_tokens": int(reasoning_tokens or 0),
+        "network_leak": leak["network_leak"],
+        "leak_signals": leak["leak_signals"],
+    }
+
+
+def run_copilot(
+    *,
+    prompt: str,
+    model: str,
+    cwd: Path,
+    log_dir: Path,
+    mcp_config: Path | None,
+    wall_time: float,
+) -> dict[str, Any]:
+    """Invoke Copilot headless. Returns {stdout_jsonl, returncode, timed_out, wall}."""
+    # Copilot runs with cwd=worktree and resolves a relative --log-dir against
+    # THAT cwd, which would scatter process logs under the worktree. Force
+    # absolute so logs land where the parser reads them.
+    log_dir = log_dir.resolve()
+    log_dir.mkdir(parents=True, exist_ok=True)
+    env = dict(os.environ)
+    hardened = swe_bench.network_block_enabled()
+    if hardened:
+        # Remove the opaque-name salt and any GitHub credentials so the agent
+        # process cannot recover them.
+        env = _harden_env(env)
+        # Fence the agent's git: with the worktree's own .git stripped, a bare
+        # `git log`/`git status` would otherwise walk UP to the enclosing harness
+        # repo and leak its branch name + commit messages (which reveal the
+        # benchmark intent). GIT_CEILING_DIRECTORIES stops the upward search at
+        # the worktree's parent. Listed dirs are NOT crossed, so git sees no
+        # repository from inside the (history-free) worktree. Both the resolved
+        # and lexical parent are listed to defeat symlinked paths.
+        env["GIT_CEILING_DIRECTORIES"] = _git_ceiling_dirs(cwd)
+    t0 = time.time()
+    timed_out = False
+    stdout, stderr, returncode = "", "", None
+    # Transient startup failures (OAuth token validation hitting a network blip,
+    # connection resets) make Copilot exit in ~1s with empty stdout. Those rows
+    # would otherwise be scored as recall=0 false negatives, so retry them.
+    for attempt in range(1, COPILOT_MAX_ATTEMPTS + 1):
+        session_id = str(uuid.uuid4())
+        cmd = [
+            "copilot", "-p", prompt,
+            "--model", model,
+            "--output-format", "json",
+            "--no-remote",
+            "--disable-builtin-mcps",
+            "--allow-all-tools",
+        ]
+        # Under hardening, confine the `view` file tool to the worktree (via
+        # --add-dir alone) instead of --allow-all-paths, so it cannot read the
+        # sibling cloned-repo `.git` oracle or the harness's own results/gold
+        # files. Shell reads are backstopped by deny-globs + the leak detector.
+        if not hardened:
+            cmd.append("--allow-all-paths")
+        cmd += [
+            "--add-dir", str(cwd),
+            "--log-level", "debug",
+            "--log-dir", str(log_dir),
+            "--session-id", session_id,
+        ]
+        # Thinking is ENABLED for scored runs so the agent's tool-choice
+        # deliberation is captured in the trace. The reasoning-token cost is
+        # accounted separately (parse_tokens_from_logs -> reasoning_tokens) so
+        # the base token comparison across arms stays interpretable. Set
+        # COPILOT_REASONING_EFFORT=off to disable.
+        _effort = _resolve_reasoning_effort()
+        if _effort:
+            cmd += ["--effort", _effort]
+        # Network/remote exfiltration block (defense-in-depth; detector backstops).
+        if hardened:
+            cmd += _network_deny_flags()
+        if mcp_config is not None:
+            cmd += ["--additional-mcp-config", f"@{mcp_config}"]
+
+        timed_out = False
+        # start_new_session=True puts Copilot + its children (MCP server, shells)
+        # in a fresh process group we can signal as a unit on timeout.
+        proc = subprocess.Popen(
+            cmd,
+            cwd=str(cwd),
+            stdin=subprocess.DEVNULL,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            env=env,
+            start_new_session=True,
+        )
+        try:
+            stdout, stderr = proc.communicate(timeout=wall_time)
+        except subprocess.TimeoutExpired:
+            timed_out = True
+            _kill_group(proc.pid)
+            try:
+                stdout, stderr = proc.communicate(timeout=30)
+            except subprocess.TimeoutExpired:
+                stdout, stderr = "", ""
+        returncode = proc.returncode
+
+        if timed_out or not _is_transient_startup_failure(returncode, stdout, stderr):
+            break
+        if attempt < COPILOT_MAX_ATTEMPTS:
+            print(
+                f"[retry] copilot startup failure (rc={returncode}, attempt "
+                f"{attempt}/{COPILOT_MAX_ATTEMPTS}); backing off "
+                f"{COPILOT_RETRY_BACKOFF_SEC}s. stderr={stderr.strip()[:160]!r}"
+            )
+            time.sleep(COPILOT_RETRY_BACKOFF_SEC)
+
+    wall = time.time() - t0
+    (log_dir / "stdout.jsonl").write_text(stdout or "")
+    (log_dir / "stderr.txt").write_text(stderr or "")
+    startup_failed = _is_transient_startup_failure(returncode, stdout, stderr) and not timed_out
+    return {
+        "stdout": stdout or "",
+        "stderr": stderr or "",
+        "returncode": returncode,
+        "timed_out": timed_out,
+        "startup_failed": startup_failed,
+        "wall": wall,
+    }
+
+
+def _kill_group(pid: int) -> None:
+    """Best-effort terminate a process and its group.
+
+    On macOS ``os.killpg`` can raise ``PermissionError`` (EPERM) when a child
+    has changed session/owner or is mid-reap. That must never turn a recoverable
+    timeout into a fatal exception, so all signalling errors are swallowed and we
+    fall back to signalling the direct pid.
+    """
+    try:
+        pgid = os.getpgid(pid)
+    except (ProcessLookupError, PermissionError, OSError):
+        pgid = None
+    for sig in (signal.SIGTERM, signal.SIGKILL):
+        signalled = False
+        if pgid is not None:
+            try:
+                os.killpg(pgid, sig)
+                signalled = True
+            except ProcessLookupError:
+                return
+            except (PermissionError, OSError):
+                pgid = None
+        if not signalled:
+            try:
+                os.kill(pid, sig)
+            except ProcessLookupError:
+                return
+            except (PermissionError, OSError):
+                pass
+        time.sleep(2)
+
+
+# ---------------------------------------------------------------------------
+# Parsing: tokens (debug logs), premium / files (result event), tool calls
+# ---------------------------------------------------------------------------
+
+# A genuine Copilot model-response usage block. We require all of these keys so
+# stray JSON (e.g. an MCP tool result or the server's own stderr) can't be
+# mis-counted as token usage.
+_USAGE_REQUIRED = ("prompt_tokens", "completion_tokens", "total_tokens", "prompt_tokens_details")
+
+
+def parse_tokens_from_logs(log_dir: Path) -> dict[str, int]:
+    """Sum token usage across every model-response block in this run's logs.
+
+    Copilot fans out multiple requests per turn; each writes a pretty-printed
+    ``"usage": { ... }`` block to ``process-*.log``. We sum them all. The log
+    dir is per-run, so there is no cross-run contamination.
+    """
+    totals = {
+        "input_tokens": 0,
+        "output_tokens": 0,
+        "total_tokens": 0,
+        "cached_input_tokens": 0,
+        "cache_creation_tokens": 0,
+        "reasoning_tokens": 0,
+        "usage_blocks": 0,
+    }
+    for log in sorted(log_dir.glob("process-*.log")):
+        text = log.read_text(errors="replace")
+        for block in _iter_usage_blocks(text):
+            if not all(k in block for k in _USAGE_REQUIRED):
+                continue
+            totals["input_tokens"] += int(block.get("prompt_tokens", 0))
+            totals["output_tokens"] += int(block.get("completion_tokens", 0))
+            totals["total_tokens"] += int(block.get("total_tokens", 0))
+            details = block.get("prompt_tokens_details") or {}
+            totals["cached_input_tokens"] += int(details.get("cached_tokens", 0))
+            totals["cache_creation_tokens"] += int(details.get("cache_creation_tokens", 0))
+            # Thinking tokens are a subset of completion_tokens; surfaced
+            # separately so the base (non-reasoning) output is comparable across
+            # arms even with thinking enabled.
+            cdetails = block.get("completion_tokens_details") or {}
+            totals["reasoning_tokens"] += int(cdetails.get("reasoning_tokens", 0) or 0)
+            totals["usage_blocks"] += 1
+    return totals
+
+
+def _iter_usage_blocks(text: str):
+    """Yield parsed JSON objects for each ``"usage": {...}`` in the log text.
+
+    Brace-balanced scan from the opening ``{`` so multi-line pretty-printed
+    blocks parse correctly.
+    """
+    for m in re.finditer(r'"usage"\s*:\s*\{', text):
+        start = m.end() - 1  # position of the opening brace
+        depth = 0
+        for i in range(start, len(text)):
+            ch = text[i]
+            if ch == "{":
+                depth += 1
+            elif ch == "}":
+                depth -= 1
+                if depth == 0:
+                    blob = text[start : i + 1]
+                    try:
+                        yield json.loads(blob)
+                    except json.JSONDecodeError:
+                        pass
+                    break
+
+
+def parse_result_event(stdout: str) -> dict[str, Any]:
+    """Extract premium-request count + files modified from the result event."""
+    out = {"premium_requests": 0, "files_modified": [], "is_error": None, "num_turns": None}
+    for line in stdout.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            ev = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        if ev.get("type") != "result":
+            continue
+        data = ev.get("data", ev)
+        usage = data.get("usage") or {}
+        out["premium_requests"] = int(usage.get("premiumRequests", 0) or 0)
+        code_changes = usage.get("codeChanges") or data.get("codeChanges") or {}
+        out["files_modified"] = list(code_changes.get("filesModified", []) or [])
+        out["is_error"] = data.get("isError")
+        out["num_turns"] = data.get("numTurns")
+    return out
+
+
+def parse_tool_calls(stdout: str) -> tuple[int, dict[str, int]]:
+    """Count tool invocations by name from execution-start events."""
+    by_name: dict[str, int] = {}
+    total = 0
+    for line in stdout.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            ev = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        etype = ev.get("type", "")
+        if not etype.startswith("tool.execution_start"):
+            continue
+        data = ev.get("data", {})
+        name = data.get("name") or data.get("toolName") or "unknown"
+        if name is None:
+            name = "unknown"
+        by_name[name] = by_name.get(name, 0) + 1
+        total += 1
+    return total, by_name
+
+
+# A code-graph MCP tool call shows up with this prefix in the tool name
+# (e.g. ``code-graph-search_code``). Used for nudge-compliance metrics.
+_GRAPH_TOOL_PREFIX = "code-graph"
+
+
+def parse_tool_sequence(stdout: str) -> list[str]:
+    """Return tool names in invocation order (for first-tool / compliance)."""
+    seq: list[str] = []
+    for line in stdout.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            ev = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        if not ev.get("type", "").startswith("tool.execution_start"):
+            continue
+        data = ev.get("data", {})
+        name = data.get("name") or data.get("toolName") or "unknown"
+        seq.append(name or "unknown")
+    return seq
+
+
+def _tool_prefix_for_track(track: str) -> str:
+    """The MCP server name prefix that identifies the track's nav tool calls."""
+    if track == LSP:
+        return "lsp"
+    return _GRAPH_TOOL_PREFIX
+
+
+def _is_graph_tool(name: str, prefix: str = _GRAPH_TOOL_PREFIX) -> bool:
+    return bool(name) and name.startswith(prefix)
+
+
+def nudge_compliance(stdout: str, track: str = CODE_GRAPH) -> dict[str, Any]:
+    """Measure whether/how the agent engaged the track's MCP nav tools."""
+    prefix = _tool_prefix_for_track(track)
+    seq = parse_tool_sequence(stdout)
+    first = seq[0] if seq else None
+    graph_calls = sum(1 for n in seq if _is_graph_tool(n, prefix))
+    return {
+        "first_tool": first,
+        "first_is_graph": bool(first and _is_graph_tool(first, prefix)),
+        "graph_calls": graph_calls,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Localization (LocAgent-style): extract the agent's predicted files
+# ---------------------------------------------------------------------------
+
+
+def extract_agent_text(stdout: str) -> str:
+    """Concatenate the agent's own message text (not tool output) in order.
+
+    Scans both ``assistant.message`` (finalized) and ``assistant.message_delta``
+    (streaming) so the sentinel is recoverable across CLI versions. Finalized
+    messages stream after their deltas, so the last sentinel occurrence (which
+    the parser keys on) lands in a complete message.
+    """
+    parts: list[str] = []
+    for line in stdout.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            ev = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        if ev.get("type") in ("assistant.message", "assistant.message_delta"):
+            content = ev.get("data", {}).get("content")
+            if isinstance(content, str) and content.strip():
+                parts.append(content)
+    return "\n".join(parts)
+
+
+def _norm_path(path: str) -> str:
+    """Normalize a predicted path to a repo-root-relative posix form."""
+    p = path.strip().strip("'\"").strip()
+    p = p.replace("\\", "/")
+    while p.startswith("./"):
+        p = p[2:]
+    for prefix in ("a/", "b/"):
+        if p.startswith(prefix):
+            p = p[len(prefix):]
+    return p.lstrip("/")
+
+
+def parse_localization(text: str) -> tuple[list[str], str | None, bool]:
+    """Parse the predicted file list from the agent's final message.
+
+    Returns ``(pred_files, parse_error, fallback)``. The strict path looks for
+    the ``FINAL_LOCALIZATION_JSON:`` sentinel followed by a JSON array. If the
+    sentinel is missing/malformed, ``fallback`` is True and ``parse_error``
+    carries the reason (headline numbers should drop / stratify these).
+    """
+    idx = text.rfind(LOCALIZE_SENTINEL)
+    if idx == -1:
+        return [], "sentinel_missing", True
+    tail = text[idx + len(LOCALIZE_SENTINEL):]
+    start = tail.find("[")
+    if start == -1:
+        return [], "no_array", True
+    depth = 0
+    end = -1
+    for i in range(start, len(tail)):
+        c = tail[i]
+        if c == "[":
+            depth += 1
+        elif c == "]":
+            depth -= 1
+            if depth == 0:
+                end = i
+                break
+    if end == -1:
+        return [], "unbalanced_array", True
+    blob = tail[start:end + 1]
+    try:
+        arr = json.loads(blob)
+    except json.JSONDecodeError as exc:
+        return [], f"json_error:{exc.msg}", True
+    if not isinstance(arr, list):
+        return [], "not_a_list", True
+    pred: list[str] = []
+    for item in arr:
+        if not isinstance(item, str):
+            continue
+        norm = _norm_path(item)
+        if norm and norm not in pred:
+            pred.append(norm)
+    return pred, None, False
+
+
+def score_localization(pred: list[str], gold: list[str]) -> dict[str, Any]:
+    """Score predicted files vs gold (order-sensitive for acc@k / MRR)."""
+    gold_set = {_norm_path(g) for g in gold}
+    pred_norm = [_norm_path(p) for p in pred]
+    pred_set = set(pred_norm)
+    hits = gold_set & pred_set
+    recall = len(hits) / len(gold_set) if gold_set else 0.0
+    precision = len(hits) / len(pred_set) if pred_set else 0.0
+    all_found = bool(gold_set) and gold_set.issubset(pred_set)
+
+    def acc_at(k: int) -> float:
+        topk = set(pred_norm[:k])
+        return 1.0 if gold_set and (gold_set & topk) else 0.0
+
+    mrr = 0.0
+    for rank, path in enumerate(pred_norm, start=1):
+        if path in gold_set:
+            mrr = 1.0 / rank
+            break
+    return {
+        "gold_files": sorted(gold_set),
+        "pred_files": pred_norm,
+        "file_recall": round(recall, 4),
+        "file_precision": round(precision, 4),
+        "file_all_found": all_found,
+        "acc_at_1": acc_at(1),
+        "acc_at_3": acc_at(3),
+        "acc_at_5": acc_at(5),
+        "file_mrr": round(mrr, 4),
+    }
+
+
+# ---------------------------------------------------------------------------
+# Patch extraction
+# ---------------------------------------------------------------------------
+
+
+def extract_patch(repo_path: Path, base_commit: str) -> dict[str, Any]:
+    """Capture all changes vs base as a single unified diff (junk-excluded).
+
+    ``git add -A`` then ``git diff --cached <base>`` captures committed, staged,
+    unstaged and untracked changes regardless of how Copilot left the tree.
+    Build/cache dirs are excluded via pathspec.
+    """
+    excludes = [f":(exclude){d}" for d in _PATCH_EXCLUDES]
+    excludes += [f":(exclude)*/{d}/*" for d in _PATCH_EXCLUDES]
+    swe_bench._git(["add", "-A"], cwd=repo_path, check=False)
+    res = swe_bench._git(
+        ["diff", "--cached", base_commit, "--", ".", *excludes],
+        cwd=repo_path,
+        check=False,
+    )
+    patch = res.stdout
+    files = _patched_files(patch)
+    touched_tests = any(not swe_bench.is_source_file(f) and f.endswith(".py") for f in files) or any(
+        swe_bench._TEST_PATH_RE.search(f) for f in files
+    )
+    return {"patch": patch, "patched_files": files, "touched_tests": touched_tests}
+
+
+def _patched_files(patch: str) -> list[str]:
+    files = []
+    for line in patch.splitlines():
+        if line.startswith("+++ b/"):
+            files.append(line[6:])
+    return files
+
+
+# ---------------------------------------------------------------------------
+# Per-instance driver
+# ---------------------------------------------------------------------------
+
+
+def _resolve_run_dir(
+    cache_dir: Path,
+    *,
+    model: str,
+    mode: str,
+    prompt_mode: str,
+    track: str,
+    instance_id: str,
+    run_idx: int,
+) -> Path:
+    """Build the run_dir for one trajectory.
+
+    For multi-run pilots (run_idx>0) each repeat is nested under ``run<idx>`` so
+    logs are not overwritten; run_idx==0 keeps the bare layout for
+    backwards-compat with existing single-run caches. ``row_stdout_path()``
+    resolves both layouts.
+    """
+    run_dir = cache_dir / "runs" / model / mode / prompt_mode / track / instance_id
+    if run_idx > 0:
+        run_dir = run_dir / f"run{run_idx}"
+    return run_dir
+
+
+def _compute_prompt_mode(
+    *, adopt_arm: str | None, nudge: bool, inject_label: str | None = None
+) -> str:
+    """Single source of truth for prompt_mode so main() and run_one() agree.
+
+    The NOISY/GRAPH-WRONG distractor condition is orthogonal to the prompt arm,
+    so it is encoded as a suffix (e.g. ``adopt-sem-noisy``). CLEAN runs carry no
+    suffix and stay byte-identical to the plain arm prompt_mode.
+    """
+    if adopt_arm is not None:
+        base = f"adopt-{adopt_arm}"
+    else:
+        base = "nudged" if nudge else "neutral"
+    if inject_label:
+        return f"{base}-{inject_label}"
+    return base
+
+
+def _inject_env(
+    inst: swe_bench.SweBenchInstance,
+    *,
+    inject_manifest: Path | None,
+    inject_k: int | None,
+) -> dict[str, str] | None:
+    """Build the env that gates server-side NOISY distractor injection.
+
+    The keyed-by-task manifest is read inside the MCP server; here we just point
+    it at the manifest path and pin BENCH_NOISY_TASK to this instance so only
+    this task's distractors are injected. Returns None when injection is off.
+    """
+    if inject_manifest is None:
+        return None
+    env = {
+        "BENCH_NOISY_MANIFEST": str(inject_manifest),
+        "BENCH_NOISY_TASK": inst.instance_id,
+    }
+    if inject_k is not None:
+        env["BENCH_NOISY_K"] = str(inject_k)
+    return env
+
+
+def run_one(
+    inst: swe_bench.SweBenchInstance,
+    *,
+    track: str,
+    model: str,
+    cache_dir: Path,
+    wall_time: float,
+    server_root: Path,
+    run_idx: int = 0,
+    nudge: bool = False,
+    mode: str = FIX,
+    adopt_arm: str | None = None,
+    inject_manifest: Path | None = None,
+    inject_label: str | None = None,
+    inject_k: int | None = None,
+) -> dict[str, Any]:
+    if adopt_arm is not None and (track != CODE_GRAPH or mode != LOCALIZE):
+        raise ValueError(
+            f"adopt_arm={adopt_arm!r} requires track={CODE_GRAPH} and mode={LOCALIZE}; "
+            f"got track={track!r} mode={mode!r}"
+        )
+    if adopt_arm is not None and adopt_arm not in ADOPT_ARMS:
+        raise ValueError(f"unknown adopt_arm={adopt_arm!r}; expected one of {ADOPT_ARMS}")
+    if inject_manifest is not None:
+        if track != CODE_GRAPH or mode != LOCALIZE:
+            raise ValueError(
+                f"inject_manifest requires track={CODE_GRAPH} and mode={LOCALIZE}; "
+                f"got track={track!r} mode={mode!r}"
+            )
+        if not inject_label:
+            raise ValueError("inject_manifest requires a non-empty inject_label")
+    prompt_mode = _compute_prompt_mode(
+        adopt_arm=adopt_arm, nudge=nudge, inject_label=inject_label
+    )
+    work_root = cache_dir / "worktrees" / track
+    work_root.mkdir(parents=True, exist_ok=True)
+    run_dir = _resolve_run_dir(
+        cache_dir,
+        model=model,
+        mode=mode,
+        prompt_mode=prompt_mode,
+        track=track,
+        instance_id=inst.instance_id,
+        run_idx=run_idx,
+    )
+    if run_dir.exists():
+        shutil.rmtree(run_dir)
+    run_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"\n=== {inst.instance_id} [{track}] model={model} mode={mode} prompt={prompt_mode} ===")
+
+    # Common base row fields (identity).
+    base_row = {
+        "benchmark": "swe_bench_verified",
+        "task_id": inst.instance_id,
+        "config": track,
+        "model": model,
+        "mode": mode,
+        "prompt_mode": prompt_mode,
+        "run_idx": run_idx,
+        "runner": RUNNER_VERSION,
+    }
+
+    if mode == LOCALIZE:
+        return _run_localize(
+            inst, track=track, model=model, run_dir=run_dir, work_root=work_root,
+            wall_time=wall_time, server_root=server_root, nudge=nudge, base_row=base_row,
+            adopt_arm=adopt_arm, inject_manifest=inject_manifest, inject_k=inject_k,
+        )
+
+    repo_path = swe_bench.prepare_worktree(
+        inst, worktrees_dir=work_root.resolve(), apply_test_patch=True
+    )
+
+    index_sec = None
+    mcp_config = None
+    if track == CODE_GRAPH:
+        index_sec = ensure_indexed(repo_path, fresh=True)
+        host, port = _falkor_settings()
+        wrapper = _write_mcp_wrapper(run_dir, server_root)
+        mcp_config = _write_mcp_config(run_dir, wrapper, host, port)
+    elif track == LSP:
+        wrapper = _write_lsp_wrapper(run_dir, repo_path)
+        mcp_config = _write_lsp_mcp_config(run_dir, wrapper)
+
+    prompt = build_prompt(
+        track, repo_path, inst.problem_statement, repo_path.name, nudge=nudge, mode=mode
+    )
+    (run_dir / "prompt.txt").write_text(prompt)
+
+    result = run_copilot(
+        prompt=prompt,
+        model=model,
+        cwd=repo_path,
+        log_dir=run_dir / "logs",
+        mcp_config=mcp_config,
+        wall_time=wall_time,
+    )
+
+    tokens = parse_tokens_from_logs(run_dir / "logs")
+    result_ev = parse_result_event(result["stdout"])
+    tool_total, tool_by_name = parse_tool_calls(result["stdout"])
+    compliance = nudge_compliance(result["stdout"], track)
+    patch_info = extract_patch(repo_path, inst.base_commit)
+
+    if result.get("startup_failed"):
+        print(
+            f"[error] {inst.instance_id} [{track}] copilot startup failed after "
+            f"{COPILOT_MAX_ATTEMPTS} attempts (rc={result['returncode']}); "
+            f"marking incomplete for re-run"
+        )
+        return {
+            **base_row,
+            "index_sec": index_sec,
+            "timed_out": result["timed_out"],
+            "returncode": result["returncode"],
+            "outcome": "error",
+            "error": f"copilot_startup_failed: {result.get('stderr', '').strip()[:200]}",
+            "wall_clock_sec": round(result["wall"], 2),
+            "completed": False,
+        }
+
+    row = {
+        **base_row,
+        "input_tokens": tokens["input_tokens"],
+        "output_tokens": tokens["output_tokens"],
+        "total_tokens": tokens["total_tokens"],
+        "cached_input_tokens": tokens["cached_input_tokens"],
+        "cache_creation_tokens": tokens["cache_creation_tokens"],
+        "usage_blocks": tokens["usage_blocks"],
+        "premium_requests": result_ev["premium_requests"],
+        "tool_calls_total": tool_total,
+        "tool_calls_by_name": tool_by_name,
+        "first_tool": compliance["first_tool"],
+        "first_is_graph": compliance["first_is_graph"],
+        "graph_calls": compliance["graph_calls"],
+        "files_modified": result_ev["files_modified"],
+        "touched_tests": patch_info["touched_tests"],
+        "index_sec": index_sec,
+        "timed_out": result["timed_out"],
+        "returncode": result["returncode"],
+        "outcome": "ungraded",
+        "patch": patch_info["patch"],
+        "wall_clock_sec": round(result["wall"], 2),
+        "completed": True,
+        **hardening_meta(repo_path, result["stdout"], tokens["reasoning_tokens"]),
+    }
+    _maybe_write_trace(run_dir, row)
+    print(
+        f"[done] {inst.instance_id} [{track}] in={row['input_tokens']} "
+        f"out={row['output_tokens']} premium={row['premium_requests']} "
+        f"tools={tool_total} graph={compliance['graph_calls']} "
+        f"patch_files={len(patch_info['patched_files'])} "
+        f"timed_out={result['timed_out']} wall={row['wall_clock_sec']}s"
+    )
+    return row
+
+
+def _run_localize(
+    inst: swe_bench.SweBenchInstance,
+    *,
+    track: str,
+    model: str,
+    run_dir: Path,
+    work_root: Path,
+    wall_time: float,
+    server_root: Path,
+    nudge: bool,
+    base_row: dict[str, Any],
+    adopt_arm: str | None = None,
+    inject_manifest: Path | None = None,
+    inject_k: int | None = None,
+) -> dict[str, Any]:
+    """Localization driver: no edits, no Docker; score predicted files vs gold."""
+    gold = swe_bench.gold_changed_files(inst.patch, source_only=True)
+    if not gold:
+        print(f"[skip] {inst.instance_id} [{track}] no source-only gold files")
+        return {
+            **base_row,
+            "outcome": "skipped_no_gold",
+            "completed": True,
+            "gold_files": [],
+        }
+
+    # Distinct, test-free worktree forces a clean re-index with no test_patch
+    # leakage into the graph.
+    repo_path = swe_bench.prepare_localize_worktree(
+        inst, worktrees_dir=work_root.resolve()
+    )
+
+    index_sec = None
+    mcp_config = None
+    if track == CODE_GRAPH:
+        index_sec = ensure_indexed(repo_path, fresh=True)
+        host, port = _falkor_settings()
+        wrapper = _write_mcp_wrapper(run_dir, server_root)
+        extra_env = _inject_env(inst, inject_manifest=inject_manifest, inject_k=inject_k)
+        mcp_config = _write_mcp_config(run_dir, wrapper, host, port, extra_env=extra_env)
+    elif track == LSP:
+        wrapper = _write_lsp_wrapper(run_dir, repo_path)
+        mcp_config = _write_lsp_mcp_config(run_dir, wrapper)
+
+    prompt = build_prompt(
+        track, repo_path, inst.problem_statement, repo_path.name,
+        nudge=nudge, mode=LOCALIZE, adopt_arm=adopt_arm,
+    )
+    (run_dir / "prompt.txt").write_text(prompt)
+
+    result = run_copilot(
+        prompt=prompt,
+        model=model,
+        cwd=repo_path,
+        log_dir=run_dir / "logs",
+        mcp_config=mcp_config,
+        wall_time=wall_time,
+    )
+
+    tokens = parse_tokens_from_logs(run_dir / "logs")
+    result_ev = parse_result_event(result["stdout"])
+    tool_total, tool_by_name = parse_tool_calls(result["stdout"])
+    compliance = nudge_compliance(result["stdout"], track)
+
+    agent_text = extract_agent_text(result["stdout"])
+    (run_dir / "agent_text.txt").write_text(agent_text)
+    pred, parse_error, fallback = parse_localization(agent_text)
+    scores = score_localization(pred, gold)
+    leak = swe_bench.leakage_flags(inst, gold)
+
+    # A transient startup/network failure produces no model output; record it as
+    # an error (completed=False) so it is re-run rather than scored as recall=0.
+    if result.get("startup_failed"):
+        print(
+            f"[error] {inst.instance_id} [{track}] copilot startup failed after "
+            f"{COPILOT_MAX_ATTEMPTS} attempts (rc={result['returncode']}); "
+            f"marking incomplete for re-run"
+        )
+        return {
+            **base_row,
+            "index_sec": index_sec,
+            "index_fresh": track == CODE_GRAPH,
+            "timed_out": result["timed_out"],
+            "returncode": result["returncode"],
+            "outcome": "error",
+            "error": f"copilot_startup_failed: {result.get('stderr', '').strip()[:200]}",
+            "wall_clock_sec": round(result["wall"], 2),
+            "completed": False,
+        }
+
+    row = {
+        **base_row,
+        "input_tokens": tokens["input_tokens"],
+        "output_tokens": tokens["output_tokens"],
+        "total_tokens": tokens["total_tokens"],
+        "cached_input_tokens": tokens["cached_input_tokens"],
+        "cache_creation_tokens": tokens["cache_creation_tokens"],
+        "usage_blocks": tokens["usage_blocks"],
+        "premium_requests": result_ev["premium_requests"],
+        "tool_calls_total": tool_total,
+        "tool_calls_by_name": tool_by_name,
+        "first_tool": compliance["first_tool"],
+        "first_is_graph": compliance["first_is_graph"],
+        "graph_calls": compliance["graph_calls"],
+        "index_sec": index_sec,
+        "index_fresh": track == CODE_GRAPH,
+        "timed_out": result["timed_out"],
+        "returncode": result["returncode"],
+        "parse_error": parse_error,
+        "parse_fallback": fallback,
+        "is_structural": swe_bench.is_structural(inst),
+        "mentions_gold_path": leak.get("mentions_gold_path"),
+        "mentions_gold_basename": leak.get("mentions_gold_basename"),
+        "contains_traceback": leak.get("contains_traceback"),
+        "outcome": "localized",
+        "wall_clock_sec": round(result["wall"], 2),
+        "completed": True,
+        **scores,
+        **hardening_meta(repo_path, result["stdout"], tokens["reasoning_tokens"]),
+    }
+    _maybe_write_trace(run_dir, row)
+    print(
+        f"[loc] {inst.instance_id} [{track}] recall={scores['file_recall']} "
+        f"acc@1={scores['acc_at_1']} mrr={scores['file_mrr']} "
+        f"pred={len(pred)} gold={len(scores['gold_files'])} "
+        f"graph={compliance['graph_calls']} parse_err={parse_error} "
+        f"in={row['input_tokens']} wall={row['wall_clock_sec']}s"
+    )
+    return row
+
+
+def _maybe_write_trace(run_dir: Path, row: dict[str, Any]) -> None:
+    """Best-effort decision-loop trace extraction; never break a run on error."""
+    try:
+        from bench.analysis.trace import extract_run
+
+        extract_run(run_dir, row=row, write=True)
+    except Exception as exc:  # noqa: BLE001 - trace is diagnostic, not critical
+        print(f"[trace] extraction failed for {run_dir.name}: {exc}")
+
+
+# ---------------------------------------------------------------------------
+# Resume / IO
+# ---------------------------------------------------------------------------
+
+
+def _load_done(results_path: Path) -> set[tuple]:
+    done: set[tuple] = set()
+    if not results_path.exists():
+        return done
+    for line in results_path.read_text().splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            r = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        if r.get("completed") and r.get("runner") == RUNNER_VERSION:
+            done.add((
+                r["task_id"],
+                r["config"],
+                r.get("model", ""),
+                r.get("mode", FIX),
+                r.get("prompt_mode", "neutral"),
+                int(r.get("run_idx", 0)),
+            ))
+    return done
+
+
+def _append_row(results_path: Path, row: dict[str, Any]) -> None:
+    results_path.parent.mkdir(parents=True, exist_ok=True)
+    with results_path.open("a") as f:
+        f.write(json.dumps(row) + "\n")
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def _load_instance_ids(args) -> list[str]:
+    if args.instances_file:
+        ids = [
+            ln.strip()
+            for ln in Path(args.instances_file).read_text().splitlines()
+            if ln.strip() and not ln.startswith("#")
+        ]
+        return ids
+    if args.instance:
+        return list(args.instance)
+    if args.select_structural:
+        return []  # resolved later against the loaded dataset
+    raise SystemExit(
+        "provide --instance ID [ID ...], --instances-file FILE, or --select-structural N"
+    )
+
+
+def main(argv: list[str] | None = None) -> int:
+    p = argparse.ArgumentParser(description="Drive Copilot CLI over SWE-bench Verified.")
+    p.add_argument("--instance", nargs="*", help="explicit instance id(s)")
+    p.add_argument("--instances-file", help="file with one instance id per line")
+    p.add_argument(
+        "--select-structural", type=int, default=0,
+        help="auto-select N structural instances (>=2 source files/dirs) for localization",
+    )
+    p.add_argument(
+        "--track", action="append", choices=VALID_TRACKS, default=None,
+        help="track(s) to run (default: both)",
+    )
+    p.add_argument("--model", default="claude-opus-4.8")
+    p.add_argument("--mode", choices=VALID_MODES, default=FIX, help="fix or localize")
+    p.add_argument(
+        "--nudge", action="store_true",
+        help="use the nudged prompt variant (forces structured search-first)",
+    )
+    p.add_argument(
+        "--adopt-arm", choices=ADOPT_ARMS, default=None,
+        help="Lane 1 adoption-calibration arm (code_graph + localize only): "
+             "ctrl (=nudge base), sem (edge-semantics clause), rat (keep/drop step)",
+    )
+    p.add_argument(
+        "--inject-manifest", default=None,
+        help="path to a NOISY/GRAPH-WRONG distractor manifest (code_graph + localize only); "
+             "enables server-side injection of verified non-gold candidates",
+    )
+    p.add_argument(
+        "--inject-label", default=None,
+        help="condition label suffixed onto prompt_mode when injecting (e.g. 'noisy', 'gwrong')",
+    )
+    p.add_argument(
+        "--inject-k", type=int, default=None,
+        help="override number of distractors to inject (default: manifest k)",
+    )
+    p.add_argument("--cache-dir", default=str(DEFAULT_CACHE))
+    p.add_argument("--results", default=None, help="results jsonl (default: <cache>/<model>/results.jsonl)")
+    p.add_argument("--wall-time", type=float, default=1200.0, help="per-run wall-clock seconds")
+    p.add_argument("--server-root", default=str(DEFAULT_MCP_SERVER_ROOT))
+    p.add_argument("--run-idx", type=int, default=0)
+    p.add_argument("--seed", type=int, default=swe_bench.DEFAULT_SEED, help="seed for --select-structural")
+    p.add_argument(
+        "--dataset", default=None,
+        help="HuggingFace dataset name (default: SWE-bench_Verified). "
+             "Use 'loc-bench' shorthand or a full id like czlll/Loc-Bench_V1.",
+    )
+    p.add_argument(
+        "--no-leak", action="store_true",
+        help="with --select-structural: drop instances whose problem statement names a gold file (structural-hard gate)",
+    )
+    args = p.parse_args(argv)
+
+    tracks = args.track or list(VALID_TRACKS)
+    if args.adopt_arm is not None:
+        # Lane 1 arms are code_graph + localize only; pin the track/mode so the
+        # dedup key, run_dir and prompt all agree with run_one's guard.
+        if args.mode != LOCALIZE:
+            raise SystemExit(f"--adopt-arm requires --mode {LOCALIZE}")
+        if tracks != [CODE_GRAPH]:
+            raise SystemExit(f"--adopt-arm requires --track {CODE_GRAPH} (only)")
+    cache_dir = Path(args.cache_dir).resolve()
+    inject_manifest: Path | None = None
+    if args.inject_manifest is not None:
+        if args.mode != LOCALIZE:
+            raise SystemExit(f"--inject-manifest requires --mode {LOCALIZE}")
+        if tracks != [CODE_GRAPH]:
+            raise SystemExit(f"--inject-manifest requires --track {CODE_GRAPH} (only)")
+        if not args.inject_label:
+            raise SystemExit("--inject-manifest requires --inject-label")
+        inject_manifest = Path(args.inject_manifest).resolve()
+        if not inject_manifest.is_file():
+            raise SystemExit(f"--inject-manifest not found: {inject_manifest}")
+    results_path = (
+        Path(args.results)
+        if args.results
+        else cache_dir / args.model / "results.jsonl"
+    )
+    server_root = Path(args.server_root)
+    # Only suffix prompt_mode when injection is actually active.
+    effective_inject_label = args.inject_label if inject_manifest is not None else None
+    prompt_mode = _compute_prompt_mode(
+        adopt_arm=args.adopt_arm, nudge=args.nudge, inject_label=effective_inject_label,
+    )
+
+    ids = _load_instance_ids(args)
+    dataset_name = args.dataset
+    if dataset_name and dataset_name.lower() in ("loc-bench", "locbench"):
+        dataset_name = swe_bench.LOC_BENCH_DATASET
+    all_insts = {i.instance_id: i for i in swe_bench.load_instances(dataset_name=dataset_name)}
+    if ids:
+        missing = [i for i in ids if i not in all_insts]
+        if missing:
+            raise SystemExit(f"unknown instance ids: {missing}")
+        insts = [all_insts[i] for i in ids]
+    else:
+        insts = swe_bench.select_structural(
+            list(all_insts.values()), seed=args.seed, n=args.select_structural,
+            python_only=True, no_leak=args.no_leak,
+        )
+        print(f"[plan] selected {len(insts)} structural instances: "
+              f"{[i.instance_id for i in insts]}")
+
+    done = _load_done(results_path)
+    print(f"[plan] {len(insts)} instances x {len(tracks)} tracks; mode={args.mode} "
+          f"prompt={prompt_mode}; {len(done)} rows already complete; results -> {results_path}")
+
+    for inst in insts:
+        for track in tracks:
+            key = (inst.instance_id, track, args.model, args.mode, prompt_mode, args.run_idx)
+            if key in done:
+                print(f"[skip] {inst.instance_id} [{track}] already complete")
+                continue
+            try:
+                row = run_one(
+                    inst,
+                    track=track,
+                    model=args.model,
+                    cache_dir=cache_dir,
+                    wall_time=args.wall_time,
+                    server_root=server_root,
+                    run_idx=args.run_idx,
+                    nudge=args.nudge,
+                    mode=args.mode,
+                    adopt_arm=args.adopt_arm,
+                    inject_manifest=inject_manifest,
+                    inject_label=effective_inject_label,
+                    inject_k=args.inject_k,
+                )
+            except Exception as exc:  # noqa: BLE001
+                print(f"[error] {inst.instance_id} [{track}]: {exc!r}", file=sys.stderr)
+                traceback.print_exc()
+                row = {
+                    "benchmark": "swe_bench_verified",
+                    "task_id": inst.instance_id,
+                    "config": track,
+                    "model": args.model,
+                    "mode": args.mode,
+                    "prompt_mode": prompt_mode,
+                    "run_idx": args.run_idx,
+                    "runner": RUNNER_VERSION,
+                    "outcome": "error",
+                    "error": repr(exc),
+                    "patch": "",
+                    "completed": False,
+                }
+            _append_row(results_path, row)
+
+    print("[plan] done")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/bench/runners/copilot_tco.py b/bench/runners/copilot_tco.py
new file mode 100644
index 00000000..628f0cdd
--- /dev/null
+++ b/bench/runners/copilot_tco.py
@@ -0,0 +1,196 @@
+"""Total-cost-of-ownership (TCO) accounting for the Copilot benchmark tracks.
+
+Copilot's own token accounting captures only the **agent model** spend. For the
+``code_graph`` track the *true* cost has three more components that the agent's
+token count never sees:
+
+    1. **Indexing** -- one-time CPU to build the FalkorDB graph. With the
+       tree-sitter resolver (``CODE_GRAPH_PY_RESOLVER=tree_sitter``) this is
+       **LLM-free**, so it is pure compute that amortizes across every later
+       query of the same repo. We report it as wall-seconds (and an optional
+       compute-$ estimate), never blended into the per-task model cost.
+    2. **FalkorDB hosting** -- a standing graph DB. Amortizable infra, reported
+       as a flat note, not a per-task charge.
+    3. **GraphRAG ``ask`` side-LLM** -- the ONLY code-graph tool that calls an
+       LLM (NL->Cypher via ``MODEL_NAME``, default gemini-flash-lite). The
+       headline tracks exclude ``ask`` so this is normally **$0**; if a run does
+       call it, ``graphrag_*`` fields on the row meter it and it is added here.
+
+So the headline takeaway: with ``ask`` excluded and tree-sitter indexing, the
+code-graph track adds **zero per-task side-LLM cost** over the no-MCP control --
+its only delta is amortizable infra. This module makes that explicit and prices
+any ``ask`` usage when present.
+
+Usage:
+    uv run python -m bench.runners.copilot_tco --results <results.jsonl>
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from collections import defaultdict
+from pathlib import Path
+from typing import Any
+
+# List price per 1M tokens (USD): input / output. Illustrative (non-billing).
+AGENT_PRICING = {
+    "opus": (15.0, 75.0),
+    "sonnet": (3.0, 15.0),
+    "haiku": (0.80, 4.0),
+}
+
+# GraphRAG `ask` underlying model price per 1M tokens (input/output).
+# Default MODEL_NAME is gemini/gemini-flash-lite-latest. Illustrative.
+GRAPHRAG_PRICING = {
+    "gemini-flash-lite": (0.075, 0.30),
+    "gemini-flash": (0.15, 0.60),
+}
+DEFAULT_GRAPHRAG_MODEL = "gemini-flash-lite"
+
+# Rough on-demand compute price for indexing wall-time (1 vCPU-hour). Illustrative.
+INDEX_CPU_USD_PER_HOUR = 0.05
+
+
+def agent_key(model: str) -> str:
+    """Map a Copilot model id (or shorthand) to an AGENT_PRICING key."""
+    m = model.lower()
+    if "opus" in m:
+        return "opus"
+    if "sonnet" in m:
+        return "sonnet"
+    if "haiku" in m:
+        return "haiku"
+    return "sonnet"
+
+
+def agent_cost_usd(in_tok: int, out_tok: int, model: str) -> float:
+    pin, pout = AGENT_PRICING[agent_key(model)]
+    return in_tok / 1e6 * pin + out_tok / 1e6 * pout
+
+
+def graphrag_cost_usd(in_tok: int, out_tok: int, model: str = DEFAULT_GRAPHRAG_MODEL) -> float:
+    pin, pout = GRAPHRAG_PRICING.get(model, GRAPHRAG_PRICING[DEFAULT_GRAPHRAG_MODEL])
+    return in_tok / 1e6 * pin + out_tok / 1e6 * pout
+
+
+def index_cost_usd(index_sec: float | None) -> float:
+    if not index_sec:
+        return 0.0
+    return index_sec / 3600.0 * INDEX_CPU_USD_PER_HOUR
+
+
+def row_tco(row: dict[str, Any]) -> dict[str, Any]:
+    """Full TCO breakdown for a single result row."""
+    model = row.get("model", "claude-sonnet-4.6")
+    in_tok = int(row.get("input_tokens", 0) or 0)
+    out_tok = int(row.get("output_tokens", 0) or 0)
+    agent_usd = agent_cost_usd(in_tok, out_tok, model)
+
+    g_in = int(row.get("graphrag_input_tokens", 0) or 0)
+    g_out = int(row.get("graphrag_output_tokens", 0) or 0)
+    g_calls = int(row.get("graphrag_ask_calls", 0) or 0)
+    g_usd = graphrag_cost_usd(g_in, g_out) if (g_in or g_out) else 0.0
+
+    idx_usd = index_cost_usd(row.get("index_sec"))
+
+    return {
+        "task_id": row.get("task_id"),
+        "config": row.get("config"),
+        "model": model,
+        "agent_input_tokens": in_tok,
+        "agent_output_tokens": out_tok,
+        "agent_usd": round(agent_usd, 4),
+        "premium_requests": int(row.get("premium_requests", 0) or 0),
+        "graphrag_ask_calls": g_calls,
+        "graphrag_tokens": g_in + g_out,
+        "graphrag_usd": round(g_usd, 4),
+        "index_sec": row.get("index_sec"),
+        "index_usd_amortized_once": round(idx_usd, 4),
+        # Per-task TCO = agent model + any ask side-LLM. Indexing is reported
+        # separately because it amortizes across all queries of the repo.
+        "per_task_tco_usd": round(agent_usd + g_usd, 4),
+    }
+
+
+def aggregate(rows: list[dict[str, Any]]) -> dict[str, dict[str, Any]]:
+    by: dict[str, list[dict[str, Any]]] = defaultdict(list)
+    for r in rows:
+        if not r.get("completed"):
+            continue
+        by[r.get("config", "?")].append(r)
+
+    out: dict[str, dict[str, Any]] = {}
+    for cfg, crows in by.items():
+        n = len(crows)
+        tcos = [row_tco(r) for r in crows]
+        agent = sum(t["agent_usd"] for t in tcos)
+        graphrag = sum(t["graphrag_usd"] for t in tcos)
+        index = sum(t["index_usd_amortized_once"] for t in tcos)
+        premium = sum(t["premium_requests"] for t in tcos)
+        ask_calls = sum(t["graphrag_ask_calls"] for t in tcos)
+        resolved = sum(1 for r in crows if r.get("outcome") == "resolved")
+        out[cfg] = {
+            "n": n,
+            "resolved": resolved,
+            "agent_usd": round(agent, 2),
+            "graphrag_ask_calls": ask_calls,
+            "graphrag_usd": round(graphrag, 4),
+            "index_usd_one_time": round(index, 4),
+            "premium_requests": premium,
+            "per_task_tco_usd_sum": round(agent + graphrag, 2),
+            "per_task_tco_usd_mean": round((agent + graphrag) / n, 4) if n else 0.0,
+        }
+    return out
+
+
+def _load(path: Path) -> list[dict[str, Any]]:
+    rows = []
+    for line in path.read_text().splitlines():
+        line = line.strip()
+        if line:
+            rows.append(json.loads(line))
+    return rows
+
+
+def main(argv: list[str] | None = None) -> int:
+    p = argparse.ArgumentParser(description="TCO accounting for Copilot benchmark runs.")
+    p.add_argument("--results", required=True, help="results jsonl")
+    p.add_argument("--json", action="store_true", help="emit JSON instead of a table")
+    args = p.parse_args(argv)
+
+    rows = _load(Path(args.results))
+    agg = aggregate(rows)
+
+    if args.json:
+        print(json.dumps(agg, indent=2))
+        return 0
+
+    print(f"\nTCO by track  ({Path(args.results).name})\n")
+    hdr = (
+        f"{'track':>16} | {'n':>3} | {'resolved':>8} | {'agent $':>9} | "
+        f"{'ask calls':>9} | {'ask $':>7} | {'index $ (1x)':>12} | "
+        f"{'premium':>7} | {'TCO $/task':>10}"
+    )
+    print(hdr)
+    print("-" * len(hdr))
+    for cfg in sorted(agg):
+        s = agg[cfg]
+        print(
+            f"{cfg:>16} | {s['n']:>3} | {s['resolved']:>8} | "
+            f"{s['agent_usd']:>9.2f} | {s['graphrag_ask_calls']:>9} | "
+            f"{s['graphrag_usd']:>7.4f} | {s['index_usd_one_time']:>12.4f} | "
+            f"{s['premium_requests']:>7} | {s['per_task_tco_usd_mean']:>10.4f}"
+        )
+    print(
+        "\nNotes: agent $ = Copilot model tokens (list price). ask $ = GraphRAG "
+        "side-LLM (0 when `ask` excluded). index $ = one-time tree-sitter "
+        "indexing CPU, amortizes across all queries (LLM-free). FalkorDB hosting "
+        "is standing infra, not charged per task. Premium requests are Copilot's "
+        "real billing unit -- reported separately, never blended into $."
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/bench/runners/localize_runner.py b/bench/runners/localize_runner.py
new file mode 100644
index 00000000..fc37f9d3
--- /dev/null
+++ b/bench/runners/localize_runner.py
@@ -0,0 +1,576 @@
+"""LocAgent-style code-localization benchmark.
+
+Where the full SWE-bench *fix* task ties all configs on accuracy (because
+fixes are localized and grep suffices), this benchmark isolates the
+**navigation** problem: given only the issue text, the agent must name the
+source file(s) that need to change — without editing anything. We then score
+file-level localization (recall / precision / Acc@k / MRR) and the token /
+command cost each tool incurs to get there.
+
+Design (see plan.md 2026-05-30 23:30):
+  * Test-free worktree under a distinct name `{id}__loc` -> a FRESH FalkorDB
+    index that does NOT contain the test_patch (which would leak the answer).
+  * One shared, free-form instance template for every config (tools are
+    advertised by the per-config preamble; no forced first command).
+  * Strict `FINAL_LOCALIZATION_JSON:` sentinel parsing with an explicit
+    `parse_error` flag; a regex fallback is recorded for diagnostics only and
+    never feeds the headline metric.
+  * Gold = non-test, non-doc Python files from the gold patch.
+
+Run:
+  uv run python -m bench.runners.localize_runner --set structural \
+      --config baseline --config lsp --config code_graph --config code_graph_mcp \
+      --limit 30 --model anthropic/claude-opus-4-... \
+      --results bench/cache/opus-localize/results.jsonl
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import re
+import signal
+import subprocess
+import time
+from pathlib import Path
+from typing import Any
+
+from bench.runners.mini_runner import (
+    DEFAULT_CACHE_DIR,
+    VALID_CONFIGS,
+    _ensure_indexed,
+    _ensure_indexed_mcp,
+    config_env,
+    load_preamble,
+)
+
+LOCALIZE_RESULTS = DEFAULT_CACHE_DIR / "opus-localize" / "results.jsonl"
+LOCALIZE_TRAJECTORIES = DEFAULT_CACHE_DIR / "opus-localize" / "trajectories"
+
+SENTINEL = "FINAL_LOCALIZATION_JSON:"
+
+from minisweagent.environments.local import LocalEnvironment  # noqa: E402
+
+
+class SafeLocalEnvironment(LocalEnvironment):
+    """LocalEnvironment whose timeout reliably reaps the whole process tree.
+
+    The stock implementation runs ``subprocess.run(shell=True, timeout=...)``.
+    When a command spawns a grandchild that inherits the stdout pipe (e.g. a
+    jedi/multilspy language server that hangs while indexing a large repo such
+    as Django), the timeout kills only the shell and ``communicate()`` then
+    blocks *forever* waiting for the inherited pipe to close — wedging the whole
+    agent. We launch each command in its own session and ``SIGKILL`` the entire
+    process group on timeout, which closes the pipe and unblocks the read.
+
+    Everything else (the ``COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT`` completion
+    check, template vars, serialization, pydantic config) is inherited.
+    """
+
+    def execute(self, action: dict, cwd: str = "", *, timeout: int | None = None) -> dict[str, Any]:
+        command = action.get("command", "")
+        run_cwd = cwd or self.config.cwd or os.getcwd()
+        tmo = timeout or self.config.timeout
+        proc = subprocess.Popen(
+            command,
+            shell=True,
+            text=True,
+            cwd=run_cwd,
+            env=os.environ | self.config.env,
+            encoding="utf-8",
+            errors="replace",
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            start_new_session=True,
+        )
+        try:
+            out, _ = proc.communicate(timeout=tmo)
+            output = {"output": out, "returncode": proc.returncode, "exception_info": ""}
+        except subprocess.TimeoutExpired:
+            try:
+                os.killpg(os.getpgid(proc.pid), signal.SIGKILL)
+            except (ProcessLookupError, PermissionError):
+                pass
+            try:
+                out, _ = proc.communicate(timeout=10)
+            except subprocess.TimeoutExpired:
+                out = ""
+            output = {
+                "output": (out or "") + f"\n[command timed out after {tmo}s; process group killed]",
+                "returncode": -1,
+                "exception_info": f"TimeoutExpired after {tmo}s",
+                "extra": {"exception_type": "TimeoutExpired", "exception": "timeout"},
+            }
+        self._check_finished(output)
+        return output
+
+
+class TimeoutRetryModel:
+    """Wrap a minisweagent model so each API call is bounded by a hard timeout.
+
+    litellm's own ``timeout`` does not reliably interrupt the Azure Anthropic
+    passthrough — we have observed an ESTABLISHED socket stall with the Python
+    process blocked in a C-level read for 20+ min, CPU frozen, never returning.
+    ``SIGALRM`` interrupts even a blocked syscall (PEP 475 re-raises from the
+    handler), so we arm it around each ``query`` and retry on stall. The agent's
+    own between-step wall-time check then actually becomes reachable.
+
+    All other attributes/methods (cost, n_calls, serialize, format_message, …)
+    are delegated to the wrapped model.
+    """
+
+    def __init__(self, inner: Any, *, per_call_timeout: int = 180, retries: int = 3):
+        self._inner = inner
+        self._per_call_timeout = per_call_timeout
+        self._retries = retries
+
+    def query(self, messages: list[dict[str, str]], **kwargs) -> dict:
+        last_exc: Exception | None = None
+        for attempt in range(self._retries + 1):
+            def _on_alarm(signum, frame):  # noqa: ARG001
+                raise TimeoutError(
+                    f"model.query stalled > {self._per_call_timeout}s"
+                )
+
+            prev = signal.signal(signal.SIGALRM, _on_alarm)
+            signal.alarm(self._per_call_timeout)
+            try:
+                return self._inner.query(messages, **kwargs)
+            except TimeoutError as exc:
+                last_exc = exc
+                print(
+                    f"[warn] model stalled (attempt {attempt + 1}/"
+                    f"{self._retries + 1}); retrying",
+                    flush=True,
+                )
+            finally:
+                signal.alarm(0)
+                signal.signal(signal.SIGALRM, prev)
+        raise last_exc if last_exc else RuntimeError("model.query failed")
+
+    def __getattr__(self, name: str) -> Any:
+        # Delegate everything we don't override (cost, n_calls, serialize, …).
+        return getattr(self._inner, name)
+
+
+# One template for ALL configs. The per-config preamble already advertises the
+# available navigation tool (cg / lsp / none); we deliberately do NOT force a
+# first command here so the comparison measures *natural* tool usage.
+LOCALIZE_INSTANCE_TEMPLATE = f"""\
+You are working in the repository at {{{{cwd}}}}.
+
+You are doing CODE LOCALIZATION ONLY. Read the issue below and determine
+which source file(s) must be modified to resolve it. **Do NOT edit, create,
+or patch any file.** Investigate the codebase with the tools available to
+you, then report your answer PROMPTLY — do not over-explore. As soon as you
+are reasonably confident of the file(s), submit.
+
+The issue:
+
+{{{{task}}}}
+
+To submit, run a single bash command whose stdout is exactly these two lines
+(this is how you end the task):
+
+    echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT
+    echo '{SENTINEL} ["pkg/module/foo.py", "pkg/other.py"]'
+
+Replace the array with the real repo-relative source file paths you believe
+must change, most-likely first. List only implementation files (exclude
+tests). The text after `{SENTINEL}` MUST be a valid JSON array of strings.
+"""
+
+_PY_PATH_RE = re.compile(r"[A-Za-z0-9_./-]+\.py")
+
+# Optional forced-workflow ablation. The free-form primary measures *natural*
+# tool adoption (which on this model is near-zero — the agent defaults to
+# grep/find). To measure the tool's *intrinsic* value when adoption is
+# guaranteed, prepend a per-config mandate to invoke the navigation tool first.
+_FORCE_TOOL_SNIPPET = {
+    "lsp": (
+        "MANDATORY WORKFLOW: Before running any grep/find/cat, you MUST use the "
+        "`lsp` tool at least once to locate a relevant symbol's definition or "
+        "references (e.g. `lsp goto-definition <file> <line> <col>` or "
+        "`lsp find-references ...`). Prefer `lsp` over text search throughout.\n\n"
+    ),
+    "code_graph": (
+        "MANDATORY WORKFLOW: Before running any grep/find/cat, you MUST use the "
+        "`cg` tool at least once to locate candidate symbols "
+        "(`cg search_code --prefix <name>`) and trace cross-file structure "
+        "(`cg get-callers` / `cg get-dependencies` / `cg impact-analysis`). "
+        "Prefer `cg` over text search throughout.\n\n"
+    ),
+    "code_graph_mcp": (
+        "MANDATORY WORKFLOW: Before running any grep/find/cat, you MUST use the "
+        "`cg-mcp` tool at least once to locate candidate symbols "
+        "(`cg-mcp search_code --prefix <name>`) and trace cross-file structure "
+        "(`cg-mcp get_callers` / `cg-mcp get_dependencies` / "
+        "`cg-mcp impact_analysis`). Prefer `cg-mcp` over text search throughout.\n\n"
+    ),
+}
+
+
+def build_instance_template(config: str, *, force_tool: bool) -> str:
+    """Return the instance template, optionally prefixed with a per-config
+    mandate to use the navigation tool first (forced-workflow ablation)."""
+    if not force_tool:
+        return LOCALIZE_INSTANCE_TEMPLATE
+    snippet = _FORCE_TOOL_SNIPPET.get(config)
+    if not snippet:  # baseline has no tool; nothing to force.
+        return LOCALIZE_INSTANCE_TEMPLATE
+    return snippet + LOCALIZE_INSTANCE_TEMPLATE
+
+
+# ---------------------------------------------------------------------------
+# Prediction parsing
+# ---------------------------------------------------------------------------
+
+def _all_text(traj: dict[str, Any]) -> str:
+    """Concatenate ONLY the model's own outputs (assistant + exit/submission).
+
+    System/user/tool messages are excluded so the example sentinel in the
+    instance prompt can never be mistaken for the agent's answer.
+    """
+    parts: list[str] = []
+    for m in traj.get("messages", []):
+        if m.get("role") not in ("assistant", "exit"):
+            continue
+        c = m.get("content", "")
+        if isinstance(c, str):
+            parts.append(c)
+        elif isinstance(c, list):
+            for seg in c:
+                if isinstance(seg, dict) and isinstance(seg.get("text"), str):
+                    parts.append(seg["text"])
+    # Also include the captured submission text if present.
+    sub = traj.get("info", {}).get("submission")
+    if isinstance(sub, str):
+        parts.append(sub)
+    return "\n".join(parts)
+
+
+def _norm_path(p: str) -> str:
+    p = p.strip().strip('"').strip("'")
+    if p.startswith("./"):
+        p = p[2:]
+    if p.startswith("b/") or p.startswith("a/"):
+        p = p[2:]
+    return p
+
+
+def parse_prediction(traj: dict[str, Any]) -> tuple[list[str], bool, list[str]]:
+    """Return (pred_files, parse_error, fallback_files).
+
+    Primary: the LAST `FINAL_LOCALIZATION_JSON:` sentinel followed by a JSON
+    array. `parse_error` is True when no sentinel+valid-array is found.
+    `fallback_files` is a diagnostic regex scan (NOT used for headline).
+    """
+    text = _all_text(traj)
+    fallback: list[str] = []
+    seen: set[str] = set()
+    for m in _PY_PATH_RE.finditer(text):
+        fp = _norm_path(m.group(0))
+        if fp not in seen:
+            seen.add(fp)
+            fallback.append(fp)
+
+    idx = text.rfind(SENTINEL)
+    if idx == -1:
+        return [], True, fallback
+    after = text[idx + len(SENTINEL):]
+    # find the first balanced [...] JSON array
+    start = after.find("[")
+    if start == -1:
+        return [], True, fallback
+    depth = 0
+    end = -1
+    for i in range(start, len(after)):
+        if after[i] == "[":
+            depth += 1
+        elif after[i] == "]":
+            depth -= 1
+            if depth == 0:
+                end = i
+                break
+    if end == -1:
+        return [], True, fallback
+    blob = after[start : end + 1]
+    try:
+        arr = json.loads(blob)
+        if not isinstance(arr, list):
+            return [], True, fallback
+    except json.JSONDecodeError:
+        return [], True, fallback
+    pred: list[str] = []
+    pseen: set[str] = set()
+    for item in arr:
+        if not isinstance(item, str):
+            continue
+        fp = _norm_path(item)
+        if fp and fp not in pseen:
+            pseen.add(fp)
+            pred.append(fp)
+    return pred, False, fallback
+
+
+# ---------------------------------------------------------------------------
+# Scoring
+# ---------------------------------------------------------------------------
+
+def score_localization(pred: list[str], gold: list[str]) -> dict[str, Any]:
+    gold_set = set(gold)
+    pred_set = set(pred)
+    inter = gold_set & pred_set
+    recall = len(inter) / len(gold_set) if gold_set else 0.0
+    precision = len(inter) / len(pred_set) if pred_set else 0.0
+    all_found = gold_set.issubset(pred_set) if gold_set else False
+
+    def acc_at_k(k: int) -> bool:
+        return gold_set.issubset(set(pred[:k])) if gold_set else False
+
+    # MRR: reciprocal rank of the first gold hit in the predicted order.
+    mrr = 0.0
+    for rank, fp in enumerate(pred, start=1):
+        if fp in gold_set:
+            mrr = 1.0 / rank
+            break
+    return {
+        "file_recall": round(recall, 4),
+        "file_precision": round(precision, 4),
+        "file_all_found": all_found,
+        "acc_at_1": acc_at_k(1),
+        "acc_at_3": acc_at_k(3),
+        "acc_at_5": acc_at_k(5),
+        "file_mrr": round(mrr, 4),
+    }
+
+
+# ---------------------------------------------------------------------------
+# Single run
+# ---------------------------------------------------------------------------
+
+def run_localize_task(
+    inst: Any,
+    config: str,
+    *,
+    model_name: str,
+    step_limit: int = 30,
+    cost_limit: float = 2.0,
+    wall_time_limit_seconds: int = 900,
+    force_tool: bool = False,
+) -> dict[str, Any]:
+    from bench.datasets import swe_bench as sb
+
+    if config not in VALID_CONFIGS:
+        raise ValueError(f"unknown config {config!r}")
+
+    repo_path = sb.prepare_localize_worktree(inst)
+    gold_files = sb.gold_changed_files(inst.patch, source_only=True)
+    gold_syms = sb.gold_symbols(inst, repo_path)
+    leak = sb.leakage_flags(inst, gold_files)
+
+    # Fresh, test-free index for the graph configs.
+    if config == "code_graph":
+        _ensure_indexed(repo_path)
+    elif config == "code_graph_mcp":
+        _ensure_indexed_mcp(repo_path)
+
+    from minisweagent.agents.default import DefaultAgent
+    from minisweagent.models.litellm_model import LitellmModel
+
+    env_vars = config_env(config, repo_path)
+    env = SafeLocalEnvironment(cwd=str(repo_path), env=env_vars, timeout=120)
+    agent = DefaultAgent(
+        TimeoutRetryModel(
+            LitellmModel(
+                model_name=model_name,
+                model_kwargs={"timeout": 180},
+            ),
+            per_call_timeout=180,
+            retries=3,
+        ),
+        env,
+        system_template=load_preamble(config),
+        instance_template=build_instance_template(config, force_tool=force_tool),
+        step_limit=step_limit,
+        cost_limit=cost_limit,
+        wall_time_limit_seconds=wall_time_limit_seconds,
+    )
+
+    started = time.time()
+    exit_status = "ok"
+    try:
+        agent.run(task=inst.problem_statement)
+    except Exception as exc:  # noqa: BLE001
+        exit_status = f"error:{type(exc).__name__}"
+    wall = round(time.time() - started, 3)
+    traj = agent.serialize()
+
+    pred, parse_error, fallback = parse_prediction(traj)
+    sc = score_localization(pred, gold_files)
+
+    from bench.metrics import task_metrics_from_trajectory
+
+    tm = task_metrics_from_trajectory(
+        traj, benchmark="swe_localize", task_id=inst.instance_id,
+        config=config, wall_clock_sec=wall,
+    )
+
+    row = {
+        "benchmark": "swe_localize",
+        "task_id": inst.instance_id,
+        "config": config,
+        "force_tool": force_tool,
+        "input_tokens": tm.input_tokens,
+        "output_tokens": tm.output_tokens,
+        "tool_calls_total": tm.tool_calls_total,
+        "tool_calls_by_name": tm.tool_calls_by_name,
+        "wall_clock_sec": wall,
+        "exit_status": exit_status,
+        "gold_files": gold_files,
+        "gold_files_count": len(gold_files),
+        "gold_dirs_count": len({str(Path(f).parent) for f in gold_files}),
+        "gold_symbols": gold_syms,
+        "symbol_mappable": bool(gold_syms),
+        "predicted_files": pred,
+        "predicted_files_fallback": fallback[:20],
+        "parse_error": parse_error,
+        "is_structural": sb.is_structural(inst),
+        **leak,
+        **sc,
+    }
+    return {"row": row, "trajectory": traj}
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+def _write_trajectory(task_id: str, config: str, traj: dict[str, Any], d: Path) -> None:
+    d.mkdir(parents=True, exist_ok=True)
+    (d / f"{task_id}__{config}.json").write_text(
+        json.dumps(traj, indent=2, sort_keys=True, default=str)
+    )
+
+
+def main(argv: list[str] | None = None) -> int:
+    p = argparse.ArgumentParser(description="LocAgent-style localization benchmark")
+    p.add_argument("--config", choices=VALID_CONFIGS, action="append",
+                   help="repeatable; defaults to baseline/lsp/code_graph "
+                        "(code_graph_mcp omitted by default: HTTP/MCP transport "
+                        "parity already established at n=40, and this worktree's "
+                        "venv lacks the mcp module)")
+    p.add_argument("--set", choices=("cached", "structural", "all"), default="structural",
+                   help="cached=prior n=40 ids (pilot); structural=multi-file/dir gold")
+    p.add_argument("--limit", type=int, default=None)
+    p.add_argument("--model", default="anthropic/claude-opus-4-5")
+    p.add_argument("--results", type=Path, default=LOCALIZE_RESULTS)
+    p.add_argument("--trajectories", type=Path, default=LOCALIZE_TRAJECTORIES)
+    p.add_argument("--step-limit", type=int, default=40)
+    p.add_argument("--cost-limit", type=float, default=2.0)
+    p.add_argument("--wall-time", type=int, default=900)
+    p.add_argument("--cached-ids", type=Path, default=None,
+                   help="JSONL/txt of task_ids to use when --set cached")
+    p.add_argument("--force-tool", action="store_true",
+                   help="forced-workflow ablation: prepend a per-config mandate "
+                        "to invoke the navigation tool (cg/lsp) before any "
+                        "grep/find. Measures the tool's intrinsic value when "
+                        "adoption is guaranteed (free-form adoption is ~0).")
+    p.add_argument("--dataset", default=None,
+                   help="HF dataset name (default: princeton-nlp/SWE-bench_Verified). "
+                        "Use SWE-bench-Live/SWE-bench-Live for a contamination-free, "
+                        "less-pretraining-saturated corpus.")
+    p.add_argument("--split", default="test",
+                   help="dataset split (SWE-bench-Live exposes test/lite/verified/full)")
+    p.add_argument("--repos", default=None,
+                   help="comma-separated owner/name allowlist for --set structural "
+                        "(target large, less-saturated repos)")
+    p.add_argument("--python-only", action="store_true",
+                   help="require >=1 .py gold file (tools are Python-only)")
+    args = p.parse_args(argv)
+
+    configs = args.config or ["baseline", "lsp", "code_graph"]
+
+    from bench.datasets import swe_bench as sb
+
+    all_insts = sb.load_instances(split=args.split, dataset_name=args.dataset)
+    by_id = {i.instance_id: i for i in all_insts}
+
+    if args.set == "cached":
+        ids: list[str] = []
+        src = args.cached_ids
+        if src and src.exists():
+            for line in src.read_text().splitlines():
+                line = line.strip()
+                if line:
+                    ids.append(json.loads(line)["task_id"] if line.startswith("{") else line)
+        else:
+            # derive from the prior fix-run results file
+            prior = DEFAULT_CACHE_DIR / "opus" / "results.jsonl"
+            seen: set[str] = set()
+            for line in prior.read_text().splitlines():
+                tid = json.loads(line)["task_id"]
+                if tid not in seen:
+                    seen.add(tid)
+                    ids.append(tid)
+        insts = [by_id[i] for i in ids if i in by_id]
+    elif args.set == "structural":
+        repo_allow = (
+            {r.strip() for r in args.repos.split(",") if r.strip()}
+            if args.repos
+            else None
+        )
+        insts = sb.select_structural(
+            all_insts, n=args.limit, repos=repo_allow, python_only=args.python_only
+        )
+    else:
+        insts = all_insts
+
+    if args.limit is not None:
+        insts = insts[: args.limit]
+
+    # Drop instances with no source-file gold (e.g. test/doc-only patches).
+    insts = [i for i in insts if sb.gold_changed_files(i.patch, source_only=True)]
+
+    print(f"[localize] {len(insts)} instances x {len(configs)} configs "
+          f"({args.set} set), model={args.model}")
+    args.results.parent.mkdir(parents=True, exist_ok=True)
+
+    done: set[tuple[str, str]] = set()
+    if args.results.exists():
+        for line in args.results.read_text().splitlines():
+            if not line.strip():
+                continue
+            r = json.loads(line)
+            done.add((r["task_id"], r["config"]))
+
+    with args.results.open("a") as out:
+        for inst in insts:
+            for cfg in configs:
+                if (inst.instance_id, cfg) in done:
+                    print(f"[resume] {inst.instance_id}/{cfg} exists; skip")
+                    continue
+                print(f"[run] {inst.instance_id}/{cfg} ...", flush=True)
+                try:
+                    res = run_localize_task(
+                        inst, cfg, model_name=args.model,
+                        step_limit=args.step_limit, cost_limit=args.cost_limit,
+                        wall_time_limit_seconds=args.wall_time,
+                        force_tool=args.force_tool,
+                    )
+                except Exception as exc:  # noqa: BLE001
+                    print(f"[error] {inst.instance_id}/{cfg}: {exc!r}", flush=True)
+                    continue
+                out.write(json.dumps(res["row"]) + "\n")
+                out.flush()
+                _write_trajectory(inst.instance_id, cfg, res["trajectory"], args.trajectories)
+                r = res["row"]
+                print(f"[done] {inst.instance_id}/{cfg} "
+                      f"acc@1={r['acc_at_1']} recall={r['file_recall']} "
+                      f"in={r['input_tokens']} parse_err={r['parse_error']}", flush=True)
+    print("[localize] DONE")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/bench/runners/mini_runner.py b/bench/runners/mini_runner.py
index 3689c0aa..49481c07 100644
--- a/bench/runners/mini_runner.py
+++ b/bench/runners/mini_runner.py
@@ -49,7 +49,7 @@
 DEFAULT_CACHE_DIR = BENCH_DIR / "cache"
 DEFAULT_RESULTS = DEFAULT_CACHE_DIR / "results.jsonl"
 
-VALID_CONFIGS = ("baseline", "lsp", "code_graph")
+VALID_CONFIGS = ("baseline", "lsp", "code_graph", "code_graph_mcp")
 
 
 # ---------------------------------------------------------------------------
@@ -155,11 +155,40 @@ class Task:
 """
 
 
+INSTANCE_TEMPLATE_CODE_GRAPH_MCP = """\
+You are working in the repository at {{cwd}}.
+The code-graph MCP server has already indexed this repository under the
+project name `$PROJECT_NAME` on branch `$BRANCH` (use the env vars
+literally).
+
+The task to solve:
+
+{{task}}
+
+**Required workflow.** Before reading or editing any file, your first
+bash command MUST be:
+
+  `cg-mcp search_code --project "$PROJECT_NAME" --branch "$BRANCH" --prefix <a symbol named in the task description>`
+
+Then use `cg-mcp get_callers --project "$PROJECT_NAME" --branch "$BRANCH" --symbol-id <id>`
+to expand relationships before doing any textual search. Use
+`cg-mcp impact_analysis ... --symbol-id <id> --depth 3` before
+non-trivial edits.
+
+When you believe the task is complete, finish your turn with a final
+message that contains a unified diff of your changes inside a fenced
+``` block, then exit. Do not commit; the harness reads the diff via
+`git diff`.
+"""
+
+
 def load_instance_template(config: str) -> str:
     if config == "lsp":
         return INSTANCE_TEMPLATE_LSP
     if config == "code_graph":
         return INSTANCE_TEMPLATE_CODE_GRAPH
+    if config == "code_graph_mcp":
+        return INSTANCE_TEMPLATE_CODE_GRAPH_MCP
     return INSTANCE_TEMPLATE
 
 
@@ -210,6 +239,23 @@ def config_env(config: str, repo_path: Path) -> dict[str, str]:
         # The agent's preamble references $REPO_NAME — set it to the
         # worktree dirname, which is what analyze_folder used as the id.
         env["REPO_NAME"] = repo_path.name
+    elif config == "code_graph_mcp":
+        # MCP transport: agent calls `cg-mcp …` which spawns the
+        # `cgraph-mcp` stdio server per call. FalkorDB coordinates
+        # are passed through verbatim.
+        env.setdefault("FALKORDB_HOST", os.environ.get("FALKORDB_HOST", "127.0.0.1"))
+        env.setdefault("FALKORDB_PORT", os.environ.get("FALKORDB_PORT", "6379"))
+        # `cgraph-mcp` must be on PATH; the runner installs the
+        # falkordb-code-graph package into the same interpreter, so
+        # prepending the venv bin gives us the entry point.
+        venv_bin = str(Path(sys.executable).parent)
+        env["PATH"] = f"{venv_bin}:{env['PATH']}"
+        # The preamble references $PROJECT_NAME and $BRANCH; project
+        # name matches what `index_repo` derives from the folder
+        # (= worktree dirname), and branch is the per-instance tag we
+        # used when indexing.
+        env["PROJECT_NAME"] = repo_path.name
+        env["BRANCH"] = os.environ.get("CGRAPH_MCP_BRANCH", "_default")
     return env
 
 
@@ -235,17 +281,63 @@ def _ensure_indexed(repo_path: Path) -> None:
                 print(f"[index] {repo_name} already indexed; skip")
                 return
         print(f"[index] analyzing {repo_path} ...")
-        with httpx.Client(timeout=600.0, headers=headers) as c:
+        # Default ignore set: auto-generated / vendored / pathological dirs
+        # that either contain no useful symbols or send jedi into a
+        # multi-hour resolve loop (e.g. sympy/integrals/rubi/rules has
+        # 3000-line files with hundreds of unresolvable symbols per line).
+        default_ignore = [
+            ".git", "venv", ".venv", "node_modules", "__pycache__",
+            "rubi/rules",  # sympy: blocks indexing for ~hours otherwise
+            "build", "dist", ".tox", ".eggs",
+        ]
+        with httpx.Client(timeout=7200.0, headers=headers) as c:
             r = c.post(
                 f"{base}/api/analyze_folder",
-                json={"path": str(repo_path), "ignore": []},
+                json={"path": str(repo_path), "ignore": default_ignore},
             )
             if r.status_code != 200:
-                print(f"[index] WARN analyze_folder returned {r.status_code}: {r.text[:200]}")
-            else:
-                print(f"[index] indexed {repo_name}")
+                raise RuntimeError(
+                    f"analyze_folder returned {r.status_code}: {r.text[:300]}. "
+                    f"Check ALLOWED_ANALYSIS_DIR on the API server covers {repo_path}."
+                )
+            print(f"[index] indexed {repo_name}")
+    except Exception as exc:
+        raise RuntimeError(f"failed to index {repo_name} at {repo_path}: {exc}") from exc
+
+
+def _ensure_indexed_mcp(repo_path: Path) -> None:
+    """MCP-track equivalent of _ensure_indexed.
+
+    Drives the `index_repo` MCP tool in-process via the bench adapter
+    (avoids spawning a second cgraph-mcp just to bootstrap; the agent
+    will spawn its own per call). Same skip-if-present optimization
+    as the HTTP path: cheap GRAPH.LIST scan against FalkorDB.
+    """
+    from bench.agents import code_graph_mcp_adapter as cgm
+    import redis
+
+    repo_name = repo_path.name
+    branch = os.environ.get("CGRAPH_MCP_BRANCH", "_default")
+    host = os.environ.get("FALKORDB_HOST", "127.0.0.1")
+    port = int(os.environ.get("FALKORDB_PORT", "6379"))
+    expected_graph = f"code:{repo_name}:{branch}"
+    try:
+        r = redis.Redis(host=host, port=port, decode_responses=True, socket_timeout=2)
+        if expected_graph in (r.execute_command("GRAPH.LIST") or []):
+            print(f"[index-mcp] {expected_graph} already indexed; skip")
+            return
     except Exception as exc:  # noqa: BLE001
-        print(f"[index] WARN failed to index {repo_name}: {exc!r}")
+        print(f"[index-mcp] WARN list_graphs failed ({exc!r}); will attempt index anyway")
+
+    print(f"[index-mcp] indexing {repo_path} as {expected_graph} ...")
+    try:
+        payload = cgm.index_repo(str(repo_path), branch=branch)
+        if isinstance(payload, dict) and payload.get("error"):
+            print(f"[index-mcp] WARN index_repo error: {payload['error']!r}")
+        else:
+            print(f"[index-mcp] indexed: {payload}")
+    except Exception as exc:  # noqa: BLE001
+        print(f"[index-mcp] WARN failed to index {repo_name}: {exc!r}")
 
 
 # ---------------------------------------------------------------------------
@@ -569,7 +661,7 @@ def main(argv: list[str] | None = None) -> int:
 
     p = argparse.ArgumentParser(description="code-graph benchmark runner")
     p.add_argument("--config", choices=VALID_CONFIGS, action="append",
-                   help="one of baseline / lsp / code_graph; repeatable. "
+                   help="one of baseline / lsp / code_graph / code_graph_mcp; repeatable. "
                         "Default: all three.")
     mode = p.add_mutually_exclusive_group(required=True)
     mode.add_argument("--dry-run", action="store_true",
@@ -600,6 +692,11 @@ def main(argv: list[str] | None = None) -> int:
                         "needs GITHUB_TOKEN with models:read scope); "
                         "'github_copilot/gpt-4o' (uses your Copilot session, "
                         "device-code OAuth on first call).")
+    p.add_argument("--instances-file", type=Path, default=None,
+                   help="Path to a file listing instance_ids to run EXACTLY "
+                        "(one per line, or a results .jsonl with a task_id "
+                        "field). Overrides --stage/--limit sampling so a run "
+                        "can be reproduced against a prior model's exact set.")
     p.add_argument("--step-limit", type=int, default=50)
     p.add_argument("--cost-limit", type=float, default=3.0)
     p.add_argument("--wall-time", type=int, default=1200)
@@ -619,12 +716,40 @@ def main(argv: list[str] | None = None) -> int:
         from bench.metrics import append_jsonl
 
         insts = sample_instances(load_instances(), stage=args.stage)
-        if args.limit is not None:
+        if args.instances_file is not None:
+            wanted: list[str] = []
+            seen: set[str] = set()
+            for line in args.instances_file.read_text().splitlines():
+                line = line.strip()
+                if not line:
+                    continue
+                if line.startswith("{"):
+                    import json as _json
+                    tid = _json.loads(line).get("task_id")
+                else:
+                    tid = line
+                if tid and tid not in seen:
+                    seen.add(tid)
+                    wanted.append(tid)
+            pool = {i.instance_id: i for i in load_instances()}
+            missing = [t for t in wanted if t not in pool]
+            if missing:
+                raise SystemExit(f"instances-file ids not in dataset: {missing[:5]}")
+            insts = [pool[t] for t in wanted]
+            print(f"[swe-bench] instances-file override: {len(insts)} instances")
+        elif args.limit is not None:
             insts = insts[: args.limit]
         print(f"[swe-bench] stage={args.stage} running {len(insts)} instances "
               f"x {len(configs)} configs = {len(insts) * len(configs)} trajectories")
         for inst in insts:
             for cfg in configs:
+                # Resume support: if a trajectory file for this (instance, cfg)
+                # already exists, skip the run entirely. Lets us recover from
+                # crashes / kills without re-spending tokens on completed work.
+                existing_traj = args.trajectories / f"{inst.instance_id}__{cfg}.json"
+                if existing_traj.exists():
+                    print(f"[resume] {inst.instance_id}/{cfg}: trajectory exists, skip")
+                    continue
                 # Fresh worktree per (instance, config) to avoid cross-talk.
                 wt = prepare_worktree(inst)
                 # Rename so each cfg gets a distinct path.
@@ -640,6 +765,8 @@ def main(argv: list[str] | None = None) -> int:
                 # call returns nothing and the agent abandons the tool.
                 if cfg == "code_graph":
                     _ensure_indexed(cfg_wt)
+                elif cfg == "code_graph_mcp":
+                    _ensure_indexed_mcp(cfg_wt)
                 cfg_rows = run_batch(
                     [task],
                     [cfg],
@@ -655,7 +782,15 @@ def main(argv: list[str] | None = None) -> int:
                 )
                 rows.extend(cfg_rows)
                 ok, summary = verify_instance(inst, cfg_wt)
-                cfg_rows[-1]["metrics"].outcome = "resolved" if ok else "failed"
+                # Inline verify is a best-effort signal only; the authoritative
+                # grade comes from the SWE-bench Docker harness (run separately
+                # via bench.runners.swebench_verify against the stored patch).
+                # If pytest couldn't even run here (e.g. missing in the launch
+                # env), record `ungraded` rather than a misleading `failed`.
+                if summary.startswith("UNGRADED:"):
+                    cfg_rows[-1]["metrics"].outcome = "ungraded"
+                else:
+                    cfg_rows[-1]["metrics"].outcome = "resolved" if ok else "failed"
                 if not ok:
                     cfg_rows[-1]["verify_summary"] = summary[-200:]
                 append_jsonl(args.results, cfg_rows[-1]["metrics"])
diff --git a/bench/runners/nav_multihop_agent.py b/bench/runners/nav_multihop_agent.py
new file mode 100644
index 00000000..accf10eb
--- /dev/null
+++ b/bench/runners/nav_multihop_agent.py
@@ -0,0 +1,753 @@
+"""Multi-hop navigation PREMIUM agent arm (Lane 2).
+
+Drives the Copilot CLI over the validated multi-hop nav question set across the
+three arms — ``no_mcp`` (baseline, builtin grep/view only), ``lsp`` (jedi MCP),
+and ``code_graph`` (FalkorDB code-graph MCP) — and scores each answer against the
+jedi oracle gold with set-F1 (file + qualname) and, for path questions, boolean
+reachability correctness. Also records the realized token / tool-call / premium
+cost per arm (prereg H2: median token reduction).
+
+This is the agent counterpart to the FREE ``nav_multihop_gate.py`` answerability
+gate. The gate proved the GRAPH DATA is compact + correct on uxarray; this runner
+measures whether an AGENT wielding each tool actually reaches that answer, and at
+what cost.
+
+Design notes / invariants (see session checkpoint "Multi-hop nav gate"):
+* The code_graph arm queries the PRE-BUILT fixed-resolver graph by project name
+  (default ``mh_uxarray`` on FalkorDB :6380). It does NOT re-index — the running
+  staging API server lacks the resolver fix (commit 8fa2a43), so re-indexing
+  would silently rebuild a BROKEN graph and invalidate the comparison.
+* All three arms run with cwd = the SAME uxarray worktree the graph was indexed
+  from and the oracle gold is relative to, so paths align across arms.
+* The MCP nav tools actually exposed are find_symbol / search_code /
+  get_neighbors / impact_analysis / find_path (NOT get_callers/get_callees —
+  those are internal helpers). The `ask` GraphRAG tool was dropped (it errored
+  100% of the time without a Gemini key). find_symbol bridges a symbol name to
+  its integer node id, which the relationship tools require. The code_graph
+  capability note names the real tools.
+
+Usage:
+    .venv/bin/python -m bench.runners.nav_multihop_agent \
+        --questions /tmp/ux_questions.json \
+        --project mh_uxarray --port 6380 \
+        --model claude-sonnet-4.6 \
+        --arms no_mcp lsp code_graph \
+        --out /tmp/ux_nav_agent.json
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+
+from bench.runners.copilot_runner import (
+    CODE_GRAPH,
+    LSP,
+    NO_MCP,
+    DEFAULT_MCP_SERVER_ROOT,
+    RUNNER_VERSION,
+    extract_agent_text,
+    nudge_compliance,
+    parse_result_event,
+    parse_tokens_from_logs,
+    parse_tool_calls,
+    run_copilot,
+    _write_lsp_mcp_config,
+    _write_lsp_wrapper,
+    _write_mcp_config,
+    _write_mcp_wrapper,
+)
+from bench.runners.nav_multihop_gate import _prf
+
+ARMS = (NO_MCP, LSP, CODE_GRAPH)
+
+# Question types whose answer is a SET of (path, qualname); the remaining type
+# ("path") is a reachability boolean + an optional example chain.
+SET_TYPES = ("callers", "callees", "blast_radius")
+
+NAV_SENTINEL = "FINAL_NAV_JSON:"
+
+# ---------------------------------------------------------------------------
+# Capability notes — symmetric across arms; each names ONLY its own mechanism.
+# The code_graph note lists the REAL exposed tools and the type->tool mapping.
+# ---------------------------------------------------------------------------
+
+_CAP_NO_MCP = (
+    "No external MCP navigation tools are available. Use Copilot's built-in file "
+    "reading and text search (grep/rg) tools to trace the call relationships "
+    "yourself."
+)
+
+_CAP_LSP = (
+    "An LSP MCP server is available exposing jedi-backed Python navigation tools "
+    "(goto_definition, find_references, hover, document_symbols). Paths are "
+    "repo-root-relative; line/character positions are 0-based (subtract 1 from "
+    "the 1-based line numbers grep/view report). To find CALLERS of a function, "
+    "use find_references on its definition; to find CALLEES, read the function "
+    "body and goto_definition on each name it calls. Prefer these precise "
+    "navigation tools over plain text search when they help."
+)
+
+_CAP_CODE_GRAPH = (
+    "A code-graph MCP server is available, already indexed under "
+    'project="{project}" (do NOT call index_repo). Workflow: (1) call '
+    'find_symbol(name, project="{project}", file=<defining file>) to resolve a '
+    "function/method/class to its integer symbol_id. The question gives you the "
+    "exact qualname and file, so pass the leaf name (e.g. the part after the last "
+    "dot) plus that file to disambiguate; the result with file_match=true is the "
+    "one you want. (2) get_neighbors(symbol_id, project, relation=\"CALLS\", "
+    "direction=\"IN\") returns the direct CALLERS, direction=\"OUT\" returns the "
+    "direct CALLEES. (3) impact_analysis(symbol_id, project, direction=\"IN\", "
+    "depth=3) returns the transitive callers (blast radius) up to 3 hops. (4) For "
+    "a reachability question, resolve BOTH endpoints with find_symbol, then "
+    "find_path(source_id, dest_id, project) returns a call chain between them (an "
+    "empty result means unreachable). Each returned node carries its file and "
+    "name, so you can answer directly from the graph without grepping. Prefer "
+    "these precise graph tools over plain text search."
+)
+
+
+def _capability(track: str, project: str) -> str:
+    if track == CODE_GRAPH:
+        return _CAP_CODE_GRAPH.format(project=project)
+    if track == LSP:
+        return _CAP_LSP
+    return _CAP_NO_MCP
+
+
+_OUTPUT_SPEC_SET = (
+    '{{"items": [{{"path": "pkg/module.py", "qualname": "ClassName.method"}}, '
+    '{{"path": "pkg/other.py", "qualname": "module_level_function"}}]}}'
+)
+_OUTPUT_SPEC_PATH = (
+    '{{"reachable": true, "path": [{{"path": "pkg/a.py", "qualname": "A.f"}}, '
+    '{{"path": "pkg/b.py", "qualname": "B.g"}}]}}'
+)
+
+_PROMPT = """\
+You are answering a CODE NAVIGATION question about the Python repository checked
+out at {cwd}.
+
+QUESTION:
+{question}
+
+Investigate the repository to determine the answer. Do NOT modify any files. Do
+NOT run or edit tests.
+{capability}
+When you are confident, finish your FINAL assistant message with a single line in
+EXACTLY this format:
+
+{sentinel} {output_spec}
+
+Rules for that line:
+- Use repo-root-relative POSIX paths to .py source files.
+- `qualname` is the dotted name of the function/method, e.g. `ClassName.method`
+  or `module_level_function` (no file path, no parentheses, no arguments).
+{type_rule}
+- Write that line as plain text in your OWN final message. Do NOT emit it through
+  a shell command, `echo`, a file write, or any tool call."""
+
+_TYPE_RULE_SET = (
+    "- List EVERY matching function. Include both source and test functions."
+)
+_TYPE_RULE_PATH = (
+    "- If a call chain exists, set reachable=true and give ONE such ordered chain "
+    "from source to target in `path`. If NO chain exists, set reachable=false and "
+    "path=[]."
+)
+
+
+def build_nav_prompt(track: str, cwd: Path, q: dict, project: str) -> str:
+    is_path = q["type"] == "path"
+    return _PROMPT.format(
+        cwd=cwd,
+        question=q["question"].strip(),
+        capability=_capability(track, project),
+        sentinel=NAV_SENTINEL,
+        output_spec=_OUTPUT_SPEC_PATH if is_path else _OUTPUT_SPEC_SET,
+        type_rule=_TYPE_RULE_PATH if is_path else _TYPE_RULE_SET,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Answer parsing + scoring
+# ---------------------------------------------------------------------------
+
+
+def _norm_path(p: str) -> str:
+    p = (p or "").strip().strip("'\"").strip().replace("\\", "/")
+    while p.startswith("./"):
+        p = p[2:]
+    return p.lstrip("/")
+
+
+def _norm_qual(s: str) -> str:
+    return (s or "").strip().strip("'\"").strip()
+
+
+def _extract_json_object(text: str) -> tuple[dict | None, str | None]:
+    """Pull the JSON object that follows the last NAV_SENTINEL occurrence."""
+    idx = text.rfind(NAV_SENTINEL)
+    if idx == -1:
+        return None, "sentinel_missing"
+    tail = text[idx + len(NAV_SENTINEL):]
+    start = tail.find("{")
+    if start == -1:
+        return None, "no_object"
+    depth = 0
+    end = -1
+    in_str = False
+    esc = False
+    for i in range(start, len(tail)):
+        c = tail[i]
+        if in_str:
+            if esc:
+                esc = False
+            elif c == "\\":
+                esc = True
+            elif c == '"':
+                in_str = False
+            continue
+        if c == '"':
+            in_str = True
+        elif c == "{":
+            depth += 1
+        elif c == "}":
+            depth -= 1
+            if depth == 0:
+                end = i
+                break
+    if end == -1:
+        return None, "unbalanced_object"
+    try:
+        return json.loads(tail[start:end + 1]), None
+    except json.JSONDecodeError as exc:
+        return None, f"json_error:{exc.msg}"
+
+
+def parse_nav_answer(text: str, qtype: str) -> tuple[dict, str | None]:
+    obj, err = _extract_json_object(text)
+    if obj is None:
+        if qtype == "path":
+            return {"reachable": None, "path": []}, err
+        return {"items": []}, err
+    if qtype == "path":
+        reachable = obj.get("reachable")
+        if isinstance(reachable, str):
+            reachable = reachable.strip().lower() == "true"
+        path = obj.get("path") or []
+        items = [
+            (_norm_path(it.get("path", "")), _norm_qual(it.get("qualname", "")))
+            for it in path
+            if isinstance(it, dict)
+        ]
+        return {"reachable": bool(reachable), "path": items}, None
+    raw = obj.get("items")
+    if not isinstance(raw, list):
+        return {"items": []}, "items_not_a_list"
+    items = [
+        (_norm_path(it.get("path", "")), _norm_qual(it.get("qualname", "")))
+        for it in raw
+        if isinstance(it, dict)
+    ]
+    return {"items": items}, None
+
+
+def _leaf(qual: str) -> str:
+    return _norm_qual(qual).split(".")[-1]
+
+
+def _gold_set(q: dict) -> set[tuple[str, str]]:
+    return {(_norm_path(g["path"]), _norm_qual(g["qualname"])) for g in q["gold"]}
+
+
+def _loose_set(items: list[tuple[str, str]]) -> set[tuple[str, str]]:
+    """Lenient identity: (path, last dotted component) — tolerates agents that
+    emit a bare leaf or a different qualname prefix than the oracle."""
+    return {(p, _leaf(qn)) for p, qn in items if p}
+
+
+def _endpoint_match(item: tuple[str, str], spec: dict) -> bool:
+    """A predicted (path, qual) matches a path-question endpoint spec when the
+    file matches and EITHER the full qualname or just the leaf agrees."""
+    p, qn = item
+    sp = _norm_path(spec["path"])
+    if p != sp:
+        return False
+    return _norm_qual(qn) == _norm_qual(spec["qualname"]) or _leaf(qn) == _leaf(spec["qualname"])
+
+
+def score_nav(q: dict, pred: dict) -> dict[str, Any]:
+    qtype = q["type"]
+    if qtype == "path":
+        gold_reachable = bool(q["gold"]["reachable"])
+        pred_reachable = pred.get("reachable")
+        boolean_correct = (pred_reachable is not None) and (pred_reachable == gold_reachable)
+        ppath = pred.get("path", [])
+        # For a claimed-reachable answer, demand a non-empty chain whose
+        # endpoints are the requested source and target (beyond a lucky bool).
+        endpoints_correct = False
+        if pred_reachable and ppath:
+            src = q["symbol"]["source"]
+            tgt = q["symbol"]["target"]
+            endpoints_correct = _endpoint_match(ppath[0], src) and _endpoint_match(ppath[-1], tgt)
+        # The scored credit: negatives need only the correct boolean; positives
+        # additionally need a well-formed chain with correct endpoints.
+        if gold_reachable:
+            path_correct = bool(boolean_correct and endpoints_correct)
+        else:
+            path_correct = bool(boolean_correct)
+        return {
+            "gold_reachable": gold_reachable,
+            "pred_reachable": pred_reachable,
+            "boolean_correct": bool(boolean_correct),
+            "endpoints_correct": bool(endpoints_correct),
+            "path_correct": path_correct,
+            "pred_path_len": len(ppath),
+        }
+    gold = _gold_set(q)
+    pred_set = {(p, qn) for p, qn in pred.get("items", []) if p}
+    gold_files = {p for p, _ in gold}
+    pred_files = {p for p, _ in pred_set}
+    return {
+        "qual_prf": _prf(pred_set, gold),
+        "loose_qual_prf": _prf(_loose_set(pred.get("items", [])), _loose_set(list(gold))),
+        "file_prf": _prf(pred_files, gold_files),
+        "pred_n": len(pred_set),
+        "gold_n": len(gold),
+    }
+
+
+# ---------------------------------------------------------------------------
+# Per-(question, arm) run
+# ---------------------------------------------------------------------------
+
+
+def _nav_calls(tool_by_name: dict[str, int], track: str) -> int:
+    prefix = "lsp" if track == LSP else "code-graph"
+    return sum(n for k, n in tool_by_name.items() if k.startswith(prefix))
+
+
+def _gate_caps(gate_path: Path | None) -> dict[str, dict]:
+    """Load per-question GRAPH answerability caps from the agentless gate output.
+
+    The gate computed the graph's Cypher answer vs gold for every question, so its
+    per-question file/qual F1 is the CEILING a code_graph agent can reach by
+    perfectly transcribing the tool output. Folding it in lets us attribute a
+    code_graph agent shortfall to either the agent (below cap) or the data (low
+    cap) — the rubber-duck's #1 must-fix.
+    """
+    if not gate_path or not gate_path.exists():
+        return {}
+    caps: dict[str, dict] = {}
+    for r in json.loads(gate_path.read_text()).get("rows", []):
+        if r["type"] == "path":
+            caps[r["id"]] = {
+                "graph_reachable": r.get("graph_reachable"),
+                "graph_path_correct": r.get("correct"),
+            }
+        else:
+            caps[r["id"]] = {
+                "graph_file_f1": r.get("file_prf", {}).get("f1"),
+                "graph_qual_f1": r.get("qual_prf", {}).get("f1"),
+                "graph_file_recall": r.get("file_prf", {}).get("recall"),
+                "grep_file_recall": r.get("grep_file_recall"),
+            }
+    return caps
+
+
+def run_one_nav(
+    q: dict,
+    *,
+    track: str,
+    model: str,
+    worktree: Path,
+    project: str,
+    port: int,
+    server_root: Path,
+    out_dir: Path,
+    wall_time: float,
+    cap: dict | None = None,
+) -> dict[str, Any]:
+    run_dir = out_dir / "runs" / track / q["id"].replace("/", "_").replace("::", "__")
+    if run_dir.exists():
+        import shutil
+
+        shutil.rmtree(run_dir)
+    run_dir.mkdir(parents=True, exist_ok=True)
+
+    mcp_config = None
+    if track == CODE_GRAPH:
+        wrapper = _write_mcp_wrapper(run_dir, server_root)
+        mcp_config = _write_mcp_config(run_dir, wrapper, "127.0.0.1", port)
+    elif track == LSP:
+        wrapper = _write_lsp_wrapper(run_dir, worktree)
+        mcp_config = _write_lsp_mcp_config(run_dir, wrapper)
+
+    prompt = build_nav_prompt(track, worktree, q, project)
+    (run_dir / "prompt.txt").write_text(prompt)
+
+    print(f"\n=== {q['id']} [{track}] type={q['type']} model={model} ===")
+    result = run_copilot(
+        prompt=prompt,
+        model=model,
+        cwd=worktree,
+        log_dir=run_dir / "logs",
+        mcp_config=mcp_config,
+        wall_time=wall_time,
+    )
+
+    tokens = parse_tokens_from_logs(run_dir / "logs")
+    result_ev = parse_result_event(result["stdout"])
+    tool_total, tool_by_name = parse_tool_calls(result["stdout"])
+    compliance = nudge_compliance(result["stdout"], track)
+    agent_text = extract_agent_text(result["stdout"])
+    (run_dir / "agent_text.txt").write_text(agent_text)
+
+    pred, parse_error = parse_nav_answer(agent_text, q["type"])
+
+    base = {
+        "id": q["id"],
+        "type": q["type"],
+        "hop": q["hop"],
+        "config": track,
+        "model": model,
+        "runner": RUNNER_VERSION,
+        **(cap or {}),
+    }
+
+    if result.get("startup_failed"):
+        print(f"[error] {q['id']} [{track}] copilot startup failed")
+        return {
+            **base,
+            "outcome": "error",
+            "error": (result.get("stderr") or "").strip()[:200],
+            "completed": False,
+        }
+
+    scores = score_nav(q, pred)
+    row = {
+        **base,
+        **scores,
+        "parse_error": parse_error,
+        "input_tokens": tokens["input_tokens"],
+        "output_tokens": tokens["output_tokens"],
+        "total_tokens": tokens["total_tokens"],
+        "premium_requests": result_ev["premium_requests"],
+        "tool_calls_total": tool_total,
+        "tool_calls_by_name": tool_by_name,
+        "nav_tool_calls": _nav_calls(tool_by_name, track),
+        "first_tool": compliance["first_tool"],
+        "timed_out": result["timed_out"],
+        "wall_clock_sec": round(result["wall"], 2),
+        "outcome": "answered",
+        "completed": True,
+    }
+    _print_row(row)
+    return row
+
+
+def _print_row(row: dict) -> None:
+    if row["type"] == "path":
+        verdict = "OK" if row.get("path_correct") else "X"
+        detail = (
+            f"reach pred={row.get('pred_reachable')} gold={row.get('gold_reachable')} "
+            f"ends={row.get('endpoints_correct')} {verdict}"
+        )
+    else:
+        f = row.get("file_prf", {})
+        qf = row.get("qual_prf", {})
+        cap = row.get("graph_file_f1")
+        detail = (
+            f"fileF1={f.get('f1')} qualF1={qf.get('f1')} pred_n={row.get('pred_n')} "
+            f"cap(graph_fileF1)={cap}"
+        )
+    print(
+        f"[nav] {row['id']} [{row['config']}] {detail} "
+        f"in={row['input_tokens']} out={row['output_tokens']} "
+        f"navtools={row['nav_tool_calls']} tools={row['tool_calls_total']} "
+        f"parse_err={row.get('parse_error')} wall={row['wall_clock_sec']}s"
+    )
+
+
+# ---------------------------------------------------------------------------
+# Aggregate report
+# ---------------------------------------------------------------------------
+
+
+def _mean(xs: list[float]) -> float:
+    return round(sum(xs) / len(xs), 4) if xs else 0.0
+
+
+def _median(xs: list[float]) -> float:
+    if not xs:
+        return 0.0
+    s = sorted(xs)
+    n = len(s)
+    mid = n // 2
+    return round((s[mid] if n % 2 else (s[mid - 1] + s[mid]) / 2), 2)
+
+
+def aggregate(rows: list[dict]) -> dict[str, Any]:
+    report: dict[str, Any] = {}
+    arms = sorted({r["config"] for r in rows if r.get("completed")})
+    for arm in arms:
+        ar = [r for r in rows if r["config"] == arm and r.get("completed")]
+        per_type: dict[str, Any] = {}
+        for typ in ("callers", "callees", "blast_radius", "path"):
+            tr = [r for r in ar if r["type"] == typ]
+            if not tr:
+                continue
+            if typ == "path":
+                per_type[typ] = {
+                    "n": len(tr),
+                    "path_acc": _mean([1.0 if r.get("path_correct") else 0.0 for r in tr]),
+                    "boolean_acc": _mean([1.0 if r.get("boolean_correct") else 0.0 for r in tr]),
+                    "median_total_tokens": _median([r["total_tokens"] for r in tr]),
+                    "median_nav_calls": _median([r["nav_tool_calls"] for r in tr]),
+                }
+            else:
+                row = {
+                    "n": len(tr),
+                    "file_f1": _mean([r["file_prf"]["f1"] for r in tr]),
+                    "qual_f1": _mean([r["qual_prf"]["f1"] for r in tr]),
+                    "loose_qual_f1": _mean([r["loose_qual_prf"]["f1"] for r in tr]),
+                    "file_recall": _mean([r["file_prf"]["recall"] for r in tr]),
+                    "median_total_tokens": _median([r["total_tokens"] for r in tr]),
+                    "median_nav_calls": _median([r["nav_tool_calls"] for r in tr]),
+                }
+                caps = [r["graph_file_f1"] for r in tr if r.get("graph_file_f1") is not None]
+                if caps:
+                    row["graph_file_f1_cap"] = _mean(caps)
+                per_type[typ] = row
+        multihop = [r for r in ar if r["hop"] == "multihop" and r["type"] != "path"]
+        report[arm] = {
+            "n": len(ar),
+            "by_type": per_type,
+            "median_total_tokens": _median([r["total_tokens"] for r in ar]),
+            "median_input_tokens": _median([r["input_tokens"] for r in ar]),
+            "median_output_tokens": _median([r["output_tokens"] for r in ar]),
+            "median_premium": _median([r["premium_requests"] for r in ar]),
+            "multihop_file_f1": _mean([r["file_prf"]["f1"] for r in multihop]) if multihop else None,
+            "parse_errors": sum(1 for r in ar if r.get("parse_error")),
+        }
+    report["_paired"] = _paired_deltas(rows)
+    return report
+
+
+def _paired_deltas(rows: list[dict]) -> dict[str, Any]:
+    """Per-question paired file-F1 (set types) and path_correct deltas with a
+    bootstrap 90% CI — small-n honesty (rubber-duck #5)."""
+    import random
+
+    by_arm: dict[str, dict[str, float]] = {}
+    for r in rows:
+        if not r.get("completed"):
+            continue
+        if r["type"] == "path":
+            val = 1.0 if r.get("path_correct") else 0.0
+        else:
+            val = r["file_prf"]["f1"]
+        by_arm.setdefault(r["config"], {})[r["id"]] = val
+    out: dict[str, Any] = {}
+    arms = sorted(by_arm)
+    if CODE_GRAPH not in arms:
+        return out
+    for other in [a for a in arms if a != CODE_GRAPH]:
+        common = sorted(set(by_arm[CODE_GRAPH]) & set(by_arm[other]))
+        diffs = [by_arm[CODE_GRAPH][i] - by_arm[other][i] for i in common]
+        if not diffs:
+            continue
+        rng = random.Random(13)
+        boot = []
+        for _ in range(2000):
+            sample = [diffs[rng.randrange(len(diffs))] for _ in diffs]
+            boot.append(sum(sample) / len(sample))
+        boot.sort()
+        out[f"code_graph_minus_{other}"] = {
+            "n_paired": len(diffs),
+            "mean_delta": round(sum(diffs) / len(diffs), 4),
+            "ci90": [round(boot[int(0.05 * len(boot))], 4), round(boot[int(0.95 * len(boot))], 4)],
+            "wins": sum(1 for d in diffs if d > 1e-9),
+            "losses": sum(1 for d in diffs if d < -1e-9),
+            "ties": sum(1 for d in diffs if abs(d) <= 1e-9),
+        }
+    return out
+
+
+def _print_report(report: dict) -> None:
+    print("\n" + "=" * 78)
+    print("MULTI-HOP NAV — PREMIUM AGENT ARM")
+    print("=" * 78)
+    for arm, a in report.items():
+        if arm == "_paired":
+            continue
+        mh = a["multihop_file_f1"]
+        print(f"\n### {arm}  (n={a['n']}, parse_errors={a['parse_errors']})")
+        print(
+            f"  median_tokens total={a['median_total_tokens']} "
+            f"in={a['median_input_tokens']} out={a['median_output_tokens']}  "
+            f"median_premium={a['median_premium']}  "
+            f"multihop_file_f1={mh}"
+        )
+        for typ, t in a["by_type"].items():
+            if typ == "path":
+                print(
+                    f"    {typ:<13} n={t['n']} path_acc={t['path_acc']} "
+                    f"bool_acc={t['boolean_acc']} med_tok={t['median_total_tokens']} "
+                    f"med_navcalls={t['median_nav_calls']}"
+                )
+            else:
+                cap = t.get("graph_file_f1_cap")
+                print(
+                    f"    {typ:<13} n={t['n']} fileF1={t['file_f1']} qualF1={t['qual_f1']} "
+                    f"looseQ={t['loose_qual_f1']} fileRec={t['file_recall']} "
+                    f"med_tok={t['median_total_tokens']} med_navcalls={t['median_nav_calls']}"
+                    + (f" [graph_cap_fileF1={cap}]" if cap is not None else "")
+                )
+    paired = report.get("_paired") or {}
+    if paired:
+        print("\n### paired deltas (code_graph minus other; file-F1 / path_correct)")
+        for k, v in paired.items():
+            print(
+                f"    {k}: mean_delta={v['mean_delta']} ci90={v['ci90']} "
+                f"W/L/T={v['wins']}/{v['losses']}/{v['ties']} (n={v['n_paired']})"
+            )
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def _load_done(results_path: Path) -> set[tuple]:
+    done: set[tuple] = set()
+    if not results_path.exists():
+        return done
+    for line in results_path.read_text().splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            r = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        if r.get("completed") and r.get("runner") == RUNNER_VERSION:
+            done.add((r["id"], r["config"], r.get("model", "")))
+    return done
+
+
+def main(argv: list[str] | None = None) -> int:
+    p = argparse.ArgumentParser(description="Multi-hop nav premium agent arm.")
+    p.add_argument("--questions", required=True)
+    p.add_argument("--worktree", default=None,
+                   help="repo cwd for all arms (default: questions JSON 'worktree')")
+    p.add_argument("--project", default="mh_uxarray",
+                   help="code_graph project name of the PRE-BUILT fixed graph")
+    p.add_argument("--port", type=int, default=6380, help="FalkorDB port")
+    p.add_argument("--model", default="claude-sonnet-4.6")
+    p.add_argument("--arms", nargs="*", default=list(ARMS), choices=list(ARMS))
+    p.add_argument("--types", nargs="*", default=None,
+                   choices=["callers", "callees", "blast_radius", "path"])
+    p.add_argument("--ids", nargs="*", default=None, help="restrict to these question ids")
+    p.add_argument("--limit", type=int, default=None, help="first N questions (post-filter)")
+    p.add_argument("--server-root", default=str(DEFAULT_MCP_SERVER_ROOT))
+    p.add_argument("--gate", default="/tmp/ux_gate.json",
+                   help="agentless gate output for per-question graph caps")
+    p.add_argument("--seed", type=int, default=13, help="run-order shuffle seed")
+    p.add_argument("--wall-time", type=float, default=900.0)
+    p.add_argument("--out", default="/tmp/ux_nav_agent.json")
+    p.add_argument("--results", default=None,
+                   help="append-only jsonl for resume (default: <out>.jsonl)")
+    p.add_argument("--no-resume", action="store_true")
+    args = p.parse_args(argv)
+
+    data = json.loads(Path(args.questions).read_text())
+    worktree = Path(args.worktree or data["worktree"]).resolve()
+    if not worktree.exists():
+        raise SystemExit(f"worktree not found: {worktree}")
+
+    qs = data["questions"]
+    if args.types:
+        qs = [q for q in qs if q["type"] in args.types]
+    if args.ids:
+        idset = set(args.ids)
+        qs = [q for q in qs if q["id"] in idset]
+    if args.limit:
+        qs = qs[: args.limit]
+
+    out_path = Path(args.out)
+    out_dir = out_path.with_suffix("")
+    out_dir.mkdir(parents=True, exist_ok=True)
+    results_path = Path(args.results) if args.results else out_path.with_suffix(".jsonl")
+
+    done = set() if args.no_resume else _load_done(results_path)
+    server_root = Path(args.server_root)
+    caps = _gate_caps(Path(args.gate) if args.gate else None)
+
+    print(
+        f"worktree={worktree}\nproject={args.project} port={args.port} "
+        f"model={args.model}\narms={args.arms} questions={len(qs)} "
+        f"caps_loaded={len(caps)} already_done={len(done)}"
+    )
+
+    rows: list[dict] = []
+    if results_path.exists():
+        for line in results_path.read_text().splitlines():
+            line = line.strip()
+            if line:
+                try:
+                    rows.append(json.loads(line))
+                except json.JSONDecodeError:
+                    pass
+
+    # Randomize (question, arm) order with a fixed seed so provider-side prompt
+    # caching / drift can't systematically favor whichever arm always runs first
+    # (rubber-duck #7).
+    import random
+
+    worklist = [(q, arm) for q in qs for arm in args.arms]
+    random.Random(args.seed).shuffle(worklist)
+
+    for q, arm in worklist:
+        key = (q["id"], arm, args.model)
+        if key in done:
+            continue
+        row = run_one_nav(
+            q,
+            track=arm,
+            model=args.model,
+            worktree=worktree,
+            project=args.project,
+            port=args.port,
+            server_root=server_root,
+            out_dir=out_dir,
+            wall_time=args.wall_time,
+            cap=caps.get(q["id"]),
+        )
+        rows.append(row)
+        with results_path.open("a") as f:
+            f.write(json.dumps(row) + "\n")
+
+    completed = [r for r in rows if r.get("completed")]
+    report = aggregate(completed)
+    _print_report(report)
+    out_path.write_text(json.dumps(
+        {
+            "worktree": str(worktree),
+            "project": args.project,
+            "model": args.model,
+            "n_rows": len(completed),
+            "report": report,
+            "rows": rows,
+        },
+        indent=2,
+    ))
+    print(f"\nwrote {out_path}  (jsonl: {results_path})")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/bench/runners/nav_multihop_gate.py b/bench/runners/nav_multihop_gate.py
new file mode 100644
index 00000000..a4fe901f
--- /dev/null
+++ b/bench/runners/nav_multihop_gate.py
@@ -0,0 +1,291 @@
+"""Agentless answerability/compression gate for the multi-hop nav experiment (Lane 2).
+
+This is the FREE gate that runs BEFORE any premium agent spend. For each generated
+question (callers / callees / blast_radius / path) it computes three things:
+
+  (a) ORACLE gold        -- already embedded in the question record (jedi-based,
+                            graph-independent; see nav_multihop_oracle.py).
+  (b) CODE_GRAPH answer  -- a single Cypher query over the fixed-resolver CALLS
+                            graph (1-hop reverse/forward, [:CALLS*1..3] closure, or
+                            shortestPath) mapped to the SAME (relpath, qualname)
+                            identity the oracle uses, plus the byte/token size of
+                            that compact structured answer.
+  (c) NO-TOOL grep       -- the raw `grep -rn "<leaf>"` evidence the un-tooled agent
+                            must scan, plus the FILE set grep trivially yields.
+
+We then score set-F1 (qualname-level AND file-level) of the graph answer vs the
+oracle, and compare evidence-token compactness graph-vs-grep. The GATE DECISION:
+if code_graph is not both compact AND correct -- especially on the >=2-hop
+questions (blast_radius, path) where grep is expensive and wrong -- do NOT spend
+premium on the agent arm.
+
+Usage:
+  .venv/bin/python -m bench.runners.nav_multihop_gate \
+      --questions /tmp/ux_questions.json \
+      --graph code:mh_uxarray:_default --worktree <path> [--port 6380] [--out gate.json]
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import statistics as st
+import subprocess
+from pathlib import Path
+from typing import Any, Optional
+
+from bench.runners.struct_query_bench import _graph_query
+from bench.runners.nav_oracle_bench import Scope, build_scopes, enclosing_qualname
+
+
+# ---------------------------------------------------------------------------
+# token estimate (chars/4 heuristic, consistent across graph & grep so the
+# RELATIVE compactness comparison is fair regardless of the absolute tokenizer)
+# ---------------------------------------------------------------------------
+
+def _toks(s: str) -> int:
+    return (len(s) + 3) // 4
+
+
+# ---------------------------------------------------------------------------
+# graph -> (relpath, qualname) mapping, shared by all query types
+# ---------------------------------------------------------------------------
+
+def _map_nodes(rows: list[list[Any]], worktree: Path,
+               scope_cache: dict[Path, list[Scope]]) -> set[tuple[str, str]]:
+    """rows are [name, path, src_start]; map each to (relpath, enclosing qualname)."""
+    out: set[tuple[str, str]] = set()
+    for row in rows:
+        if len(row) < 2 or row[0] is None or row[1] is None:
+            continue
+        nm, pth = str(row[0]), str(row[1])
+        start = int(row[2]) if len(row) > 2 and row[2] is not None else None
+        mp = Path(pth)
+        try:
+            rel = str(mp.relative_to(worktree))
+        except ValueError:
+            rel = mp.name
+        qual = nm
+        if start is not None and mp.exists():
+            if mp not in scope_cache:
+                scope_cache[mp] = build_scopes(mp)
+            q = enclosing_qualname(scope_cache[mp], start)
+            if q != "<module>":
+                qual = q
+        out.add((rel, qual))
+    return out
+
+
+def _subject_match(leaf: str, subj_path: str) -> str:
+    """Cypher predicate selecting the subject node by leaf name + def path suffix."""
+    leaf_q = leaf.replace("'", "\\'")
+    path_q = subj_path.replace("'", "\\'")
+    return f"s.name = '{leaf_q}' AND s.path ENDS WITH '{path_q}'"
+
+
+def graph_callers(graph, port, leaf, subj_path, worktree, sc):
+    cy = (f"MATCH (c)-[:CALLS]->(s) WHERE {_subject_match(leaf, subj_path)} "
+          "RETURN DISTINCT c.name, c.path, c.src_start")
+    return _map_nodes(_graph_query(graph, cy, port), worktree, sc)
+
+
+def graph_callees(graph, port, leaf, subj_path, worktree, sc):
+    cy = (f"MATCH (s)-[:CALLS]->(c) WHERE {_subject_match(leaf, subj_path)} "
+          "RETURN DISTINCT c.name, c.path, c.src_start")
+    return _map_nodes(_graph_query(graph, cy, port), worktree, sc)
+
+
+def graph_blast(graph, port, leaf, subj_path, worktree, sc, depth=3):
+    cy = (f"MATCH (c)-[:CALLS*1..{depth}]->(s) WHERE {_subject_match(leaf, subj_path)} "
+          "RETURN DISTINCT c.name, c.path, c.src_start")
+    return _map_nodes(_graph_query(graph, cy, port), worktree, sc)
+
+
+def graph_reachable(graph, port, src_leaf, src_path, dst_leaf, dst_path, depth=8):
+    cy = (
+        f"MATCH (a) WHERE a.name='{src_leaf}' AND a.path ENDS WITH '{src_path}' "
+        f"MATCH (b) WHERE b.name='{dst_leaf}' AND b.path ENDS WITH '{dst_path}' "
+        f"WITH a, b MATCH p = (a)-[:CALLS*1..{depth}]->(b) RETURN count(p) > 0 LIMIT 1"
+    )
+    rows = _graph_query(graph, cy, port)
+    if rows and rows[0]:
+        v = rows[0][0]
+        if isinstance(v, str):
+            return v.strip().lower() == "true"
+        return bool(v)
+    return False
+
+
+# ---------------------------------------------------------------------------
+# no-tool grep baseline
+# ---------------------------------------------------------------------------
+
+def grep_evidence(leaf: str, worktree: Path) -> tuple[str, set[str]]:
+    """Return (raw grep output the agent must scan, set of files containing `leaf`)."""
+    try:
+        res = subprocess.run(
+            ["grep", "-rn", "--include=*.py", r"\b" + re.escape(leaf) + r"\b", str(worktree)],
+            capture_output=True, text=True, timeout=60,
+        )
+        raw = res.stdout
+    except Exception:
+        raw = ""
+    files: set[str] = set()
+    for ln in raw.splitlines():
+        fp = ln.split(":", 1)[0]
+        try:
+            files.add(str(Path(fp).relative_to(worktree)))
+        except ValueError:
+            files.add(Path(fp).name)
+    return raw, files
+
+
+# ---------------------------------------------------------------------------
+# scoring
+# ---------------------------------------------------------------------------
+
+def _prf(pred: set, gold: set) -> dict[str, float]:
+    tp = len(pred & gold)
+    fp = len(pred - gold)
+    fn = len(gold - pred)
+    p = tp / (tp + fp) if (tp + fp) else (1.0 if not fn else 0.0)
+    r = tp / (tp + fn) if (tp + fn) else (1.0 if not fp else 0.0)
+    f1 = 2 * p * r / (p + r) if (p + r) else 0.0
+    return {"tp": tp, "fp": fp, "fn": fn,
+            "precision": round(p, 3), "recall": round(r, 3), "f1": round(f1, 3)}
+
+
+def _gold_set(q: dict) -> set[tuple[str, str]]:
+    return {(g["path"], g["qualname"]) for g in q["gold"]}
+
+
+def _files(s: set[tuple[str, str]]) -> set[str]:
+    return {p for p, _ in s}
+
+
+def run(questions_path: Path, graph: str, worktree: Path, port: int) -> dict:
+    data = json.loads(questions_path.read_text())
+    qs = data["questions"]
+    sc: dict[Path, list[Scope]] = {}
+    rows: list[dict] = []
+
+    for q in qs:
+        typ = q["type"]
+        rec: dict[str, Any] = {"id": q["id"], "type": typ, "hop": q["hop"]}
+
+        if typ == "path":
+            srt, dst = q["symbol"]["source"], q["symbol"]["target"]
+            gold_reach = q["gold"]["reachable"]
+            pred_reach = graph_reachable(
+                graph, port, srt["leaf"], srt["path"], dst["leaf"], dst["path"])
+            rec["gold_reachable"] = gold_reach
+            rec["graph_reachable"] = pred_reach
+            rec["correct"] = (pred_reach == gold_reach)
+            # graph evidence = a single bool + the path; grep cannot answer reachability
+            rec["graph_tokens"] = _toks(json.dumps({"reachable": pred_reach}))
+            graw, _ = grep_evidence(srt["leaf"], worktree)
+            graw2, _ = grep_evidence(dst["leaf"], worktree)
+            rec["grep_tokens"] = _toks(graw) + _toks(graw2)
+            rows.append(rec)
+            continue
+
+        leaf = q["symbol"]["leaf"]
+        subj_path = q["symbol"]["path"]
+        gold = _gold_set(q)
+
+        if typ == "callers":
+            pred = graph_callers(graph, port, leaf, subj_path, worktree, sc)
+        elif typ == "callees":
+            pred = graph_callees(graph, port, leaf, subj_path, worktree, sc)
+        elif typ == "blast_radius":
+            pred = graph_blast(graph, port, leaf, subj_path, worktree, sc,
+                               depth=q.get("depth", 3))
+        else:
+            continue
+
+        rec["qual_prf"] = _prf(pred, gold)
+        rec["file_prf"] = _prf(_files(pred), _files(gold))
+        # graph evidence the agent reads = the compact structured answer
+        graph_ans = json.dumps(sorted([{"path": p, "qualname": qn} for p, qn in pred],
+                                      key=lambda d: (d["path"], d["qualname"])))
+        rec["graph_tokens"] = _toks(graph_ans)
+        rec["graph_n"] = len(pred)
+        rec["gold_n"] = len(gold)
+        # grep baseline: raw evidence size + the file recall it trivially yields
+        graw, gfiles = grep_evidence(leaf, worktree)
+        rec["grep_tokens"] = _toks(graw)
+        rec["grep_file_recall"] = round(
+            len(gfiles & _files(gold)) / len(_files(gold)), 3) if _files(gold) else 1.0
+        rows.append(rec)
+
+    return {"graph": graph, "worktree": str(worktree),
+            "n": len(rows), "rows": rows}
+
+
+def _agg(rows, types, key, sub=None):
+    vals = []
+    for r in rows:
+        if r["type"] not in types:
+            continue
+        v = r.get(key)
+        if sub and isinstance(v, dict):
+            v = v.get(sub)
+        if isinstance(v, (int, float)):
+            vals.append(v)
+    return round(st.mean(vals), 3) if vals else None
+
+
+def report(res: dict) -> None:
+    rows = res["rows"]
+    print(f"\n=== AGENTLESS MULTI-HOP GATE === graph={res['graph']}  n={res['n']}\n")
+
+    setq = ["callers", "callees", "blast_radius"]
+    print("SET QUESTIONS (graph CALLS answer vs jedi oracle):")
+    print(f"{'type':<14}{'qF1':>7}{'fileF1':>8}{'qRec':>7}{'qPrec':>7}"
+          f"{'gTok':>8}{'grepTok':>9}{'grepFRec':>9}")
+    for t in setq:
+        trows = [r for r in rows if r["type"] == t]
+        if not trows:
+            continue
+        print(f"{t:<14}"
+              f"{_agg(rows,[t],'qual_prf','f1'):>7}"
+              f"{_agg(rows,[t],'file_prf','f1'):>8}"
+              f"{_agg(rows,[t],'qual_prf','recall'):>7}"
+              f"{_agg(rows,[t],'qual_prf','precision'):>7}"
+              f"{_agg(rows,[t],'graph_tokens'):>8.0f}"
+              f"{_agg(rows,[t],'grep_tokens'):>9.0f}"
+              f"{_agg(rows,[t],'grep_file_recall'):>9}")
+
+    prows = [r for r in rows if r["type"] == "path"]
+    if prows:
+        corr = sum(1 for r in prows if r["correct"])
+        print(f"\nPATH QUESTIONS (reachability bool, graph shortestPath vs oracle):")
+        print(f"  correct {corr}/{len(prows)}  "
+              f"avg graph_tokens={_agg(rows,['path'],'graph_tokens'):.0f}  "
+              f"avg grep_tokens={_agg(rows,['path'],'grep_tokens'):.0f}")
+
+    # gate signal: compactness ratio + correctness on >=2-hop
+    multihop = [r for r in rows if r["hop"] == "multihop" and "qual_prf" in r]
+    if multihop:
+        print(f"\n>=2-HOP (blast_radius) qF1={_agg(rows,['blast_radius'],'qual_prf','f1')} "
+              f"fileF1={_agg(rows,['blast_radius'],'file_prf','f1')}")
+    print()
+
+
+def main(argv=None):
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--questions", required=True)
+    ap.add_argument("--graph", required=True)
+    ap.add_argument("--worktree", required=True)
+    ap.add_argument("--port", type=int, default=6380)
+    ap.add_argument("--out", default=None)
+    a = ap.parse_args(argv)
+    res = run(Path(a.questions), a.graph, Path(a.worktree), a.port)
+    report(res)
+    if a.out:
+        Path(a.out).write_text(json.dumps(res, indent=1))
+        print(f"wrote {a.out}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bench/runners/nav_multihop_oracle.py b/bench/runners/nav_multihop_oracle.py
new file mode 100644
index 00000000..35e77e0d
--- /dev/null
+++ b/bench/runners/nav_multihop_oracle.py
@@ -0,0 +1,550 @@
+"""Independent (graph-blind) multi-hop call-graph oracle for Lane 2.
+
+Builds a FORWARD call graph ``caller -> callee`` for a worktree using **jedi**
+(goto on every call site) + ``ast`` (scopes / call-site enumeration), with ZERO
+input from FalkorDB / the tree-sitter analyzer under test. From that one graph we
+derive all four question types' ground truth:
+
+  * callers(S)      = reverse 1-hop  -> {u : u -> S}
+  * callees(S)      = forward 1-hop  -> {v : S -> v}
+  * blast_radius(S) = reverse transitive closure, depth<=D -> who is affected if S changes
+  * path(A, B)      = forward reachability + one valid path (edges all in the oracle)
+
+Independence discipline (prereg-multihop-nav §2): the oracle never reads the
+graph; jedi is a different engine from the tree-sitter resolver we benchmark, so
+grading the graph/agent against it is non-circular. Node identity is
+``(relpath, qualname)`` -- the SAME identity ``nav_oracle_bench`` uses -- so graph
+and agent answers are directly comparable.
+
+The forward graph is expensive (one jedi goto per call site) so it is cached to
+``<worktree>/.nav_oracle_cache.json`` keyed by a digest of the .py file set +
+mtimes; pass ``--rebuild`` to force.
+
+Usage:
+  .venv/bin/python -m bench.runners.nav_multihop_oracle --worktree <path> [--rebuild] [--depth 3]
+"""
+from __future__ import annotations
+
+import argparse
+import ast
+import hashlib
+import json
+import time
+from collections import deque
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+
+from bench.runners.nav_oracle_bench import (
+    Scope,
+    build_scopes,
+    enclosing_qualname,
+    find_definitions,
+    _iter_py,
+    _parse,
+)
+
+# A node in the call graph: (relpath, qualname). qualname uses dotted scope path
+# (e.g. "Grid.calculate_face_areas"); module-level call sites are "<module>".
+Node = tuple[str, str]
+
+
+# ---------------------------------------------------------------------------
+# Call-site enumeration (ast, source-only)
+# ---------------------------------------------------------------------------
+
+@dataclass
+class CallSite:
+    caller_qual: str        # enclosing function/class qualname, or "<module>"
+    callee_name: str        # the identifier being called (attr or bare name)
+    line: int               # 1-based line of the callee identifier
+    col: int                # 0-based column of the callee identifier (jedi seed)
+
+
+def _callee_ident(func: ast.AST) -> Optional[tuple[str, int, int]]:
+    """For a Call.func node return (name, line, col0) pointing AT the callee
+    identifier, suitable for seeding jedi.goto. Handles bare ``foo(`` (Name) and
+    method ``obj.bar(`` (Attribute). Returns None for unresolvable forms
+    (subscripts, calls-of-calls, lambdas)."""
+    if isinstance(func, ast.Name):
+        return func.id, func.lineno, func.col_offset
+    if isinstance(func, ast.Attribute):
+        # The attribute name sits at the END of the attribute expression.
+        end_line = getattr(func, "end_lineno", func.lineno)
+        end_col = getattr(func, "end_col_offset", None)
+        if end_col is None:
+            return None
+        return func.attr, end_line, end_col - len(func.attr)
+    return None
+
+
+def enumerate_call_sites(path: Path, scopes: list[Scope]) -> list[CallSite]:
+    tree = _parse(path)
+    if tree is None:
+        return []
+    out: list[CallSite] = []
+    for node in ast.walk(tree):
+        if not isinstance(node, ast.Call):
+            continue
+        ident = _callee_ident(node.func)
+        if ident is None:
+            continue
+        name, line, col = ident
+        caller = enclosing_qualname(scopes, node.func.lineno)
+        out.append(CallSite(caller_qual=caller, callee_name=name, line=line, col=col))
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Forward call graph (jedi-resolved)
+# ---------------------------------------------------------------------------
+
+@dataclass
+class CallGraph:
+    worktree: str
+    fwd: dict[Node, set[Node]] = field(default_factory=dict)   # caller -> {callee}
+    rev: dict[Node, set[Node]] = field(default_factory=dict)   # callee -> {caller}
+    nodes: set[Node] = field(default_factory=set)
+    # def_index: qualname-leaf -> set of Nodes defining it (for question phrasing)
+    by_name: dict[str, set[Node]] = field(default_factory=dict)
+
+    def add_edge(self, u: Node, v: Node) -> None:
+        if u == v:
+            return
+        self.fwd.setdefault(u, set()).add(v)
+        self.rev.setdefault(v, set()).add(u)
+        self.nodes.add(u)
+        self.nodes.add(v)
+
+    def successors(self, s: Node) -> set[Node]:
+        return set(self.fwd.get(s, set()))
+
+    def predecessors(self, s: Node) -> set[Node]:
+        return set(self.rev.get(s, set()))
+
+    def reverse_closure(self, s: Node, depth: int) -> set[Node]:
+        """All nodes that transitively reach s within <=depth hops (excl. s)."""
+        seen: set[Node] = set()
+        frontier = deque([(s, 0)])
+        while frontier:
+            cur, d = frontier.popleft()
+            if d >= depth:
+                continue
+            for u in self.rev.get(cur, set()):
+                if u not in seen and u != s:
+                    seen.add(u)
+                    frontier.append((u, d + 1))
+        return seen
+
+    def forward_path(self, a: Node, b: Node, max_depth: int = 8) -> Optional[list[Node]]:
+        """One shortest forward path a->...->b (BFS), or None if unreachable."""
+        if a == b:
+            return [a]
+        prev: dict[Node, Node] = {a: a}
+        frontier = deque([(a, 0)])
+        while frontier:
+            cur, d = frontier.popleft()
+            if d >= max_depth:
+                continue
+            for v in self.fwd.get(cur, set()):
+                if v not in prev:
+                    prev[v] = cur
+                    if v == b:
+                        path = [b]
+                        while path[-1] != a:
+                            path.append(prev[path[-1]])
+                        return list(reversed(path))
+                    frontier.append((v, d + 1))
+        return None
+
+
+def _def_node_for(name_obj, worktree: Path, scope_cache: dict[Path, list[Scope]]) -> Optional[Node]:
+    """Map a jedi goto result to a (relpath, qualname) node, or None if outside
+    the worktree / not a func/class definition."""
+    mod = getattr(name_obj, "module_path", None)
+    if mod is None:
+        return None
+    mp = Path(mod)
+    try:
+        rel = str(mp.relative_to(worktree))
+    except ValueError:
+        return None  # stdlib / site-packages
+    typ = getattr(name_obj, "type", None)
+    if typ not in ("function", "class"):
+        return None
+    line = getattr(name_obj, "line", None)
+    if line is None:
+        return None
+    if mp not in scope_cache:
+        scope_cache[mp] = build_scopes(mp)
+    qual = enclosing_qualname(scope_cache[mp], line)
+    if qual == "<module>":
+        # def at module top: enclosing scope IS the def, so this should not happen;
+        # guard by using the leaf name.
+        qual = getattr(name_obj, "name", "<module>")
+    return (rel, qual)
+
+
+def build_call_graph(worktree: Path, *, progress: bool = True) -> CallGraph:
+    import jedi
+
+    cg = CallGraph(worktree=str(worktree))
+    project = jedi.Project(str(worktree))
+    scope_cache: dict[Path, list[Scope]] = {}
+    files = list(_iter_py(worktree))
+    t0 = time.time()
+    for i, path in enumerate(files):
+        if path not in scope_cache:
+            scope_cache[path] = build_scopes(path)
+        scopes = scope_cache[path]
+        try:
+            rel = str(path.relative_to(worktree))
+        except ValueError:
+            continue
+        # register every defined scope as a node (so isolated defs still exist)
+        for s in scopes:
+            node = (rel, s.qualname)
+            cg.nodes.add(node)
+            leaf = s.qualname.rsplit(".", 1)[-1]
+            cg.by_name.setdefault(leaf, set()).add(node)
+        sites = enumerate_call_sites(path, scopes)
+        try:
+            script = jedi.Script(path=str(path), project=project)
+        except Exception:
+            continue
+        for cs in sites:
+            try:
+                targets = script.goto(cs.line, cs.col, follow_imports=True,
+                                       follow_builtin_imports=False)
+            except Exception:
+                continue
+            for t in targets:
+                callee = _def_node_for(t, worktree, scope_cache)
+                if callee is None:
+                    continue
+                cg.add_edge((rel, cs.caller_qual), callee)
+        if progress and (i + 1) % 20 == 0:
+            print(f"  [oracle] {i + 1}/{len(files)} files  "
+                  f"edges={sum(len(v) for v in cg.fwd.values())}  "
+                  f"{time.time() - t0:.0f}s", flush=True)
+    if progress:
+        print(f"  [oracle] DONE {len(files)} files  nodes={len(cg.nodes)}  "
+              f"edges={sum(len(v) for v in cg.fwd.values())}  {time.time() - t0:.0f}s",
+              flush=True)
+    return cg
+
+
+# ---------------------------------------------------------------------------
+# Cache
+# ---------------------------------------------------------------------------
+
+def _digest(worktree: Path) -> str:
+    h = hashlib.sha256()
+    for p in sorted(_iter_py(worktree)):
+        try:
+            st = p.stat()
+            h.update(str(p).encode())
+            h.update(str(int(st.st_mtime)).encode())
+            h.update(str(st.st_size).encode())
+        except OSError:
+            continue
+    return h.hexdigest()[:16]
+
+
+def _cache_path(worktree: Path) -> Path:
+    return worktree / ".nav_oracle_cache.json"
+
+
+def load_or_build(worktree: Path, *, rebuild: bool = False) -> CallGraph:
+    cp = _cache_path(worktree)
+    dig = _digest(worktree)
+    if cp.exists() and not rebuild:
+        try:
+            raw = json.loads(cp.read_text())
+            if raw.get("digest") == dig:
+                cg = CallGraph(worktree=str(worktree))
+                for u, v in raw["edges"]:
+                    cg.add_edge(tuple(u), tuple(v))
+                for n in raw.get("nodes", []):
+                    cg.nodes.add(tuple(n))
+                    leaf = tuple(n)[1].rsplit(".", 1)[-1]
+                    cg.by_name.setdefault(leaf, set()).add(tuple(n))
+                print(f"  [oracle] loaded cache {cp.name} "
+                      f"(nodes={len(cg.nodes)} edges={sum(len(v) for v in cg.fwd.values())})")
+                return cg
+        except (json.JSONDecodeError, KeyError, ValueError):
+            pass
+    cg = build_call_graph(worktree)
+    edges = [[list(u), list(v)] for u, vs in cg.fwd.items() for v in vs]
+    cp.write_text(json.dumps({
+        "digest": dig,
+        "nodes": [list(n) for n in sorted(cg.nodes)],
+        "edges": edges,
+    }))
+    print(f"  [oracle] wrote cache {cp.name} ({len(edges)} edges)")
+    return cg
+
+
+def _distinctive(qual: str, generic: set[str]) -> bool:
+    leaf = qual.rsplit(".", 1)[-1]
+    return (len(leaf) >= 5 and not leaf.startswith("_")
+            and leaf not in generic and leaf.lower() not in generic)
+
+
+def generate_questions(
+    cg: CallGraph, worktree: Path, *, seed: int, per_type: int, depth: int = 3,
+) -> list[dict]:
+    """Graph-blind question universe sampled from the INDEPENDENT oracle (never
+    from FalkorDB). One question per symbol/type with cardinality bands so we
+    measure navigation, not list-enumeration (prereg + rubber-duck §3).
+
+      callers      1-hop : gold-set size 3..30
+      callees      1-hop : gold-set size 3..30
+      blast_radius >=2hop: reverse closure(<=depth) size 5..50 AND > indeg
+                           (genuine multi-hop, not just the 1-hop caller set)
+      path         >=2hop: B reachable from A with path length >=3 nodes;
+                           plus a few unreachable negatives.
+
+    Single-definition gate (find_definitions) keeps the question referent
+    unambiguous and the leaf name usable as a graph/agent lookup key.
+    """
+    import random
+    from bench.runners.struct_query_bench import _GENERIC
+
+    rng = random.Random(seed)
+    # cache single-def leaf names to avoid repeated ast scans
+    def_cache: dict[str, int] = {}
+
+    def single_def(leaf: str) -> bool:
+        if leaf not in def_cache:
+            def_cache[leaf] = len(find_definitions(worktree, leaf))
+        return def_cache[leaf] == 1
+
+    nodes = sorted(cg.nodes)
+    rng.shuffle(nodes)
+
+    def _is_test(node: Node) -> bool:
+        rel, qual = node
+        leaf = qual.rsplit(".", 1)[-1]
+        return ("/test" in rel or rel.startswith("test")
+                or "/tests/" in rel or leaf.startswith("test_")
+                or "conftest" in rel)
+
+    # subject pool excludes test/fixture nodes — graph value lives in library code
+    nodes = [n for n in nodes if not _is_test(n)]
+
+    def phrase_set(node: Node) -> dict:
+        return {"path": node[0], "qualname": node[1]}
+
+    out: list[dict] = []
+
+    # ---- callers (1-hop reverse) ----
+    picked = 0
+    for n in nodes:
+        if picked >= per_type:
+            break
+        rel, qual = n
+        leaf = qual.rsplit(".", 1)[-1]
+        if not _distinctive(qual, _GENERIC):
+            continue
+        callers = cg.predecessors(n)
+        if not (3 <= len(callers) <= 30):
+            continue
+        if not single_def(leaf):
+            continue
+        out.append({
+            "id": f"callers::{rel}::{qual}",
+            "type": "callers", "hop": "1hop",
+            "symbol": {"path": rel, "qualname": qual, "leaf": leaf},
+            "question": (f"List every function that directly CALLS the function "
+                         f"`{qual}` (defined in `{rel}`). Return the caller functions."),
+            "gold": [phrase_set(c) for c in sorted(callers)],
+        })
+        picked += 1
+
+    # ---- callees (1-hop forward) ----
+    picked = 0
+    for n in nodes:
+        if picked >= per_type:
+            break
+        rel, qual = n
+        leaf = qual.rsplit(".", 1)[-1]
+        if not _distinctive(qual, _GENERIC):
+            continue
+        callees = cg.successors(n)
+        if not (3 <= len(callees) <= 30):
+            continue
+        if not single_def(leaf):
+            continue
+        out.append({
+            "id": f"callees::{rel}::{qual}",
+            "type": "callees", "hop": "1hop",
+            "symbol": {"path": rel, "qualname": qual, "leaf": leaf},
+            "question": (f"List every function that the function `{qual}` "
+                         f"(defined in `{rel}`) directly CALLS. Return the callee functions."),
+            "gold": [phrase_set(c) for c in sorted(callees)],
+        })
+        picked += 1
+
+    # ---- blast_radius (>=2-hop reverse closure) ----
+    picked = 0
+    for n in nodes:
+        if picked >= per_type:
+            break
+        rel, qual = n
+        leaf = qual.rsplit(".", 1)[-1]
+        if not _distinctive(qual, _GENERIC):
+            continue
+        indeg = len(cg.predecessors(n))
+        closure = cg.reverse_closure(n, depth)
+        if not (5 <= len(closure) <= 50):
+            continue
+        if len(closure) <= indeg:  # must extend beyond the 1-hop caller set
+            continue
+        if not single_def(leaf):
+            continue
+        out.append({
+            "id": f"blast::{rel}::{qual}",
+            "type": "blast_radius", "hop": "multihop",
+            "symbol": {"path": rel, "qualname": qual, "leaf": leaf},
+            "question": (f"If the signature/behaviour of `{qual}` (defined in `{rel}`) "
+                         f"changes, which functions are potentially AFFECTED? Include all "
+                         f"functions that reach `{qual}` through up to {depth} levels of "
+                         f"calls (transitive callers)."),
+            "gold": [phrase_set(c) for c in sorted(closure)],
+            "depth": depth,
+        })
+        picked += 1
+
+    # ---- path (>=2-hop forward reachability) — ~half positive, ~half negative ----
+    n_pos = (per_type + 1) // 2
+    n_neg = per_type - n_pos
+    picked = 0
+    src_pool = [n for n in nodes if cg.successors(n) and _distinctive(n[1], _GENERIC)]
+    attempts = 0
+    seen_pairs: set[tuple[Node, Node]] = set()
+    while picked < n_pos and attempts < len(src_pool) * 4 and src_pool:
+        attempts += 1
+        a = rng.choice(src_pool)
+        # forward reachable nodes within a few hops
+        reach: list[Node] = []
+        frontier = deque([(a, 0)])
+        seen = {a}
+        while frontier:
+            cur, d = frontier.popleft()
+            if d >= 5:
+                continue
+            for v in cg.fwd.get(cur, set()):
+                if v not in seen:
+                    seen.add(v)
+                    if d + 1 >= 2:
+                        reach.append(v)
+                    frontier.append((v, d + 1))
+        reach = [b for b in reach if _distinctive(b[1], _GENERIC) and not _is_test(b)]
+        if not reach:
+            continue
+        b = rng.choice(reach)
+        if (a, b) in seen_pairs:
+            continue
+        path = cg.forward_path(a, b)
+        if path is None or len(path) < 3:
+            continue
+        if not (single_def(a[1].rsplit(".", 1)[-1]) and single_def(b[1].rsplit(".", 1)[-1])):
+            continue
+        seen_pairs.add((a, b))
+        out.append({
+            "id": f"path::{a[0]}::{a[1]}->{b[0]}::{b[1]}",
+            "type": "path", "hop": "multihop",
+            "symbol": {"source": {"path": a[0], "qualname": a[1], "leaf": a[1].rsplit('.', 1)[-1]},
+                       "target": {"path": b[0], "qualname": b[1], "leaf": b[1].rsplit('.', 1)[-1]}},
+            "question": (f"Is there a chain of function calls starting from `{a[1]}` "
+                         f"(in `{a[0]}`) that eventually reaches `{b[1]}` (in `{b[0]}`)? "
+                         f"If yes, give one such call path."),
+            "gold": {"reachable": True, "path": [phrase_set(p) for p in path]},
+        })
+        picked += 1
+
+    # negatives: A,B both real symbols with NO forward path A->B
+    picked_neg = 0
+    cand = [n for n in nodes if _distinctive(n[1], _GENERIC) and not _is_test(n)]
+    attempts = 0
+    while picked_neg < n_neg and attempts < len(cand) * 8 and len(cand) > 2:
+        attempts += 1
+        a = rng.choice(cand)
+        b = rng.choice(cand)
+        if a == b or (a, b) in seen_pairs:
+            continue
+        # require a has outgoing edges (otherwise trivially unreachable)
+        if not cg.successors(a):
+            continue
+        if cg.forward_path(a, b) is not None:
+            continue
+        if not (single_def(a[1].rsplit(".", 1)[-1]) and single_def(b[1].rsplit(".", 1)[-1])):
+            continue
+        seen_pairs.add((a, b))
+        out.append({
+            "id": f"path::{a[0]}::{a[1]}->{b[0]}::{b[1]}",
+            "type": "path", "hop": "multihop",
+            "symbol": {"source": {"path": a[0], "qualname": a[1], "leaf": a[1].rsplit('.', 1)[-1]},
+                       "target": {"path": b[0], "qualname": b[1], "leaf": b[1].rsplit('.', 1)[-1]}},
+            "question": (f"Is there a chain of function calls starting from `{a[1]}` "
+                         f"(in `{a[0]}`) that eventually reaches `{b[1]}` (in `{b[0]}`)? "
+                         f"If yes, give one such call path."),
+            "gold": {"reachable": False, "path": []},
+        })
+        picked_neg += 1
+
+    return out
+
+
+def _stats(cg: CallGraph, depth: int) -> dict:
+    indeg = [len(cg.rev.get(n, set())) for n in cg.nodes]
+    outdeg = [len(cg.fwd.get(n, set())) for n in cg.nodes]
+    import statistics as st
+    nz_in = [d for d in indeg if d]
+    nz_out = [d for d in outdeg if d]
+    return {
+        "nodes": len(cg.nodes),
+        "edges": sum(outdeg),
+        "nodes_with_callers": len(nz_in),
+        "nodes_with_callees": len(nz_out),
+        "median_indeg_nz": st.median(nz_in) if nz_in else 0,
+        "median_outdeg_nz": st.median(nz_out) if nz_out else 0,
+        "max_indeg": max(indeg) if indeg else 0,
+        "max_outdeg": max(outdeg) if outdeg else 0,
+    }
+
+
+def main(argv: Optional[list[str]] = None) -> int:
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--worktree", required=True, type=Path)
+    ap.add_argument("--rebuild", action="store_true")
+    ap.add_argument("--depth", type=int, default=3)
+    ap.add_argument("--questions", type=int, default=0,
+                    help="if >0, generate this many questions PER TYPE and write --out")
+    ap.add_argument("--seed", type=int, default=1234)
+    ap.add_argument("--out")
+    args = ap.parse_args(argv)
+    wt = args.worktree.resolve()
+    cg = load_or_build(wt, rebuild=args.rebuild)
+    print(json.dumps(_stats(cg, args.depth), indent=2, default=str))
+    if args.questions:
+        qs = generate_questions(cg, wt, seed=args.seed, per_type=args.questions, depth=args.depth)
+        from collections import Counter
+        byt = Counter(q["type"] for q in qs)
+        byh = Counter(q["hop"] for q in qs)
+        print(f"\ngenerated {len(qs)} questions  by_type={dict(byt)}  by_hop={dict(byh)}")
+        for q in qs:
+            if q["type"] == "path":
+                gold = "reachable" if q["gold"]["reachable"] else "no-path"
+                print(f"  [{q['type']:12} {q['hop']:8}] {q['id'][:70]}  gold={gold}")
+            else:
+                print(f"  [{q['type']:12} {q['hop']:8}] {q['symbol']['qualname']:32} gold_n={len(q['gold'])}")
+        if args.out:
+            Path(args.out).write_text(json.dumps({"worktree": str(wt), "questions": qs}, indent=2))
+            print(f"wrote {args.out}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/bench/runners/nav_oracle_bench.py b/bench/runners/nav_oracle_bench.py
new file mode 100644
index 00000000..7e2cbe37
--- /dev/null
+++ b/bench/runners/nav_oracle_bench.py
@@ -0,0 +1,340 @@
+"""Deterministic graph-vs-oracle navigation accuracy bench (Lane 2, the FREE half).
+
+Fixes the circular-validation flaw of ``struct_query_bench`` (which grades the graph
+against its own CALLS edges). Here the GROUND TRUTH for "who calls S" comes from an
+INDEPENDENT oracle -- jedi find-references -- run on the same worktree source, never
+from the graph. We then score the graph's CALLS answer against that oracle.
+
+Independence discipline (prereg-multihop-nav §2, §6):
+  * Symbol definitions are located via ``ast`` over the source, NOT via the graph.
+  * Symbols with 0 or >1 definitions in the worktree are DROPPED (oracle-uncertain);
+    we log how many, keeping the gold clean at the stated cost of generalization.
+  * jedi references are filtered to CALL sites and mapped to their enclosing function
+    via ``ast`` (source-only), so the comparison unit -- (relpath, caller_qualname) --
+    is computed without consulting the graph.
+
+Comparison unit: the SET of caller functions, identified by (relpath, qualname).
+Module-level call sites map to qualname ``"<module>"``. We report per-symbol set
+precision / recall / F1 of GRAPH vs ORACLE, macro-averaged, plus the raw disagreement
+lists (graph-only and oracle-only callers) for hand audit.
+
+Usage:
+  .venv/bin/python -m bench.runners.nav_oracle_bench \
+      --graph code:loc-<hash>:_default --worktree <path> [--n 30] [--port 6380] [--json out.json]
+"""
+from __future__ import annotations
+
+import argparse
+import ast
+import json
+import statistics as st
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Optional
+
+from bench.runners.struct_query_bench import _GENERIC, _graph_query
+
+
+# ---------------------------------------------------------------------------
+# Source-side (graph-independent) helpers: definitions + enclosing scopes via ast
+# ---------------------------------------------------------------------------
+
+@dataclass
+class Scope:
+    start: int          # 1-based first line (the def/class line)
+    end: int            # 1-based last line (inclusive)
+    qualname: str
+
+
+def _iter_py(worktree: Path):
+    for p in worktree.rglob("*.py"):
+        # Skip typical vendored / test-noise dirs that pollute the oracle.
+        parts = set(p.parts)
+        if parts & {".git", "node_modules", ".tox", "build", "dist", ".venv"}:
+            continue
+        yield p
+
+
+def _parse(path: Path) -> Optional[ast.AST]:
+    try:
+        return ast.parse(path.read_text(encoding="utf-8", errors="replace"))
+    except (SyntaxError, ValueError):
+        return None
+
+
+def find_definitions(worktree: Path, name: str) -> list[tuple[Path, int, int]]:
+    """All (path, lineno, name_col) where a function/class `name` is DEFINED.
+
+    Source-only (ast), independent of the graph. name_col is the 0-based column of
+    the identifier itself (not the `def`/`class` keyword), suitable for seeding jedi.
+    """
+    out: list[tuple[Path, int, int]] = []
+    for p in _iter_py(worktree):
+        tree = _parse(p)
+        if tree is None:
+            continue
+        for node in ast.walk(tree):
+            if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
+                if node.name != name:
+                    continue
+                kw = "class " if isinstance(node, ast.ClassDef) else "def "
+                out.append((p, node.lineno, node.col_offset + len(kw)))
+    return out
+
+
+def build_scopes(path: Path) -> list[Scope]:
+    """Flat list of every function/class scope in a file with line ranges + qualname."""
+    tree = _parse(path)
+    if tree is None:
+        return []
+    scopes: list[Scope] = []
+
+    def walk(node: ast.AST, prefix: str) -> None:
+        for child in ast.iter_child_nodes(node):
+            if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
+                qual = f"{prefix}{child.name}"
+                end = getattr(child, "end_lineno", None) or child.lineno
+                scopes.append(Scope(child.lineno, end, qual))
+                walk(child, qual + ".")
+            else:
+                walk(child, prefix)
+
+    walk(tree, "")
+    return scopes
+
+
+def enclosing_qualname(scopes: list[Scope], line: int) -> str:
+    """Innermost scope containing `line`, or '<module>' if none."""
+    best: Optional[Scope] = None
+    for s in scopes:
+        if s.start <= line <= s.end:
+            if best is None or s.start > best.start:  # innermost = latest start
+                best = s
+    return best.qualname if best else "<module>"
+
+
+# ---------------------------------------------------------------------------
+# Oracle (jedi) caller set
+# ---------------------------------------------------------------------------
+
+def _is_call_site(line_text: str, col: int, name: str) -> bool:
+    """Heuristic: the reference at `col` is a CALL if `name` is immediately
+    followed (modulo whitespace) by '('. Excludes imports, attribute reads,
+    type hints, decorators-without-call."""
+    after = line_text[col + len(name):]
+    stripped = after.lstrip()
+    return stripped.startswith("(")
+
+
+def oracle_callers(
+    worktree: Path, defs: list[tuple[Path, int, int]], name: str,
+    scope_cache: dict[Path, list[Scope]],
+) -> Optional[set[tuple[str, str]]]:
+    """jedi find-references -> set of (relpath, caller_qualname) call sites.
+
+    Returns None if jedi can't resolve (oracle failure -> caller drops the symbol).
+    Unions references across all definition sites (already gated to a single def by
+    the caller, but kept general)."""
+    import jedi
+
+    callers: set[tuple[str, str]] = set()
+    project = jedi.Project(str(worktree))
+    for dpath, dline, dcol in defs:
+        try:
+            script = jedi.Script(path=str(dpath), project=project)
+            refs = script.get_references(line=dline, column=dcol, include_builtins=False)
+        except Exception:
+            return None
+        for r in refs:
+            if r.is_definition():
+                continue
+            mod = r.module_path
+            if mod is None:
+                continue
+            mp = Path(mod)
+            try:
+                rel = str(mp.relative_to(worktree))
+            except ValueError:
+                continue  # reference outside the worktree (stdlib/site-packages)
+            try:
+                line_text = mp.read_text(encoding="utf-8", errors="replace").splitlines()[r.line - 1]
+            except (OSError, IndexError):
+                continue
+            if not _is_call_site(line_text, r.column, name):
+                continue
+            if mp not in scope_cache:
+                scope_cache[mp] = build_scopes(mp)
+            qual = enclosing_qualname(scope_cache[mp], r.line)
+            callers.add((rel, qual))
+    return callers
+
+
+# ---------------------------------------------------------------------------
+# Graph caller set
+# ---------------------------------------------------------------------------
+
+def graph_callers(graph: str, port: int, name: str, worktree: Path) -> set[tuple[str, str]]:
+    """The graph's CALLS answer as a set of (relpath, caller_qualname).
+
+    The graph stores caller functions directly (c)-[:CALLS]->(s{name}). We map each
+    caller's (path, name) to the SAME (relpath, qualname) identity the oracle uses by
+    re-deriving qualname from src_start via the source ast, so the two sets are
+    comparable. Falls back to the bare caller name if scope lookup misses."""
+    cypher = (
+        f"MATCH (c)-[:CALLS]->(s {{name:'{name}'}}) "
+        "RETURN DISTINCT c.name, c.path, c.src_start"
+    )
+    rows = _graph_query(graph, cypher, port)
+    out: set[tuple[str, str]] = set()
+    scope_cache: dict[Path, list[Scope]] = {}
+    for row in rows:
+        if len(row) < 2 or row[0] is None or row[1] is None:
+            continue
+        cname, cpath = str(row[0]), str(row[1])
+        start = int(row[2]) if len(row) > 2 and row[2] is not None else None
+        mp = Path(cpath)
+        try:
+            rel = str(mp.relative_to(worktree))
+        except ValueError:
+            rel = mp.name
+        qual = cname
+        if start is not None and mp.exists():
+            if mp not in scope_cache:
+                scope_cache[mp] = build_scopes(mp)
+            q = enclosing_qualname(scope_cache[mp], start)
+            if q != "<module>":
+                qual = q
+        out.add((rel, qual))
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Sampling + scoring
+# ---------------------------------------------------------------------------
+
+def sample_caller_symbols(
+    graph: str, port: int, *, n: int, seed: int,
+    fanin_lo: int = 3, fanin_hi: int = 80,
+) -> list[dict[str, Any]]:
+    """Distinctively-named callees with banded fan-in (reuses struct_query_bench
+    rationale: 3..80 avoids precise-grep-already and generic-megahub extremes)."""
+    import random
+    cypher = (
+        "MATCH (c)-[:CALLS]->(s) WHERE s:Searchable "
+        "WITH s.name AS name, count(c) AS fanin "
+        f"WHERE fanin >= {fanin_lo} AND fanin <= {fanin_hi} "
+        "RETURN name, fanin ORDER BY name"
+    )
+    rows = _graph_query(graph, cypher, port)
+    pairs = []
+    for row in rows:
+        if len(row) < 2:
+            continue
+        name, fan = row[0], row[1]
+        if not name or name in _GENERIC or str(name).startswith("__") or len(str(name)) < 4:
+            continue
+        pairs.append((str(name), int(fan)))
+    rng = random.Random(seed)
+    rng.shuffle(pairs)
+    return [{"name": nm, "fanin": fn} for nm, fn in pairs[:n]]
+
+
+def _prf(graph_set: set, oracle_set: set) -> dict[str, float]:
+    tp = len(graph_set & oracle_set)
+    fp = len(graph_set - oracle_set)
+    fn = len(oracle_set - graph_set)
+    prec = tp / (tp + fp) if (tp + fp) else (1.0 if not fn else 0.0)
+    rec = tp / (tp + fn) if (tp + fn) else (1.0 if not fp else 0.0)
+    f1 = 2 * prec * rec / (prec + rec) if (prec + rec) else 0.0
+    return {"tp": tp, "fp": fp, "fn": fn, "precision": round(prec, 4),
+            "recall": round(rec, 4), "f1": round(f1, 4)}
+
+
+@dataclass
+class Result:
+    rows: list[dict] = field(default_factory=list)
+    dropped: list[dict] = field(default_factory=list)
+
+
+def run(graph: str, worktree: Path, *, n: int, seed: int, port: int) -> Result:
+    syms = sample_caller_symbols(graph, port, n=n, seed=seed)
+    res = Result()
+    scope_cache: dict[Path, list[Scope]] = {}
+    for s in syms:
+        name = s["name"]
+        defs = find_definitions(worktree, name)
+        if len(defs) != 1:  # oracle-reliability gate
+            res.dropped.append({"symbol": name, "reason": f"{len(defs)} defs", "fanin": s["fanin"]})
+            continue
+        oset = oracle_callers(worktree, defs, name, scope_cache)
+        if oset is None:
+            res.dropped.append({"symbol": name, "reason": "jedi failed", "fanin": s["fanin"]})
+            continue
+        gset = graph_callers(graph, port, name, worktree)
+        prf = _prf(gset, oset)
+        res.rows.append({
+            "symbol": name,
+            "fanin": s["fanin"],
+            "n_graph": len(gset),
+            "n_oracle": len(oset),
+            **prf,
+            "graph_only": sorted(f"{q} @ {p}" for p, q in (gset - oset))[:10],
+            "oracle_only": sorted(f"{q} @ {p}" for p, q in (oset - gset))[:10],
+        })
+    return res
+
+
+def summarize(rows: list[dict]) -> dict[str, Any]:
+    if not rows:
+        return {"n": 0}
+    def macro(k: str) -> float:
+        return round(st.mean(r[k] for r in rows), 4)
+    return {
+        "n_scored": len(rows),
+        "macro_precision": macro("precision"),
+        "macro_recall": macro("recall"),
+        "macro_f1": macro("f1"),
+        "median_f1": round(st.median(r["f1"] for r in rows), 4),
+        "exact_match_rate": round(sum(1 for r in rows if r["fp"] == 0 and r["fn"] == 0) / len(rows), 4),
+    }
+
+
+def main(argv: Optional[list[str]] = None) -> int:
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--graph", required=True, help="FalkorDB graph key, e.g. code:loc-<hash>:_default")
+    ap.add_argument("--worktree", required=True, type=Path)
+    ap.add_argument("--n", type=int, default=30)
+    ap.add_argument("--seed", type=int, default=1234)
+    ap.add_argument("--port", type=int, default=6380)
+    ap.add_argument("--json")
+    args = ap.parse_args(argv)
+
+    wt = args.worktree.resolve()
+    res = run(args.graph, wt, n=args.n, seed=args.seed, port=args.port)
+    summary = summarize(res.rows)
+
+    print(f"graph={args.graph}  worktree={wt.name}")
+    print(f"sampled n={args.n}  scored={summary.get('n_scored', 0)}  dropped={len(res.dropped)}")
+    print(f"  GRAPH-vs-ORACLE (callers, 1-hop reverse CALLS):")
+    for k in ("macro_precision", "macro_recall", "macro_f1", "median_f1", "exact_match_rate"):
+        print(f"    {k:18} = {summary.get(k)}")
+    print(f"\n  per-symbol (sorted by f1):")
+    for r in sorted(res.rows, key=lambda x: x["f1"]):
+        print(f"    {r['symbol']:28} fanin={r['fanin']:>3} "
+              f"P={r['precision']:.2f} R={r['recall']:.2f} F1={r['f1']:.2f} "
+              f"(g={r['n_graph']} o={r['n_oracle']} tp={r['tp']} fp={r['fp']} fn={r['fn']})")
+    if res.dropped:
+        from collections import Counter
+        dc = Counter(d["reason"] for d in res.dropped)
+        print(f"\n  dropped: {dict(dc)}")
+
+    if args.json:
+        Path(args.json).write_text(json.dumps(
+            {"graph": args.graph, "worktree": str(wt), "summary": summary,
+             "rows": res.rows, "dropped": res.dropped}, indent=2))
+        print(f"\nwrote {args.json}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/bench/runners/reader_experiment.py b/bench/runners/reader_experiment.py
new file mode 100644
index 00000000..60440eb8
--- /dev/null
+++ b/bench/runners/reader_experiment.py
@@ -0,0 +1,507 @@
+"""Stage A offline "reader experiment" for the relationship_explanation lever.
+
+Background
+----------
+The corrected stratified measurement (files/benchmark-report-corrected-stratified.md)
+found that on the cg-n10-hardened batch the code-graph search tool *surfaces* gold
+files well (exposure 0.824) but the agent *adopts* them poorly (adoption 0.536):
+~70% of the recall loss is surfaced-but-dropped gold, not retrieval misses.
+
+This harness tests ONE cheap, non-overfitting lever before committing to an
+expensive end-to-end factorial: does annotating each search_code result with a
+FACTUAL ``relationship_explanation`` (WHY two files relate, e.g. "both override
+``value_to_string``") increase how often a single-turn reader keeps a surfaced
+gold file?
+
+It is fully OFFLINE w.r.t. the graph: it re-uses the search_code outputs already
+captured in the cg-n10-hardened run dirs, re-annotates them per arm, and presents
+them to an isolated single-turn Copilot agent that has NO tools and NO repository
+access — so the only thing that varies between arms is the annotation schema.
+
+Arms (rubber-duck-mandated)
+---------------------------
+* ``off``     — captured results as-is (no relationship_explanation).
+* ``placebo`` — length-matched neutral filler in the same field (controls for
+                "more text in the result" rather than "the explanation content").
+* ``explain`` — the factual relationship_explanation.
+
+The surfaced set is IDENTICAL across arms ("fixed-opportunity adoption"): every
+arm sees the same files; only the annotation differs.
+
+Primary metric
+--------------
+Paired per-instance file_recall (NOT raw adoption). Secondary: fixed-opportunity
+adoption (of gold files surfaced anywhere in the captures, how many the reader
+predicts), acc@k, MRR, and tokens. Paired bootstrap CIs across the shared task
+set.
+
+Usage
+-----
+    python -m bench.runners.reader_experiment \
+        --run-dir bench/cache/cg-n10-hardened \
+        --model claude-opus-4.8 \
+        --out bench/cache/reader-stageA
+
+Add ``--dry-run`` to render prompts + arms without spending any LLM tokens.
+"""
+
+from __future__ import annotations
+
+import argparse
+import copy
+import json
+import os
+import random
+import subprocess
+import sys
+import time
+import uuid
+from pathlib import Path
+from typing import Any
+
+# --- repo-local imports -----------------------------------------------------
+_THIS = Path(__file__).resolve()
+_BENCH = _THIS.parents[1]            # .../bench
+_REPO = _BENCH.parent               # .../mcp-t17
+if str(_BENCH) not in sys.path:
+    sys.path.insert(0, str(_BENCH))
+
+from runners import copilot_runner as cr  # noqa: E402
+from analysis.reader_capture import capture_search_calls  # noqa: E402
+
+# rel_explain lives in the mcp-smoke worktree (the live server the bench uses).
+_REL_EXPLAIN_DIR = (
+    _REPO.parent.parent / ".worktrees" / "mcp-smoke" / "api" / "mcp" / "tools"
+)
+if _REL_EXPLAIN_DIR.is_dir() and str(_REL_EXPLAIN_DIR) not in sys.path:
+    sys.path.insert(0, str(_REL_EXPLAIN_DIR))
+import rel_explain  # noqa: E402
+
+ARMS = ("off", "placebo", "explain")
+
+# Boundary markers in the captured prompt.txt (see plan + sample inspection).
+_PROBLEM_END = "\nInvestigate the repository to determine"
+
+
+# ---------------------------------------------------------------------------
+# Loading captured tasks
+# ---------------------------------------------------------------------------
+def _load_gold(run_dir: Path, model: str) -> dict[str, list[str]]:
+    """Map task_id -> gold_files for the code_graph config rows."""
+    results = run_dir / model / "results.jsonl"
+    gold: dict[str, list[str]] = {}
+    for line in results.read_text().splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        row = json.loads(line)
+        if row.get("config") != "code_graph":
+            continue
+        tid = row.get("task_id")
+        if tid and tid not in gold:
+            gold[tid] = list(row.get("gold_files") or [])
+    return gold
+
+
+def _extract_problem(prompt_text: str) -> str:
+    """Pull the bare problem statement out of the captured localize prompt."""
+    # Everything after the first line ("You are localizing ... checked out at X.")
+    nl = prompt_text.find("\n")
+    body = prompt_text[nl + 1:] if nl != -1 else prompt_text
+    end = body.find(_PROBLEM_END)
+    if end != -1:
+        body = body[:end]
+    return body.strip()
+
+
+def load_tasks(run_dir: Path, model: str) -> list[dict[str, Any]]:
+    """Return ordered task records with problem statement, gold, and captures."""
+    gold_map = _load_gold(run_dir, model)
+    runs_root = run_dir / "runs" / model / "localize" / "nudged" / "code_graph"
+    tasks: list[dict[str, Any]] = []
+    for task_dir in sorted(p for p in runs_root.iterdir() if p.is_dir()):
+        tid = task_dir.name
+        prompt_f = task_dir / "prompt.txt"
+        stdout_f = task_dir / "logs" / "stdout.jsonl"
+        if not prompt_f.exists() or not stdout_f.exists():
+            continue
+        calls = capture_search_calls(stdout_f)
+        if not calls:
+            continue
+        tasks.append(
+            {
+                "task_id": tid,
+                "problem": _extract_problem(prompt_f.read_text()),
+                "gold_files": [cr._norm_path(g) for g in gold_map.get(tid, [])],
+                "calls": calls,
+            }
+        )
+    return tasks
+
+
+# ---------------------------------------------------------------------------
+# Arm construction + prompt rendering
+# ---------------------------------------------------------------------------
+def annotate_calls(calls: list[dict[str, Any]], arm: str) -> list[dict[str, Any]]:
+    """Deep-copy + re-annotate every call's results for the given arm."""
+    out: list[dict[str, Any]] = []
+    for call in calls:
+        results = copy.deepcopy(call["results"])
+        annotated = rel_explain.annotate_results(results, call["query"], arm)
+        out.append({"query": call["query"], "results": annotated})
+    return out
+
+
+def surfaced_files(calls: list[dict[str, Any]]) -> set[str]:
+    """Every file the captures put in front of the agent (primary + related)."""
+    files: set[str] = set()
+    for call in calls:
+        for r in call["results"]:
+            f = r.get("file")
+            if f:
+                files.add(cr._norm_path(f))
+            for rel in r.get("likely_related_files") or []:
+                rf = rel.get("file")
+                if rf:
+                    files.add(cr._norm_path(rf))
+    return files
+
+
+_READER_INSTRUCTIONS = (
+    "You are localizing (not fixing) a bug in a Python repository. You do NOT "
+    "have access to the repository or any tools. Below is the bug report followed "
+    "by the complete output of a code-navigation search tool that was run against "
+    "the repository during an earlier investigation. Each result lists a file, a "
+    "matched symbol, and (where available) related files with an explanation of "
+    "how they relate.\n\n"
+    "Determine which SOURCE files must be edited to fix the issue, reasoning ONLY "
+    "from the bug report and the search results shown. Prefer files that the "
+    "evidence most directly implicates. List Python source files only (no tests, "
+    "no docs).\n"
+)
+
+_READER_SENTINEL_INSTR = (
+    "\nFinish your final message with a single line in EXACTLY this format "
+    "(most-likely file first, repo-root-relative paths):\n\n"
+    f"{cr.LOCALIZE_SENTINEL} [\"pkg/module_a.py\", \"pkg/module_b.py\"]\n\n"
+    "Write that line as plain text in your own message."
+)
+
+
+def render_prompt(problem: str, calls: list[dict[str, Any]]) -> str:
+    parts = [_READER_INSTRUCTIONS, "\n=== BUG REPORT ===\n", problem, "\n"]
+    parts.append("\n=== CODE-NAVIGATION SEARCH RESULTS ===\n")
+    for i, call in enumerate(calls, start=1):
+        parts.append(f"\n--- search_code call {i}: query={call['query']!r} ---\n")
+        parts.append(json.dumps(call["results"], indent=1, ensure_ascii=False))
+        parts.append("\n")
+    parts.append(_READER_SENTINEL_INSTR)
+    return "".join(parts)
+
+
+# ---------------------------------------------------------------------------
+# Isolated single-turn invocation (NO tools, NO repo)
+# ---------------------------------------------------------------------------
+def run_isolated_reader(
+    *, prompt: str, model: str, log_dir: Path, wall_time: float
+) -> dict[str, Any]:
+    """Invoke Copilot single-turn with ZERO tools; answer purely from the prompt.
+
+    ``--available-tools=`` (empty) is the key: the model gets no tools and must
+    answer from the prompt-embedded evidence only. ``--log-level debug`` is
+    required for ``process-*.log`` files (token usage) to be written.
+    """
+    log_dir = log_dir.resolve()
+    log_dir.mkdir(parents=True, exist_ok=True)
+    env = dict(os.environ)
+    stdout, stderr, returncode = "", "", None
+    timed_out = False
+    for attempt in range(1, cr.COPILOT_MAX_ATTEMPTS + 1):
+        session_id = str(uuid.uuid4())
+        cmd = [
+            "copilot", "-p", prompt,
+            "--model", model,
+            "--output-format", "json",
+            "--no-remote",
+            "--disable-builtin-mcps",
+            "--available-tools=",
+            "--log-level", "debug",
+            "--log-dir", str(log_dir),
+            "--session-id", session_id,
+        ]
+        _effort = cr._resolve_reasoning_effort()
+        if _effort:
+            cmd += ["--effort", _effort]
+        timed_out = False
+        proc = subprocess.Popen(
+            cmd,
+            cwd=str(log_dir),
+            stdin=subprocess.DEVNULL,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            env=env,
+            start_new_session=True,
+        )
+        try:
+            stdout, stderr = proc.communicate(timeout=wall_time)
+        except subprocess.TimeoutExpired:
+            timed_out = True
+            cr._kill_group(proc.pid)
+            try:
+                stdout, stderr = proc.communicate(timeout=30)
+            except subprocess.TimeoutExpired:
+                stdout, stderr = "", ""
+        returncode = proc.returncode
+        if timed_out or not cr._is_transient_startup_failure(returncode, stdout, stderr):
+            break
+        if attempt < cr.COPILOT_MAX_ATTEMPTS:
+            time.sleep(cr.COPILOT_RETRY_BACKOFF_SEC)
+    (log_dir / "stdout.jsonl").write_text(stdout or "")
+    (log_dir / "stderr.txt").write_text(stderr or "")
+    return {"stdout": stdout or "", "returncode": returncode, "timed_out": timed_out}
+
+
+def _final_text(stdout: str) -> str:
+    """Extract assistant-visible text from the Copilot CLI JSONL event stream.
+
+    The CLI emits one JSON object per line, each shaped
+    ``{type, data, id, timestamp, ...}``. The assistant's final visible text
+    (carrying the ``FINAL_LOCALIZATION_JSON:`` sentinel) lives in
+    ``assistant.message`` events under ``data.content`` (a string). Streaming
+    deltas arrive as ``assistant.message_delta`` events; we fall back to
+    assembling those, and finally to a terminal ``result`` event, if no
+    consolidated message is present.
+    """
+    messages: list[str] = []
+    deltas: list[str] = []
+    result_text = ""
+    for line in stdout.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            ev = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        if not isinstance(ev, dict):
+            continue
+        t = ev.get("type")
+        data = ev.get("data")
+        if t == "assistant.message" and isinstance(data, dict):
+            content = data.get("content")
+            if isinstance(content, str) and content:
+                messages.append(content)
+            elif isinstance(content, list):
+                for block in content:
+                    if isinstance(block, dict) and isinstance(block.get("text"), str):
+                        messages.append(block["text"])
+        elif t == "assistant.message_delta" and isinstance(data, dict):
+            for key in ("content", "delta", "text"):
+                val = data.get(key)
+                if isinstance(val, str) and val:
+                    deltas.append(val)
+                    break
+        elif t == "result":
+            if isinstance(data, str):
+                result_text = data
+            elif isinstance(data, dict):
+                for key in ("content", "text", "result"):
+                    val = data.get(key)
+                    if isinstance(val, str) and val:
+                        result_text = val
+                        break
+        # Legacy single-object shape (older CLI): {type:"assistant", message:{content:[...]}}
+        elif t == "assistant" and isinstance(ev.get("message"), dict):
+            for block in ev["message"].get("content") or []:
+                if isinstance(block, dict) and block.get("type") == "text":
+                    messages.append(block.get("text") or "")
+
+    if messages:
+        return "\n".join(messages)
+    if deltas:
+        return "".join(deltas)
+    return result_text
+
+
+# ---------------------------------------------------------------------------
+# Scoring + paired bootstrap
+# ---------------------------------------------------------------------------
+def adoption(pred: list[str], surfaced_gold: set[str]) -> float | None:
+    if not surfaced_gold:
+        return None
+    keep = surfaced_gold & {cr._norm_path(p) for p in pred}
+    return len(keep) / len(surfaced_gold)
+
+
+def paired_bootstrap(
+    deltas: list[float], n: int = 10000, seed: int = 20260604
+) -> dict[str, float]:
+    """Mean delta + 95% bootstrap CI over paired per-instance deltas."""
+    if not deltas:
+        return {"mean": 0.0, "ci_low": 0.0, "ci_high": 0.0, "n": 0}
+    rng = random.Random(seed)
+    means = []
+    m = len(deltas)
+    for _ in range(n):
+        sample = [deltas[rng.randrange(m)] for _ in range(m)]
+        means.append(sum(sample) / m)
+    means.sort()
+    return {
+        "mean": sum(deltas) / m,
+        "ci_low": means[int(0.025 * n)],
+        "ci_high": means[int(0.975 * n)],
+        "n": m,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Stage A reader experiment")
+    ap.add_argument("--run-dir", required=True, type=Path,
+                    help="cg-n10-hardened cache dir (contains <model>/results.jsonl + runs/)")
+    ap.add_argument("--model", default="claude-opus-4.8")
+    ap.add_argument("--out", required=True, type=Path, help="output cache dir")
+    ap.add_argument("--arms", default=",".join(ARMS),
+                    help="comma-separated subset of off,placebo,explain")
+    ap.add_argument("--wall-time", type=float, default=420.0)
+    ap.add_argument("--limit", type=int, default=0, help="limit #tasks (0=all)")
+    ap.add_argument("--dry-run", action="store_true",
+                    help="render arms+prompts, write them, no LLM calls")
+    args = ap.parse_args()
+
+    arms = [a.strip() for a in args.arms.split(",") if a.strip()]
+    for a in arms:
+        if rel_explain.normalize_mode(a) != a:
+            print(f"WARNING: arm {a!r} normalizes to {rel_explain.normalize_mode(a)!r}")
+
+    tasks = load_tasks(args.run_dir, args.model)
+    if args.limit:
+        tasks = tasks[: args.limit]
+    if not tasks:
+        print("No tasks loaded — check --run-dir/--model.", file=sys.stderr)
+        return 2
+    print(f"Loaded {len(tasks)} tasks; arms={arms}; model={args.model}")
+
+    args.out.mkdir(parents=True, exist_ok=True)
+    results_path = args.out / "reader_results.jsonl"
+    rows: list[dict[str, Any]] = []
+
+    with results_path.open("w") as out_f:
+        for ti, task in enumerate(tasks, start=1):
+            tid = task["task_id"]
+            gold = set(task["gold_files"])
+            surf = surfaced_files(task["calls"])
+            surf_gold = surf & gold
+            for arm in arms:
+                acalls = annotate_calls(task["calls"], arm)
+                prompt = render_prompt(task["problem"], acalls)
+                log_dir = args.out / "runs" / tid / arm
+                log_dir.mkdir(parents=True, exist_ok=True)
+                (log_dir / "prompt.txt").write_text(prompt)
+                tag = f"[{ti}/{len(tasks)}] {tid} :: {arm}"
+                if args.dry_run:
+                    print(f"{tag}  (dry-run) prompt={len(prompt)}B "
+                          f"surfaced_gold={len(surf_gold)}/{len(gold)}")
+                    continue
+                t0 = time.time()
+                rr = run_isolated_reader(
+                    prompt=prompt, model=args.model,
+                    log_dir=log_dir, wall_time=args.wall_time,
+                )
+                final = _final_text(rr["stdout"])
+                pred, perr, fallback = cr.parse_localization(final)
+                score = cr.score_localization(pred, sorted(gold))
+                toks = cr.parse_tokens_from_logs(log_dir)
+                row = {
+                    "task_id": tid,
+                    "arm": arm,
+                    "model": args.model,
+                    "gold_files": sorted(gold),
+                    "surfaced_gold": sorted(surf_gold),
+                    "pred_files": pred,
+                    "file_recall": score["file_recall"],
+                    "file_precision": score["file_precision"],
+                    "file_all_found": score["file_all_found"],
+                    "acc_at_1": score["acc_at_1"],
+                    "acc_at_3": score["acc_at_3"],
+                    "acc_at_5": score["acc_at_5"],
+                    "file_mrr": score["file_mrr"],
+                    "adoption": adoption(pred, surf_gold),
+                    "parse_error": perr,
+                    "parse_fallback": fallback,
+                    "input_tokens": toks["input_tokens"],
+                    "output_tokens": toks["output_tokens"],
+                    "reasoning_tokens": toks["reasoning_tokens"],
+                    "usage_blocks": toks["usage_blocks"],
+                    "timed_out": rr["timed_out"],
+                    "returncode": rr["returncode"],
+                    "wall_sec": round(time.time() - t0, 1),
+                    "prompt_bytes": len(prompt),
+                }
+                rows.append(row)
+                out_f.write(json.dumps(row) + "\n")
+                out_f.flush()
+                print(f"{tag}  recall={score['file_recall']} "
+                      f"adopt={row['adoption']} in={toks['input_tokens']} "
+                      f"err={perr} wall={row['wall_sec']}s")
+
+    if args.dry_run or not rows:
+        print(f"\nWrote prompts under {args.out/'runs'}. Done (dry-run={args.dry_run}).")
+        return 0
+
+    _summarize(rows, arms, args.out)
+    return 0
+
+
+def _summarize(rows: list[dict[str, Any]], arms: list[str], out: Path) -> None:
+    by: dict[str, dict[str, dict[str, Any]]] = {a: {} for a in arms}
+    for r in rows:
+        by[r["arm"]][r["task_id"]] = r
+
+    def col(arm: str, key: str) -> list[float]:
+        return [v[key] for v in by[arm].values() if v.get(key) is not None]
+
+    print("\n================ STAGE A SUMMARY ================")
+    hdr = f"{'arm':10s} {'recall':>8s} {'adopt':>8s} {'acc@1':>7s} {'mrr':>7s} {'in_med':>9s}"
+    print(hdr)
+    import statistics as st
+    for a in arms:
+        rec = col(a, "file_recall")
+        ado = col(a, "adoption")
+        a1 = col(a, "acc_at_1")
+        mrr = col(a, "file_mrr")
+        intok = col(a, "input_tokens")
+        print(f"{a:10s} {st.mean(rec):8.3f} "
+              f"{(st.mean(ado) if ado else 0):8.3f} "
+              f"{st.mean(a1):7.3f} {st.mean(mrr):7.3f} "
+              f"{int(st.median(intok)) if intok else 0:9d}")
+
+    # Paired deltas vs 'off' baseline.
+    base = "off" if "off" in arms else arms[0]
+    print(f"\nPaired deltas vs {base!r} (95% bootstrap CI):")
+    for a in arms:
+        if a == base:
+            continue
+        common = sorted(set(by[a]) & set(by[base]))
+        for metric in ("file_recall", "adoption"):
+            deltas = []
+            for t in common:
+                va, vb = by[a][t].get(metric), by[base][t].get(metric)
+                if va is None or vb is None:
+                    continue
+                deltas.append(va - vb)
+            bs = paired_bootstrap(deltas)
+            print(f"  {a:8s} Δ{metric:13s} "
+                  f"mean={bs['mean']:+.3f} CI[{bs['ci_low']:+.3f},{bs['ci_high']:+.3f}] n={bs['n']}")
+
+    summary_path = out / "summary.json"
+    summary_path.write_text(json.dumps(
+        {"arms": arms, "n_tasks": len(by[arms[0]]), "rows": rows}, indent=1))
+    print(f"\nWrote {summary_path}")
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/bench/runners/struct_query_bench.py b/bench/runners/struct_query_bench.py
new file mode 100644
index 00000000..8a7d3efd
--- /dev/null
+++ b/bench/runners/struct_query_bench.py
@@ -0,0 +1,232 @@
+"""Structural-query token-compression benchmark.
+
+Answers the stakeholder question directly, without agent-in-the-loop variance:
+
+    For graph-shaped questions ("who calls symbol S?", "what does S call?",
+    "where is S defined?"), how many *context tokens* does the code-graph's
+    structured answer consume, versus the raw file/grep evidence an agent
+    would otherwise have to read into its context to derive the same answer?
+
+This isolates the one thing a code graph can mechanically do — return compact,
+precise structural facts — from the confounds that dominate the agent
+benchmarks (the model ignoring the tool, non-convergence on large repos, issue
+text leaking the answer). If the graph cannot beat grep on *evidence
+compression* here, it cannot save tokens anywhere.
+
+Method (deterministic, no LLM):
+  * Connect straight to FalkorDB (the API's graph_entities endpoint caps the
+    returned subgraph; the raw store is complete).
+  * Sample distinctively-named Function/Class symbols with a meaningful but
+    non-pathological caller fan-in.
+  * For each symbol and each query type, compute:
+      graph_tokens  = tiktoken size of the graph's structured answer
+                      (caller/callee/definition list: "name @ relpath:line")
+      raw_tokens    = tiktoken size of the evidence an agent reads WITHOUT the
+                      graph: `rg -nw <symbol>` over the repo's .py files (the
+                      lines it must scan and disambiguate)
+      ratio         = raw_tokens / graph_tokens   (>1 => graph saves tokens)
+  * Aggregate with paired statistics (median ratio, geometric mean, win-rate),
+    never raw sums (one megahub symbol would dominate).
+
+Token counts use tiktoken cl100k as a standard, reproducible proxy for context
+size; absolute Claude token counts differ slightly but ratios are stable.
+
+Usage:
+    python -m bench.runners.struct_query_bench \
+        --repo-graph code:conan-io__conan-16987__loc:_default \
+        --worktree bench/cache/worktrees-localize/conan-io__conan-16987__loc \
+        --sample 40 --out bench/cache/struct-query/conan.jsonl
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import random
+import statistics as st
+import subprocess
+from pathlib import Path
+from typing import Any
+
+DEFAULT_SEED = 20260526
+
+# Generic method names whose name-based CALLS edges are too noisy to be a
+# meaningful "callers of X" answer (every .run()/.get() in the repo links here).
+_GENERIC = {
+    "run", "get", "set", "__init__", "__call__", "__enter__", "__exit__",
+    "setUp", "tearDown", "update", "add", "remove", "append", "load", "save",
+    "main", "test", "name", "value", "data", "result", "items", "keys",
+    "values", "format", "parse", "build", "make", "create", "close", "open",
+    "read", "write", "start", "stop", "send", "recv", "copy", "clear",
+}
+
+
+def _tok(text: str) -> int:
+    """tiktoken cl100k token count (reproducible context-size proxy)."""
+    import tiktoken
+
+    enc = tiktoken.get_encoding("cl100k_base")
+    return len(enc.encode(text))
+
+
+def _graph_query(graph: str, cypher: str, port: int) -> list[list[Any]]:
+    """Run a Cypher query against FalkorDB, returning structured data rows.
+
+    Uses the redis Python client and FalkorDB's RESP result framing: the reply
+    is [header, data_rows, stats]; data_rows is a list of rows, each a list of
+    column values. Far more robust than parsing redis-cli text output.
+    """
+    import redis
+
+    r = redis.Redis(host="127.0.0.1", port=port, decode_responses=True)
+    reply = r.execute_command("GRAPH.QUERY", graph, cypher)
+    if not isinstance(reply, list) or len(reply) < 2:
+        return []
+    data = reply[1]
+    rows: list[list[Any]] = []
+    for row in data:
+        rows.append(list(row) if isinstance(row, (list, tuple)) else [row])
+    return rows
+
+
+def _relpath(path: str, worktree: str) -> str:
+    try:
+        return str(Path(path).relative_to(worktree))
+    except ValueError:
+        return Path(path).name
+
+
+def sample_symbols(
+    graph: str, port: int, *, n: int, seed: int = DEFAULT_SEED,
+    fanin_lo: int = 3, fanin_hi: int = 80,
+) -> list[dict[str, Any]]:
+    """Distinctively-named Function/Class symbols with meaningful fan-in.
+
+    We band the fan-in: too low (1-2) means grep is already precise (no graph
+    headroom); too high (generic megahubs) means the name-based caller list is
+    noise. The 3..80 band targets symbols where 'who calls X' is both a real
+    question and answerable precisely.
+    """
+    cypher = (
+        "MATCH (c)-[:CALLS]->(s) WHERE s:Searchable "
+        f"WITH s.name AS name, count(c) AS fanin "
+        f"WHERE fanin >= {fanin_lo} AND fanin <= {fanin_hi} "
+        "RETURN name, fanin ORDER BY name"
+    )
+    rows = _graph_query(graph, cypher, port)
+    pairs: list[tuple[str, int]] = []
+    for row in rows:
+        if len(row) < 2:
+            continue
+        name, fan = row[0], row[1]
+        if not name or name in _GENERIC or str(name).startswith("__"):
+            continue
+        if len(str(name)) < 4:
+            continue
+        try:
+            pairs.append((str(name), int(fan)))
+        except (ValueError, TypeError):
+            continue
+    rng = random.Random(seed)
+    rng.shuffle(pairs)
+    return [{"name": nm, "fanin": fn} for nm, fn in pairs[:n]]
+
+
+def graph_callers_answer(graph: str, port: int, symbol: str, worktree: str) -> str:
+    """The structured caller list the graph returns for `who calls <symbol>`."""
+    cypher = (
+        f"MATCH (c)-[:CALLS]->(s {{name:'{symbol}'}}) "
+        "RETURN DISTINCT c.name, c.path ORDER BY c.name LIMIT 200"
+    )
+    rows = _graph_query(graph, cypher, port)
+    lines = []
+    for row in rows:
+        if len(row) < 2:
+            continue
+        cname, cpath = row[0], row[1]
+        lines.append(f"{cname} @ {_relpath(str(cpath), worktree)}")
+    return f"callers of {symbol} ({len(lines)}):\n" + "\n".join(lines)
+
+
+def grep_evidence(symbol: str, worktree: str) -> str:
+    """The raw evidence an agent reads WITHOUT the graph: word-boundary grep
+    of the symbol across the repo's Python files (the lines it must scan to
+    find and disambiguate real call sites)."""
+    out = subprocess.run(
+        ["rg", "-nw", "--no-heading", "-g", "*.py", symbol, worktree],
+        capture_output=True,
+        text=True,
+        timeout=120,
+    )
+    # Strip the absolute worktree prefix to mimic what the agent actually sees
+    # (relative paths), so we don't inflate raw tokens with long abs paths.
+    return out.stdout.replace(worktree.rstrip("/") + "/", "")
+
+
+def run(
+    graph: str, worktree: str, *, port: int, n: int, seed: int,
+) -> list[dict[str, Any]]:
+    syms = sample_symbols(graph, port, n=n, seed=seed)
+    results = []
+    for s in syms:
+        name = s["name"]
+        g = graph_callers_answer(graph, port, name, worktree)
+        r = grep_evidence(name, worktree)
+        gt, rt = _tok(g), _tok(r)
+        results.append({
+            "symbol": name,
+            "fanin": s["fanin"],
+            "graph_tokens": gt,
+            "raw_tokens": rt,
+            "ratio": round(rt / gt, 3) if gt else None,
+            "grep_hits": r.count("\n"),
+        })
+    return results
+
+
+def summarize(rows: list[dict[str, Any]]) -> dict[str, Any]:
+    ratios = [r["ratio"] for r in rows if r["ratio"]]
+    wins = sum(1 for x in ratios if x > 1.0)
+    geomean = (
+        st.geometric_mean(ratios) if ratios else None
+    )
+    return {
+        "n": len(rows),
+        "median_ratio": round(st.median(ratios), 3) if ratios else None,
+        "geomean_ratio": round(geomean, 3) if geomean else None,
+        "win_rate": f"{wins}/{len(ratios)}",
+        "median_graph_tokens": int(st.median([r["graph_tokens"] for r in rows])) if rows else 0,
+        "median_raw_tokens": int(st.median([r["raw_tokens"] for r in rows])) if rows else 0,
+    }
+
+
+def main(argv: list[str] | None = None) -> int:
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument("--repo-graph", required=True, help="FalkorDB graph key")
+    p.add_argument("--worktree", required=True, help="repo worktree path (for grep + relpaths)")
+    p.add_argument("--port", type=int, default=6380)
+    p.add_argument("--sample", type=int, default=40)
+    p.add_argument("--seed", type=int, default=DEFAULT_SEED)
+    p.add_argument("--out", type=Path, default=None)
+    args = p.parse_args(argv)
+
+    wt = str(Path(args.worktree).resolve())
+    rows = run(args.repo_graph, wt, port=args.port, n=args.sample, seed=args.seed)
+    summ = summarize(rows)
+    print(json.dumps({"summary": summ}, indent=2))
+    for r in sorted(rows, key=lambda x: -(x["ratio"] or 0)):
+        print(f"  {r['symbol']:32} fanin={r['fanin']:>4} "
+              f"graph={r['graph_tokens']:>5} raw={r['raw_tokens']:>7} "
+              f"ratio={r['ratio']}")
+    if args.out:
+        args.out.parent.mkdir(parents=True, exist_ok=True)
+        with args.out.open("w") as f:
+            f.write(json.dumps({"summary": summ}) + "\n")
+            for r in rows:
+                f.write(json.dumps(r) + "\n")
+        print(f"\nwrote {args.out}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/bench/scripts/start-api.sh b/bench/scripts/start-api.sh
new file mode 100755
index 00000000..4e55f673
--- /dev/null
+++ b/bench/scripts/start-api.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+# Launch the code-graph API server with the fast tree-sitter Python
+# resolver enabled (PR #691 + #692). This is what the bench harness
+# expects to talk to at 127.0.0.1:5000.
+#
+# Usage:
+#   bench/scripts/start-api.sh                  # default port 5000
+#   bench/scripts/start-api.sh --port 5001
+#
+# Prereqs:
+#   - FalkorDB running. For native falkordb on 6380 set
+#     FALKORDB_HOST=127.0.0.1 FALKORDB_PORT=6380 before invoking.
+#   - uv on PATH.
+#   - cwd must be a code-graph worktree containing api/ with PR #691
+#     and PR #692 applied (i.e. the dvirdukhan/query-cache branch tip
+#     or staging once those are merged).
+
+set -euo pipefail
+
+PORT=5000
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --port) PORT="$2"; shift 2 ;;
+        *) echo "Unknown arg: $1" >&2; exit 1 ;;
+    esac
+done
+
+# Tree-sitter static resolver — turns Python indexing from minutes to
+# seconds. Default is still jedi, so callers must opt in explicitly.
+export CODE_GRAPH_PY_RESOLVER="${CODE_GRAPH_PY_RESOLVER:-tree_sitter}"
+
+# Allow the bench harness to analyze any folder; the bench worktrees
+# live under bench/cache/worktrees.
+export ALLOWED_ANALYSIS_DIR="${ALLOWED_ANALYSIS_DIR:-/}"
+
+# Public mode: bench harness does not bother with bearer tokens.
+export CODE_GRAPH_PUBLIC="${CODE_GRAPH_PUBLIC:-1}"
+
+echo "[start-api] CODE_GRAPH_PY_RESOLVER=$CODE_GRAPH_PY_RESOLVER"
+echo "[start-api] CODE_GRAPH_PUBLIC=$CODE_GRAPH_PUBLIC"
+echo "[start-api] FALKORDB_HOST=${FALKORDB_HOST:-127.0.0.1} FALKORDB_PORT=${FALKORDB_PORT:-6379}"
+echo "[start-api] Listening on 127.0.0.1:$PORT"
+
+exec uv run uvicorn api.index:app --host 127.0.0.1 --port "$PORT"
diff --git a/bench/tools/code_graph_mcp/system_preamble.md b/bench/tools/code_graph_mcp/system_preamble.md
new file mode 100644
index 00000000..bf5af4a1
--- /dev/null
+++ b/bench/tools/code_graph_mcp/system_preamble.md
@@ -0,0 +1,72 @@
+# code-graph (MCP) preamble
+
+You are an autonomous coding agent solving a software-engineering task.
+Your sole tool is bash: every action you take is a shell command that
+is executed in the repository's working directory.
+
+## Code-navigation workflow — use this BEFORE grep/find
+
+A code-graph **MCP server** (`cgraph-mcp`) is available for this repo.
+**Before reading or editing code, locate the relevant symbols through
+`cg-mcp` rather than grepping the file tree** — it's faster, returns
+precise `{id, file, line}` records, and reveals caller / callee /
+impact relationships you would otherwise reconstruct by hand. Fall
+back to bash only when `cg-mcp` cannot answer the question.
+
+`$PROJECT_NAME` and `$BRANCH` are exported for you (do not guess).
+The graph is already indexed against the current commit.
+
+Typical loop:
+
+1. `cg-mcp search_code --project "$PROJECT_NAME" --prefix <name>` —
+   locate a function/class by name. Pick the `id` of the best hit.
+2. `cg-mcp get_callers --project "$PROJECT_NAME" --symbol-id <id>` —
+   "who calls this?" before refactoring.
+3. `cg-mcp impact_analysis --project "$PROJECT_NAME" --symbol-id <id>
+   --depth 3` — full transitive blast radius. Use this BEFORE
+   non-trivial edits.
+4. Read the implicated file(s) with `sed -n` / `cat`, then edit.
+
+## Available `cg-mcp` sub-commands
+
+- `cg-mcp search_code      --project P --prefix STR [--limit N]` —
+  prefix search; returns `[{id, name, label, file, line}, ...]`.
+- `cg-mcp get_callers      --project P --symbol-id ID [--limit N]` —
+  incoming CALLS edges (who calls X).
+- `cg-mcp get_callees      --project P --symbol-id ID [--limit N]` —
+  outgoing CALLS edges (what X calls).
+- `cg-mcp get_dependencies --project P --symbol-id ID [--limit N]` —
+  all outgoing edges (CALLS + IMPORTS + DEFINES).
+- `cg-mcp impact_analysis  --project P --symbol-id ID
+                          [--direction IN|OUT] [--depth N]` —
+  transitive blast radius (default IN, depth 3).
+- `cg-mcp find_path        --project P --source-id ID --dest-id ID` —
+  the call chain(s) between two symbols.
+- `cg-mcp index_repo       --path-or-url PATH [--branch B]` —
+  (re)index a folder or git URL. Only needed for repos that aren't
+  pre-indexed.
+
+You also have the usual Unix tools (`cat`, `grep`/`rg`, `find`, `sed`)
+for cases the graph can't answer.
+
+## Rules of thumb
+
+1. **Always run `search_code` first** to turn a name into an `id`.
+2. **`impact_analysis` before any non-trivial edit.** Even when you
+   think you know the answer — the transitive closure often surprises
+   you.
+3. **Don't `grep` for callers.** `get_callers` is one cheap Cypher
+   hop; grep over a large repo costs tens of thousands of tokens.
+
+## Submission
+
+When you believe the task is complete, run a bash command whose first
+line of stdout is exactly:
+
+```
+COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT
+```
+
+followed by your final answer or summary on subsequent lines. The
+runner reads the working-tree `git diff` automatically; you do not
+need to commit.
diff --git a/bench/tools/code_graph_mcp/tools.yaml b/bench/tools/code_graph_mcp/tools.yaml
new file mode 100644
index 00000000..3b676977
--- /dev/null
+++ b/bench/tools/code_graph_mcp/tools.yaml
@@ -0,0 +1,39 @@
+# SWE-agent tool bundle: code-graph MCP-transport config.
+#
+# This is the MCP-transport sibling of bench/tools/code_graph/tools.yaml.
+# Same backend graph; different transport. Where `code_graph` calls the
+# host FastAPI service over HTTP, `code_graph_mcp` spawns the
+# `cgraph-mcp` stdio server for each tool call — the exact transport
+# Claude Code / Cursor / Cline use in production.
+#
+# Tool names mirror the 8 MCP tools registered in api/mcp/tools/
+# (search_code, get_callers, get_callees, get_dependencies,
+# impact_analysis, find_path, index_repo, ask). The bash agent calls
+# them through the `cg-mcp <tool> ...` shim (see bench/cli/cg-mcp).
+#
+# IMPORTANT: `ask` (GraphRAG) is intentionally NOT in the tool list.
+# Including it would double-count tokens (nested LLM agent). Same Q2
+# decision as the HTTP code_graph config — we benchmark the *graph*,
+# not GraphRAG.
+
+extends: ../baseline/tools.yaml
+
+tools:
+  - index_repo            # (path_or_url, branch?) -> indexing stats
+  - search_code           # (project, prefix) -> [symbol]
+  - get_callers           # (project, symbol_id) -> [caller]
+  - get_callees           # (project, symbol_id) -> [callee]
+  - get_dependencies      # (project, symbol_id) -> [dep]
+  - impact_analysis       # (project, symbol_id, direction, depth) -> [impacted]
+  - find_path             # (project, source_id, dest_id) -> [path]
+
+backend:
+  transport: mcp_stdio
+  command: cgraph-mcp
+  # Container has cgraph-mcp on PATH via `pip install -e .` against this
+  # repo. FALKORDB_HOST/PORT are passed through to the spawned MCP
+  # server, pointing at the same host FalkorDB the HTTP config uses.
+  env_passthrough:
+    - FALKORDB_HOST
+    - FALKORDB_PORT
+    - MODEL_NAME
diff --git a/tests/bench/__init__.py b/tests/bench/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/bench/test_adopt_controls.py b/tests/bench/test_adopt_controls.py
new file mode 100644
index 00000000..9b84c307
--- /dev/null
+++ b/tests/bench/test_adopt_controls.py
@@ -0,0 +1,169 @@
+"""Unit tests for the adoption-calibration controls + candidate metric.
+
+Covers the FREE/offline pieces (no API, no live graph):
+  * candidate-level confusion matrix scores an injected NOISY distractor as FP
+    when the agent keeps it and TN when it drops it (the run-time NOISY arm's
+    correctness hinge);
+  * edit-critical relabel heuristic + manual override precedence;
+  * the degenerate-task convention for macro precision/recall (undefined tasks
+    are dropped from the macro average, not imputed).
+"""
+
+from __future__ import annotations
+
+import json
+
+from bench.analysis import adopt_controls as ac
+from bench.analysis.exposure_adoption import candidate_calibration, classify_run
+
+
+def _write_stdout(path, primaries):
+    """Write a minimal Copilot-CLI stdout.jsonl with one search_code call that
+    surfaces ``primaries`` (list of file paths) as ranked primary hits."""
+    contents = [{"type": "text", "text": json.dumps({"file": f})} for f in primaries]
+    events = [
+        {"type": "tool.execution_start",
+         "data": {"toolCallId": "c1", "mcpToolName": "search_code"}},
+        {"type": "tool.execution_complete",
+         "data": {"toolCallId": "c1", "result": {"contents": contents}}},
+    ]
+    path.write_text("\n".join(json.dumps(e) for e in events))
+    return path
+
+
+def test_kept_distractor_scores_fp(tmp_path):
+    sp = _write_stdout(tmp_path / "s.jsonl", ["pkg/gold.py", "pkg/distractor.py"])
+    cls = classify_run(sp, gold_files=["pkg/gold.py"],
+                       pred_files=["pkg/gold.py", "pkg/distractor.py"])
+    c = cls["cand"]
+    assert c["tp"] == 1  # surfaced gold kept
+    assert c["fp"] == 1  # surfaced non-gold (distractor) kept
+    assert c["fn"] == 0 and c["tn"] == 0
+
+
+def test_dropped_distractor_scores_tn(tmp_path):
+    sp = _write_stdout(tmp_path / "s.jsonl", ["pkg/gold.py", "pkg/distractor.py"])
+    cls = classify_run(sp, gold_files=["pkg/gold.py"], pred_files=["pkg/gold.py"])
+    c = cls["cand"]
+    assert c["tp"] == 1 and c["tn"] == 1  # gold kept, distractor correctly dropped
+    assert c["fp"] == 0 and c["fn"] == 0
+
+
+def test_dropped_gold_scores_fn(tmp_path):
+    sp = _write_stdout(tmp_path / "s.jsonl", ["pkg/gold.py"])
+    cls = classify_run(sp, gold_files=["pkg/gold.py"], pred_files=[])
+    assert cls["cand"]["fn"] == 1 and cls["cand"]["tp"] == 0
+
+
+def test_incidental_gold_excluded_from_matrix(tmp_path):
+    # gold surfaced but marked NOT edit-critical -> neither TP nor FN when dropped
+    sp = _write_stdout(tmp_path / "s.jsonl", ["pkg/gold.py"])
+    cls = classify_run(sp, gold_files=["pkg/gold.py"], pred_files=[],
+                       edit_critical=[])
+    c = cls["cand"]
+    assert c["tp"] == 0 and c["fn"] == 0
+    assert cls["cand"]["detail"]["pkg/gold.py"] == "incidental_gold"
+
+
+def test_edit_critical_heuristic_and_override():
+    gold = ["pkg/core.py", "tests/test_core.py", "pkg/migrations/0001_init.py"]
+    crit, inc = ac.edit_critical_split(gold)
+    assert crit == ["pkg/core.py"]
+    assert set(inc) == {"tests/test_core.py", "pkg/migrations/0001_init.py"}
+
+    # override flips core.py to incidental and the test file to critical
+    ov = {"t1": {"pkg/core.py": "incidental", "tests/test_core.py": "critical"}}
+    crit2, inc2 = ac.edit_critical_split(gold, task="t1", overrides=ov)
+    assert "pkg/core.py" in inc2
+    assert "tests/test_core.py" in crit2
+
+
+def test_macro_drops_undefined_tasks():
+    # task A: one TP, one TN -> P=1.0 R=1.0; task B: all TN (no kept, no surfaced
+    # gold) -> precision & recall undefined -> must be DROPPED from macro, not 0.
+    runs = [
+        {"task": "A", "cand": {"tp": 1, "fp": 0, "fn": 0, "tn": 1}},
+        {"task": "B", "cand": {"tp": 0, "fp": 0, "fn": 0, "tn": 3}},
+    ]
+    cal = candidate_calibration(runs)
+    # only task A contributes to the macro average (B is undefined on both axes)
+    assert cal["macro"]["precision"] == 1.0
+    assert cal["macro"]["recall"] == 1.0
+
+
+def test_gold_symbols_skip_dunders(tmp_path):
+    f = tmp_path / "m.py"
+    f.write_text("class C:\n    def __init__(self): pass\n    def real_method(self): pass\n"
+                 "def top_fn(): pass\n")
+    syms = ac.gold_symbols_offline(tmp_path, "m.py")
+    assert "__init__" not in syms
+    assert {"C", "real_method", "top_fn"} <= set(syms)
+
+
+# --- identity-aware log lookup (no cross-wiring across prompt_modes) ----------
+def _write_batch_run(batch_root, model, mode, prompt_mode, task, primaries,
+                     run_idx=0):
+    """Materialize a runs/<model>/<mode>/<prompt_mode>/code_graph/<task>[/run<idx>]
+    /logs/stdout.jsonl with the given surfaced primaries."""
+    base = batch_root / "runs" / model / mode / prompt_mode / "code_graph" / task
+    if run_idx:
+        base = base / f"run{run_idx}"
+    logs = base / "logs"
+    logs.mkdir(parents=True, exist_ok=True)
+    _write_stdout(logs / "stdout.jsonl", primaries)
+
+
+def test_row_stdout_path_does_not_cross_wire_prompt_modes(tmp_path):
+    from bench.analysis.exposure_adoption import row_stdout_path
+    model = "m"
+    batch = tmp_path / "batch"
+    # Same task under two arms, each surfacing a DISTINCT file.
+    _write_batch_run(batch, model, "localize", "adopt-ctrl", "task-1", ["pkg/ctrl.py"])
+    _write_batch_run(batch, model, "localize", "adopt-sem", "task-1", ["pkg/sem.py"])
+    ctrl_row = {"config": "code_graph", "mode": "localize",
+                "prompt_mode": "adopt-ctrl", "task_id": "task-1", "run_idx": 0}
+    sem_row = {**ctrl_row, "prompt_mode": "adopt-sem"}
+    ctrl_log = row_stdout_path(batch, model, ctrl_row).read_text()
+    sem_log = row_stdout_path(batch, model, sem_row).read_text()
+    assert "pkg/ctrl.py" in ctrl_log and "pkg/sem.py" not in ctrl_log
+    assert "pkg/sem.py" in sem_log and "pkg/ctrl.py" not in sem_log
+
+
+def test_row_stdout_path_separates_run_idx(tmp_path):
+    from bench.analysis.exposure_adoption import row_stdout_path
+    model = "m"
+    batch = tmp_path / "batch"
+    _write_batch_run(batch, model, "localize", "adopt-ctrl", "task-1", ["pkg/r0.py"], run_idx=0)
+    _write_batch_run(batch, model, "localize", "adopt-ctrl", "task-1", ["pkg/r1.py"], run_idx=1)
+    r0 = {"config": "code_graph", "mode": "localize", "prompt_mode": "adopt-ctrl",
+          "task_id": "task-1", "run_idx": 0}
+    r1 = {**r0, "run_idx": 1}
+    assert "pkg/r0.py" in row_stdout_path(batch, model, r0).read_text()
+    assert "pkg/r1.py" in row_stdout_path(batch, model, r1).read_text()
+
+
+def test_macro_strict_scores_drop_everything_as_zero():
+    # task A: clean keep -> F1=1. task B: gold SURFACED but ALL dropped
+    # (tp=0, fn>0) -> a real adoption failure -> macro_strict must include it as
+    # F1=0 (not silently drop it), while the lenient macro drops it.
+    runs = [
+        {"task": "A", "cand": {"tp": 1, "fp": 0, "fn": 0, "tn": 1}},
+        {"task": "B", "cand": {"tp": 0, "fp": 0, "fn": 2, "tn": 0}},
+    ]
+    cal = candidate_calibration(runs)
+    assert cal["macro"]["f1"] == 1.0           # B dropped from lenient macro
+    assert cal["macro_strict"]["f1"] == 0.5    # B counted as F1=0 -> (1+0)/2
+    assert cal["n_tasks_gold_dropped_failures"] == 1
+
+
+def test_macro_strict_keeps_dropping_no_surfaced_gold_tasks():
+    # task with no surfaced gold (tp=0, fn=0) is genuinely undefined and stays
+    # dropped from BOTH macros.
+    runs = [
+        {"task": "A", "cand": {"tp": 1, "fp": 0, "fn": 0, "tn": 1}},
+        {"task": "B", "cand": {"tp": 0, "fp": 0, "fn": 0, "tn": 3}},
+    ]
+    cal = candidate_calibration(runs)
+    assert cal["macro_strict"]["f1"] == 1.0
+    assert cal["macro_strict"]["n"] == 1
+    assert cal["n_tasks_dropped_undefined"] == 1
diff --git a/tests/bench/test_adopt_diag.py b/tests/bench/test_adopt_diag.py
new file mode 100644
index 00000000..97e62925
--- /dev/null
+++ b/tests/bench/test_adopt_diag.py
@@ -0,0 +1,176 @@
+"""Unit tests for the Lane 1 per-arm diagnostics (``adopt_diag``).
+
+Covers the FREE/offline pieces (no API, no live graph):
+  * RAT keep/drop parsing + consistency audit (compliant, consistent,
+    dropped-but-kept conflict, kept-then-omitted erosion);
+  * end-to-end ``diagnose`` over a tiny CTRL/RAT batch: arm detection, exposure
+    recall, GRAPH-WRONG subset freezing, and token summary.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from bench.analysis import adopt_diag as ad
+
+
+def test_parse_rat_decisions_keep_drop_and_backticks():
+    text = (
+        "Here are my decisions:\n"
+        "KEEP pkg/a.py — directly implements the save path\n"
+        "- `DROP` `pkg/b.py` — only a caller, not the edit site\n"
+        "DROP pkg/c.py: unrelated sibling\n"
+        "I also note keep is a verb used in prose but not line-initial here.\n"
+    )
+    d = ad.parse_rat_decisions(text)
+    assert d == {"pkg/a.py": "keep", "pkg/b.py": "drop", "pkg/c.py": "drop"}
+
+
+def test_rat_audit_consistent_when_no_dropped_file_kept():
+    text = "KEEP pkg/a.py — yes\nDROP pkg/b.py — no\n"
+    a = ad.rat_audit(text, pred_files=["pkg/a.py"])
+    assert a["compliant"] and a["consistent"]
+    assert a["n_keep"] == 1 and a["n_drop"] == 1
+    assert a["dropped_but_kept"] == []
+    assert a["kept_omitted"] == []
+
+
+def test_rat_audit_flags_dropped_but_kept_conflict():
+    text = "KEEP pkg/a.py — yes\nDROP pkg/b.py — no\n"
+    a = ad.rat_audit(text, pred_files=["pkg/a.py", "pkg/b.py"])
+    assert not a["consistent"]
+    assert a["dropped_but_kept"] == ["pkg/b.py"]
+
+
+def test_rat_audit_flags_kept_then_omitted():
+    text = "KEEP pkg/a.py — yes\nKEEP pkg/b.py — yes\n"
+    a = ad.rat_audit(text, pred_files=["pkg/a.py"])
+    assert a["consistent"]  # nothing dropped-then-kept
+    assert a["kept_omitted"] == ["pkg/b.py"]
+
+
+def test_rat_audit_non_compliant_when_no_decisions():
+    a = ad.rat_audit("I think the answer is pkg/a.py.", pred_files=["pkg/a.py"])
+    assert not a["compliant"]
+    assert a["n_keep"] == 0 and a["n_drop"] == 0
+
+
+# ---------------------------------------------------------------------------
+# End-to-end batch fixture
+# ---------------------------------------------------------------------------
+
+MODEL = "test-model"
+
+
+def _write_run(batch_root: Path, arm: str, task: str, primaries: list[str],
+               agent_text: str) -> None:
+    run_dir = batch_root / "runs" / MODEL / "localize" / arm / "code_graph" / task
+    (run_dir / "logs").mkdir(parents=True, exist_ok=True)
+    contents = [{"type": "text", "text": json.dumps({"file": f})} for f in primaries]
+    events = [
+        {"type": "tool.execution_start",
+         "data": {"toolCallId": "c1", "mcpToolName": "search_code"}},
+        {"type": "tool.execution_complete",
+         "data": {"toolCallId": "c1", "result": {"contents": contents}}},
+    ]
+    (run_dir / "logs" / "stdout.jsonl").write_text(
+        "\n".join(json.dumps(e) for e in events)
+    )
+    (run_dir / "agent_text.txt").write_text(agent_text)
+
+
+def _row(arm: str, task: str, gold: list[str], pred: list[str], **extra) -> dict:
+    base = {
+        "benchmark": "swe_bench_verified",
+        "task_id": task,
+        "config": "code_graph",
+        "model": MODEL,
+        "mode": "localize",
+        "prompt_mode": arm,
+        "run_idx": 0,
+        "runner": "copilot-runner/2",
+        "completed": True,
+        "gold_files": gold,
+        "pred_files": pred,
+        "total_tokens": 1000,
+        "output_tokens": 400,
+        "reasoning_tokens": 150,
+        "input_tokens": 600,
+        "premium_requests": 1,
+        "num_turns": 5,
+    }
+    base.update(extra)
+    return base
+
+
+def _build_batch(tmp_path: Path) -> Path:
+    batch_root = tmp_path / "cache"
+    results_path = batch_root / MODEL / "results.jsonl"
+    results_path.parent.mkdir(parents=True, exist_ok=True)
+
+    rows = []
+    # task t1: graph surfaces gold at rank-1 (CLEAN, not graph-wrong).
+    # CTRL drops the gold (adoption failure); RAT keeps it.
+    _write_run(batch_root, "adopt-ctrl", "t1", ["pkg/gold.py"], "")
+    _write_run(batch_root, "adopt-rat", "t1", ["pkg/gold.py"],
+               "KEEP pkg/gold.py — implements it\n")
+    rows.append(_row("adopt-ctrl", "t1", ["pkg/gold.py"], []))            # FN
+    rows.append(_row("adopt-rat", "t1", ["pkg/gold.py"], ["pkg/gold.py"]))  # TP
+
+    # task t2: rank-1 hit is non-gold (GRAPH-WRONG). CTRL keeps the wrong rank-1
+    # (FP); RAT drops it and keeps the real gold surfaced at rank-2.
+    _write_run(batch_root, "adopt-ctrl", "t2", ["pkg/wrong.py", "pkg/real.py"], "")
+    _write_run(batch_root, "adopt-rat", "t2", ["pkg/wrong.py", "pkg/real.py"],
+               "DROP pkg/wrong.py — only related\nKEEP pkg/real.py — the edit site\n")
+    rows.append(_row("adopt-ctrl", "t2", ["pkg/real.py"], ["pkg/wrong.py"]))
+    rows.append(_row("adopt-rat", "t2", ["pkg/real.py"], ["pkg/real.py"]))
+
+    results_path.write_text("\n".join(json.dumps(r) for r in rows))
+    return results_path
+
+
+def test_diagnose_detects_arms_and_exposure(tmp_path):
+    results_path = _build_batch(tmp_path)
+    rep = ad.diagnose(results_path, ref_arm="adopt-ctrl")
+    assert rep["arms_present"] == ["adopt-ctrl", "adopt-rat"]
+    # Every gold file was surfaced in both arms -> exposure_recall == 1.0.
+    assert rep["arms"]["adopt-ctrl"]["exposure"]["exposure_recall"] == 1.0
+    assert rep["arms"]["adopt-rat"]["exposure"]["exposure_recall"] == 1.0
+    # CTRL dropped both surfaced golds; RAT kept both.
+    assert rep["arms"]["adopt-ctrl"]["exposure"]["adoption_rate"] == 0.0
+    assert rep["arms"]["adopt-rat"]["exposure"]["adoption_rate"] == 1.0
+
+
+def test_diagnose_graph_wrong_subset_frozen_from_ref(tmp_path):
+    results_path = _build_batch(tmp_path)
+    rep = ad.diagnose(results_path, ref_arm="adopt-ctrl")
+    # t2's rank-1 (pkg/wrong.py) is non-gold -> GRAPH-WRONG; t1 is not.
+    assert rep["graph_wrong"]["tasks"] == ["t2"]
+    assert rep["graph_wrong"]["ref_arm"] == "adopt-ctrl"
+
+
+def test_diagnose_rat_audit_and_consistency(tmp_path):
+    results_path = _build_batch(tmp_path)
+    rep = ad.diagnose(results_path, ref_arm="adopt-ctrl")
+    ra = rep["arms"]["adopt-rat"]["rat_audit"]
+    assert ra["n"] == 2
+    assert ra["compliance_rate"] == 1.0   # both RAT runs emitted decisions
+    assert ra["consistency_rate"] == 1.0  # no dropped file was kept
+    assert ra["n_dropped_but_kept"] == 0
+
+
+def test_diagnose_rat_calibration_beats_ctrl(tmp_path):
+    results_path = _build_batch(tmp_path)
+    rep = ad.diagnose(results_path, ref_arm="adopt-ctrl")
+    ctrl_f1 = rep["arms"]["adopt-ctrl"]["calibration_clean"]["macro_strict"]["f1"]
+    rat_f1 = rep["arms"]["adopt-rat"]["calibration_clean"]["macro_strict"]["f1"]
+    assert rat_f1 > ctrl_f1
+
+
+def test_token_summary_visible_output_excludes_reasoning(tmp_path):
+    rows = [_row("adopt-rat", "t1", ["g.py"], ["g.py"])]
+    ts = ad.token_summary(rows)
+    # output 400 - reasoning 150 = 250 visible
+    assert ts["median_visible_output_tokens"] == 250
+    assert ts["median_total_tokens"] == 1000
diff --git a/tests/bench/test_cg_mcp_adapter.py b/tests/bench/test_cg_mcp_adapter.py
new file mode 100644
index 00000000..7d6f9274
--- /dev/null
+++ b/tests/bench/test_cg_mcp_adapter.py
@@ -0,0 +1,178 @@
+"""Tests for the MCP-transport bench adapter (`cg-mcp`).
+
+Heavy end-to-end test (talks to real cgraph-mcp + FalkorDB) is gated
+behind the same `_falkordb_reachable` check as the existing MCP tests.
+Light tests run unconditionally and validate the argparse surface and
+`_extract` shape handling.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import socket
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+
+from bench.agents import code_graph_mcp_adapter as cgm
+from bench.cli import cg_mcp
+
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+
+
+def _mcp_server_available() -> bool:
+    """The benchmark MCP adapter requires the in-repo `cgraph-mcp` server.
+
+    On branches that pre-date the MCP stack (e.g. this branch's base,
+    `fix-find-symbol-nested-name`), `api.mcp.server` is absent. The
+    end-to-end test must skip there; it will run on staging once the
+    MCP stack lands.
+    """
+    try:
+        import api.mcp.server  # noqa: F401
+        return True
+    except ImportError:
+        return False
+
+
+def _falkordb_reachable() -> bool:
+    host = os.environ.get("FALKORDB_HOST", "127.0.0.1")
+    port = int(os.environ.get("FALKORDB_PORT", "6390"))
+    try:
+        with socket.create_connection((host, port), timeout=1):
+            return True
+    except OSError:
+        return False
+
+
+# ── light unit tests ──────────────────────────────────────────────────
+
+
+class _FakeChunk:
+    def __init__(self, text: str) -> None:
+        self.text = text
+
+
+class _FakeResult:
+    def __init__(self, content, structured=None, is_error=False):
+        self.content = content
+        self.structuredContent = structured
+        self.isError = is_error
+
+
+def test_extract_prefers_text_chunk_json():
+    r = _FakeResult([_FakeChunk('{"id": 7, "name": "foo"}')])
+    assert cgm._extract(r) == {"id": 7, "name": "foo"}
+
+
+def test_extract_falls_back_to_structured_result_wrapper():
+    r = _FakeResult(content=[], structured={"result": [1, 2, 3]})
+    assert cgm._extract(r) == [1, 2, 3]
+
+
+def test_extract_returns_raw_text_when_not_json():
+    r = _FakeResult([_FakeChunk("not json at all")])
+    assert cgm._extract(r) == "not json at all"
+
+
+def test_cli_rejects_unknown_subcommand(capsys):
+    with pytest.raises(SystemExit):
+        cg_mcp.main(["totally_bogus"])
+
+
+def test_cli_index_repo_parses_ignore_list(monkeypatch):
+    captured: dict = {}
+
+    def fake_index_repo(path_or_url, branch=None, ignore=None):
+        captured.update(path_or_url=path_or_url, branch=branch, ignore=ignore)
+        return {"ok": True, **captured}
+
+    monkeypatch.setattr(cgm, "index_repo", fake_index_repo)
+    rc = cg_mcp.main(
+        [
+            "index_repo",
+            "--path-or-url",
+            "/tmp/x",
+            "--branch",
+            "main",
+            "--ignore",
+            ".venv",
+            "node_modules",
+        ]
+    )
+    assert rc == 0
+    assert captured["path_or_url"] == "/tmp/x"
+    assert captured["branch"] == "main"
+    assert captured["ignore"] == [".venv", "node_modules"]
+
+
+# ── heavy end-to-end test ─────────────────────────────────────────────
+
+
+@pytest.mark.skipif(
+    not _mcp_server_available(),
+    reason="api.mcp.server not present — requires MCP stack to be merged",
+)
+@pytest.mark.skipif(not _falkordb_reachable(), reason="FalkorDB unreachable")
+def test_cg_mcp_search_code_end_to_end(tmp_path):
+    """Spawn the actual cg-mcp shim against a freshly-indexed fixture."""
+    fixture = REPO_ROOT / "tests" / "mcp" / "fixtures" / "sample_project"
+    if not fixture.exists():
+        pytest.skip("MCP sample fixture not present")
+
+    env = os.environ.copy()
+    env["FALKORDB_HOST"] = os.environ.get("FALKORDB_HOST", "127.0.0.1")
+    env["FALKORDB_PORT"] = os.environ.get("FALKORDB_PORT", "6390")
+    env["BENCH_PYTHON"] = sys.executable
+    # Ensure cgraph-mcp is on PATH for the spawned subprocess.
+    venv_bin = str(Path(sys.executable).parent)
+    env["PATH"] = f"{venv_bin}:{env.get('PATH', '')}"
+
+    # Index the fixture under a deterministic project/branch.
+    project = "sample_project"
+    branch = f"benchmcp-{os.getpid()}"
+    idx = subprocess.run(
+        [
+            str(REPO_ROOT / "bench" / "cli" / "cg-mcp"),
+            "index_repo",
+            "--path-or-url",
+            str(fixture),
+            "--branch",
+            branch,
+        ],
+        env=env,
+        capture_output=True,
+        text=True,
+        timeout=120,
+    )
+    assert idx.returncode == 0, idx.stderr
+    idx_payload = json.loads(idx.stdout)
+    assert "graph_name" in idx_payload
+    assert idx_payload["num_nodes"] > 0
+
+    # Then search for any known symbol from the fixture.
+    sr = subprocess.run(
+        [
+            str(REPO_ROOT / "bench" / "cli" / "cg-mcp"),
+            "search_code",
+            "--project",
+            project,
+            "--branch",
+            branch,
+            "--prefix",
+            "a",  # broad prefix to match something in the fixture
+            "--limit",
+            "3",
+        ],
+        env=env,
+        capture_output=True,
+        text=True,
+        timeout=60,
+    )
+    assert sr.returncode == 0, sr.stderr
+    out = json.loads(sr.stdout)
+    assert out is not None
diff --git a/tests/bench/test_copilot_runner.py b/tests/bench/test_copilot_runner.py
new file mode 100644
index 00000000..73f1f729
--- /dev/null
+++ b/tests/bench/test_copilot_runner.py
@@ -0,0 +1,660 @@
+"""Unit tests for the Copilot benchmark runner parsers + TCO accounting.
+
+These run unconditionally (no FalkorDB / Copilot needed): they exercise the
+log/JSONL parsing and cost math against synthetic inputs plus the captured
+spike fixtures when present.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+import pytest
+
+from bench.runners import copilot_runner as cr
+from bench.runners import copilot_tco as tco
+
+
+# ---------------------------------------------------------------------------
+# Token-block parsing
+# ---------------------------------------------------------------------------
+
+
+def _write_log(tmp_path: Path, name: str, text: str) -> Path:
+    d = tmp_path / "logs"
+    d.mkdir(exist_ok=True)
+    (d / name).write_text(text)
+    return d
+
+
+_USAGE_BLOCK = """\
+some preamble line
+  "usage": {
+    "prompt_tokens": 1000,
+    "completion_tokens": 50,
+    "total_tokens": 1050,
+    "prompt_tokens_details": {
+      "cached_tokens": 200,
+      "cache_creation_tokens": 800
+    },
+    "completion_tokens_details": { "reasoning_tokens": 0 }
+  }
+trailing
+"""
+
+
+def test_parse_tokens_sums_multiple_blocks(tmp_path):
+    text = _USAGE_BLOCK + "\n" + _USAGE_BLOCK
+    d = _write_log(tmp_path, "process-1.log", text)
+    out = cr.parse_tokens_from_logs(d)
+    assert out["input_tokens"] == 2000
+    assert out["output_tokens"] == 100
+    assert out["total_tokens"] == 2100
+    assert out["cached_input_tokens"] == 400
+    assert out["cache_creation_tokens"] == 1600
+    assert out["usage_blocks"] == 2
+
+
+def test_parse_tokens_ignores_non_model_usage(tmp_path):
+    # An MCP tool result or stray JSON with a "usage" key but missing the
+    # required model-response fields must NOT be counted.
+    stray = '{ "usage": { "premiumRequests": 15, "totalApiDurationMs": 100 } }'
+    text = _USAGE_BLOCK + "\n" + stray
+    d = _write_log(tmp_path, "process-1.log", text)
+    out = cr.parse_tokens_from_logs(d)
+    assert out["usage_blocks"] == 1
+    assert out["input_tokens"] == 1000
+
+
+def test_parse_tokens_multiple_log_files(tmp_path):
+    d = _write_log(tmp_path, "process-1.log", _USAGE_BLOCK)
+    (d / "process-2.log").write_text(_USAGE_BLOCK)
+    out = cr.parse_tokens_from_logs(d)
+    assert out["usage_blocks"] == 2
+    assert out["input_tokens"] == 2000
+
+
+# ---------------------------------------------------------------------------
+# Result-event + tool-call parsing
+# ---------------------------------------------------------------------------
+
+
+def test_parse_result_event():
+    stdout = "\n".join([
+        json.dumps({"type": "assistant", "data": {}}),
+        json.dumps({
+            "type": "result",
+            "data": {
+                "usage": {
+                    "premiumRequests": 12,
+                    "codeChanges": {"filesModified": ["a.py", "b.py"]},
+                },
+                "isError": False,
+                "numTurns": 7,
+            },
+        }),
+    ])
+    out = cr.parse_result_event(stdout)
+    assert out["premium_requests"] == 12
+    assert out["files_modified"] == ["a.py", "b.py"]
+    assert out["is_error"] is False
+    assert out["num_turns"] == 7
+
+
+def test_parse_tool_calls_counts_mcp_and_shell():
+    stdout = "\n".join([
+        json.dumps({"type": "tool.execution_start", "data": {"name": "bash"}}),
+        json.dumps({"type": "tool.execution_start", "data": {"name": "bash"}}),
+        json.dumps({"type": "tool.execution_start", "data": {"name": "code-graph-search_code"}}),
+        json.dumps({"type": "tool.execution_complete", "data": {}}),
+    ])
+    total, by_name = cr.parse_tool_calls(stdout)
+    assert total == 3
+    assert by_name == {"bash": 2, "code-graph-search_code": 1}
+
+
+def test_parsers_tolerate_garbage_lines():
+    stdout = "not json\n\n" + json.dumps({"type": "result", "data": {"usage": {"premiumRequests": 1}}})
+    assert cr.parse_result_event(stdout)["premium_requests"] == 1
+    assert cr.parse_tool_calls("garbage\n{bad") == (0, {})
+
+
+# ---------------------------------------------------------------------------
+# Patch helpers
+# ---------------------------------------------------------------------------
+
+
+def test_patched_files_extraction():
+    patch = (
+        "diff --git a/x/y.py b/x/y.py\n"
+        "--- a/x/y.py\n"
+        "+++ b/x/y.py\n"
+        "@@ -1 +1 @@\n-a\n+b\n"
+        "diff --git a/tests/test_z.py b/tests/test_z.py\n"
+        "--- a/tests/test_z.py\n"
+        "+++ b/tests/test_z.py\n"
+        "@@ -1 +1 @@\n-c\n+d\n"
+    )
+    assert cr._patched_files(patch) == ["x/y.py", "tests/test_z.py"]
+
+
+# ---------------------------------------------------------------------------
+# Prompt assembly
+# ---------------------------------------------------------------------------
+
+
+def test_prompt_excludes_ask_for_code_graph(tmp_path):
+    p = cr.build_prompt(cr.CODE_GRAPH, tmp_path, "Fix the bug.", "django__django-10973")
+    assert "django__django-10973" in p
+    assert "Do not use the `ask` tool" in p
+    assert "search_code" in p
+
+
+def test_prompt_no_mcp_has_no_tool_sales():
+    p = cr.build_prompt(cr.NO_MCP, Path("/tmp/x"), "Fix it.", "proj")
+    assert "MCP" in p  # capability note present
+    assert "search_code" not in p
+
+
+# ---------------------------------------------------------------------------
+# TCO accounting
+# ---------------------------------------------------------------------------
+
+
+def test_tco_no_ask_is_agent_only():
+    row = {
+        "task_id": "t1", "config": "code_graph", "model": "claude-opus-4.8",
+        "input_tokens": 1_000_000, "output_tokens": 100_000,
+        "premium_requests": 20, "index_sec": 60.0, "completed": True,
+    }
+    out = tco.row_tco(row)
+    # opus: 1M in * $15 + 0.1M out * $75 = 15 + 7.5 = 22.5
+    assert out["agent_usd"] == pytest.approx(22.5, abs=0.01)
+    assert out["graphrag_usd"] == 0.0
+    assert out["per_task_tco_usd"] == pytest.approx(22.5, abs=0.01)
+    assert out["index_usd_amortized_once"] > 0
+
+
+def test_tco_meters_ask_when_present():
+    row = {
+        "task_id": "t1", "config": "code_graph_ask", "model": "claude-sonnet-4.6",
+        "input_tokens": 0, "output_tokens": 0,
+        "graphrag_ask_calls": 3, "graphrag_input_tokens": 1_000_000,
+        "graphrag_output_tokens": 100_000, "completed": True,
+    }
+    out = tco.row_tco(row)
+    # gemini-flash-lite: 1M * 0.075 + 0.1M * 0.30 = 0.075 + 0.03 = 0.105
+    assert out["graphrag_usd"] == pytest.approx(0.105, abs=1e-4)
+    assert out["graphrag_ask_calls"] == 3
+
+
+def test_tco_aggregate_groups_by_config():
+    rows = [
+        {"config": "copilot_no_mcp", "model": "claude-opus-4.8", "input_tokens": 100, "output_tokens": 10, "outcome": "resolved", "completed": True},
+        {"config": "code_graph", "model": "claude-opus-4.8", "input_tokens": 100, "output_tokens": 10, "outcome": "failed", "completed": True},
+        {"config": "code_graph", "model": "claude-opus-4.8", "input_tokens": 0, "output_tokens": 0, "outcome": "x", "completed": False},  # skipped
+    ]
+    agg = tco.aggregate(rows)
+    assert agg["copilot_no_mcp"]["n"] == 1
+    assert agg["copilot_no_mcp"]["resolved"] == 1
+    assert agg["code_graph"]["n"] == 1  # incomplete row excluded
+
+
+def test_agent_key_mapping():
+    assert tco.agent_key("claude-opus-4.8") == "opus"
+    assert tco.agent_key("claude-sonnet-4.6") == "sonnet"
+    assert tco.agent_key("claude-haiku-4.5") == "haiku"
+
+
+# ---------------------------------------------------------------------------
+# Prompt assembly: nudge + localize modes
+# ---------------------------------------------------------------------------
+
+
+def test_prompt_nudge_code_graph_mandates_search(tmp_path):
+    p = cr.build_prompt(cr.CODE_GRAPH, tmp_path, "Fix it.", "proj", nudge=True)
+    assert "MUST begin by calling search_code(project=\"proj\")" in p
+    assert "Do not use the `ask` tool" in p
+
+
+def test_prompt_nudge_no_mcp_is_matched_control(tmp_path):
+    p = cr.build_prompt(cr.NO_MCP, tmp_path, "Fix it.", "proj", nudge=True)
+    # Matched "search-first" control with no tool sales / no graph verbs.
+    assert "Before resorting to plain text search" in p
+    assert "search_code" not in p
+
+
+def test_prompt_localize_emits_sentinel_contract(tmp_path):
+    p = cr.build_prompt(cr.CODE_GRAPH, tmp_path, "Bug.", "proj", mode=cr.LOCALIZE)
+    assert cr.LOCALIZE_SENTINEL in p
+    assert "Do NOT modify any files" in p
+    assert "Do NOT emit it through a" in p
+
+
+# ---------------------------------------------------------------------------
+# Lane 1 adoption-calibration arms (CTRL / SEM / RAT)
+# ---------------------------------------------------------------------------
+
+
+def test_adopt_ctrl_capability_equals_canonical_nudge(tmp_path):
+    # CTRL must be byte-identical to the canonical nudge capability (prereg §2
+    # amended: CTRL == _CAP_CODE_GRAPH_NUDGE), independent of env-gated variants.
+    cap = cr._capability(cr.CODE_GRAPH, "proj", nudge=False, adopt_arm="ctrl")
+    assert cap == cr._CAP_CODE_GRAPH_NUDGE.format(project="proj")
+
+
+def test_adopt_arms_bypass_env_gates(tmp_path, monkeypatch):
+    # The SUBST/SPIKE/TRAVERSE env gates must NOT affect arm capabilities.
+    monkeypatch.setenv("CGRAPH_SUBST_NUDGE", "1")
+    monkeypatch.setenv("CGRAPH_SPIKE_NUDGE", "1")
+    monkeypatch.setenv("CGRAPH_TRAVERSE_NUDGE", "1")
+    cap = cr._capability(cr.CODE_GRAPH, "proj", nudge=True, adopt_arm="ctrl")
+    assert cap == cr._CAP_CODE_GRAPH_NUDGE.format(project="proj")
+    assert "TRUST the ranked results" not in cap
+    assert "get_importers" not in cap
+
+
+def test_adopt_sem_appends_frozen_clause_only(monkeypatch, tmp_path):
+    monkeypatch.delenv("BENCH_BLOCK_NETWORK", raising=False)
+    p = cr.build_prompt(
+        cr.CODE_GRAPH, tmp_path, "Bug.", "proj", mode=cr.LOCALIZE, adopt_arm="sem"
+    )
+    # SEM == CTRL (canonical nudge) + frozen edge-semantics clause.
+    assert "MUST begin by calling search_code(project=\"proj\")" in p
+    assert "Relatedness alone is not a reason to keep or to drop." in p
+    assert "evidence that code is RELATED" in p
+    # Rejected wording (benchmark prior) must never appear.
+    assert "often a caller or a sibling" not in p
+    # SEM has no extra keep/drop step (that is RAT's lever).
+    assert "KEEP <file>" not in p
+
+
+def test_adopt_rat_injects_keep_drop_step_before_sentinel(tmp_path):
+    p = cr.build_prompt(
+        cr.CODE_GRAPH, tmp_path, "Bug.", "proj", mode=cr.LOCALIZE, adopt_arm="rat"
+    )
+    assert "KEEP <file>" in p and "DROP <file>" in p
+    assert "list every file the graph surfaced" in p
+    # The keep/drop step must precede the FINAL sentinel instruction.
+    assert p.index("list every file the graph surfaced") < p.index(cr.LOCALIZE_SENTINEL)
+    # RAT shares the CTRL base, not the SEM clause.
+    assert "MUST begin by calling search_code(project=\"proj\")" in p
+    assert "Relatedness alone is not a reason" not in p
+
+
+def test_adopt_arm_guard_rejects_non_code_graph(tmp_path):
+    with pytest.raises(ValueError):
+        cr.build_prompt(
+            cr.NO_MCP, tmp_path, "Bug.", "proj", mode=cr.LOCALIZE, adopt_arm="ctrl"
+        )
+
+
+def test_adopt_arm_guard_rejects_non_localize(tmp_path):
+    with pytest.raises(ValueError):
+        cr.build_prompt(
+            cr.CODE_GRAPH, tmp_path, "Bug.", "proj", mode=cr.FIX, adopt_arm="ctrl"
+        )
+
+
+def test_adopt_arm_guard_rejects_unknown_value(tmp_path):
+    # Programmatic callers bypass argparse choices; an unknown arm must not
+    # silently fall through to CTRL behavior while logging prompt_mode=adopt-bad.
+    with pytest.raises(ValueError):
+        cr.build_prompt(
+            cr.CODE_GRAPH, tmp_path, "Bug.", "proj", mode=cr.LOCALIZE, adopt_arm="bad"
+        )
+
+
+# ---------------------------------------------------------------------------
+# NOISY/GRAPH-WRONG distractor injection wiring
+# ---------------------------------------------------------------------------
+
+
+class _Inst:
+    def __init__(self, instance_id):
+        self.instance_id = instance_id
+
+
+def test_compute_prompt_mode_suffixes_only_when_injecting():
+    assert cr._compute_prompt_mode(adopt_arm="sem", nudge=True, inject_label=None) == "adopt-sem"
+    assert (
+        cr._compute_prompt_mode(adopt_arm="sem", nudge=True, inject_label="noisy")
+        == "adopt-sem-noisy"
+    )
+    assert cr._compute_prompt_mode(adopt_arm=None, nudge=True, inject_label=None) == "nudged"
+    assert (
+        cr._compute_prompt_mode(adopt_arm=None, nudge=False, inject_label="gwrong")
+        == "neutral-gwrong"
+    )
+
+
+def test_inject_env_pins_task_and_manifest():
+    inst = _Inst("django__django-1")
+    env = cr._inject_env(inst, inject_manifest=Path("/tmp/m.json"), inject_k=3)
+    assert env == {
+        "BENCH_NOISY_MANIFEST": "/tmp/m.json",
+        "BENCH_NOISY_TASK": "django__django-1",
+        "BENCH_NOISY_K": "3",
+    }
+
+
+def test_inject_env_omits_k_when_unset():
+    env = cr._inject_env(_Inst("t1"), inject_manifest=Path("/tmp/m.json"), inject_k=None)
+    assert "BENCH_NOISY_K" not in env
+    assert env["BENCH_NOISY_TASK"] == "t1"
+
+
+def test_inject_env_none_when_disabled():
+    assert cr._inject_env(_Inst("t1"), inject_manifest=None, inject_k=None) is None
+
+
+def test_write_mcp_config_clean_is_falkor_only(tmp_path):
+    w = tmp_path / "wrap.sh"
+    w.write_text("#!/bin/bash\n")
+    cfg = cr._write_mcp_config(tmp_path, w, "h", 6379)
+    env = json.loads(cfg.read_text())["mcpServers"]["code-graph"]["env"]
+    assert env == {"FALKORDB_HOST": "h", "FALKORDB_PORT": "6379"}
+
+
+def test_write_mcp_config_threads_extra_env(tmp_path):
+    w = tmp_path / "wrap.sh"
+    w.write_text("#!/bin/bash\n")
+    cfg = cr._write_mcp_config(
+        tmp_path, w, "h", 6379, extra_env={"BENCH_NOISY_TASK": "t1"}
+    )
+    env = json.loads(cfg.read_text())["mcpServers"]["code-graph"]["env"]
+    assert env["FALKORDB_HOST"] == "h"
+    assert env["BENCH_NOISY_TASK"] == "t1"
+
+
+def test_run_one_inject_guard_rejects_non_localize(tmp_path):
+    with pytest.raises(ValueError):
+        cr.run_one(
+            _Inst("x"), track=cr.CODE_GRAPH, model="m", cache_dir=tmp_path,
+            wall_time=1.0, server_root=tmp_path, mode=cr.FIX,
+            inject_manifest=Path("/tmp/m.json"), inject_label="noisy",
+        )
+
+
+def test_run_one_inject_guard_requires_label(tmp_path):
+    with pytest.raises(ValueError):
+        cr.run_one(
+            _Inst("x"), track=cr.CODE_GRAPH, model="m", cache_dir=tmp_path,
+            wall_time=1.0, server_root=tmp_path, mode=cr.LOCALIZE,
+            inject_manifest=Path("/tmp/m.json"), inject_label=None,
+        )
+
+
+# ---------------------------------------------------------------------------
+# Localization extraction + scoring
+# ---------------------------------------------------------------------------
+
+
+def _msg(content):
+    return json.dumps({"type": "assistant.message", "data": {"content": content}})
+
+
+def test_extract_agent_text_concats_messages():
+    stdout = "\n".join([
+        _msg("first thought"),
+        json.dumps({"type": "tool.execution_start", "data": {"name": "bash"}}),
+        json.dumps({"type": "user.message", "data": {"content": "IGNORE ME"}}),
+        _msg(""),  # empty tool-only turn
+        _msg("final answer"),
+    ])
+    text = cr.extract_agent_text(stdout)
+    assert "first thought" in text
+    assert "final answer" in text
+    assert "IGNORE ME" not in text
+
+
+def test_parse_localization_strict_sentinel():
+    text = 'Reasoning...\nFINAL_LOCALIZATION_JSON: ["a/pkg/x.py", "./pkg/y.py"]'
+    pred, err, fallback = cr.parse_localization(text)
+    assert pred == ["pkg/x.py", "pkg/y.py"]
+    assert err is None
+    assert fallback is False
+
+
+def test_parse_localization_uses_last_sentinel():
+    text = (
+        'FINAL_LOCALIZATION_JSON: ["wrong.py"]\n'
+        'on reflection...\n'
+        'FINAL_LOCALIZATION_JSON: ["right.py"]'
+    )
+    pred, err, fallback = cr.parse_localization(text)
+    assert pred == ["right.py"]
+    assert err is None
+
+
+def test_parse_localization_missing_sentinel_flags_fallback():
+    pred, err, fallback = cr.parse_localization("no marker here")
+    assert pred == []
+    assert err == "sentinel_missing"
+    assert fallback is True
+
+
+def test_parse_localization_malformed_array():
+    pred, err, fallback = cr.parse_localization("FINAL_LOCALIZATION_JSON: [oops")
+    assert pred == []
+    assert fallback is True
+    assert err == "unbalanced_array"
+
+
+def test_score_localization_recall_and_mrr():
+    scores = cr.score_localization(
+        pred=["pkg/b.py", "pkg/a.py"], gold=["pkg/a.py", "pkg/c.py"]
+    )
+    assert scores["file_recall"] == 0.5
+    assert scores["file_precision"] == 0.5
+    assert scores["file_all_found"] is False
+    assert scores["acc_at_1"] == 0.0  # first pred (b.py) not gold
+    assert scores["acc_at_3"] == 1.0
+    assert scores["file_mrr"] == 0.5  # a.py at rank 2
+
+
+def test_score_localization_all_found():
+    scores = cr.score_localization(pred=["a.py", "b.py"], gold=["a.py", "b.py"])
+    assert scores["file_all_found"] is True
+    assert scores["file_recall"] == 1.0
+    assert scores["acc_at_1"] == 1.0
+    assert scores["file_mrr"] == 1.0
+
+
+# ---------------------------------------------------------------------------
+# Nudge compliance metrics
+# ---------------------------------------------------------------------------
+
+
+def test_nudge_compliance_first_is_graph():
+    stdout = "\n".join([
+        json.dumps({"type": "tool.execution_start", "data": {"name": "code-graph-search_code"}}),
+        json.dumps({"type": "tool.execution_start", "data": {"name": "bash"}}),
+        json.dumps({"type": "tool.execution_start", "data": {"name": "code-graph-get_callers"}}),
+    ])
+    c = cr.nudge_compliance(stdout)
+    assert c["first_tool"] == "code-graph-search_code"
+    assert c["first_is_graph"] is True
+    assert c["graph_calls"] == 2
+
+
+def test_nudge_compliance_no_graph():
+    stdout = json.dumps({"type": "tool.execution_start", "data": {"name": "bash"}})
+    c = cr.nudge_compliance(stdout)
+    assert c["first_tool"] == "bash"
+    assert c["first_is_graph"] is False
+    assert c["graph_calls"] == 0
+
+
+# ---------------------------------------------------------------------------
+# Resume identity includes mode / prompt_mode / model
+# ---------------------------------------------------------------------------
+
+
+def test_load_done_keys_include_mode_and_prompt(tmp_path):
+    results = tmp_path / "results.jsonl"
+    rows = [
+        {"task_id": "t1", "config": "code_graph", "model": "claude-opus-4.8",
+         "mode": "fix", "prompt_mode": "neutral", "run_idx": 0,
+         "runner": cr.RUNNER_VERSION, "completed": True},
+        {"task_id": "t1", "config": "code_graph", "model": "claude-opus-4.8",
+         "mode": "localize", "prompt_mode": "nudged", "run_idx": 0,
+         "runner": cr.RUNNER_VERSION, "completed": True},
+    ]
+    results.write_text("\n".join(json.dumps(r) for r in rows))
+    done = cr._load_done(results)
+    assert ("t1", "code_graph", "claude-opus-4.8", "fix", "neutral", 0) in done
+    assert ("t1", "code_graph", "claude-opus-4.8", "localize", "nudged", 0) in done
+    # A different prompt_mode is NOT considered done.
+    assert ("t1", "code_graph", "claude-opus-4.8", "fix", "nudged", 0) not in done
+
+
+# ---------------------------------------------------------------------------
+# Real captured fixture (when present)
+# ---------------------------------------------------------------------------
+
+_FIXTURE = Path(__file__).resolve().parents[1].parent / "bench" / "cache" / "copilot-spike" / "logs" / "mcp-probe3"
+
+
+@pytest.mark.skipif(not (_FIXTURE / "..").exists() or not _FIXTURE.exists(), reason="spike fixture absent")
+def test_real_fixture_tokens_parse():
+    out = cr.parse_tokens_from_logs(_FIXTURE)
+    assert out["usage_blocks"] >= 1
+    assert out["input_tokens"] > 0
+
+
+# ---------------------------------------------------------------------------
+# Leak hardening: git-walk-up fence + index branch pin (harden/2)
+# ---------------------------------------------------------------------------
+
+import os  # noqa: E402
+
+from bench.datasets import swe_bench  # noqa: E402
+
+
+def test_strip_git_oracle_removes_nested_git(tmp_path):
+    root = tmp_path / "wt"
+    (root / "pkg").mkdir(parents=True)
+    # top-level repo .git directory
+    (root / ".git").mkdir()
+    (root / ".git" / "config").write_text("[remote]\n")
+    # submodule .git is a FILE holding a gitdir pointer, not a directory
+    (root / "pkg" / ".git").write_text("gitdir: ../../.git/modules/pkg\n")
+    # real source must survive
+    (root / "pkg" / "mod.py").write_text("x = 1\n")
+
+    swe_bench.strip_git_oracle(root)
+
+    assert not (root / ".git").exists()
+    assert not (root / "pkg" / ".git").exists()
+    assert (root / "pkg" / "mod.py").read_text() == "x = 1\n"
+
+
+def test_harden_env_scrubs_git_and_creds(monkeypatch):
+    monkeypatch.setenv("GITHUB_TOKEN", "secret")
+    monkeypatch.setenv("GIT_DIR", "/somewhere/.git")
+    monkeypatch.setenv("GIT_WORK_TREE", "/somewhere")
+    monkeypatch.setenv("GIT_COMMON_DIR", "/somewhere/.git")
+    out = cr._harden_env(dict(os.environ))
+    assert "GITHUB_TOKEN" not in out
+    assert "GIT_DIR" not in out
+    assert "GIT_WORK_TREE" not in out
+    assert "GIT_COMMON_DIR" not in out
+    assert out["GIT_CONFIG_NOSYSTEM"] == "1"
+
+
+def test_git_ceiling_dirs_points_at_worktree_parent(tmp_path):
+    wt = tmp_path / "worktrees" / "code_graph" / "loc-abc123"
+    wt.mkdir(parents=True)
+    ceiling = cr._git_ceiling_dirs(wt)
+    parent = str((tmp_path / "worktrees" / "code_graph").resolve())
+    assert parent in ceiling.split(os.pathsep)
+
+
+def test_hardening_meta_harden2_flags(monkeypatch, tmp_path):
+    monkeypatch.setenv("BENCH_BLOCK_NETWORK", "1")
+    meta = cr.hardening_meta(tmp_path, "", 0)  # tmp_path has no .git
+    assert meta["harness_hardening_version"] == "harden/2"
+    assert meta["git_walk_up_blocked"] is True
+    assert meta["git_sanitized"] is True
+    assert meta["network_block_mode"] is True
+    assert meta["opaque_path_mode"] is True
+
+
+def test_hardening_meta_on_by_default(monkeypatch, tmp_path):
+    # Hardening is default-ON: tracing repeatedly caught the agent fetching the
+    # gold file list from GitHub, turning localization misses into fake recall=1.0.
+    monkeypatch.delenv("BENCH_BLOCK_NETWORK", raising=False)
+    meta = cr.hardening_meta(tmp_path, "", 0)
+    assert meta["git_walk_up_blocked"] is True
+    assert meta["network_block_mode"] is True
+
+
+def test_hardening_meta_explicit_opt_out(monkeypatch, tmp_path):
+    # Hardening only disengages when explicitly set to a falsy value.
+    monkeypatch.setenv("BENCH_BLOCK_NETWORK", "0")
+    meta = cr.hardening_meta(tmp_path, "", 0)
+    assert meta["git_walk_up_blocked"] is False
+    assert meta["network_block_mode"] is False
+
+
+def test_leak_scan_flags_git_escape_attempts():
+    # `git -C ..` re-points discovery above the ceiling fence.
+    sig = cr._scan_leak_arguments("bash", {"command": "git -C .. log --oneline -1"})
+    assert sig, "git -C escape should be flagged"
+    # Explicit --git-dir bypass.
+    sig2 = cr._scan_leak_arguments("bash", {"command": "git --git-dir=/x/.git log"})
+    assert sig2, "git --git-dir escape should be flagged"
+    # Unsetting the ceiling env before running git.
+    sig3 = cr._scan_leak_arguments("bash", {"command": "env -u GIT_CEILING_DIRECTORIES git log"})
+    assert sig3, "env -u GIT_CEILING escape should be flagged"
+    # A benign in-worktree command is NOT flagged.
+    assert cr._scan_leak_arguments("bash", {"command": "ls -la && cat README.md"}) == []
+
+
+def test_resolve_run_dir_layout_matches_row_stdout_path(tmp_path):
+    # The producer (_resolve_run_dir) and the consumer (exposure_adoption.
+    # row_stdout_path) must agree on the on-disk layout, including the run<idx>
+    # nesting introduced for multi-run pilots. This pins that contract so a
+    # change to one side can't silently desync log lookup.
+    from bench.analysis import exposure_adoption as ea
+
+    cache_dir = tmp_path / "batch"
+    model = "claude-opus-4.8"
+    common = dict(
+        model=model,
+        mode="localize",
+        prompt_mode="adopt-sem",
+        track="code_graph",
+        instance_id="django__django-12345",
+    )
+    for run_idx in (0, 1, 3):
+        run_dir = cr._resolve_run_dir(cache_dir, run_idx=run_idx, **common)
+        log_dir = run_dir / "logs"
+        log_dir.mkdir(parents=True, exist_ok=True)
+        (log_dir / "stdout.jsonl").write_text("{}\n")
+
+        row = {
+            "mode": common["mode"],
+            "prompt_mode": common["prompt_mode"],
+            "config": common["track"],
+            "task_id": common["instance_id"],
+            "run_idx": run_idx,
+        }
+        resolved = ea.row_stdout_path(cache_dir, model, row)
+        assert resolved == log_dir / "stdout.jsonl"
+
+
+def test_resolve_run_dir_nests_only_for_nonzero_idx(tmp_path):
+    base = dict(
+        model="m",
+        mode="fix",
+        prompt_mode="neutral",
+        track="lsp",
+        instance_id="t-1",
+    )
+    bare = cr._resolve_run_dir(tmp_path, run_idx=0, **base)
+    nested = cr._resolve_run_dir(tmp_path, run_idx=2, **base)
+    assert bare.name == "t-1"
+    assert nested.name == "run2" and nested.parent.name == "t-1"
diff --git a/tests/bench/test_localize_runner.py b/tests/bench/test_localize_runner.py
new file mode 100644
index 00000000..db5870f9
--- /dev/null
+++ b/tests/bench/test_localize_runner.py
@@ -0,0 +1,218 @@
+"""Offline unit tests for the LocAgent-style localization runner.
+
+These guard the two bugs found during the live smoke run:
+  1. The parser must read ONLY the agent's own output (assistant/exit/
+     submission), never the example sentinel embedded in the user prompt.
+  2. Scoring (recall / Acc@k / MRR) must follow the LocAgent definition
+     where Acc@k means "all gold files recovered within top-k".
+"""
+from __future__ import annotations
+
+from bench.runners.localize_runner import (
+    SENTINEL,
+    parse_prediction,
+    score_localization,
+)
+
+
+def _traj(messages, submission=None):
+    info = {}
+    if submission is not None:
+        info["submission"] = submission
+    return {"messages": messages, "info": info}
+
+
+def test_parser_ignores_example_in_user_prompt():
+    """The instance prompt contains an EXAMPLE sentinel; it must be skipped."""
+    user_prompt = (
+        "Name the files that must change. End with a line like:\n"
+        f'{SENTINEL} ["pkg/module/foo.py","pkg/other.py"]\n'
+    )
+    assistant = (
+        "I investigated the code base.\n"
+        f'{SENTINEL} ["app/real_target.py","app/helper.py"]'
+    )
+    traj = _traj(
+        [
+            {"role": "system", "content": "be helpful"},
+            {"role": "user", "content": user_prompt},
+            {"role": "assistant", "content": assistant},
+        ]
+    )
+    pred, err, _ = parse_prediction(traj)
+    assert err is False
+    assert pred == ["app/real_target.py", "app/helper.py"]
+    # the example files must never leak in
+    assert "pkg/module/foo.py" not in pred
+
+
+def test_parser_reads_submission_field():
+    sub = f'{SENTINEL} ["pkg/b.py"]'
+    traj = _traj(
+        [
+            {"role": "user", "content": f'{SENTINEL} ["example/x.py"]'},
+        ],
+        submission=sub,
+    )
+    pred, err, _ = parse_prediction(traj)
+    assert err is False
+    assert pred == ["pkg/b.py"]
+
+
+def test_parser_uses_last_sentinel_in_agent_text():
+    assistant = (
+        f'{SENTINEL} ["first/guess.py"]\n'
+        "...reconsidered...\n"
+        f'{SENTINEL} ["final/answer.py"]'
+    )
+    traj = _traj([{"role": "assistant", "content": assistant}])
+    pred, err, _ = parse_prediction(traj)
+    assert err is False
+    assert pred == ["final/answer.py"]
+
+
+def test_parser_missing_sentinel_is_error():
+    traj = _traj([{"role": "assistant", "content": "no answer here"}])
+    pred, err, _ = parse_prediction(traj)
+    assert err is True
+    assert pred == []
+
+
+def test_parser_normalizes_diff_prefixes_and_dotslash():
+    # git-diff style `a/`,`b/` prefixes are stripped; `./` is stripped.
+    assistant = f'{SENTINEL} ["a/django/x.py", "./sympy/y.py"]'
+    traj = _traj([{"role": "assistant", "content": assistant}])
+    pred, err, _ = parse_prediction(traj)
+    assert err is False
+    assert pred == ["django/x.py", "sympy/y.py"]
+
+
+def test_score_all_found_and_acc_at_k():
+    gold = ["pkg/a.py", "pkg/b.py"]
+    # both gold present but need top-2 -> acc@1 False, acc@3 True
+    pred = ["pkg/a.py", "pkg/b.py"]
+    s = score_localization(pred, gold)
+    assert s["file_recall"] == 1.0
+    assert s["file_all_found"] is True
+    assert s["acc_at_1"] is False
+    assert s["acc_at_3"] is True
+    assert s["acc_at_5"] is True
+    assert s["file_mrr"] == 1.0
+
+
+def test_score_partial_recall():
+    gold = ["pkg/a.py", "pkg/b.py"]
+    pred = ["pkg/a.py", "pkg/wrong.py"]
+    s = score_localization(pred, gold)
+    assert s["file_recall"] == 0.5
+    assert s["file_all_found"] is False
+    assert s["acc_at_5"] is False
+    assert s["file_mrr"] == 1.0  # first prediction is gold
+
+
+def test_score_mrr_second_position():
+    gold = ["pkg/b.py"]
+    pred = ["pkg/wrong.py", "pkg/b.py"]
+    s = score_localization(pred, gold)
+    assert s["file_mrr"] == 0.5
+    assert s["acc_at_1"] is False
+    assert s["acc_at_3"] is True
+
+
+def test_score_empty_prediction():
+    s = score_localization([], ["pkg/a.py"])
+    assert s["file_recall"] == 0.0
+    assert s["file_all_found"] is False
+    assert s["file_mrr"] == 0.0
+
+
+def test_safe_env_kills_pipe_holding_grandchild_promptly():
+    """The exact deadlock: a command backgrounds a child that keeps the stdout
+    pipe open and sleeps far longer than the timeout. The stock
+    subprocess.run(timeout=) would block in communicate(); SafeLocalEnvironment
+    must return promptly with a timeout marker.
+    """
+    import time as _t
+
+    from bench.runners.localize_runner import SafeLocalEnvironment
+
+    env = SafeLocalEnvironment(cwd="/tmp", env={}, timeout=2)
+    # `sleep 60 &` inherits stdout; parent echoes then exits but the child
+    # holds the pipe open for 60s.
+    started = _t.time()
+    out = env.execute({"command": "sleep 60 & echo started; sleep 60"})
+    elapsed = _t.time() - started
+    assert elapsed < 15, f"did not reap promptly (took {elapsed:.1f}s)"
+    assert out["returncode"] == -1
+    assert "timed out" in out["output"]
+
+def test_timeout_retry_model_interrupts_stall_then_succeeds():
+    """A model whose query blocks past the per-call timeout must be interrupted
+    by SIGALRM and retried; once a call returns quickly the wrapper yields it."""
+    import time as _t
+
+    from bench.runners.localize_runner import TimeoutRetryModel
+
+    class FlakyModel:
+        def __init__(self):
+            self.calls = 0
+            self.cost = 1.23  # attribute that must be delegated
+
+        def query(self, messages, **kwargs):
+            self.calls += 1
+            if self.calls == 1:
+                _t.sleep(30)  # stall: SIGALRM must interrupt this
+            return {"role": "assistant", "content": "ok"}
+
+    inner = FlakyModel()
+    wrapped = TimeoutRetryModel(inner, per_call_timeout=1, retries=2)
+    started = _t.time()
+    out = wrapped.query([{"role": "user", "content": "hi"}])
+    elapsed = _t.time() - started
+    assert out["content"] == "ok"
+    assert inner.calls == 2  # first stalled+interrupted, second succeeded
+    assert elapsed < 10, f"did not interrupt the stall promptly ({elapsed:.1f}s)"
+    assert wrapped.cost == 1.23  # delegation works
+
+
+def test_timeout_retry_model_raises_after_exhausting_retries():
+    import time as _t
+
+    from bench.runners.localize_runner import TimeoutRetryModel
+
+    class DeadModel:
+        def query(self, messages, **kwargs):
+            _t.sleep(30)
+
+    wrapped = TimeoutRetryModel(DeadModel(), per_call_timeout=1, retries=1)
+    started = _t.time()
+    try:
+        wrapped.query([{"role": "user", "content": "hi"}])
+        raised = False
+    except TimeoutError:
+        raised = True
+    elapsed = _t.time() - started
+    assert raised
+    assert elapsed < 10
+
+def test_build_instance_template_forced_vs_freeform():
+    from bench.runners.localize_runner import (
+        LOCALIZE_INSTANCE_TEMPLATE,
+        build_instance_template,
+    )
+
+    # Free-form: identical to the shared template for every config.
+    for cfg in ("baseline", "lsp", "code_graph", "code_graph_mcp"):
+        assert build_instance_template(cfg, force_tool=False) == LOCALIZE_INSTANCE_TEMPLATE
+
+    # Forced: tool configs get a mandate prefix naming their tool.
+    lsp_t = build_instance_template("lsp", force_tool=True)
+    assert lsp_t != LOCALIZE_INSTANCE_TEMPLATE
+    assert lsp_t.endswith(LOCALIZE_INSTANCE_TEMPLATE)
+    assert "MANDATORY" in lsp_t and "lsp" in lsp_t
+
+    cg_t = build_instance_template("code_graph", force_tool=True)
+    assert "MANDATORY" in cg_t and "cg search_code" in cg_t
+
+    # baseline has no tool -> forced is a no-op (still the shared template).
+    assert build_instance_template("baseline", force_tool=True) == LOCALIZE_INSTANCE_TEMPLATE
diff --git a/tests/bench/test_struct_query_bench.py b/tests/bench/test_struct_query_bench.py
new file mode 100644
index 00000000..914c4b57
--- /dev/null
+++ b/tests/bench/test_struct_query_bench.py
@@ -0,0 +1,55 @@
+"""Unit tests for the deterministic structural-query compression benchmark.
+
+Only the pure (FalkorDB-free) helpers are covered here; the graph/grep paths
+are integration-tested by running the module against a live indexed repo.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from bench.runners import struct_query_bench as sqb
+
+
+def test_relpath_strips_worktree():
+    assert sqb._relpath("/wt/pkg/mod.py", "/wt") == "pkg/mod.py"
+
+
+def test_relpath_falls_back_to_basename_outside_worktree():
+    assert sqb._relpath("/other/pkg/mod.py", "/wt") == "mod.py"
+
+
+def test_tok_counts_tokens():
+    pytest.importorskip("tiktoken")
+    assert sqb._tok("hello world") > 0
+    # longer text => more tokens
+    assert sqb._tok("a b c d e f g h") > sqb._tok("a b")
+
+
+def test_summarize_paired_ratio_stats():
+    rows = [
+        {"ratio": 4.0, "graph_tokens": 100, "raw_tokens": 400},
+        {"ratio": 2.0, "graph_tokens": 50, "raw_tokens": 100},
+        {"ratio": 0.5, "graph_tokens": 200, "raw_tokens": 100},
+        {"ratio": 1.0, "graph_tokens": 80, "raw_tokens": 80},
+    ]
+    s = sqb.summarize(rows)
+    assert s["n"] == 4
+    # median of [0.5, 1.0, 2.0, 4.0] = 1.5
+    assert s["median_ratio"] == 1.5
+    # 2 of 4 strictly above 1.0
+    assert s["win_rate"] == "2/4"
+    assert s["geomean_ratio"] is not None
+
+
+def test_summarize_handles_empty():
+    s = sqb.summarize([])
+    assert s["n"] == 0
+    assert s["median_ratio"] is None
+    assert s["win_rate"] == "0/0"
+
+
+def test_generic_names_excluded_constant():
+    # sanity: the common-name filter contains the obvious megahub offenders
+    for n in ("run", "get", "__init__", "update"):
+        assert n in sqb._GENERIC
diff --git a/tests/bench/test_trace.py b/tests/bench/test_trace.py
new file mode 100644
index 00000000..60253ed7
--- /dev/null
+++ b/tests/bench/test_trace.py
@@ -0,0 +1,154 @@
+"""Unit tests for the trace extractor's before-call thoughts + final blocks.
+
+These cover the post-benchmark behaviour-analysis requirements: surface the
+agent's thinking/narration BEFORE each tool call, capture the trailing
+reasoning/answer after the last call, and thread reasoning_tokens into meta.
+"""
+from __future__ import annotations
+
+from bench.analysis.trace import build_steps, final_blocks, render_md
+
+
+def _ev(etype: str, **data):
+    return {"type": etype, "data": data}
+
+
+def _make_events():
+    # Turn 0: reason + narrate, then fire TWO sibling tool calls.
+    # Turn 1: narrate, then one tool call.
+    # Trailing: closing reasoning + final answer after the last tool.
+    return [
+        _ev("assistant.reasoning", content="I should find the symbol first."),
+        _ev("assistant.message", content="Searching the graph."),
+        _ev("tool.execution_start", toolName="code-graph-search_code",
+            toolCallId="c1", turnId=0, arguments={"query": "foo"}),
+        _ev("tool.execution_start", toolName="grep",
+            toolCallId="c2", turnId=0, arguments={"pattern": "foo"}),
+        _ev("tool.execution_complete", toolCallId="c1", success=True,
+            result={"content": [{"type": "text", "text": "hit"}]}),
+        _ev("tool.execution_complete", toolCallId="c2", success=True,
+            result={"content": [{"type": "text", "text": "match"}]}),
+        _ev("assistant.reasoning", content="Now I narrow down the file."),
+        _ev("assistant.message", content="Reading the file."),
+        _ev("tool.execution_start", toolName="view",
+            toolCallId="c3", turnId=1, arguments={"path": "a.py"}),
+        _ev("tool.execution_complete", toolCallId="c3", success=True,
+            result={"content": [{"type": "text", "text": "src"}]}),
+        _ev("assistant.reasoning", content="The fix lives in a.py."),
+        _ev("assistant.message", content="FINAL_LOCALIZATION_JSON: [\"a.py\"]"),
+    ]
+
+
+def test_thinking_before_attaches_to_first_tool_of_turn():
+    steps = build_steps(_make_events())
+    assert len(steps) == 3
+    # Step 0 (first tool of turn 0) carries the turn's thinking + narration.
+    assert steps[0]["thinking_before"] == "I should find the symbol first."
+    assert steps[0]["narration_before"] == "Searching the graph."
+    # Step 1 is a sibling in the same turn -> before-window is empty.
+    assert steps[1]["thinking_before"] == ""
+    assert steps[1]["narration_before"] == ""
+    assert steps[0]["turn"] == steps[1]["turn"] == 0
+    # Step 2 (turn 1) carries turn 1's thinking + narration.
+    assert steps[2]["thinking_before"] == "Now I narrow down the file."
+    assert steps[2]["narration_before"] == "Reading the file."
+
+
+def test_final_blocks_capture_trailing_answer():
+    final = final_blocks(_make_events())
+    assert final["thinking"] == "The fix lives in a.py."
+    assert "FINAL_LOCALIZATION_JSON" in final["narration"]
+
+
+def test_render_md_shows_thoughts_before_call_and_reasoning_tokens():
+    events = _make_events()
+    steps = build_steps(events)
+    final = final_blocks(events)
+    meta = {"input_tokens": 100, "output_tokens": 10,
+            "reasoning_tokens": 42, "total_tokens": 110}
+    md = render_md(meta, steps, [], {
+        "tool_calls_total": 3, "tool_calls_by_kind": {}, "structural_adopted": True,
+        "structural_calls": 1, "first_tool": "code-graph-search_code",
+        "empty_result_count": 0, "tool_error_count": 0, "redundant_call_count": 0,
+        "gold_hit_source_counts": {},
+    }, final)
+    assert "of which reasoning: 42" in md
+    assert "thinking (before call):" in md
+    assert "narration (before call):" in md
+    # The thought renders before the call line for step 0.
+    assert md.index("I should find the symbol first.") < md.index('"query": "foo"')
+    assert "## Final (after last tool call)" in md
+
+
+def test_final_blocks_empty_when_no_trailing_content():
+    events = [
+        _ev("tool.execution_start", toolName="grep", toolCallId="c1",
+            turnId=0, arguments={}),
+        _ev("tool.execution_complete", toolCallId="c1", success=True,
+            result={"content": [{"type": "text", "text": "x"}]}),
+    ]
+    final = final_blocks(events)
+    assert final["thinking"] == ""
+    assert final["narration"] == ""
+
+
+def test_cost_without_benefit_charges_unused_structural_tokens():
+    """A structural tool that surfaces a gold file is beneficial; an empty or
+    unused structural call is charged as cost-without-benefit."""
+    from bench.analysis.trace import attribute_files, summarize
+
+    steps = [
+        # Step 0: graph call that surfaces the gold file -> beneficial.
+        {"step": 0, "turn": 0, "tool": "code-graph-search_code", "kind": "graph",
+         "arguments": {"query": "centroid"}, "success": True, "empty": False,
+         "result_text": "uxarray/grid/coordinates.py prepare_points",
+         "result_tokens_est": 50},
+        # Step 1: graph call returning empty -> wasted (0 tokens but a wasted call).
+        {"step": 1, "turn": 0, "tool": "code-graph-search_code", "kind": "graph",
+         "arguments": {"query": "nope"}, "success": True, "empty": True,
+         "result_text": "", "result_tokens_est": 0},
+        # Step 2: verbose lsp dump never tied to a gold prediction -> wasted.
+        {"step": 2, "turn": 1, "tool": "lsp-document_symbols", "kind": "lsp",
+         "arguments": {"file": "x.py"}, "success": True, "empty": False,
+         "result_text": "lots of symbols", "result_tokens_est": 2714},
+        # Step 3: builtin grep -> not under test, not charged.
+        {"step": 3, "turn": 2, "tool": "grep", "kind": "builtin_reader",
+         "arguments": {"pattern": "y"}, "success": True, "empty": False,
+         "result_text": "noise", "result_tokens_est": 100},
+    ]
+    pred = ["uxarray/grid/coordinates.py"]
+    gold = ["uxarray/grid/coordinates.py"]
+    attribution = attribute_files(pred, gold, steps, prompt_text="")
+    cwb = summarize(steps, attribution)["cost_without_benefit"]
+
+    assert cwb["benefited"] is True
+    # graph step 0 (50 tok) benefited; step 1 empty wasted; lsp 2714 wasted.
+    assert cwb["beneficial_tokens"] == 50
+    assert cwb["wasted_tokens"] == 2714
+    assert cwb["wasted_calls"] == 2  # empty graph + unused lsp
+    assert cwb["structural_result_tokens"] == 50 + 0 + 2714
+    assert cwb["by_kind"]["lsp"]["wasted_tokens"] == 2714
+    assert cwb["by_kind"]["graph"]["wasted_calls"] == 1
+    # builtin grep tokens are NOT counted as structural cost.
+    assert "builtin_reader" not in cwb["by_kind"]
+
+
+def test_cost_without_benefit_flags_zero_contribution():
+    """Structural tool used but it never surfaced the gold file -> benefited False
+    and all its tokens are wasted."""
+    from bench.analysis.trace import attribute_files, summarize
+
+    steps = [
+        {"step": 0, "turn": 0, "tool": "code-graph-search_code", "kind": "graph",
+         "arguments": {"query": "q"}, "success": True, "empty": False,
+         "result_text": "some/other/file.py", "result_tokens_est": 300},
+    ]
+    # Agent predicted the gold from its own prior / builtin, not the graph.
+    pred = ["the/gold.py"]
+    gold = ["the/gold.py"]
+    attribution = attribute_files(pred, gold, steps, prompt_text="")
+    cwb = summarize(steps, attribution)["cost_without_benefit"]
+
+    assert cwb["benefited"] is False
+    assert cwb["wasted_tokens"] == 300
+    assert cwb["wasted_fraction"] == 1.0