diff --git a/.gitignore b/.gitignore index b7476d0a..9595dd13 100644 --- a/.gitignore +++ b/.gitignore @@ -58,3 +58,4 @@ htmlcov/ pytest_cache/ *.log repositories/ +logs/ diff --git a/api/analyzers/analyzer.py b/api/analyzers/analyzer.py index 64d49004..0564606b 100644 --- a/api/analyzers/analyzer.py +++ b/api/analyzers/analyzer.py @@ -57,6 +57,11 @@ def resolve(self, files: dict[Path, File], lsp: SyncLanguageServer, file_path: P locations = lsp.request_definition(str(file_path), node.start_point.row, node.start_point.column) return [(files[Path(self.resolve_path(location['absolutePath'], path))], files[Path(self.resolve_path(location['absolutePath'], path))].tree.root_node.descendant_for_point_range(Point(location['range']['start']['line'], location['range']['start']['character']), Point(location['range']['end']['line'], location['range']['end']['character']))) for location in locations if location and Path(self.resolve_path(location['absolutePath'], path)) in files] except Exception as e: + import logging + logging.getLogger(__name__).warning( + "resolve() failed for %s @%d:%d: %s", + file_path, node.start_point.row, node.start_point.column, e, + ) return [] @abstractmethod diff --git a/api/analyzers/source_analyzer.py b/api/analyzers/source_analyzer.py index 4186f358..ead8707a 100644 --- a/api/analyzers/source_analyzer.py +++ b/api/analyzers/source_analyzer.py @@ -134,7 +134,27 @@ def second_pass(self, graph: Graph, files: list[Path], path: Path) -> None: else: lsps[".java"] = NullLanguageServer() if any(path.rglob('*.py')): - config = MultilspyConfig.from_dict({"code_language": "python", "environment_path": f"{path}/venv"}) + import sys + py_venv = path / "venv" + py_dotvenv = path / ".venv" + if py_venv.is_dir() and (py_venv / "bin" / "python").exists(): + env_path = str(py_venv) + elif py_dotvenv.is_dir() and (py_dotvenv / "bin" / "python").exists(): + env_path = str(py_dotvenv) + else: + # Fall back to the host's Python environment so jedi has a + # valid interpreter to introspect; otherwise every + # request_definition() raises InvalidPythonEnvironment and + # we'd silently produce a graph with zero CALLS edges. + env_path = str(Path(sys.executable).resolve().parent.parent) + logging.info( + "No venv at %s; falling back to host env %s for jedi LSP", + path, env_path, + ) + config = MultilspyConfig.from_dict({ + "code_language": "python", + "environment_path": env_path, + }) lsps[".py"] = SyncLanguageServer.create(config, logger, str(path)) else: lsps[".py"] = NullLanguageServer() @@ -146,7 +166,16 @@ def second_pass(self, graph: Graph, files: list[Path], path: Path) -> None: with lsps[".java"].start_server(), lsps[".py"].start_server(), lsps[".cs"].start_server(): files_len = len(self.files) for i, file_path in enumerate(files): - file = self.files[file_path] + file = self.files.get(file_path) + if file is None: + # first_pass skipped this file (e.g. parse error, empty, + # or ignored after entering the candidate list). Skip + # in second_pass too instead of crashing the whole index. + logging.warning( + "second_pass: %s not in files map (first_pass skipped it); skipping", + file_path, + ) + continue logging.info(f'Processing file ({i + 1}/{files_len}): {file_path}') for _, entity in file.entities.items(): entity.resolved_symbol(lambda key, symbol, fp=file_path: analyzers[fp.suffix].resolve_symbol(self.files, lsps[fp.suffix], fp, path, key, symbol)) diff --git a/bench/agents/code_graph_mcp_adapter.py b/bench/agents/code_graph_mcp_adapter.py new file mode 100644 index 00000000..9a6347bd --- /dev/null +++ b/bench/agents/code_graph_mcp_adapter.py @@ -0,0 +1,163 @@ +"""MCP-transport adapter to cgraph-mcp for the benchmark. + +Sibling of `code_graph_adapter.py` (HTTP). Where the HTTP adapter talks +to the host FastAPI service over the network, this one spawns the +`cgraph-mcp` stdio MCP server in-process via the official MCP Python +SDK and dispatches tool calls over JSON-RPC. + +This gives us a second, real-world benchmark track that exercises the +exact same transport agents (Claude Code, Cursor, …) will use in +production. Tool names match the 8-tool MCP surface +(`index_repo`, `search_code`, `get_callers`, `get_callees`, +`get_dependencies`, `impact_analysis`, `find_path`, `ask`). + +Each call spawns a fresh server, runs the call, and exits. That's +~0.5-1s overhead per call but keeps the model trivially safe to call +from a bash shim (one process per invocation, no shared state). +A future optimisation could persist the server across calls via a +side-channel daemon, but per-call spawn matches how external agents +actually use MCP servers today. +""" + +from __future__ import annotations + +import asyncio +import json +import os +from typing import Any + +from mcp import ClientSession, StdioServerParameters +from mcp.client.stdio import stdio_client + + +DEFAULT_TIMEOUT_SEC = 60.0 + + +def _env_for_mcp() -> dict[str, str]: + """Build the env for the spawned cgraph-mcp process. + + Pass through everything from the caller but make sure the FalkorDB + coordinates are present — the runner usually sets them to point at + the host FalkorDB container. + """ + env = dict(os.environ) + env.setdefault("FALKORDB_HOST", os.environ.get("FALKORDB_HOST", "127.0.0.1")) + env.setdefault("FALKORDB_PORT", os.environ.get("FALKORDB_PORT", "6379")) + return env + + +def _extract(result: Any) -> Any: + """Normalize a CallToolResult into a JSON-serialisable Python value. + + The MCP spec lets servers put the payload in `structuredContent` + and/or echo it as a JSON text chunk. Our 8 tools do both; agents + have historically preferred the text payload. We mirror that: + return the parsed text chunk when present, otherwise fall back to + structuredContent (unwrapping the spec's `{"result": ...}` wrapper + for collection-returning tools). + """ + for chunk in result.content: + if hasattr(chunk, "text") and chunk.text: + try: + return json.loads(chunk.text) + except json.JSONDecodeError: + return chunk.text + struct = getattr(result, "structuredContent", None) + if isinstance(struct, dict) and set(struct.keys()) == {"result"}: + return struct["result"] + return struct + + +async def _call_tool_async(name: str, arguments: dict[str, Any], timeout: float) -> Any: + params = StdioServerParameters(command="cgraph-mcp", args=[], env=_env_for_mcp()) + async with stdio_client(params) as (read, write): + async with ClientSession(read, write) as session: + await asyncio.wait_for(session.initialize(), timeout=timeout) + result = await asyncio.wait_for( + session.call_tool(name, arguments), timeout=timeout + ) + payload = _extract(result) + if getattr(result, "isError", False): + return {"error": payload} + return payload + + +def call_tool(name: str, arguments: dict[str, Any], *, timeout: float = DEFAULT_TIMEOUT_SEC) -> Any: + """Sync entry point for the bash shim. One spawn per call.""" + return asyncio.run(_call_tool_async(name, arguments, timeout)) + + +# ── Top-level convenience wrappers ───────────────────────────────────── +# Names map 1:1 onto MCP tool names (and onto bench/tools/code_graph_mcp/ +# tools.yaml entries). Kwargs mirror each tool's MCP arg schema. + + +def index_repo(path_or_url: str, branch: str | None = None, ignore: list[str] | None = None) -> dict[str, Any]: + args: dict[str, Any] = {"path_or_url": path_or_url} + if branch is not None: + args["branch"] = branch + if ignore is not None: + args["ignore"] = ignore + return call_tool("index_repo", args) + + +def search_code(prefix: str, project: str, branch: str | None = None, limit: int = 10) -> Any: + args: dict[str, Any] = {"prefix": prefix, "project": project, "limit": limit} + if branch is not None: + args["branch"] = branch + return call_tool("search_code", args) + + +def _neighbors(tool: str, symbol_id: int, project: str, branch: str | None, limit: int) -> Any: + args: dict[str, Any] = {"symbol_id": symbol_id, "project": project, "limit": limit} + if branch is not None: + args["branch"] = branch + return call_tool(tool, args) + + +def get_callers(symbol_id: int, project: str, branch: str | None = None, limit: int = 50) -> Any: + return _neighbors("get_callers", symbol_id, project, branch, limit) + + +def get_callees(symbol_id: int, project: str, branch: str | None = None, limit: int = 50) -> Any: + return _neighbors("get_callees", symbol_id, project, branch, limit) + + +def get_dependencies(symbol_id: int, project: str, branch: str | None = None, limit: int = 50) -> Any: + return _neighbors("get_dependencies", symbol_id, project, branch, limit) + + +def impact_analysis( + symbol_id: int, + project: str, + branch: str | None = None, + direction: str = "IN", + depth: int = 3, +) -> Any: + args: dict[str, Any] = { + "symbol_id": symbol_id, + "project": project, + "direction": direction, + "depth": depth, + } + if branch is not None: + args["branch"] = branch + return call_tool("impact_analysis", args) + + +def find_path(source_id: int, dest_id: int, project: str, branch: str | None = None) -> Any: + args: dict[str, Any] = { + "source_id": source_id, + "dest_id": dest_id, + "project": project, + } + if branch is not None: + args["branch"] = branch + return call_tool("find_path", args) + + +def ask(question: str, project: str, branch: str | None = None) -> Any: + args: dict[str, Any] = {"question": question, "project": project} + if branch is not None: + args["branch"] = branch + return call_tool("ask", args) diff --git a/bench/agents/lsp_adapter.py b/bench/agents/lsp_adapter.py index aee8f2e6..3247862f 100644 --- a/bench/agents/lsp_adapter.py +++ b/bench/agents/lsp_adapter.py @@ -131,6 +131,7 @@ def __init__(self, repo_root: str | Path, language: str = "python", self.shim = shim self._env_path = environment_path self._server: Any | None = None # SyncLanguageServer + self._cm: Any | None = None # live start_server() context (persistent mode) # ----- lifecycle ------------------------------------------------------ @@ -166,6 +167,34 @@ def server_running(self) -> Iterator["LSPClient"]: finally: self._server = None + # ----- persistent lifecycle (for a long-lived MCP server) ------------- + + def start(self) -> "LSPClient": + """Start a persistent language-server subprocess. + + Unlike ``server_running`` (a per-call context manager used by the + bash CLI), this keeps one jedi process alive so an MCP server can + serve many tool calls without paying the ~1-3s startup each time. + The caller is responsible for calling ``stop()`` at shutdown. + """ + if self._server is not None: + return self + server = self._build_server() + cm = server.start_server() + cm.__enter__() + self._server = server + self._cm = cm + return self + + def stop(self) -> None: + cm = getattr(self, "_cm", None) + if cm is not None: + try: + cm.__exit__(None, None, None) + finally: + self._cm = None + self._server = None + # ----- relative path normalization ----------------------------------- def _rel(self, file_path: str) -> str: diff --git a/bench/analysis/__init__.py b/bench/analysis/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/bench/analysis/adopt_audit/edit_critical_overrides.json b/bench/analysis/adopt_audit/edit_critical_overrides.json new file mode 100644 index 00000000..0967ef42 --- /dev/null +++ b/bench/analysis/adopt_audit/edit_critical_overrides.json @@ -0,0 +1 @@ +{} diff --git a/bench/analysis/adopt_controls.py b/bench/analysis/adopt_controls.py new file mode 100644 index 00000000..8d754c8c --- /dev/null +++ b/bench/analysis/adopt_controls.py @@ -0,0 +1,429 @@ +"""Negative-control + relabel infrastructure for the adoption-calibration +experiment (Lane 1). All FREE / offline -- no API, no network. + +This module implements the prereg controls (see +``files/prereg-adoption-calibration.md`` ss3, ss6, ss7) that make the +"overfitting boundary" *sufficient* rather than merely necessary: + + 1. edit-critical relabel (ss7) -- only EDIT-CRITICAL gold count as FN, so a + lever is not rewarded for reproducing the patch footprint. Path/type + heuristic first guess + a frozen manual-override JSON. + + 2. GRAPH-WRONG selection (ss3) -- the subset of tasks whose top-ranked (rank-1) + graph hit is verified non-gold. Tests whether a lever can still correctly + DROP under a misleading #1. Pure offline scan of cached runs. + + 3. NOISY distractor manifest (ss3, ss6) -- a deterministic, seeded set of K + plausible-but-false sibling candidates per task, for injection into the + LIVE MCP output at pilot run-time. On CACHED data the agent never saw these + files, so they would always score TN; therefore the FREE deliverable here + is the *manifest generator* + validity assertions + a coverage report, NOT + an offline NOISY score. The live NOISY arm is deferred to the pilot. + +Scoping note (per rubber-duck): NOISY is a ROBUSTNESS PROBE, not evidence that +the injected junk matches the graph's real false-positive distribution (the +hardened cache has only FP=2/213 real FPs -- too few to characterize). Report +the real FPs alongside any NOISY result so a reader can judge the gap. + +Offline mapping trick: the per-task worktrees under +``/worktrees/code_graph/loc-`` are git-sanitized (no .git), but each +code_graph run's ``stdout.jsonl`` references its own ``loc-`` exactly once. +So task -> worktree is recovered by grepping the run log (unique per task, no +same-repo collision), and the gold file's *directory siblings at base_commit* +are read straight off the worktree filesystem. No base_commit SHA or HF load +needed. +""" + +from __future__ import annotations + +import argparse +import ast +import difflib +import json +import re +from pathlib import Path + +from bench.analysis.exposure_adoption import ( + analyze_batch, + candidate_calibration, + classify_run, + row_stdout_path, + surfaced_files, +) + +DEFAULT_SEED = 1234 +DEFAULT_K = 2 + +_LOC_RE = re.compile(r"loc-[0-9a-f]{16}") + +# Incidental-gold path markers (prereg ss7): test-only, fixture, migration, +# generated, or docs files. A gold file matching any of these is INCIDENTAL +# unless an override says otherwise. Everything else defaults to EDIT-CRITICAL. +_INCIDENTAL_PATH_RE = re.compile( + r"(^|/)(tests?|testing|test_[^/]*|[^/]*_test\.py|conftest\.py|fixtures?|" + r"migrations?|_generated|generated|\.pb\.py|docs?|examples?)(/|$|\.)", + re.IGNORECASE, +) +# Files we never inject as distractors (not real "plausible source siblings"): +# tests/fixtures/migrations, caches, and package markers (__init__.py / dunder +# files) which are near-universal and not credible edit locations. +_NONSOURCE_DISTRACTOR_RE = re.compile( + r"(^|/)(tests?|conftest\.py|fixtures?|migrations?|__pycache__|" + r"[^/]*_test\.py|test_[^/]*\.py|__[a-z0-9_]+__\.py|_?version\.py|setup\.py)(/|$)", + re.IGNORECASE, +) + + +# --------------------------------------------------------------------------- +# task -> worktree mapping (offline, via the run log) +# --------------------------------------------------------------------------- +def _run_stdout(batch_root: Path, model: str, task: str, + prompt_mode: str = "nudged") -> Path | None: + base = batch_root / "runs" / model / "localize" / prompt_mode / "code_graph" + cand = base / task / "logs" / "stdout.jsonl" + if cand.exists(): + return cand + hits = list(base.glob(f"{task}/**/stdout.jsonl")) + if hits: + return hits[0] + # fall back to any mode dir + hits = list((batch_root / "runs" / model).glob(f"*/*/code_graph/{task}/**/stdout.jsonl")) + return hits[0] if hits else None + + +def map_task_worktree(batch_root: Path, model: str, task: str, + prompt_mode: str = "nudged") -> Path | None: + """Return the on-disk worktree dir for ``task`` at base_commit, or None. + + Recovered by reading the unique ``loc-`` referenced in the task's + code_graph run log. Asserts uniqueness (raises on >1 distinct hash). + """ + sp = _run_stdout(batch_root, model, task, prompt_mode) + if sp is None: + return None + locs = sorted(set(_LOC_RE.findall(sp.read_text()))) + if len(locs) != 1: + return None + wt = batch_root / "worktrees" / "code_graph" / locs[0] + return wt if wt.exists() else None + + +# --------------------------------------------------------------------------- +# edit-critical relabel (prereg ss7) +# --------------------------------------------------------------------------- +def load_overrides(path: Path | None) -> dict[str, dict[str, str]]: + """Load the frozen manual-audit override file. + + Schema: ``{task_id: {gold_file: "critical"|"incidental"}}``. Missing file + or None -> empty (heuristic-only). + """ + if path is None or not path.exists(): + return {} + data = json.loads(path.read_text()) + return {k: dict(v) for k, v in data.items()} + + +def edit_critical_split( + gold_files: list[str], + task: str | None = None, + overrides: dict[str, dict[str, str]] | None = None, +) -> tuple[list[str], list[str]]: + """Split gold into (edit_critical, incidental). + + Heuristic: a gold file whose path matches ``_INCIDENTAL_PATH_RE`` is + INCIDENTAL; otherwise EDIT-CRITICAL. The manual override for ``task`` (if + present) wins over the heuristic, per gold file. + """ + ov = (overrides or {}).get(task or "", {}) + critical, incidental = [], [] + for g in gold_files: + label = ov.get(g) + if label is None: + label = "incidental" if _INCIDENTAL_PATH_RE.search(g) else "critical" + (incidental if label == "incidental" else critical).append(g) + return critical, incidental + + +# --------------------------------------------------------------------------- +# GRAPH-WRONG selection (prereg ss3) +# --------------------------------------------------------------------------- +def select_graph_wrong(runs: list[dict], gold_by_task: dict[str, list[str]], + batch_root: Path, model: str, + prompt_mode: str = "nudged") -> list[dict]: + """Tasks whose rank-1 surfaced file is verified non-gold. + + ``runs`` are the per-run dicts from ``analyze_batch`` (carry ``task``). We + re-read surfaced_files to find rank-1, then mark a task GRAPH-WRONG if, in at + least one of its runs, the best (rank-1) primary hit is not in that task's + gold set. Returns ``[{task, run_idx, rank1, is_wrong}]`` for wrong runs. + """ + out = [] + for r in runs: + task = r.get("task") + gold = set(gold_by_task.get(task, [])) + sp = _run_stdout(batch_root, model, task, prompt_mode) + if sp is None: + continue + surf = surfaced_files(sp) + rank1 = min((f for f, v in surf.items() if v["best_rank"] is not None), + key=lambda f: surf[f]["best_rank"], default=None) + if rank1 is not None and rank1 not in gold: + out.append({"task": task, "run_idx": r.get("run_idx"), + "rank1": rank1, "is_wrong": True}) + return out + + +# --------------------------------------------------------------------------- +# NOISY distractor manifest (prereg ss3, ss6) +# --------------------------------------------------------------------------- +def gold_symbols_offline(worktree: Path, gold_file: str) -> list[str]: + """Top-level + class-method symbol names from the gold file on disk. + + Parsed with ``ast`` straight off the base_commit worktree -- no HF instance + needed. Returns [] for non-Python or unparseable files. + """ + if not gold_file.endswith(".py"): + return [] + p = worktree / gold_file + try: + tree = ast.parse(p.read_text()) + except (OSError, SyntaxError, ValueError): + return [] + out: list[str] = [] + for node in ast.walk(tree): + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): + name = node.name + # dunder methods (__init__, __repr__, ...) are near-universal and + # would spuriously match package files / generic stems -- skip them. + if name.startswith("__") and name.endswith("__"): + continue + out.append(name) + return out + + +def _similarity(stem: str, targets: list[str]) -> float: + stem = stem.lower() + return max((difflib.SequenceMatcher(None, stem, t.lower()).ratio() + for t in targets if t), default=0.0) + + +def sibling_distractors( + worktree: Path, + gold_files: list[str], + incidental: list[str], + k: int = DEFAULT_K, + seed: int = DEFAULT_SEED, +) -> list[dict]: + """Deterministic K plausible-but-false sibling distractors for a task. + + Pool = source-file siblings in every gold file's directory (base_commit + on-disk tree), excluding all gold + incidental files and any test/fixture/ + generated file. Ranked by max name-similarity to a gold stem OR gold AST + symbol. Ties broken by a seeded but reproducible key, then by path. Returns + the top-k as ``[{file, score, similar_to}]``. + """ + gold_set = set(gold_files) | set(incidental) + # similarity targets: gold stems + gold symbols + targets: list[str] = [] + for g in gold_files: + targets.append(Path(g).stem) + targets.extend(gold_symbols_offline(worktree, g)) + + pool: dict[str, float] = {} + for g in gold_files: + gdir = str(Path(g).parent) + ddir = worktree / gdir + if not ddir.is_dir(): + continue + for child in sorted(ddir.iterdir()): + if not child.is_file() or not child.name.endswith(".py"): + continue + rel = str(Path(gdir) / child.name) if gdir != "." else child.name + if rel in gold_set: + continue + if _NONSOURCE_DISTRACTOR_RE.search(rel): + continue + score = _similarity(child.stem, targets) + # keep the best score if a file is reachable from >1 gold dir + if rel not in pool or score > pool[rel]: + pool[rel] = score + + def _tiebreak(item: tuple[str, float]) -> tuple: + rel, score = item + # seeded, deterministic, path-stable ordering for equal scores + h = difflib.SequenceMatcher(None, f"{seed}", rel).ratio() + return (-score, -h, rel) + + ranked = sorted(pool.items(), key=_tiebreak) + chosen = ranked[:k] + return [{"file": rel, "score": round(score, 4), + "similar_to": _closest_target(Path(rel).stem, targets)} + for rel, score in chosen] + + +def _closest_target(stem: str, targets: list[str]) -> str | None: + if not targets: + return None + return max(targets, key=lambda t: difflib.SequenceMatcher( + None, stem.lower(), t.lower()).ratio()) + + +def build_noisy_manifest( + results_path: Path, + *, + k: int = DEFAULT_K, + seed: int = DEFAULT_SEED, + overrides_path: Path | None = None, +) -> dict: + """Build the deterministic NOISY injection manifest + coverage report. + + For every code_graph task, emit K verified-non-gold sibling distractors. The + manifest is keyed by task and is reproducible across runs. Validity + assertions (non-gold, distinct, on-disk) are enforced and surfaced in the + report so a frozen artifact is auditable. + """ + batch_root = results_path.parent.parent + model = results_path.parent.name + rows = [json.loads(ln) for ln in results_path.read_text().splitlines() if ln.strip()] + cg = [r for r in rows if r.get("config") == "code_graph"] + overrides = load_overrides(overrides_path) + + seen_tasks: set[str] = set() + manifest: dict[str, dict] = {} + coverage = {"tasks": 0, "full_k": 0, "partial": 0, "empty": 0, "no_worktree": 0} + for r in cg: + task = r["task_id"] + if task in seen_tasks: + continue + seen_tasks.add(task) + coverage["tasks"] += 1 + gold = r.get("gold_files", []) + _crit, incidental = edit_critical_split(gold, task, overrides) + wt = map_task_worktree(batch_root, model, task) + if wt is None: + coverage["no_worktree"] += 1 + manifest[task] = {"distractors": [], "note": "no_worktree"} + continue + distractors = sibling_distractors(wt, gold, incidental, k=k, seed=seed) + # validity assertions + gold_set = set(gold) + files = [d["file"] for d in distractors] + assert len(files) == len(set(files)), f"dup distractor for {task}" + assert not (set(files) & gold_set), f"gold leaked into distractors for {task}" + for d in distractors: + assert (wt / d["file"]).is_file(), f"distractor not on disk: {d['file']}" + n = len(distractors) + coverage["full_k" if n >= k else ("empty" if n == 0 else "partial")] += 1 + manifest[task] = {"worktree": wt.name, "distractors": distractors, + "gold_files": gold, "incidental": incidental} + return {"k": k, "seed": seed, "coverage": coverage, "manifest": manifest} + + +# --------------------------------------------------------------------------- +# edit-critical recall sensitivity (heuristic-only vs +overrides) +# --------------------------------------------------------------------------- +def _rescore_with_labels(results_path: Path, + overrides: dict[str, dict[str, str]] | None, + prompt_mode: str | None = None) -> dict: + """Re-run the candidate metric, passing per-task edit_critical labels. + + Locates each run's log by FULL row identity (``row_stdout_path``) so coexisting + prompt-mode arms are never cross-wired. Pass ``prompt_mode`` to restrict to one + arm. + """ + batch_root = results_path.parent.parent + model = results_path.parent.name + rows = [json.loads(ln) for ln in results_path.read_text().splitlines() if ln.strip()] + cg = [r for r in rows if r.get("config") == "code_graph"] + if prompt_mode is not None: + cg = [r for r in cg if r.get("prompt_mode") == prompt_mode] + per_run = [] + for r in cg: + task = r.get("task_id") + stdout = row_stdout_path(batch_root, model, r) + if not stdout: + continue + gold = r.get("gold_files", []) + crit, _inc = edit_critical_split(gold, task, overrides) + cls = classify_run(stdout, gold, r.get("pred_files", []), edit_critical=crit) + cls["task"] = task + cls["run_idx"] = r.get("run_idx") + per_run.append(cls) + return candidate_calibration(per_run) + + +def recall_sensitivity(results_path: Path, overrides_path: Path | None) -> dict: + """Macro P/R/F1 under (a) all-gold-critical, (b) heuristic-only, + (c) heuristic+overrides. Surfaces how much the relabel moves recall so a + skeptic can see labels weren't tuned to taste.""" + overrides = load_overrides(overrides_path) + # (a) baseline: every gold critical -> use analyze_batch (edit_critical=None) + base = candidate_calibration( + [r for r in analyze_batch(results_path)["per_run"] if "error" not in r]) + heur = _rescore_with_labels(results_path, None) + audit = _rescore_with_labels(results_path, overrides) + return {"all_critical": base["macro"], "heuristic_only": heur["macro"], + "heuristic_plus_audit": audit["macro"], "n_overrides": sum( + len(v) for v in overrides.values())} + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("results", type=Path, help="path to a code_graph results.jsonl") + ap.add_argument("--overrides", type=Path, + default=Path(__file__).parent / "adopt_audit" / "edit_critical_overrides.json") + ap.add_argument("--k", type=int, default=DEFAULT_K) + ap.add_argument("--seed", type=int, default=DEFAULT_SEED) + ap.add_argument("--json", type=Path, help="write full manifest+report JSON here") + args = ap.parse_args() + + out = analyze_batch(args.results) + runs = [r for r in out["per_run"] if "error" not in r] + rows = [json.loads(ln) for ln in args.results.read_text().splitlines() if ln.strip()] + gold_by_task = {r["task_id"]: r.get("gold_files", []) + for r in rows if r.get("config") == "code_graph"} + + model = args.results.parent.name + batch_root = args.results.parent.parent + + print("==== EDIT-CRITICAL RELABEL (recall sensitivity) ====") + sens = recall_sensitivity(args.results, args.overrides) + + def _m(d): + f = lambda x: f"{x:.3f}" if x is not None else " n/a" # noqa: E731 + return f"P={f(d['precision'])} R={f(d['recall'])} F1={f(d['f1'])}" + + print(f" all-gold-critical : {_m(sens['all_critical'])}") + print(f" heuristic-only : {_m(sens['heuristic_only'])}") + print(f" heuristic+audit (n={sens['n_overrides']:>2}): {_m(sens['heuristic_plus_audit'])}") + + print("\n==== GRAPH-WRONG SUBSET (rank-1 surfaced file is non-gold) ====") + gw = select_graph_wrong(runs, gold_by_task, batch_root, model) + gw_tasks = sorted({g["task"] for g in gw}) + print(f" graph-wrong runs: {len(gw)} | distinct tasks: {len(gw_tasks)}") + for g in gw: + print(f" {g['task']:34s} idx={g['run_idx']} rank1={g['rank1']}") + + print("\n==== NOISY DISTRACTOR MANIFEST (deterministic, for run-time injection) ====") + noisy = build_noisy_manifest(args.results, k=args.k, seed=args.seed, + overrides_path=args.overrides) + cov = noisy["coverage"] + print(f" k={noisy['k']} seed={noisy['seed']} tasks={cov['tasks']} " + f"full_k={cov['full_k']} partial={cov['partial']} " + f"empty={cov['empty']} no_worktree={cov['no_worktree']}") + for task, m in noisy["manifest"].items(): + ds = ", ".join(f"{Path(d['file']).name}({d['score']})" for d in m.get("distractors", [])) + print(f" {task:34s} -> {ds or m.get('note', '(none)')}") + + if args.json: + payload = {"recall_sensitivity": sens, "graph_wrong": gw, + "noisy": noisy} + args.json.write_text(json.dumps(payload, indent=2)) + print(f"\nwrote {args.json}") + + +if __name__ == "__main__": + main() diff --git a/bench/analysis/adopt_diag.py b/bench/analysis/adopt_diag.py new file mode 100644 index 00000000..6fee65fc --- /dev/null +++ b/bench/analysis/adopt_diag.py @@ -0,0 +1,345 @@ +"""Per-arm diagnostics for the Lane 1 adoption-calibration pilot. + +Compares the CTRL / SEM / RAT arms (``prompt_mode`` in ``adopt-ctrl`` / +``adopt-sem`` / ``adopt-rat``) on one results.jsonl, all on the code_graph + +localize track. Reports, side by side per arm: + +* candidate-level calibration (macro, **macro_strict** = prereg PRIMARY, micro) + over ALL surfaced candidates, plus the same restricted to the **GRAPH-WRONG** + control subset (rank-1 graph hit verified non-gold); +* exposure / adoption aggregates and **per-arm exposure drift** (the harness is + agent-driven, so SEM/RAT may surface different candidate sets than CTRL -- + prereg amendment "identical candidate sets is measured, not forced"); +* token deltas (median total / output / **visible_output = output - reasoning** + / input / premium_requests / turns) so the RAT thinking-vs-calibration + confound is attributable; +* a **RAT compliance audit**: did the agent emit the mandated ``KEEP``/``DROP`` + lines, and is the final answer consistent with them (no DROP file kept). + +The GRAPH-WRONG subset is selected ONCE from a reference arm (default +``adopt-ctrl``) and the SAME task set is applied to every arm, so the control is +fixed across arms rather than re-derived per arm. + +Run: + uv run python -m bench.analysis.adopt_diag [--json out.json] + [--ref-arm adopt-ctrl] +""" + +from __future__ import annotations + +import argparse +import json +import re +import statistics +from pathlib import Path + +from bench.analysis.adopt_controls import select_graph_wrong +from bench.analysis.exposure_adoption import ( + analyze_batch, + candidate_calibration, + row_stdout_path, +) + +ARM_PROMPT_MODES = ("adopt-ctrl", "adopt-sem", "adopt-rat") + +# A KEEP/DROP decision line from the RAT step. The prompt format is +# ``KEEP `` / ``DROP ``; the file may be +# wrapped in backticks and the dash is an em dash, en dash or hyphen. We only +# need the decision verb and the path token, so we accept any leading list +# marker / bullet and stop at the first whitespace, backtick, em/en dash or colon. +_RAT_LINE = re.compile( + r"^\s*[-*>\d.)\]\s]*`?\s*(KEEP|DROP)\b[\s:`]*([^\s`—–:]+)", + re.IGNORECASE, +) + + +def _norm(path: str) -> str: + """Repo-root-relative posix-ish normalization (mirrors copilot_runner).""" + p = path.strip().strip("'\"`").strip().replace("\\", "/") + while p.startswith("./"): + p = p[2:] + for prefix in ("a/", "b/"): + if p.startswith(prefix): + p = p[len(prefix):] + return p.lstrip("/") + + +def parse_rat_decisions(agent_text: str) -> dict[str, str]: + """Map normalized file -> final decision ("keep"/"drop") from RAT lines. + + Last decision for a file wins (the agent may revise). Only lines that match + the KEEP/DROP contract are considered; prose mentioning the words is ignored + because the verb must be line-initial (after optional list markers). + """ + decisions: dict[str, str] = {} + for line in agent_text.splitlines(): + m = _RAT_LINE.match(line) + if not m: + continue + verb = m.group(1).lower() + f = _norm(m.group(2)) + if f: + decisions[f] = "keep" if verb == "keep" else "drop" + return decisions + + +def rat_audit(agent_text: str, pred_files: list[str]) -> dict: + """Did the agent run the keep/drop step, and is the answer consistent? + + * ``compliant`` -- emitted at least one KEEP/DROP decision line. + * ``consistent`` -- no file the agent marked DROP appears in the final + answer (the prereg requires the final answer to honor the decisions). + * ``kept_omitted`` -- files marked KEEP but absent from the final answer + (allowed by the prompt, but tracked: silent erosion after deciding keep). + """ + decisions = parse_rat_decisions(agent_text) + pred = {_norm(p) for p in (pred_files or [])} + kept = {f for f, d in decisions.items() if d == "keep"} + dropped = {f for f, d in decisions.items() if d == "drop"} + dropped_but_kept = sorted(dropped & pred) + kept_omitted = sorted(kept - pred) + return { + "compliant": bool(decisions), + "n_keep": len(kept), + "n_drop": len(dropped), + "consistent": not dropped_but_kept, + "dropped_but_kept": dropped_but_kept, + "kept_omitted": kept_omitted, + } + + +def _median(xs: list[float]) -> float | None: + xs = [x for x in xs if x is not None] + return round(statistics.median(xs), 1) if xs else None + + +def token_summary(rows: list[dict]) -> dict: + """Median token / step usage across an arm's completed rows. + + ``visible_output`` excludes hidden reasoning tokens so a RAT calibration win + can be separated from "the model just thought/typed more". + """ + def col(key: str) -> list[float]: + return [r[key] for r in rows if r.get(key) is not None] + + visible = [ + r.get("output_tokens", 0) - r.get("reasoning_tokens", 0) + for r in rows + if r.get("output_tokens") is not None + ] + return { + "n_rows": len(rows), + "median_total_tokens": _median(col("total_tokens")), + "median_output_tokens": _median(col("output_tokens")), + "median_visible_output_tokens": _median(visible), + "median_reasoning_tokens": _median(col("reasoning_tokens")), + "median_input_tokens": _median(col("input_tokens")), + "median_premium_requests": _median(col("premium_requests")), + "median_num_turns": _median(col("num_turns")), + } + + +def _agent_text_for(batch_root: Path, model: str, row: dict) -> str: + """Read the saved agent_text.txt for a localize row (fallback: empty).""" + sp = row_stdout_path(batch_root, model, row) + if sp is None: + return "" + # run_dir/logs/stdout.jsonl -> run_dir/agent_text.txt + cand = sp.parent.parent / "agent_text.txt" + if cand.exists(): + return cand.read_text(errors="replace") + return "" + + +def _subset_calibration(runs: list[dict], tasks: set[str]) -> dict: + return candidate_calibration([r for r in runs if r.get("task") in tasks]) + + +def arm_diagnostics( + results_path: Path, + arm: str, + *, + gold_by_task: dict[str, list[str]], + graph_wrong_tasks: set[str], +) -> dict: + """All per-arm diagnostics for one ``adopt-`` prompt_mode.""" + rows = [json.loads(ln) for ln in results_path.read_text().splitlines() if ln.strip()] + model = results_path.parent.name + batch_root = results_path.parent.parent + + arm_rows = [ + r + for r in rows + if r.get("config") == "code_graph" + and r.get("mode") == "localize" + and r.get("prompt_mode") == arm + and r.get("completed") + ] + + batch = analyze_batch(results_path, prompt_mode=arm, mode="localize") + runs = [r for r in batch["per_run"] if "error" not in r] + + cal_all = candidate_calibration(runs) + cal_gw = _subset_calibration(runs, graph_wrong_tasks) + + tot_gold = sum(r["n_gold"] for r in runs) + tot_surf = sum(r["n_surfaced"] for r in runs) + tot_surf_adopt = sum(r["n_surfaced_adopted"] for r in runs) + + out: dict = { + "arm": arm, + "n_runs": len(runs), + "exposure": { + "gold_run_x_gold": tot_gold, + "surfaced": tot_surf, + "surfaced_adopted": tot_surf_adopt, + "exposure_recall": round(tot_surf / tot_gold, 4) if tot_gold else None, + "adoption_rate": round(tot_surf_adopt / tot_surf, 4) if tot_surf else None, + }, + "calibration_clean": cal_all, + "calibration_graph_wrong": cal_gw, + "tokens": token_summary(arm_rows), + } + + if arm == "adopt-rat": + audits = [] + for r in arm_rows: + a = rat_audit(_agent_text_for(batch_root, model, r), r.get("pred_files", [])) + a["task"] = r.get("task_id") + a["run_idx"] = r.get("run_idx") + audits.append(a) + n = len(audits) or 1 + out["rat_audit"] = { + "n": len(audits), + "compliance_rate": round(sum(a["compliant"] for a in audits) / n, 4), + "consistency_rate": round(sum(a["consistent"] for a in audits) / n, 4), + "n_dropped_but_kept": sum(len(a["dropped_but_kept"]) for a in audits), + "n_kept_omitted": sum(len(a["kept_omitted"]) for a in audits), + "per_run": audits, + } + return out + + +def _build_gold_by_task(rows: list[dict]) -> dict[str, list[str]]: + gold: dict[str, list[str]] = {} + for r in rows: + t = r.get("task_id") + g = r.get("gold_files") + if t and g and t not in gold: + gold[t] = list(g) + return gold + + +def diagnose(results_path: Path, *, ref_arm: str = "adopt-ctrl") -> dict: + """Full per-arm diagnostic report for every arm present in the results.""" + rows = [json.loads(ln) for ln in results_path.read_text().splitlines() if ln.strip()] + model = results_path.parent.name + batch_root = results_path.parent.parent + gold_by_task = _build_gold_by_task(rows) + + present = [ + a + for a in ARM_PROMPT_MODES + if any( + r.get("prompt_mode") == a and r.get("config") == "code_graph" + for r in rows + ) + ] + + # Freeze the GRAPH-WRONG control task set ONCE from the reference arm so the + # same tasks are scored across all arms. Fall back to the first present arm. + sel_arm = ref_arm if ref_arm in present else (present[0] if present else ref_arm) + ref_batch = analyze_batch(results_path, prompt_mode=sel_arm, mode="localize") + ref_runs = [r for r in ref_batch["per_run"] if "error" not in r] + gw = select_graph_wrong( + ref_runs, gold_by_task, batch_root, model, prompt_mode=sel_arm + ) + graph_wrong_tasks = {g["task"] for g in gw} + + arms = { + a: arm_diagnostics( + results_path, a, + gold_by_task=gold_by_task, graph_wrong_tasks=graph_wrong_tasks, + ) + for a in present + } + return { + "results": str(results_path), + "model": model, + "arms_present": present, + "graph_wrong": {"ref_arm": sel_arm, "tasks": sorted(graph_wrong_tasks)}, + "arms": arms, + } + + +def _f(x) -> str: + return f"{x:.3f}" if isinstance(x, (int, float)) else " n/a" + + +def _print_report(rep: dict) -> None: + present = rep["arms_present"] + if not present: + print("no adopt-* arms found in results") + return + gw = rep["graph_wrong"] + print(f"model: {rep['model']} arms: {', '.join(present)}") + print(f"GRAPH-WRONG subset ({len(gw['tasks'])} tasks, ref={gw['ref_arm']}): " + f"{gw['tasks']}") + + hdr = f"\n{'metric':32s} " + " ".join(f"{a.replace('adopt-',''):>10s}" for a in present) + print(hdr) + print("-" * len(hdr)) + + def row(label: str, getter) -> None: + cells = " ".join(f"{getter(rep['arms'][a]):>10s}" for a in present) + print(f"{label:32s} {cells}") + + row("runs", lambda d: str(d["n_runs"])) + row("exposure_recall", lambda d: _f(d["exposure"]["exposure_recall"])) + row("adoption_rate", lambda d: _f(d["exposure"]["adoption_rate"])) + row("CLEAN macro_strict F1 (PRIMARY)", + lambda d: _f(d["calibration_clean"]["macro_strict"]["f1"])) + row("CLEAN macro F1", lambda d: _f(d["calibration_clean"]["macro"]["f1"])) + row("CLEAN macro precision", + lambda d: _f(d["calibration_clean"]["macro"]["precision"])) + row("CLEAN macro recall", + lambda d: _f(d["calibration_clean"]["macro"]["recall"])) + row("GRAPH-WRONG macro precision", + lambda d: _f(d["calibration_graph_wrong"]["macro"]["precision"])) + row("GRAPH-WRONG macro_strict F1", + lambda d: _f(d["calibration_graph_wrong"]["macro_strict"]["f1"])) + row("median total tokens", + lambda d: _f(d["tokens"]["median_total_tokens"])) + row("median visible-output tokens", + lambda d: _f(d["tokens"]["median_visible_output_tokens"])) + row("median reasoning tokens", + lambda d: _f(d["tokens"]["median_reasoning_tokens"])) + row("median turns", lambda d: _f(d["tokens"]["median_num_turns"])) + + if "adopt-rat" in present: + ra = rep["arms"]["adopt-rat"].get("rat_audit", {}) + print("\nRAT keep/drop audit:") + print(f" compliance (emitted KEEP/DROP) : {_f(ra.get('compliance_rate'))}" + f" over n={ra.get('n')}") + print(f" consistency (no DROP kept) : {_f(ra.get('consistency_rate'))}") + print(f" dropped-but-kept conflicts : {ra.get('n_dropped_but_kept')}") + print(f" kept-then-omitted (erosion) : {ra.get('n_kept_omitted')}") + + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("results", type=Path) + ap.add_argument("--json", type=Path) + ap.add_argument("--ref-arm", default="adopt-ctrl", + help="arm whose runs fix the GRAPH-WRONG task subset") + args = ap.parse_args() + + rep = diagnose(args.results, ref_arm=args.ref_arm) + _print_report(rep) + if args.json: + args.json.write_text(json.dumps(rep, indent=2)) + print(f"\nwrote {args.json}") + + +if __name__ == "__main__": + main() diff --git a/bench/analysis/aggregate.py b/bench/analysis/aggregate.py new file mode 100644 index 00000000..beef1d6f --- /dev/null +++ b/bench/analysis/aggregate.py @@ -0,0 +1,185 @@ +"""Robust, task-weighted aggregation for the localization benchmark. + +WHY (per rubber-duck): the raw per-row means are confounded and outlier-driven: + + 1. REPLICATE confound -- decision instances are run multiple times (run_idx + 0/1/2) while controls run once, so a naive mean over rows weights some + tasks 3x. Fix: average replicates WITHIN each (task, config) first + ("per-task cell"), then take the macro-mean ACROSS tasks. Every task then + carries equal weight regardless of replicate count. + + 2. TOKEN tail -- a single runaway trajectory (>1M input tokens) dominates the + arithmetic mean. Fix: report median + mean + p90 + max + #runaways(>500k) + + a winsorized mean (p90 cap) for SENSITIVITY ONLY (never as the headline, + never silently dropping data). + + 3. CONNECTIVITY stratum -- recall on graph-connected gold is the only stratum + where the graph can mechanically help. Fix: join the connectivity label + (all_connected / partial / unconnected) per task and report recall per + stratum per arm. + +Headline accuracy metric = task-weighted macro-mean of ``file_recall`` (and the +strict ``file_all_found`` set-exact metric) per config. Headline token metric = +per-task median input tokens + the robust tail stats. + +CLI: + python -m bench.analysis.aggregate [--conn conn.json] \ + [--json out.json] [--runaway 500000] +""" + +from __future__ import annotations + +import argparse +import json +import statistics +from collections import defaultdict +from pathlib import Path + + +def _percentile(xs: list[float], q: float) -> float: + """Linear-interpolation percentile (q in [0,1]). Empty -> 0.0.""" + if not xs: + return 0.0 + s = sorted(xs) + if len(s) == 1: + return float(s[0]) + pos = q * (len(s) - 1) + lo = int(pos) + frac = pos - lo + if lo + 1 >= len(s): + return float(s[-1]) + return float(s[lo] + (s[lo + 1] - s[lo]) * frac) + + +def _winsorized_mean(xs: list[float], cap_q: float = 0.90) -> float: + """Mean after clamping values above the cap_q percentile down to it.""" + if not xs: + return 0.0 + cap = _percentile(xs, cap_q) + return statistics.fmean(min(x, cap) for x in xs) + + +def _cells(rows: list[dict], field: str) -> dict[tuple[str, str], float]: + """Average ``field`` over replicates within each (config, task) cell.""" + buckets: dict[tuple[str, str], list[float]] = defaultdict(list) + for r in rows: + v = r.get(field) + if v is None: + continue + buckets[(r["config"], r["task_id"])].append(float(v)) + return {k: statistics.fmean(v) for k, v in buckets.items() if v} + + +def _per_config(cells: dict[tuple[str, str], float]) -> dict[str, list[float]]: + out: dict[str, list[float]] = defaultdict(list) + for (config, _task), v in cells.items(): + out[config].append(v) + return out + + +def aggregate(results_path: Path, conn_path: Path | None, + runaway: float = 500_000) -> dict: + rows = [json.loads(l) for l in results_path.read_text().splitlines() if l.strip()] + configs = sorted({r["config"] for r in rows}) + + conn_label: dict[str, str] = {} + if conn_path and conn_path.exists(): + for c in json.loads(conn_path.read_text()): + conn_label[c["task"]] = c.get("label", "unknown") + + # --- task-weighted accuracy (recall + strict all-found) --- + recall_cells = _cells(rows, "file_recall") + allfound_cells = _cells(rows, "file_all_found") + recall_by_cfg = _per_config(recall_cells) + allfound_by_cfg = _per_config(allfound_cells) + + # --- token cell means (per task) for robust stats --- + intok_cells = _cells(rows, "input_tokens") + intok_by_cfg = _per_config(intok_cells) + + # raw per-row input tokens (for tail stats that should see every runaway) + intok_rows_by_cfg: dict[str, list[float]] = defaultdict(list) + for r in rows: + v = r.get("input_tokens") + if v is not None: + intok_rows_by_cfg[r["config"]].append(float(v)) + + summary = {} + for cfg in configs: + rec = recall_by_cfg.get(cfg, []) + allf = allfound_by_cfg.get(cfg, []) + intask = intok_by_cfg.get(cfg, []) + inrows = intok_rows_by_cfg.get(cfg, []) + summary[cfg] = { + "n_tasks": len(rec), + "n_rows": sum(1 for r in rows if r["config"] == cfg), + "recall_task_weighted": round(statistics.fmean(rec), 4) if rec else None, + "all_found_task_weighted": round(statistics.fmean(allf), 4) if allf else None, + "tokens": { + "median_per_task": round(statistics.median(intask)) if intask else None, + "mean_per_task": round(statistics.fmean(intask)) if intask else None, + "p90_per_task": round(_percentile(intask, 0.90)) if intask else None, + "max_row": round(max(inrows)) if inrows else None, + "n_runaways": sum(1 for x in inrows if x > runaway), + "winsorized_mean_per_task": round(_winsorized_mean(intask)) if intask else None, + }, + } + + # --- recall per connectivity stratum per config --- + strata: dict[str, dict[str, list[float]]] = defaultdict(lambda: defaultdict(list)) + for (cfg, task), v in recall_cells.items(): + strata[conn_label.get(task, "unknown")][cfg].append(v) + by_stratum = { + stratum: { + cfg: {"n": len(vals), "recall": round(statistics.fmean(vals), 4)} + for cfg, vals in cfgmap.items() + } + for stratum, cfgmap in strata.items() + } + + return { + "configs": configs, + "runaway_threshold": runaway, + "summary": summary, + "by_connectivity_stratum": by_stratum, + "connectivity_labels": conn_label, + } + + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("results", type=Path) + ap.add_argument("--conn", type=Path, help="connectivity.py json output") + ap.add_argument("--json", type=Path) + ap.add_argument("--runaway", type=float, default=500_000) + args = ap.parse_args() + + out = aggregate(args.results, args.conn, args.runaway) + + print("=== TASK-WEIGHTED ACCURACY + ROBUST TOKENS (per config) ===") + hdr = (f"{'config':16s} {'tasks':5s} {'rows':4s} {'recall':7s} {'allfnd':7s} " + f"{'tok_med':9s} {'tok_mean':9s} {'tok_p90':9s} {'tok_max':10s} {'runaway':7s}") + print(hdr) + for cfg in out["configs"]: + s = out["summary"][cfg] + t = s["tokens"] + print(f"{cfg:16s} {s['n_tasks']!s:5s} {s['n_rows']!s:4s} " + f"{s['recall_task_weighted']!s:7s} {s['all_found_task_weighted']!s:7s} " + f"{t['median_per_task']!s:9s} {t['mean_per_task']!s:9s} " + f"{t['p90_per_task']!s:9s} {t['max_row']!s:10s} {t['n_runaways']!s:7s}") + + print("\n=== RECALL BY CONNECTIVITY STRATUM ===") + for stratum, cfgmap in sorted(out["by_connectivity_stratum"].items()): + print(f"\n[{stratum}]") + for cfg in out["configs"]: + d = cfgmap.get(cfg) + if d: + print(f" {cfg:16s} n={d['n']:<3d} recall={d['recall']}") + + if args.json: + args.json.write_text(json.dumps(out, indent=2)) + print(f"\nwrote {args.json}") + + +if __name__ == "__main__": + main() diff --git a/bench/analysis/cg_report.py b/bench/analysis/cg_report.py new file mode 100644 index 00000000..c62cb465 --- /dev/null +++ b/bench/analysis/cg_report.py @@ -0,0 +1,97 @@ +"""Per-instance code_graph-vs-reference reporter for the cg-n5 micro-cycle. + +Given the code_graph cache dir and a task_id, prints: + * the code_graph result row (recall / acc@1 / tokens / tool usage) + * the FROZEN reference rows (copilot_no_mcp + lsp) for the same task + * a compact agent trace (tool steps) reconstructed via bench.analysis.trace + +Usage: + python -m bench.analysis.cg_report + python -m bench.analysis.cg_report --list # show which tasks have results +""" + +from __future__ import annotations + +import argparse +import json +from pathlib import Path + +CG_RESULTS = Path("bench/cache/cg-n5-cooverride/claude-opus-4.8/results.jsonl") +REF_RESULTS = Path("bench/cache/ref-n5-baseline-lsp/claude-opus-4.8/results.jsonl") +CG_RUNS = Path("bench/cache/cg-n5-cooverride/runs/claude-opus-4.8/localize/nudged/code_graph") + + +def _load(path: Path) -> list[dict]: + if not path.exists(): + return [] + return [json.loads(line) for line in path.read_text().splitlines() if line.strip()] + + +def _fmt_row(r: dict) -> str: + tools = r.get("tool_calls_by_name") or {} + tool_str = ", ".join(f"{k}={v}" for k, v in sorted(tools.items())) or "(none)" + return ( + f" config={r.get('config'):<16} recall={r.get('file_recall')!s:<5} " + f"acc@1={r.get('acc_at_1')!s:<5} acc@5={r.get('acc_at_5')!s:<5} " + f"in_tok={r.get('input_tokens'):<8} out_tok={r.get('output_tokens'):<6} " + f"premium={r.get('premium_requests')!s:<4} wall={r.get('wall_clock_sec')}s\n" + f" tools: {tool_str} first={r.get('first_tool')} " + f"graph_calls={r.get('graph_calls')} " + f"outcome={r.get('outcome')} timed_out={r.get('timed_out')} " + f"leak={r.get('network_leak')}" + ) + + +def _recall(r: dict) -> str: + v = r.get("file_recall") + return "?" if v is None else str(v) + + +def report(task_id: str) -> None: + cg = [r for r in _load(CG_RESULTS) if r.get("task_id") == task_id] + ref = [r for r in _load(REF_RESULTS) if r.get("task_id") == task_id] + print("=" * 78) + print(f"INSTANCE: {task_id}") + print("=" * 78) + if cg: + gold = cg[0].get("gold_files") or cg[0].get("gold") + print(f"gold_files: {gold}") + print("\n--- code_graph (THIS run, co-override) ---") + for r in cg: + print(_fmt_row(r)) + pred = r.get("pred_files") + print(f" pred: {pred}") + print("\n--- FROZEN reference ---") + for r in sorted(ref, key=lambda x: x.get("config", "")): + print(_fmt_row(r)) + + # Trace + run_dir = CG_RUNS / task_id + print(f"\n--- agent trace ({run_dir}) ---") + tr = run_dir / "trace.md" + if tr.exists(): + print(tr.read_text()) + else: + print(f" (no trace.md yet at {tr}; run: python -m bench.analysis.trace {run_dir})") + + +def list_done() -> None: + cg = _load(CG_RESULTS) + print(f"code_graph results so far: {len(cg)}") + for r in cg: + print(f" {r.get('task_id'):<40} recall={_recall(r)} outcome={r.get('outcome')}") + + +def main() -> None: + ap = argparse.ArgumentParser() + ap.add_argument("task_id", nargs="?") + ap.add_argument("--list", action="store_true") + args = ap.parse_args() + if args.list or not args.task_id: + list_done() + return + report(args.task_id) + + +if __name__ == "__main__": + main() diff --git a/bench/analysis/connectivity.py b/bench/analysis/connectivity.py new file mode 100644 index 00000000..cfc192f2 --- /dev/null +++ b/bench/analysis/connectivity.py @@ -0,0 +1,275 @@ +"""Graph-connectivity stratification for the localization benchmark. + +WHY: ``swe_bench.is_structural`` only checks that the gold patch spans >=2 files +or >=2 directories. It does NOT verify that those gold files are actually +connected in the code graph. So an instance like ``jupyterhub__oauthenticator-764`` +(gold = ``oauthenticator/google.py`` + ``setup.py``) counts as "structural" even +though ``setup.py`` has no code edges to the auth module — code_graph cannot +surface it via structure by construction. Evaluating a graph-traversal tool on +such instances dilutes signal and makes false negatives uninterpretable. + +This module assigns each instance a ``graph_connected_gold`` label computed from +the STATIC graph edges (independent of ``search_code`` ranking, to avoid +circularity). The label answers: "does the graph even contain a structural path +between the gold files?" — i.e. is there structural signal available for the +tool to exploit, separate from whether the agent adopts it. + +PRE-REGISTERED DEFINITION (fixed before reading results): + * Edge set for file<->file adjacency (undirected): + - direct: File -[IMPORTS]- File + - symbol-bridge: File -[DEFINES]-> sym -[CALLS|EXTENDS|OVERRIDES]- sym <-[DEFINES]- File + * Max depth: D = 2 file-hops. + * Labels: + - gold_missing : >=1 gold file is not present as a File node in the graph + - all_connected : all present gold files fall in ONE connected component + (reachable within <=D hops) -- full structural signal + - partial_connected : >=1 gold-gold pair connected, but not all -- partial signal + - unconnected : >=2 gold files present, no gold-gold pair connected -- NO signal + - single : only 1 gold file (no multi-file structure to traverse) + +Framing (per rubber-duck): the label means "graph has structural signal +available", NOT "task is inherently structural". Using the same graph that is +under test is acceptable under that framing. + +Offline: reads gold files from the benchmark results.jsonl; queries the already +-indexed FalkorDB graphs on port 6380. No HuggingFace reload, no LLM, $0. +""" + +from __future__ import annotations + +import argparse +import json +from collections import defaultdict +from pathlib import Path + +import redis + +FALKOR_HOST = "localhost" +FALKOR_PORT = 6380 +GRAPH_FMT = "code:{task}__loc:_default" + +# Pre-registered traversal parameters. +MAX_DEPTH = 2 # file-level hops +NEIGHBOR_CAP = 500 # per-query fanout cap (guards hub files); flagged if hit +VISIT_CAP = 8000 # total BFS frontier cap per source file + +_IMPORTS_Q = ( + "MATCH (a:File)-[:IMPORTS]-(b:File) WHERE a.path = $p AND b.path <> $p " + "RETURN DISTINCT b.path LIMIT $cap" +) +_BRIDGE_Q = ( + "MATCH (a:File)-[:DEFINES]->(s)-[:CALLS|EXTENDS|OVERRIDES]-(t)" + "<-[:DEFINES]-(b:File) WHERE a.path = $p AND b.path <> a.path " + "RETURN DISTINCT b.path LIMIT $cap" +) + + +def _graph_query(r: redis.Redis, graph: str, cypher: str, params: dict): + """Run a parameterized GRAPH.QUERY (FalkorDB ``CYPHER k=v`` prefix).""" + parts = [] + for k, v in params.items(): + if isinstance(v, str): + esc = v.replace("\\", "\\\\").replace('"', '\\"') + parts.append(f'{k}="{esc}"') + else: + parts.append(f"{k}={v}") + prefix = ("CYPHER " + " ".join(parts) + " ") if parts else "" + return r.execute_command("GRAPH.QUERY", graph, prefix + cypher) + + +def _all_file_paths(r: redis.Redis, graph: str) -> list[str]: + res = _graph_query(r, graph, "MATCH (f:File) RETURN f.path", {}) + return [row[0] for row in res[1]] + + +def _resolve_gold(gold_files: list[str], file_paths: list[str]) -> dict[str, str | None]: + """Map each repo-relative gold path to its absolute File-node path. + + File nodes store absolute paths ending in ``.../__loc/``; we + match by suffix ``/``. If multiple nodes match (shouldn't for a + full relpath), prefer the shortest. Returns ``{gold: node_path | None}``. + """ + out: dict[str, str | None] = {} + for g in gold_files: + suffix = "/" + g.lstrip("/") + cands = [p for p in file_paths if p.endswith(suffix)] + out[g] = min(cands, key=len) if cands else None + return out + + +def _file_neighbors(r: redis.Redis, graph: str, fpath: str, + cap: int = NEIGHBOR_CAP) -> tuple[set[str], bool]: + """1-hop file neighbors of ``fpath`` (IMPORTS + symbol-bridge). Returns + ``(neighbors, capped)`` where ``capped`` flags a fanout-limit hit.""" + out: set[str] = set() + capped = False + for cypher in (_IMPORTS_Q, _BRIDGE_Q): + res = _graph_query(r, graph, cypher, {"p": fpath, "cap": cap}) + rows = res[1] + if len(rows) >= cap: + capped = True + out.update(row[0] for row in rows) + out.discard(fpath) + return out, capped + + +def _reachable(r: redis.Redis, graph: str, src: str, targets: set[str], + depth: int = MAX_DEPTH) -> set[str]: + """BFS up to ``depth`` file-hops from ``src``; return the subset of + ``targets`` reached. Early-exits once all targets are found.""" + found: set[str] = set() + visited = {src} + frontier = {src} + for _ in range(depth): + nxt: set[str] = set() + for node in frontier: + neigh, _capped = _file_neighbors(r, graph, node) + for n in neigh: + if n in targets: + found.add(n) + if n not in visited: + visited.add(n) + nxt.add(n) + if len(visited) > VISIT_CAP: + break + if found >= targets or len(visited) > VISIT_CAP: + break + frontier = nxt + return found + + +class _UF: + def __init__(self, items): + self.p = {i: i for i in items} + + def find(self, x): + while self.p[x] != x: + self.p[x] = self.p[self.p[x]] + x = self.p[x] + return x + + def union(self, a, b): + self.p[self.find(a)] = self.find(b) + + def groups(self): + g = defaultdict(list) + for i in self.p: + g[self.find(i)].append(i) + return list(g.values()) + + +def classify_instance(r: redis.Redis, task: str, gold_files: list[str]) -> dict: + """Compute the connectivity stratum for one instance.""" + graph = GRAPH_FMT.format(task=task) + py_gold = [g for g in gold_files if g.endswith(".py")] + result = { + "task": task, + "gold_files": gold_files, + "n_gold": len(gold_files), + "n_gold_py": len(py_gold), + } + try: + file_paths = _all_file_paths(r, graph) + except redis.exceptions.ResponseError as e: + result["label"] = "graph_missing" + result["error"] = str(e) + return result + + resolved = _resolve_gold(gold_files, file_paths) + present = {g: p for g, p in resolved.items() if p} + missing = [g for g, p in resolved.items() if not p] + result["gold_present"] = sorted(present) + result["gold_missing_from_graph"] = sorted(missing) + + if len(gold_files) < 2: + result["label"] = "single" + return result + if missing: + # Still compute connectivity among the present ones for context, but the + # instance cannot be fully won via structure. + result["label"] = "gold_missing" + + # pairwise connectivity among present gold files via union-find + present_paths = present # {gold_rel: node_path} + target_by_node = {p: g for g, p in present_paths.items()} + uf = _UF(list(present_paths)) + edges: list[tuple[str, str]] = [] + glist = list(present_paths.items()) + for i, (g_a, p_a) in enumerate(glist): + others = {p for _, p in glist if p != p_a} + if not others: + continue + reached = _reachable(r, graph, p_a, others) + for node in reached: + g_b = target_by_node[node] + uf.union(g_a, g_b) + edges.append((g_a, g_b)) + comps = uf.groups() + result["components"] = [sorted(c) for c in comps] + result["connected_pairs"] = sorted({tuple(sorted(e)) for e in edges}) + result["isolated_gold"] = sorted( + g for c in comps if len(c) == 1 for g in c + ) + + if missing: + return result # label already 'gold_missing' + if len(comps) == 1: + result["label"] = "all_connected" + elif edges: + result["label"] = "partial_connected" + else: + result["label"] = "unconnected" + return result + + +def load_gold_from_results(results_path: Path) -> dict[str, list[str]]: + """Extract ``{task_id: gold_files}`` from a benchmark results.jsonl.""" + gold: dict[str, list[str]] = {} + for line in results_path.read_text().splitlines(): + if not line.strip(): + continue + row = json.loads(line) + tid = row.get("task_id") or row.get("instance_id") + if tid and row.get("gold_files"): + gold[tid] = row["gold_files"] + return gold + + +def classify_results(results_path: Path, + host: str = FALKOR_HOST, port: int = FALKOR_PORT) -> list[dict]: + r = redis.Redis(host=host, port=port, decode_responses=True) + gold = load_gold_from_results(results_path) + return [classify_instance(r, task, g) for task, g in sorted(gold.items())] + + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("results", type=Path, help="path to results.jsonl") + ap.add_argument("--json", type=Path, help="write full classification JSON here") + ap.add_argument("--port", type=int, default=FALKOR_PORT) + args = ap.parse_args() + + rows = classify_results(args.results, port=args.port) + + print(f"{'task':34s} {'label':18s} {'gold':4s} present/connected") + counts: dict[str, int] = defaultdict(int) + for row in rows: + counts[row["label"]] += 1 + present = len(row.get("gold_present", [])) + comps = row.get("components", []) + conn = "-" if not comps else "+".join(str(len(c)) for c in sorted(comps, key=len, reverse=True)) + miss = row.get("gold_missing_from_graph", []) + flag = f" MISSING:{','.join(Path(m).name for m in miss)}" if miss else "" + print(f"{row['task']:34s} {row['label']:18s} {row['n_gold']:<4d} " + f"{present}/[{conn}]{flag}") + print("\nstratum counts:") + for label, n in sorted(counts.items(), key=lambda x: -x[1]): + print(f" {label:18s} {n}") + + if args.json: + args.json.write_text(json.dumps(rows, indent=2)) + print(f"\nwrote {args.json}") + + +if __name__ == "__main__": + main() diff --git a/bench/analysis/embed_probe.py b/bench/analysis/embed_probe.py new file mode 100644 index 00000000..19ac4433 --- /dev/null +++ b/bench/analysis/embed_probe.py @@ -0,0 +1,117 @@ +"""Semantic-embedding arm for the retrieval probe (Phase A fix #1). + +Lexical retrieval (bm25/tfidf in retrieval_probe.py) ~2x the current name-prefix +interface but stays below the live agent's no-tool recall (0.61) on these +pretraining-saturated repos. The deciding question for fix #1 is whether a +SEMANTIC `description -> file` retriever closes that gap. + +This embeds per-file text (path + symbol names + truncated bodies/docstrings) +with a small local HF model (no API, $0) and cosine-ranks files against the +problem statement. Same 20 instances, same scoring as retrieval_probe. +""" + +from __future__ import annotations + +import sys +from collections import Counter + +import numpy as np +import redis +import torch +from transformers import AutoModel, AutoTokenizer + +from bench.analysis.retrieval_probe import ( + FALKOR_PORT, + KS, + fetch_graph, + score, +) +from bench.datasets import swe_bench + +MODEL = "sentence-transformers/all-MiniLM-L6-v2" +MAX_LEN = 256 +BATCH = 64 +PER_FILE_CHARS = 1200 # natural-language-ish snippet per file + + +def build_file_text(files, bodytok, symbols) -> dict[str, str]: + """Compose a compact NL-ish description per file: path words + symbol + names + a bounded slice of body tokens (captures signatures/docstrings).""" + sym_by_file: dict[str, list[str]] = {} + for name, f in symbols: + sym_by_file.setdefault(f, []).append(name) + out = {} + for f in files: + path_words = f.replace("/", " ").replace("_", " ").replace(".py", "") + names = " ".join(sym_by_file.get(f, [])[:80]) + body = " ".join(bodytok.get(f, [])[:300]) + out[f] = (path_words + " . " + names + " . " + body)[: PER_FILE_CHARS] + return out + + +class Embedder: + def __init__(self): + self.tok = AutoTokenizer.from_pretrained(MODEL) + self.model = AutoModel.from_pretrained(MODEL) + self.model.eval() + + @torch.no_grad() + def encode(self, texts: list[str]) -> np.ndarray: + vecs = [] + for i in range(0, len(texts), BATCH): + batch = texts[i : i + BATCH] + enc = self.tok(batch, padding=True, truncation=True, + max_length=MAX_LEN, return_tensors="pt") + out = self.model(**enc) + mask = enc["attention_mask"].unsqueeze(-1).float() + summed = (out.last_hidden_state * mask).sum(1) + counts = mask.sum(1).clamp(min=1e-9) + emb = (summed / counts).cpu().numpy() + emb /= (np.linalg.norm(emb, axis=1, keepdims=True) + 1e-9) + vecs.append(emb) + return np.vstack(vecs) if vecs else np.zeros((0, 384)) + + +def main(): + insts = swe_bench.load_instances() + sel = swe_bench.select_structural(insts, n=20, no_leak=True) + r = redis.Redis(host="localhost", port=FALKOR_PORT, decode_responses=True) + emb = Embedder() + + rows = [] + for inst in sel: + task = inst.instance_id + gold = [g for g in swe_bench.gold_changed_files(inst.patch, source_only=True) + if g.endswith(".py")] + try: + files, bodytok, symbols = fetch_graph(r, task) + except Exception as e: # noqa: BLE001 + print(f"!! {task}: {e}", file=sys.stderr) + continue + text = build_file_text(files, bodytok, symbols) + doc_vecs = emb.encode([text[f] for f in files]) + qv = emb.encode([inst.problem_statement])[0] + scores = doc_vecs @ qv + order = np.argsort(-scores) + ranked = [files[i] for i in order] + sc = score(ranked, gold) + rows.append(sc) + print(f"{task:38s} gold={len(gold)} files={len(files):5d} " + f"R@5={sc['recall@5']:.2f} hit@5={sc['hit@5']:.0f} " + f"rk={sc['gold_best_rank']}") + + print("\n========= EMBEDDING (all-MiniLM-L6-v2) AGGREGATE n={} =========".format(len(rows))) + line = "embed " + for k in KS: + line += f" R@{k}={np.mean([x[f'recall@{k}'] for x in rows]):.3f}" + for k in KS: + line += f" hit@{k}={np.mean([x[f'hit@{k}'] for x in rows]):.3f}" + line += f" MRR={np.mean([x['mrr'] for x in rows]):.3f}" + print(line) + print("\nLexical ref: bm25 R@5=0.279 hit@5=0.500 MRR=0.230 ; " + "tfidf R@5=0.312 hit@5=0.500 MRR=0.419 ; name_prefix R@5=0.175 MRR=0.176") + print("Agent ref: no_mcp recall=0.613 MRR=0.875 ; code_graph recall=0.512 MRR=0.800") + + +if __name__ == "__main__": + main() diff --git a/bench/analysis/exposure_adoption.py b/bench/analysis/exposure_adoption.py new file mode 100644 index 00000000..eefa9e3f --- /dev/null +++ b/bench/analysis/exposure_adoption.py @@ -0,0 +1,375 @@ +"""Retrieval-exposure vs adoption metrics for the code_graph arm. + +WHY (per rubber-duck): end-to-end ``file_recall`` conflates two very different +failure modes: + 1. RETRIEVAL miss -- the graph never surfaced the gold file at all. + 2. ADOPTION miss -- the graph surfaced the gold file, but the agent dropped + it from its final answer during reasoning. +Blaming the graph for (2) is unfair: that is an agent-reasoning property, not a +tool-retrieval property. This module separates them so we can say e.g. "graph +exposed 7/10 missed gold files; the agent adopted only N of them." + +For each code_graph run we parse ``stdout.jsonl`` and, for every ``search_code`` +call, join ``tool.execution_start`` -> ``tool.execution_complete`` by +``toolCallId`` to recover the UNTRUNCATED result (trace.md/trace.jsonl truncate +tool output). We collect every file the graph surfaced: + * primary hits (``file`` + ``score`` + rank position) + * likely_related_files (``file`` + ``via`` co_override/shared_method + confidence) + +Then, per gold file, we classify exposure: + * direct@ -- surfaced as a primary ranked hit + * related: -- surfaced only as a likely_related sibling + * not_surfaced -- never surfaced by the graph (true retrieval miss) +and adoption: was the surfaced gold file in the run's final ``pred_files``? + +Derived metrics (aggregated over runs): + * exposure_recall = surfaced_gold / total_gold + * adoption_rate = adopted_gold / surfaced_gold (of what the graph surfaced, + how much did the agent keep) +""" + +from __future__ import annotations + +import argparse +import json +from collections import defaultdict +from pathlib import Path + + +def _iter_search_results(stdout_path: Path): + """Yield parsed search_code result objects (one per primary) for a run.""" + names: dict[str, str] = {} + for line in stdout_path.read_text().splitlines(): + if not line.strip(): + continue + try: + ev = json.loads(line) + except json.JSONDecodeError: + continue + t = ev.get("type") + d = ev.get("data", {}) or {} + if t == "tool.execution_start": + names[d.get("toolCallId")] = d.get("toolName") or d.get("mcpToolName") or "" + elif t == "tool.execution_complete": + if "search_code" not in (names.get(d.get("toolCallId")) or ""): + continue + res = d.get("result") or {} + # ``contents`` is the clean per-item list (one JSON object per text + # entry). ``content`` is the same payload concatenated into a single + # string and is NOT valid JSON when there is >1 result -- do not use + # it. Fall back to ``content`` only when ``contents`` is absent. + items = res.get("contents") + if not isinstance(items, list): + c = res.get("content") + items = c if isinstance(c, list) else [c] + for it in items: + txt = it.get("text") if isinstance(it, dict) else it + if not txt: + continue + try: + obj = json.loads(txt) + except (json.JSONDecodeError, TypeError): + continue + for prim in (obj if isinstance(obj, list) else [obj]): + if isinstance(prim, dict): + yield prim + + +def surfaced_files(stdout_path: Path) -> dict[str, dict]: + """Return ``{file: {best_rank, via, confidence}}`` for every file the graph + surfaced across all search_code calls in a run. ``best_rank`` is the best + primary rank (1-based) or None if only surfaced as a related sibling.""" + out: dict[str, dict] = {} + + def note(f: str, rank: int | None, via: str, conf: str | None): + rec = out.setdefault(f, {"best_rank": None, "via": via, "confidence": conf}) + if rank is not None and (rec["best_rank"] is None or rank < rec["best_rank"]): + rec["best_rank"] = rank + rec["via"] = via + rec["confidence"] = conf + elif rec["best_rank"] is None and via == "direct": + rec["via"] = "direct" + + rank = 0 + for prim in _iter_search_results(stdout_path): + f = prim.get("file") + if f: + if prim.get("rank_kind") == "related": + note(f, None, f"related:{prim.get('via', '?')}", prim.get("confidence")) + else: + rank += 1 + note(f, rank, "direct", None) + for rel in prim.get("likely_related_files", []) or []: + rf = rel.get("file") + if rf: + note(rf, None, f"related:{rel.get('via', '?')}", rel.get("confidence")) + return out + + +def classify_run(stdout_path: Path, gold_files: list[str], + pred_files: list[str], edit_critical: list[str] | None = None) -> dict: + """Exposure + adoption classification for a single code_graph run. + + ``edit_critical`` optionally restricts which gold files count toward the + candidate-level TP/FN (prereg sec7 relabel). Defaults to all gold files. + """ + surf = surfaced_files(stdout_path) + pred = set(pred_files or []) + per_gold = {} + for g in gold_files: + rec = surf.get(g) + if rec is None: + exposure = "not_surfaced" + elif rec["best_rank"] is not None: + exposure = f"direct@{rec['best_rank']}" + else: + exposure = rec["via"] + per_gold[g] = { + "exposure": exposure, + "surfaced": rec is not None, + "adopted": g in pred, + } + n_gold = len(gold_files) + n_surf = sum(1 for v in per_gold.values() if v["surfaced"]) + n_surf_adopted = sum(1 for v in per_gold.values() if v["surfaced"] and v["adopted"]) + n_miss_not_surf = sum(1 for v in per_gold.values() if not v["surfaced"]) + + # Candidate-level confusion matrix over EVERY surfaced candidate (the agent's + # keep/drop DECISION quality, per prereg-adoption-calibration.md). A + # not-surfaced gold file is a RETRIEVAL miss, not a decision, so it is + # excluded here (it is already counted in n_not_surfaced above). + # TP = surfaced gold kept FN = surfaced gold dropped + # FP = surfaced non-gold kept TN = surfaced non-gold dropped + # NOTE: every gold file is treated as edit-critical until the relabel rubric + # (prereg sec7) supplies an ``incidental`` set; see ``edit_critical`` arg. + gold_set = set(gold_files) + crit = set(edit_critical) if edit_critical is not None else gold_set + tp = fp = fn = tn = 0 + cand_detail = {} + for f, rec in surf.items(): + kept = f in pred + is_gold = f in gold_set + # Only edit-critical gold counts toward TP/FN; incidental gold is excluded + # from the decision matrix (keeping or dropping it is not penalized). + if is_gold and f not in crit: + cand_detail[f] = "incidental_gold" + continue + if is_gold: + label = "TP" if kept else "FN" + tp += kept + fn += not kept + else: + label = "FP" if kept else "TN" + fp += kept + tn += not kept + cand_detail[f] = label + + return { + "n_gold": n_gold, + "n_surfaced": n_surf, + "n_surfaced_adopted": n_surf_adopted, + "n_not_surfaced": n_miss_not_surf, + "per_gold": per_gold, + "cand": {"tp": tp, "fp": fp, "fn": fn, "tn": tn, "detail": cand_detail}, + } + + +def row_stdout_path(batch_root: Path, model: str, row: dict) -> Path | None: + """Locate the stdout.jsonl for a results row by its FULL identity. + + Uses (mode, prompt_mode, config, task_id, run_idx) so runs from different + prompt_modes (e.g. ``adopt-ctrl`` vs ``adopt-sem``) are never cross-wired -- + the previous glob-first-match over ``runs//*/*/code_graph`` could + classify a SEM row against a CTRL log when both coexist in one batch. + + Supports both the legacy layout (``/logs``) and the run-indexed layout + (``/run/logs``) introduced for multi-run pilots. + """ + mode = row.get("mode", "fix") + prompt_mode = row.get("prompt_mode", "neutral") + track = row.get("config") + task = row.get("task_id") + ridx = int(row.get("run_idx", 0) or 0) + base = batch_root / "runs" / model / mode / prompt_mode / track / task + for cand in ( + base / f"run{ridx}" / "logs" / "stdout.jsonl", + base / "logs" / "stdout.jsonl", + ): + if cand.exists(): + return cand + # Last resort: a single stdout under this exact (mode,prompt_mode,track,task) + # subtree. Still identity-scoped, so no cross-prompt-mode leakage. + hits = sorted(base.glob("**/stdout.jsonl")) + return hits[0] if hits else None + + +def analyze_batch( + results_path: Path, + *, + prompt_mode: str | None = None, + mode: str | None = None, +) -> dict: + """Analyze code_graph runs referenced by a results.jsonl. + + Locates each run's log by FULL row identity (see ``row_stdout_path``) rather + than a first-match glob, so multiple prompt-mode arms in one batch are scored + against their OWN logs. Pass ``prompt_mode``/``mode`` to restrict to a single + arm (e.g. ``prompt_mode="adopt-sem"``). + """ + rows = [json.loads(ln) for ln in results_path.read_text().splitlines() if ln.strip()] + cg = [r for r in rows if r.get("config") == "code_graph"] + if mode is not None: + cg = [r for r in cg if r.get("mode") == mode] + if prompt_mode is not None: + cg = [r for r in cg if r.get("prompt_mode") == prompt_mode] + model = results_path.parent.name + batch_root = results_path.parent.parent + per_run = [] + for r in cg: + task = r.get("task_id") + stdout = row_stdout_path(batch_root, model, r) + if not stdout: + per_run.append({"task": task, "run_idx": r.get("run_idx"), + "prompt_mode": r.get("prompt_mode"), + "error": "stdout not found"}) + continue + cls = classify_run(stdout, r.get("gold_files", []), r.get("pred_files", [])) + cls["task"] = task + cls["run_idx"] = r.get("run_idx") + cls["prompt_mode"] = r.get("prompt_mode") + cls["file_recall"] = r.get("file_recall") + per_run.append(cls) + return {"per_run": per_run} + + +def _prf(tp: int, fp: int, fn: int) -> tuple[float | None, float | None, float | None]: + """Precision, recall, F1 from counts; None when the denominator is 0.""" + prec = tp / (tp + fp) if (tp + fp) else None + rec = tp / (tp + fn) if (tp + fn) else None + if prec is None or rec is None or (prec + rec) == 0: + f1 = None + else: + f1 = 2 * prec * rec / (prec + rec) + return prec, rec, f1 + + +def candidate_calibration(runs: list[dict]) -> dict: + """Macro (by task) + micro candidate-level precision/recall/F1. + + The unit of analysis is the TASK (files within a task are correlated). We sum + a task's candidate counts across its runs, compute per-task P/R/F1, then macro + average. Micro pools all candidates. Macro-F1 by task is the prereg PRIMARY. + """ + by_task: dict[str, dict[str, int]] = defaultdict(lambda: {"tp": 0, "fp": 0, "fn": 0, "tn": 0}) + for r in runs: + c = r.get("cand") + if not c: + continue + t = by_task[r["task"]] + for k in ("tp", "fp", "fn", "tn"): + t[k] += c[k] + + per_task = {} + precs, recs, f1s = [], [], [] + # macro_strict: a task where gold was SURFACED but ALL of it was dropped + # (tp=0, fn>0) is a real adoption FAILURE, not a degenerate task -- score it + # F1=0 instead of dropping it, so a conservative lever cannot inflate macro-F1 + # by silently removing the tasks it broke. Tasks with no surfaced gold at all + # (fn=0 and tp=0) remain genuinely undefined and stay dropped. + f1s_strict: list[float] = [] + n_dropped_undefined = 0 + n_dropped_gold_failures = 0 + for task, c in by_task.items(): + p, rc, f1 = _prf(c["tp"], c["fp"], c["fn"]) + per_task[task] = {**c, "precision": p, "recall": rc, "f1": f1} + if p is not None: + precs.append(p) + if rc is not None: + recs.append(rc) + if f1 is not None: + f1s.append(f1) + f1s_strict.append(f1) + elif c["tp"] == 0 and c["fn"] > 0: + # surfaced gold, none kept -> adoption failure -> strict F1 = 0 + f1s_strict.append(0.0) + n_dropped_gold_failures += 1 + else: + n_dropped_undefined += 1 + + tp = sum(c["tp"] for c in by_task.values()) + fp = sum(c["fp"] for c in by_task.values()) + fn = sum(c["fn"] for c in by_task.values()) + tn = sum(c["tn"] for c in by_task.values()) + mp, mr, mf1 = _prf(tp, fp, fn) + + def _avg(xs): + return sum(xs) / len(xs) if xs else None + + return { + "n_tasks": len(by_task), + "n_tasks_scored_f1": len(f1s), + "n_tasks_dropped_undefined": n_dropped_undefined, + "n_tasks_gold_dropped_failures": n_dropped_gold_failures, + "macro": {"precision": _avg(precs), "recall": _avg(recs), "f1": _avg(f1s)}, + "macro_strict": {"f1": _avg(f1s_strict), "n": len(f1s_strict)}, + "micro": {"tp": tp, "fp": fp, "fn": fn, "tn": tn, + "precision": mp, "recall": mr, "f1": mf1}, + "per_task": per_task, + } + + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("results", type=Path) + ap.add_argument("--json", type=Path) + args = ap.parse_args() + + out = analyze_batch(args.results) + runs = [r for r in out["per_run"] if "error" not in r] + + tot_gold = sum(r["n_gold"] for r in runs) + tot_surf = sum(r["n_surfaced"] for r in runs) + tot_surf_adopt = sum(r["n_surfaced_adopted"] for r in runs) + tot_not_surf = sum(r["n_not_surfaced"] for r in runs) + + print(f"code_graph runs analyzed: {len(runs)}") + print(f"\n{'task':34s} {'idx':3s} {'recall':6s} exposure -> adoption (per gold)") + for r in sorted(runs, key=lambda x: (x["task"], x["run_idx"] or 0)): + bits = [] + for g, v in r["per_gold"].items(): + tag = v["exposure"] + mark = "OK" if v["adopted"] else ("DROP" if v["surfaced"] else "MISS") + bits.append(f"{Path(g).name}:{tag}/{mark}") + print(f"{r['task']:34s} {str(r['run_idx']):3s} {r['file_recall']!s:6s} " + " ".join(bits)) + + print("\n==== AGGREGATE (gold-file level, over code_graph runs) ====") + print(f"total gold files (run x gold) : {tot_gold}") + print(f"surfaced by graph : {tot_surf} " + f"(exposure_recall = {tot_surf/tot_gold:.3f})") + print(f" of which adopted in final answer : {tot_surf_adopt} " + f"(adoption_rate = {tot_surf_adopt/tot_surf:.3f})" if tot_surf else "") + print(f" surfaced-but-DROPPED (adoption gap): {tot_surf - tot_surf_adopt}") + print(f"never surfaced (true retrieval miss): {tot_not_surf} " + f"({tot_not_surf/tot_gold:.3f})") + + cal = candidate_calibration(runs) + out["candidate_calibration"] = cal + mac, mic = cal["macro"], cal["micro"] + + def _f(x): + return f"{x:.3f}" if x is not None else " n/a" + + print("\n==== CANDIDATE-LEVEL CALIBRATION (keep/drop over surfaced candidates) ====") + print(" (TP surfaced-gold kept | FP non-gold kept | FN surfaced-gold dropped | TN non-gold dropped)") + print(f" micro: TP={mic['tp']} FP={mic['fp']} FN={mic['fn']} TN={mic['tn']} " + f"P={_f(mic['precision'])} R={_f(mic['recall'])} F1={_f(mic['f1'])}") + print(f" MACRO by task (PRIMARY, n_tasks={cal['n_tasks']}): " + f"P={_f(mac['precision'])} R={_f(mac['recall'])} F1={_f(mac['f1'])}") + + if args.json: + args.json.write_text(json.dumps(out, indent=2)) + print(f"\nwrote {args.json}") + + +if __name__ == "__main__": + main() diff --git a/bench/analysis/reader_capture.py b/bench/analysis/reader_capture.py new file mode 100644 index 00000000..4aede3fd --- /dev/null +++ b/bench/analysis/reader_capture.py @@ -0,0 +1,78 @@ +"""Capture verbatim ``search_code`` calls (query + full result objects) per run. + +This is the Stage-A "reader experiment" capture layer. Unlike +``exposure_adoption.surfaced_files`` (which flattens to a ``{file: rank}`` map), +here we keep the FULL, ORDERED, UNTRUNCATED result objects exactly as the agent +saw them, grouped per ``search_code`` call, together with the ``query`` argument +the agent passed. The reader harness re-annotates these captured objects via +``rel_explain.annotate_results`` (the EXACT production builder) so the offline +A/B exercises the real intervention, not a re-implementation. + +Join rule (same as exposure_adoption): in ``stdout.jsonl`` join +``tool.execution_start`` -> ``tool.execution_complete`` by ``toolCallId``. The +untruncated payload is under ``result.contents`` (a list, one clean JSON object +per text entry); fall back to ``result.content`` only when ``contents`` is +absent (single-result case). +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + + +def _parse_result_objs(res: dict) -> list[dict]: + """Parse the list of primary result objects from a tool result payload.""" + items = res.get("contents") + if not isinstance(items, list): + c = res.get("content") + items = c if isinstance(c, list) else [c] + out: list[dict] = [] + for it in items: + txt = it.get("text") if isinstance(it, dict) else it + if not txt: + continue + try: + obj = json.loads(txt) + except (json.JSONDecodeError, TypeError): + continue + for prim in (obj if isinstance(obj, list) else [obj]): + if isinstance(prim, dict): + out.append(prim) + return out + + +def capture_search_calls(stdout_path: Path) -> list[dict[str, Any]]: + """Return ordered ``[{query, results:[...]}]`` for every search_code call. + + ``results`` are the verbatim primary objects (with their nested + ``likely_related_files``) the agent received for that call. + """ + starts: dict[str, dict] = {} + calls: list[dict[str, Any]] = [] + for line in stdout_path.read_text().splitlines(): + if not line.strip(): + continue + try: + ev = json.loads(line) + except json.JSONDecodeError: + continue + t = ev.get("type") + d = ev.get("data", {}) or {} + if t == "tool.execution_start": + name = d.get("toolName") or d.get("mcpToolName") or "" + if "search_code" in name: + starts[d.get("toolCallId")] = d + elif t == "tool.execution_complete": + start = starts.get(d.get("toolCallId")) + if start is None: + continue + query = (start.get("arguments") or {}).get("query") or "" + results = _parse_result_objs(d.get("result") or {}) + calls.append({ + "tool_call_id": d.get("toolCallId"), + "query": query, + "results": results, + }) + return calls diff --git a/bench/analysis/retrieval_probe.py b/bench/analysis/retrieval_probe.py new file mode 100644 index 00000000..5ff1c378 --- /dev/null +++ b/bench/analysis/retrieval_probe.py @@ -0,0 +1,287 @@ +"""Offline intrinsic-retrieval probe (Phase A fix #1 go/no-go). + +Question: is the gold file findable from the graph's CONTENT, and how much +better is a `description -> file` retriever than today's name-prefix interface? + +This isolates RETRIEVAL QUALITY from AGENT BEHAVIOR. No agent, no LLM, no API +tokens. For each of the 20 no-leak structural-hard SWE-bench instances we take +the problem statement as the query and rank the repo's files using retrievers +built directly over the already-indexed FalkorDB graph (port 6380), then score +recall@k / MRR against the gold (patched) files. + +Retriever arms: + - name_prefix : emulates current `auto_complete`/`find-symbol` interface -- + pull identifier-ish tokens from the issue, prefix-match symbol + names, rank files by # matching symbols. (current floor) + - bm25 : Okapi BM25 over per-file text (path + symbol names + bodies). + - tfidf : TF-IDF cosine over the same per-file text. + +All retrievers are pure-numpy, deterministic, $0. BM25/TF-IDF are the proxy for +the candidate `search_semantic` primitive (production could use embeddings for +additional lift; lexical already establishes the ceiling/floor gap). +""" + +from __future__ import annotations + +import math +import re +import sys +from collections import Counter, defaultdict + +import numpy as np +import redis + +from bench.datasets import swe_bench + +FALKOR_PORT = 6380 +GRAPH_FMT = "code:{task}__loc:_default" +KS = (1, 3, 5, 10) +PER_FILE_BODY_TOKEN_CAP = 4000 # cap body tokens contributed per file +MIN_PREFIX_LEN = 4 # identifier length floor for name_prefix arm + +_word_re = re.compile(r"[A-Za-z_][A-Za-z0-9_]*") +_camel_re = re.compile(r"[A-Z]+(?=[A-Z][a-z])|[A-Z]?[a-z0-9]+|[A-Z]+") + + +def subtokens(ident: str) -> list[str]: + """Split an identifier into lowercased camelCase/snake_case subtokens + AND keep the whole lowercased identifier.""" + out: list[str] = [] + for part in ident.split("_"): + if not part: + continue + out.extend(m.group(0).lower() for m in _camel_re.finditer(part)) + out.append(ident.lower()) + return [t for t in out if t] + + +def tokenize(text: str) -> list[str]: + toks: list[str] = [] + for m in _word_re.finditer(text or ""): + toks.extend(subtokens(m.group(0))) + return toks + + +def issue_identifiers(text: str) -> list[str]: + """Candidate code symbols an agent would prefix-search: backticked names, + dotted paths, and CamelCase / snake_case identifiers in the issue.""" + cands: set[str] = set() + for m in re.finditer(r"`([^`]+)`", text or ""): + for ident in _word_re.findall(m.group(1)): + if len(ident) >= MIN_PREFIX_LEN: + cands.add(ident) + for ident in _word_re.findall(text or ""): + if len(ident) >= MIN_PREFIX_LEN and ( + "_" in ident or re.search(r"[a-z][A-Z]", ident) or ident[0].isupper() + ): + cands.add(ident) + return sorted(cands) + + +def fetch_graph(r: redis.Redis, task: str): + """Return (files: list[relpath], file_text: {relpath: token list}, + symbols: list[(name, relpath)]). relpath is repo-relative.""" + g = GRAPH_FMT.format(task=task) + split_key = f"{task}__loc/" + + def rel(p: str) -> str: + return p.split(split_key, 1)[-1] if split_key in p else p + + files: list[str] = [] + bodytok: dict[str, list[str]] = defaultdict(list) + symbols: list[tuple[str, str]] = [] + + res = r.execute_command("GRAPH.QUERY", g, "MATCH (f:File) RETURN f.path") + for row in res[1]: + rp = rel(row[0]) + files.append(rp) + bodytok[rp].extend(tokenize(rp.replace("/", " "))) + + # Symbols: name + body (doc). Bodies are source; cap per-file contribution. + q = "MATCH (n) WHERE n:Function OR n:Class RETURN n.name, n.path, n.doc" + res = r.execute_command("GRAPH.QUERY", g, q) + bodycount: Counter = Counter() + for name, path, doc in res[1]: + if not path: + continue + rp = rel(path) + if name: + symbols.append((name, rp)) + bodytok[rp].extend(subtokens(name)) + if doc and bodycount[rp] < PER_FILE_BODY_TOKEN_CAP: + toks = tokenize(doc) + take = toks[: PER_FILE_BODY_TOKEN_CAP - bodycount[rp]] + bodytok[rp].extend(take) + bodycount[rp] += len(take) + + files = sorted(set(files)) + return files, bodytok, symbols + + +# ---------- retrievers: return ranked list of relpaths ---------- + +def rank_name_prefix(query: str, files, bodytok, symbols) -> list[str]: + cands = [c.lower() for c in issue_identifiers(query)] + by_file: Counter = Counter() + # prefix match against symbol names (emulates auto_complete prefix search) + names = [(n.lower(), f) for n, f in symbols] + for c in cands: + for nl, f in names: + if nl.startswith(c): + by_file[f] += 1 + return [f for f, _ in by_file.most_common()] + + +def _build_index(files, bodytok): + docs = [bodytok[f] for f in files] + df: Counter = Counter() + for d in docs: + for t in set(d): + df[t] += 1 + return docs, df + + +def rank_bm25(query, files, bodytok, symbols, k1=1.5, b=0.75) -> list[str]: + docs, df = _build_index(files, bodytok) + N = len(docs) + if N == 0: + return [] + avgdl = sum(len(d) for d in docs) / N or 1.0 + idf = {t: math.log(1 + (N - n + 0.5) / (n + 0.5)) for t, n in df.items()} + qtok = set(tokenize(query)) + scores = np.zeros(N) + for i, d in enumerate(docs): + if not d: + continue + tf = Counter(d) + dl = len(d) + s = 0.0 + for t in qtok: + f = tf.get(t) + if not f: + continue + s += idf.get(t, 0.0) * (f * (k1 + 1)) / (f + k1 * (1 - b + b * dl / avgdl)) + scores[i] = s + order = np.argsort(-scores) + return [files[i] for i in order if scores[i] > 0] + + +def rank_tfidf(query, files, bodytok, symbols) -> list[str]: + docs, df = _build_index(files, bodytok) + N = len(docs) + if N == 0: + return [] + vocab = {t: j for j, t in enumerate(df)} + idf = np.array([math.log((1 + N) / (1 + df[t])) + 1 for t in vocab]) + rows = [] + for d in docs: + v = np.zeros(len(vocab)) + if d: + tf = Counter(d) + for t, c in tf.items(): + j = vocab.get(t) + if j is not None: + v[j] = c / len(d) + v *= idf + nrm = np.linalg.norm(v) + rows.append(v / nrm if nrm else v) + mat = np.array(rows) + qv = np.zeros(len(vocab)) + qtf = Counter(tokenize(query)) + for t, c in qtf.items(): + j = vocab.get(t) + if j is not None: + qv[j] = c + qv *= idf + qn = np.linalg.norm(qv) + if qn: + qv /= qn + scores = mat @ qv + order = np.argsort(-scores) + return [files[i] for i in order if scores[i] > 0] + + +RETRIEVERS = { + "name_prefix": rank_name_prefix, + "bm25": rank_bm25, + "tfidf": rank_tfidf, +} + + +def score(ranked: list[str], gold: list[str]): + goldset = set(gold) + pos = {f: i for i, f in enumerate(ranked)} + ranks = [pos[g] + 1 for g in gold if g in pos] + out = {} + for k in KS: + topk = set(ranked[:k]) + out[f"recall@{k}"] = len(topk & goldset) / len(goldset) if goldset else 0.0 + out[f"hit@{k}"] = 1.0 if (topk & goldset) else 0.0 + out["mrr"] = 1.0 / min(ranks) if ranks else 0.0 + out["gold_best_rank"] = min(ranks) if ranks else None + out["gold_found"] = sum(1 for g in gold if g in pos) + out["n_gold"] = len(gold) + out["n_files"] = len(ranked) + return out + + +def main(): + insts = swe_bench.load_instances() + sel = swe_bench.select_structural(insts, n=20, no_leak=True) + r = redis.Redis(host="localhost", port=FALKOR_PORT, decode_responses=True) + + agg: dict[str, list[dict]] = {a: [] for a in RETRIEVERS} + per_task = [] + for inst in sel: + task = inst.instance_id + gold = swe_bench.gold_changed_files(inst.patch, source_only=True) + gold = [g for g in gold if g.endswith(".py")] + try: + files, bodytok, symbols = fetch_graph(r, task) + except Exception as e: # noqa: BLE001 + print(f"!! {task}: graph fetch failed: {e}", file=sys.stderr) + continue + gold_in_graph = sum(1 for g in gold if g in set(files)) + row = {"task": task, "n_gold": len(gold), "gold_in_graph": gold_in_graph, + "n_files": len(files)} + for arm, fn in RETRIEVERS.items(): + ranked = fn(inst.problem_statement, files, bodytok, symbols) + sc = score(ranked, gold) + agg[arm].append(sc) + row[arm] = sc + per_task.append(row) + gp = " ".join( + f"{a}:R@5={row[a]['recall@5']:.2f}/rk={row[a]['gold_best_rank']}" + for a in RETRIEVERS + ) + print(f"{task:38s} gold={len(gold)} in_graph={gold_in_graph}/{len(gold)} " + f"files={len(files):5d} | {gp}") + + print("\n================ AGGREGATE (n={}) ================".format(len(per_task))) + hdr = f"{'arm':12s}" + for k in KS: + hdr += f" R@{k:<4}" + for k in KS: + hdr += f" hit@{k:<2}" + hdr += " MRR" + print(hdr) + for arm in RETRIEVERS: + rows = agg[arm] + line = f"{arm:12s}" + for k in KS: + line += f" {np.mean([x[f'recall@{k}'] for x in rows]):.3f}" + for k in KS: + line += f" {np.mean([x[f'hit@{k}'] for x in rows]):.3f}" + line += f" {np.mean([x['mrr'] for x in rows]):.3f}" + print(line) + + tot_gold = sum(p["n_gold"] for p in per_task) + tot_in = sum(p["gold_in_graph"] for p in per_task) + print(f"\ngold-file coverage in graph: {tot_in}/{tot_gold} " + f"({100*tot_in/tot_gold:.1f}%)") + print("\nReference (live agent, Phase A Sonnet): no_mcp recall=0.613 " + "acc@3=0.95 MRR=0.875 ; code_graph recall=0.512 MRR=0.800") + + +if __name__ == "__main__": + main() diff --git a/bench/analysis/trace.py b/bench/analysis/trace.py new file mode 100644 index 00000000..ce1cc64b --- /dev/null +++ b/bench/analysis/trace.py @@ -0,0 +1,772 @@ +"""Trajectory trace extractor for Copilot benchmark runs. + +Each benchmark run persists the full Copilot event stream to +``/logs/stdout.jsonl``. The runner itself only derives scalar counts +from it. This module reconstructs the *decision loop* so we can analyse what +the agent did rather than guess: + + (tool_name, arguments) -> (success, result_content, size, empty?) + -> (assistant reasoning/message that followed) + +It emits, per run: + * ``trace.jsonl`` -- one JSON object per tool step (machine-readable) + * ``trace.md`` -- a readable timeline (human review) + * a ``summary`` dict -- derived behaviour signals + per-file *attribution* + (did a structural tool actually surface each correctly-predicted file, or + did it come from the prompt / a builtin view-grep / the model's own prior?) + +Standalone & post-hoc: it reads an existing ``run_dir`` (and the matching row +in ``results.jsonl`` for gold/pred), so it works on runs already on disk and +can also be wired into the runner for future runs. + +Usage: + python -m bench.analysis.trace [ ...] + python -m bench.analysis.trace --cache-dir bench/cache/phaseB-levers \ + --model claude-sonnet-4.6 [--mode localize] +""" + +from __future__ import annotations + +import argparse +import json +import os +from pathlib import Path +from typing import Any, Iterable, Optional + +# Tool-name classification -------------------------------------------------- +# MCP code-graph tools surface as ``code-graph-``; the LSP arm (when +# built) will surface as ``lsp-``. Builtin agent tools are everything +# else the CLI ships (view/str_replace/grep/glob/bash/report_intent/...). +GRAPH_PREFIX = "code-graph" +LSP_PREFIX = "lsp" +BUILTIN_READERS = {"view", "read", "cat", "grep", "glob", "search", "ripgrep"} +LOCALIZE_SENTINEL = "FINAL_LOCALIZATION_JSON:" + +# Result payloads can be huge (whole file slices). Cap what we inline into the +# readable/structured trace; keep enough to see what the agent actually saw. +_RESULT_CHARS_MD = 800 +_RESULT_CHARS_JSONL = 4000 +_ARGS_CHARS = 600 +_REACTION_CHARS_MD = 600 + + +def _tool_kind(name: str) -> str: + if not name: + return "unknown" + if name.startswith(GRAPH_PREFIX): + return "graph" + if name.startswith(LSP_PREFIX): + return "lsp" + base = name.split("-")[-1].lower() + if base in BUILTIN_READERS: + return "builtin_reader" + return "builtin_other" + + +def _est_tokens(text: str) -> int: + """Cheap token estimate (~4 chars/token) for result-size accounting.""" + return (len(text) + 3) // 4 if text else 0 + + +def _load_events(stdout_path: Path) -> list[dict[str, Any]]: + events: list[dict[str, Any]] = [] + if not stdout_path.exists(): + return events + with stdout_path.open() as f: + for line in f: + line = line.strip() + if not line: + continue + try: + events.append(json.loads(line)) + except json.JSONDecodeError: + continue + return events + + +def _result_to_text(result: Any) -> str: + """Flatten a tool.execution_complete ``result`` into displayable text.""" + if result is None: + return "" + if isinstance(result, str): + return result + if isinstance(result, dict): + for key in ("content", "detailedContent"): + val = result.get(key) + if isinstance(val, str) and val.strip(): + return val + # Fall back to a compact JSON dump of the whole result object. + try: + return json.dumps(result, ensure_ascii=False) + except (TypeError, ValueError): + return str(result) + return str(result) + + +def _is_empty_result(text: str) -> bool: + t = text.strip() + return t in ("", "{}", "[]", '{"result":[]}', '{"result": []}', "null") + + +def build_steps(events: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Reconstruct ordered tool steps, each paired with the agent's reaction. + + A *step* = one ``tool.execution_start`` matched (by toolCallId) to its + ``tool.execution_complete``, annotated with the assistant reasoning/message + text that streamed *after* that completion and *before* the next tool + started (the agent's reaction to the tool's output). + """ + # Index completions by toolCallId for O(1) pairing. + completions: dict[str, dict[str, Any]] = {} + for ev in events: + if ev.get("type") == "tool.execution_complete": + d = ev.get("data", {}) + cid = d.get("toolCallId") + if cid: + completions[cid] = d + + steps: list[dict[str, Any]] = [] + # First pass: collect starts in order with their event index. + starts: list[tuple[int, dict[str, Any]]] = [] + for i, ev in enumerate(events): + if ev.get("type") == "tool.execution_start": + starts.append((i, ev)) + + for step_idx, (ev_idx, ev) in enumerate(starts): + d = ev.get("data", {}) + cid = d.get("toolCallId") + name = d.get("toolName") or d.get("name") or "unknown" + comp = completions.get(cid, {}) + result_text = _result_to_text(comp.get("result")) + + # Thoughts BEFORE this tool call = assistant reasoning/message that + # streamed AFTER the previous tool start (or stream start for step 0) + # and BEFORE this tool's start. This is the chain-of-thought that led + # to this action. Within one assistant turn the model emits one + # reasoning + one message block then fires N tool starts, so the + # before-block attaches to the FIRST tool of the turn; siblings get + # empty before-blocks (honest — the thought happened once). The `turn` + # field lets a reader regroup siblings. + prev_ev_idx = starts[step_idx - 1][0] if step_idx > 0 else -1 + thinking_parts: list[str] = [] + narration_parts: list[str] = [] + for j in range(prev_ev_idx + 1, ev_idx): + ej = events[j] + etype = ej.get("type") + content = ej.get("data", {}).get("content") + if not isinstance(content, str) or not content.strip(): + continue + if etype == "assistant.reasoning": + thinking_parts.append(content.strip()) + elif etype == "assistant.message": + narration_parts.append(content.strip()) + thinking_before = "\n".join(thinking_parts) + narration_before = "\n".join(narration_parts) + + # Reaction = assistant message/reasoning text between this step's event + # index and the next step's event index (or end of stream). Kept for + # backward-compat with programmatic consumers; equals the NEXT step's + # before-block, so render_md uses the before-blocks instead. + next_ev_idx = starts[step_idx + 1][0] if step_idx + 1 < len(starts) else len(events) + reaction_parts: list[str] = [] + for j in range(ev_idx + 1, next_ev_idx): + ej = events[j] + if ej.get("type") in ("assistant.message", "assistant.reasoning"): + content = ej.get("data", {}).get("content") + if isinstance(content, str) and content.strip(): + reaction_parts.append(content.strip()) + reaction = "\n".join(reaction_parts) + + steps.append({ + "step": step_idx, + "turn": d.get("turnId"), + "tool": name, + "kind": _tool_kind(name), + "mcp_server": d.get("mcpServerName"), + "mcp_tool": d.get("mcpToolName"), + "arguments": d.get("arguments"), + "success": comp.get("success"), + "result_text": result_text, + "result_chars": len(result_text), + "result_tokens_est": _est_tokens(result_text), + "empty": _is_empty_result(result_text), + "thinking_before": thinking_before, + "narration_before": narration_before, + "reaction": reaction, + }) + return steps + + +def final_blocks(events: list[dict[str, Any]]) -> dict[str, str]: + """Trailing thinking + narration AFTER the last tool call. + + This is the agent's closing reasoning and final answer (e.g. the + ``FINAL_LOCALIZATION_JSON:`` payload) which streams after the last tool + completes and would otherwise be dropped by the per-step windows. + """ + last_start = -1 + for i, ev in enumerate(events): + if ev.get("type") == "tool.execution_start": + last_start = i + thinking_parts: list[str] = [] + narration_parts: list[str] = [] + for j in range(last_start + 1, len(events)): + ej = events[j] + etype = ej.get("type") + content = ej.get("data", {}).get("content") + if not isinstance(content, str) or not content.strip(): + continue + if etype == "assistant.reasoning": + thinking_parts.append(content.strip()) + elif etype == "assistant.message": + narration_parts.append(content.strip()) + return { + "thinking": "\n".join(thinking_parts), + "narration": "\n".join(narration_parts), + } + + +def _mentions(text: str, path: str) -> bool: + """Does ``text`` reference this file by full relative path or basename?""" + if not text or not path: + return False + base = os.path.basename(path) + return (path in text) or (bool(base) and base in text) + + +def _iter_json_objects(text: str) -> Iterable[dict[str, Any]]: + """Yield every top-level JSON object embedded in ``text``. + + Structural ``search_code`` results are a stream of concatenated JSON + objects (NDJSON-like), not a single array, so a plain ``json.loads`` fails + with "Extra data". This walks the string with ``raw_decode`` and yields + each object it can decode, tolerating non-JSON noise between them. + """ + if not text: + return + dec = json.JSONDecoder() + i, n = 0, len(text) + while i < n: + # Skip to the next plausible object/array start. + while i < n and text[i] not in "{[": + i += 1 + if i >= n: + return + try: + obj, end = dec.raw_decode(text, i) + except json.JSONDecodeError: + i += 1 + continue + if isinstance(obj, dict): + yield obj + elif isinstance(obj, list): + for item in obj: + if isinstance(item, dict): + yield item + i = end + + +def _structural_surface_map(text: str) -> dict[str, str]: + """Map every file path surfaced in a structural result to HOW it surfaced. + + Returns ``{path_or_basename: via}`` where ``via`` is ``"direct"`` for a + genuinely ranked hit (a top-level object carrying a numeric ``score``), or + the edge label (e.g. ``"co_override"``) for a file that only appears as a + ``likely_related_files`` sibling. The tool also re-emits edge siblings as + trailing *score-less* top-level objects; those are NOT counted as direct + ranked hits. ``"direct"`` always wins when a file surfaces both ways. Both + the full path and its basename are indexed so attribution can match either. + """ + edges: dict[str, str] = {} + direct: set[str] = set() + for obj in _iter_json_objects(text): + related = obj.get("likely_related_files") + if isinstance(related, list): + for rel in related: + if isinstance(rel, dict): + path = rel.get("file") + if isinstance(path, str) and path: + for key in (path, os.path.basename(path)): + edges.setdefault(key, str(rel.get("via") or "edge")) + path = obj.get("file") + # Only a numeric-scored top-level object is a true ranked ("direct") hit; + # score-less entries are the re-emitted edge siblings. + if isinstance(path, str) and path and isinstance(obj.get("score"), (int, float)): + for key in (path, os.path.basename(path)): + direct.add(key) + + surfaces: dict[str, str] = dict(edges) + for key in direct: + surfaces[key] = "direct" # direct ranked hit always wins + return surfaces + + +def attribute_files( + pred_files: list[str], + gold_files: list[str], + steps: list[dict[str, Any]], + prompt_text: str, +) -> list[dict[str, Any]]: + """For each predicted file, decide WHERE it first surfaced. + + Source precedence (earliest evidence wins): + * ``prompt`` -- named in the problem statement (leak / given) + * ``graph`` / ``lsp`` -- first appeared in a structural tool's result + * ``builtin_reader`` -- first appeared via view/grep/glob output + * ``model`` -- never seen in prompt or any tool result; the + agent produced it from its own prior knowledge + + This is the anti-guessing metric: it tells us whether the structural tool + *actually contributed* the correct answer or was decorative. + """ + gold_set = {g for g in gold_files} + attded: list[dict[str, Any]] = [] + for p in pred_files: + is_hit = p in gold_set + source = "model" + source_step: Optional[int] = None + source_tool: Optional[str] = None + via: Optional[str] = None + if _mentions(prompt_text, p): + source = "prompt" + else: + for s in steps: + # Only successful, non-empty tool results count as a "surface". + if s.get("success") is False or s.get("empty"): + continue + if _mentions(s.get("result_text", ""), p): + kind = s["kind"] + if kind in ("graph", "lsp"): + source = kind + # Distinguish a direct ranked hit from an edge-derived + # one (e.g. co_override) so the structural mechanism that + # actually surfaced the file gets explicit credit. + surfaces = _structural_surface_map(s.get("result_text", "")) + via = surfaces.get(p) or surfaces.get(os.path.basename(p)) or "direct" + elif kind == "builtin_reader": + source = "builtin_reader" + else: + source = "builtin_other" + source_step = s["step"] + source_tool = s["tool"] + break + attded.append({ + "file": p, + "is_gold_hit": is_hit, + "source": source, + "source_step": source_step, + "source_tool": source_tool, + "via": via, + }) + return attded + + +def summarize(steps: list[dict[str, Any]], attribution: list[dict[str, Any]]) -> dict[str, Any]: + """Derive behaviour signals from the reconstructed steps.""" + by_name: dict[str, int] = {} + by_kind: dict[str, int] = {} + empty = 0 + errors = 0 + seen_calls: set[str] = set() + redundant = 0 + structural_first: Optional[bool] = None + first_tool: Optional[str] = None + + for s in steps: + name = s["tool"] + by_name[name] = by_name.get(name, 0) + 1 + by_kind[s["kind"]] = by_kind.get(s["kind"], 0) + 1 + if s.get("empty"): + empty += 1 + if s.get("success") is False: + errors += 1 + sig = f"{name}:{json.dumps(s.get('arguments'), sort_keys=True)}" + if sig in seen_calls: + redundant += 1 + else: + seen_calls.add(sig) + if first_tool is None: + first_tool = name + structural_first = s["kind"] in ("graph", "lsp") + + structural_calls = by_kind.get("graph", 0) + by_kind.get("lsp", 0) + + # Attribution rollup over correctly-predicted (gold-hit) files only. + hit_sources: dict[str, int] = {} + hit_via: dict[str, int] = {} + for a in attribution: + if a["is_gold_hit"]: + hit_sources[a["source"]] = hit_sources.get(a["source"], 0) + 1 + if a["source"] in ("graph", "lsp"): + key = f"{a['source']}:{a.get('via') or 'direct'}" + hit_via[key] = hit_via.get(key, 0) + 1 + + cost_without_benefit = _cost_without_benefit(steps, attribution) + + return { + "tool_calls_total": len(steps), + "tool_calls_by_name": by_name, + "tool_calls_by_kind": by_kind, + "structural_calls": structural_calls, + "structural_adopted": structural_calls > 0, + "first_tool": first_tool, + "structural_first": structural_first, + "empty_result_count": empty, + "tool_error_count": errors, + "redundant_call_count": redundant, + "gold_hit_source_counts": hit_sources, + "gold_hit_via_counts": hit_via, + "cost_without_benefit": cost_without_benefit, + } + + +# Tool kinds that are "under test" — the navigation tools whose value we are +# trying to measure. Builtin grep/view/glob are the baseline the agent always +# has, so they are not charged as cost-without-benefit here. +_TESTED_KINDS = ("graph", "lsp") + + +def _cost_without_benefit( + steps: list[dict[str, Any]], + attribution: list[dict[str, Any]], +) -> dict[str, Any]: + """Tokens the tool-under-test injected into context that did NOT surface a + correctly-predicted (gold) file. + + A structural call "benefits" the run iff it is the step that first surfaced + a gold-hit predicted file (per ``attribute_files`` precedence). Every other + structural call — empty results, redundant queries, verbose dumps the agent + never used, or surfaces of non-gold files — is charged as wasted context + cost. This is the sharp "cost without benefit" indicator: high wasted_tokens + with benefited=False means the tool spent context and contributed nothing to + the answer. + """ + beneficial_steps = { + a["source_step"] + for a in attribution + if a.get("is_gold_hit") + and a.get("source") in _TESTED_KINDS + and a.get("source_step") is not None + } + by_kind: dict[str, dict[str, int]] = {} + total_tokens = 0 + beneficial_tokens = 0 + wasted_tokens = 0 + wasted_calls = 0 + for s in steps: + kind = s.get("kind") + if kind not in _TESTED_KINDS: + continue + tok = int(s.get("result_tokens_est") or 0) + slot = by_kind.setdefault( + kind, {"calls": 0, "tokens": 0, "wasted_calls": 0, "wasted_tokens": 0} + ) + slot["calls"] += 1 + slot["tokens"] += tok + total_tokens += tok + if s.get("step") in beneficial_steps: + beneficial_tokens += tok + else: + wasted_tokens += tok + wasted_calls += 1 + slot["wasted_calls"] += 1 + slot["wasted_tokens"] += tok + return { + "tested_kinds": [k for k in _TESTED_KINDS if k in by_kind], + "structural_result_tokens": total_tokens, + "beneficial_tokens": beneficial_tokens, + "wasted_tokens": wasted_tokens, + "wasted_calls": wasted_calls, + "wasted_fraction": round(wasted_tokens / total_tokens, 4) if total_tokens else None, + "benefited": bool(beneficial_steps), + "by_kind": by_kind, + } + + +def _fmt_block(text: str, cap: int) -> str: + if not text: + return "(none)" + t = text.strip() + if len(t) > cap: + t = t[:cap] + f"\n… [+{len(t) - cap} chars truncated]" + return t + + +def render_md(meta: dict[str, Any], steps: list[dict[str, Any]], + attribution: list[dict[str, Any]], summary: dict[str, Any], + final: Optional[dict[str, str]] = None) -> str: + lines: list[str] = [] + lines.append(f"# Trace — {meta.get('task_id')} [{meta.get('config')}] ({meta.get('prompt_mode')})") + lines.append("") + lines.append(f"- model: {meta.get('model')} mode: {meta.get('mode')} run_idx: {meta.get('run_idx')}") + lines.append(f"- outcome: {meta.get('outcome')} recall: {meta.get('file_recall')} " + f"precision: {meta.get('file_precision')} acc@1: {meta.get('acc_at_1')} mrr: {meta.get('file_mrr')}") + _rt = meta.get("reasoning_tokens") + _rt_str = f" (of which reasoning: {_rt})" if _rt not in (None, 0) else "" + lines.append(f"- tokens: in={meta.get('input_tokens')} out={meta.get('output_tokens')}{_rt_str} " + f"total={meta.get('total_tokens')} turns≈{meta.get('usage_blocks')} wall={meta.get('wall_clock_sec')}s") + lines.append(f"- gold: {meta.get('gold_files')}") + lines.append(f"- pred: {meta.get('pred_files')}") + lines.append("") + lines.append("## Behaviour summary") + lines.append(f"- tool calls: {summary['tool_calls_total']} by kind: {summary['tool_calls_by_kind']}") + lines.append(f"- structural adopted: {summary['structural_adopted']} " + f"structural calls: {summary['structural_calls']} first tool: {summary['first_tool']}") + lines.append(f"- empty results: {summary['empty_result_count']} errors: {summary['tool_error_count']} " + f"redundant calls: {summary['redundant_call_count']}") + lines.append(f"- **gold-hit attribution**: {summary['gold_hit_source_counts'] or '(no gold hits)'}") + via_counts = summary.get("gold_hit_via_counts") + if via_counts: + lines.append(f"- **structural gold-hits by surface**: {via_counts} " + f"(direct = ranked hit; co_override/other = edge-derived)") + cwb = summary.get("cost_without_benefit") + if cwb and cwb.get("structural_result_tokens"): + frac = cwb.get("wasted_fraction") + frac_str = f"{frac:.0%}" if frac is not None else "n/a" + benefit_str = "yes" if cwb.get("benefited") else "**NO — tool contributed nothing**" + lines.append( + f"- **cost without benefit**: wasted ~{cwb['wasted_tokens']} of " + f"{cwb['structural_result_tokens']} structural tokens ({frac_str}) " + f"across {cwb['wasted_calls']} call(s); benefited: {benefit_str}" + ) + for kind, slot in (cwb.get("by_kind") or {}).items(): + lines.append( + f" - {kind}: {slot['wasted_calls']}/{slot['calls']} calls wasted, " + f"~{slot['wasted_tokens']}/{slot['tokens']} tok wasted" + ) + lines.append("") + lines.append("## Predicted-file attribution") + for a in attribution: + tag = "✓gold" if a["is_gold_hit"] else " miss" + where = a["source"] + if a["source"] in ("graph", "lsp") and a.get("via"): + where += f"/{a['via']}" + if a["source_tool"]: + where += f" (step {a['source_step']} {a['source_tool']})" + lines.append(f"- [{tag}] {a['file']} ← {where}") + lines.append("") + lines.append("## Step-by-step trajectory") + lines.append("") + lines.append("_Each step shows the agent's thinking and narration **before** the " + "tool call (the reasoning that led to the action), then the call and " + "its result._") + lines.append("") + for s in steps: + args = _fmt_block(json.dumps(s.get("arguments"), ensure_ascii=False), _ARGS_CHARS) + flags = [] + if s.get("empty"): + flags.append("EMPTY") + if s.get("success") is False: + flags.append("ERROR") + flag_str = (" [" + ",".join(flags) + "]") if flags else "" + lines.append(f"### Step {s['step']} · turn {s['turn']} · {s['tool']} ({s['kind']}){flag_str}") + thinking = s.get("thinking_before", "") + narration = s.get("narration_before", "") + if thinking: + lines.append(f"**thinking (before call):** {_fmt_block(thinking, _REACTION_CHARS_MD)}") + if narration: + lines.append(f"**narration (before call):** {_fmt_block(narration, _REACTION_CHARS_MD)}") + lines.append(f"**call:** `{args}`") + lines.append(f"**tool returned** ({s['result_chars']} chars, ~{s['result_tokens_est']} tok):") + lines.append("```") + lines.append(_fmt_block(s.get("result_text", ""), _RESULT_CHARS_MD)) + lines.append("```") + lines.append("") + if final and (final.get("thinking") or final.get("narration")): + lines.append("## Final (after last tool call)") + if final.get("thinking"): + lines.append(f"**thinking:** {_fmt_block(final['thinking'], _RESULT_CHARS_MD)}") + if final.get("narration"): + lines.append(f"**narration / answer:** {_fmt_block(final['narration'], _RESULT_CHARS_MD)}") + lines.append("") + return "\n".join(lines) + + +def extract_run(run_dir: Path, row: Optional[dict[str, Any]] = None, + write: bool = True) -> dict[str, Any]: + """Extract trace + summary for a single run directory. + + ``row`` is the matching results.jsonl record (for gold/pred/tokens). If + omitted, gold/pred attribution falls back to empty lists but the + trajectory + behaviour summary are still produced. + """ + stdout_path = run_dir / "logs" / "stdout.jsonl" + prompt_path = run_dir / "prompt.txt" + events = _load_events(stdout_path) + steps = build_steps(events) + final = final_blocks(events) + prompt_text = prompt_path.read_text() if prompt_path.exists() else "" + + row = row or {} + pred_files = row.get("pred_files") or [] + gold_files = row.get("gold_files") or [] + attribution = attribute_files(pred_files, gold_files, steps, prompt_text) + summary = summarize(steps, attribution) + + meta = { + "task_id": row.get("task_id"), + "config": row.get("config"), + "model": row.get("model"), + "mode": row.get("mode"), + "prompt_mode": row.get("prompt_mode"), + "run_idx": row.get("run_idx"), + "outcome": row.get("outcome"), + "file_recall": row.get("file_recall"), + "file_precision": row.get("file_precision"), + "acc_at_1": row.get("acc_at_1"), + "file_mrr": row.get("file_mrr"), + "input_tokens": row.get("input_tokens"), + "output_tokens": row.get("output_tokens"), + "reasoning_tokens": row.get("reasoning_tokens"), + "total_tokens": row.get("total_tokens"), + "usage_blocks": row.get("usage_blocks"), + "wall_clock_sec": row.get("wall_clock_sec"), + "gold_files": gold_files, + "pred_files": pred_files, + } + + if write: + with (run_dir / "trace.jsonl").open("w") as f: + f.write(json.dumps({"_meta": meta, "_summary": summary, + "_attribution": attribution, "_final": final}) + "\n") + for s in steps: + rec = dict(s) + # Cap inline result text in the structured file too. + rt = rec.get("result_text", "") + if len(rt) > _RESULT_CHARS_JSONL: + rec["result_text"] = rt[:_RESULT_CHARS_JSONL] + rec["result_truncated"] = True + f.write(json.dumps(rec, ensure_ascii=False) + "\n") + (run_dir / "trace.md").write_text(render_md(meta, steps, attribution, summary, final)) + + return {"meta": meta, "summary": summary, "attribution": attribution, + "steps": steps, "final": final, "run_dir": str(run_dir)} + + +# Discovery ----------------------------------------------------------------- + + +def _load_rows(results_path: Path) -> dict[tuple, dict[str, Any]]: + """Index results.jsonl rows by (task_id, config, prompt_mode, run_idx).""" + rows: dict[tuple, dict[str, Any]] = {} + if not results_path.exists(): + return rows + for line in results_path.read_text().splitlines(): + line = line.strip() + if not line: + continue + try: + r = json.loads(line) + except json.JSONDecodeError: + continue + key = (r.get("task_id"), r.get("config"), r.get("prompt_mode", "neutral"), + int(r.get("run_idx", 0))) + rows[key] = r + return rows + + +def iter_run_dirs(cache_dir: Path, model: str, mode: Optional[str] = None) -> Iterable[Path]: + """Yield every run dir under ``cache_dir/runs/model[/mode]``.""" + base = cache_dir / "runs" / model + if not base.exists(): + return + modes = [mode] if mode else [p.name for p in base.iterdir() if p.is_dir()] + for m in modes: + mdir = base / m + if not mdir.is_dir(): + continue + for prompt_mode_dir in mdir.iterdir(): + if not prompt_mode_dir.is_dir(): + continue + for track_dir in prompt_mode_dir.iterdir(): + if not track_dir.is_dir(): + continue + for inst_dir in track_dir.iterdir(): + if (inst_dir / "logs" / "stdout.jsonl").exists(): + yield inst_dir + + +def _row_for_run(run_dir: Path, rows: dict[tuple, dict[str, Any]]) -> Optional[dict[str, Any]]: + # Path layout: .../runs///// + parts = run_dir.parts + try: + task_id = parts[-1] + track = parts[-2] + prompt_mode = parts[-3] + except IndexError: + return None + for run_idx in range(0, 8): + key = (task_id, track, prompt_mode, run_idx) + if key in rows: + return rows[key] + # Fallback: match on task_id+config only. + for (t, c, _pm, _ri), r in rows.items(): + if t == task_id and c == track: + return r + return None + + +def _auto_results_path(run_dir: Path) -> Optional[Path]: + """Locate the results.jsonl for a positional run dir. + + Layout is ``/runs/////`` + and results live at ``//results.jsonl``. Walk up to the + ``runs`` anchor, recover ```` and ````, and return that path + if it exists. This means a bare ``trace.py `` still joins gold/pred + (otherwise attribution runs blind and falsely reports "contributed nothing"). + """ + parts = run_dir.parts + if "runs" not in parts: + return None + ri = len(parts) - 1 - parts[::-1].index("runs") + if ri + 1 >= len(parts): + return None + cache_dir = Path(*parts[:ri]) if ri > 0 else Path(parts[0]) + model = parts[ri + 1] + candidate = cache_dir / model / "results.jsonl" + return candidate if candidate.exists() else None + + +def main(argv: Optional[list[str]] = None) -> int: + p = argparse.ArgumentParser(description="Extract decision-loop traces from benchmark runs.") + p.add_argument("run_dirs", nargs="*", help="explicit run dir(s) to extract") + p.add_argument("--cache-dir", help="extract every run under this cache dir") + p.add_argument("--model", default="claude-sonnet-4.6") + p.add_argument("--mode", default=None, help="restrict to one mode (e.g. localize)") + p.add_argument("--results", default=None, help="results.jsonl (default: //results.jsonl)") + args = p.parse_args(argv) + + targets: list[tuple[Path, Optional[dict[str, Any]]]] = [] + if args.cache_dir: + cache_dir = Path(args.cache_dir).resolve() + results_path = Path(args.results) if args.results else cache_dir / args.model / "results.jsonl" + rows = _load_rows(results_path) + for rd in iter_run_dirs(cache_dir, args.model, args.mode): + targets.append((rd, _row_for_run(rd, rows))) + for rd in args.run_dirs: + rdp = Path(rd).resolve() + row = None + results_path = Path(args.results) if args.results else _auto_results_path(rdp) + if results_path: + row = _row_for_run(rdp, _load_rows(results_path)) + targets.append((rdp, row)) + + if not targets: + print("no run dirs found") + return 1 + + print(f"extracting {len(targets)} run(s)…") + for rd, row in targets: + out = extract_run(rd, row=row, write=True) + s = out["summary"] + m = out["meta"] + print(f" {m.get('task_id')} [{m.get('config')}/{m.get('prompt_mode')}] " + f"recall={m.get('file_recall')} tools={s['tool_calls_total']} " + f"kinds={s['tool_calls_by_kind']} empty={s['empty_result_count']} " + f"err={s['tool_error_count']} hit_src={s['gold_hit_source_counts']} " + f"-> {rd}/trace.md") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/bench/cli/cg-mcp b/bench/cli/cg-mcp new file mode 100755 index 00000000..be6c09bb --- /dev/null +++ b/bench/cli/cg-mcp @@ -0,0 +1,5 @@ +#!/usr/bin/env bash +# Bash-callable entry point for the code-graph MCP CLI. Mirrors `cg` +# but speaks JSON-RPC over stdio to a spawned `cgraph-mcp` server +# instead of HTTP to the FastAPI service. Runner adds bench/cli to PATH. +exec "${BENCH_PYTHON:-python3}" -m bench.cli.cg_mcp "$@" diff --git a/bench/cli/cg_mcp.py b/bench/cli/cg_mcp.py new file mode 100644 index 00000000..95c91390 --- /dev/null +++ b/bench/cli/cg_mcp.py @@ -0,0 +1,140 @@ +"""`cg-mcp` — bash-callable CLI exposing code-graph's 8 MCP tools. + +This is the MCP-transport sibling of `cg`. Where `cg` calls the host +FastAPI service over HTTP, `cg-mcp` spawns the `cgraph-mcp` stdio +server (via the official MCP Python SDK) for every invocation and +dispatches one tool call. + +The MCP track is what external agents (Claude Code, Cursor, …) use +in production; benchmarking through it tells us how the *real-world* +integration behaves under SWE-bench, not just the in-process FastAPI +adapter. + +Subcommands mirror the MCP tool names: + + cg-mcp index_repo --path-or-url . [--branch B] [--ignore PAT ...] + cg-mcp search_code --project P --prefix STR [--branch B] [--limit N] + cg-mcp get_callers --project P --symbol-id ID [--branch B] [--limit N] + cg-mcp get_callees --project P --symbol-id ID [--branch B] [--limit N] + cg-mcp get_dependencies --project P --symbol-id ID [--branch B] [--limit N] + cg-mcp impact_analysis --project P --symbol-id ID [--direction IN|OUT] [--depth N] + cg-mcp find_path --project P --source-id ID --dest-id ID [--branch B] + cg-mcp ask --project P --question "..." [--branch B] + +Output: one JSON document per call on stdout. Errors print to stderr +and exit non-zero. + +Env: FALKORDB_HOST / FALKORDB_PORT are passed through to the spawned +server. Optionally set CGRAPH_MCP_TIMEOUT_SEC to override the +default 60s timeout. +""" + +from __future__ import annotations + +import argparse +import json +import os +import sys +from typing import Any + +from bench.agents import code_graph_mcp_adapter as cgm + + +def _print(obj: Any) -> None: + json.dump(obj, sys.stdout, indent=2, sort_keys=True, default=str) + sys.stdout.write("\n") + + +def _timeout() -> float: + try: + return float(os.getenv("CGRAPH_MCP_TIMEOUT_SEC", "60")) + except ValueError: + return 60.0 + + +def _add_project(p: argparse.ArgumentParser) -> None: + p.add_argument("--project", required=True) + p.add_argument("--branch", default=None) + + +def _add_symbol(p: argparse.ArgumentParser) -> None: + p.add_argument("--symbol-id", type=int, required=True, dest="symbol_id") + p.add_argument("--limit", type=int, default=50) + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(prog="cg-mcp", description=__doc__) + sub = parser.add_subparsers(dest="cmd", required=True) + + ir = sub.add_parser("index_repo") + ir.add_argument("--path-or-url", required=True, dest="path_or_url") + ir.add_argument("--branch", default=None) + ir.add_argument("--ignore", nargs="*", default=None) + + sc = sub.add_parser("search_code") + _add_project(sc) + sc.add_argument("--prefix", required=True) + sc.add_argument("--limit", type=int, default=10) + + for name in ("get_callers", "get_callees", "get_dependencies"): + p = sub.add_parser(name) + _add_project(p) + _add_symbol(p) + + ia = sub.add_parser("impact_analysis") + _add_project(ia) + ia.add_argument("--symbol-id", type=int, required=True, dest="symbol_id") + ia.add_argument("--direction", choices=["IN", "OUT"], default="IN") + ia.add_argument("--depth", type=int, default=3) + + fp = sub.add_parser("find_path") + _add_project(fp) + fp.add_argument("--source-id", type=int, required=True, dest="source_id") + fp.add_argument("--dest-id", type=int, required=True, dest="dest_id") + + aq = sub.add_parser("ask") + _add_project(aq) + aq.add_argument("--question", required=True) + + args = parser.parse_args(argv) + timeout = _timeout() + + # Inject timeout for adapter calls. + cgm.DEFAULT_TIMEOUT_SEC = timeout + + try: + if args.cmd == "index_repo": + _print(cgm.index_repo(args.path_or_url, branch=args.branch, ignore=args.ignore)) + elif args.cmd == "search_code": + _print(cgm.search_code(args.prefix, args.project, branch=args.branch, limit=args.limit)) + elif args.cmd == "get_callers": + _print(cgm.get_callers(args.symbol_id, args.project, branch=args.branch, limit=args.limit)) + elif args.cmd == "get_callees": + _print(cgm.get_callees(args.symbol_id, args.project, branch=args.branch, limit=args.limit)) + elif args.cmd == "get_dependencies": + _print(cgm.get_dependencies(args.symbol_id, args.project, branch=args.branch, limit=args.limit)) + elif args.cmd == "impact_analysis": + _print( + cgm.impact_analysis( + args.symbol_id, + args.project, + branch=args.branch, + direction=args.direction, + depth=args.depth, + ) + ) + elif args.cmd == "find_path": + _print(cgm.find_path(args.source_id, args.dest_id, args.project, branch=args.branch)) + elif args.cmd == "ask": + _print(cgm.ask(args.question, args.project, branch=args.branch)) + else: # pragma: no cover — argparse already enforces this + parser.error(f"unknown subcommand: {args.cmd}") + except Exception as e: # noqa: BLE001 — surface everything to the agent + print(f"cg-mcp error: {e}", file=sys.stderr) + return 1 + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bench/datasets/swe_bench.py b/bench/datasets/swe_bench.py index 6f89373a..a1e3428c 100644 --- a/bench/datasets/swe_bench.py +++ b/bench/datasets/swe_bench.py @@ -30,9 +30,13 @@ from __future__ import annotations +import hashlib +import hmac import json import os import random +import re +import secrets import shutil import subprocess from dataclasses import dataclass @@ -42,6 +46,9 @@ from bench.runners.mini_runner import Task DATASET_NAME = "princeton-nlp/SWE-bench_Verified" +# Loc-Bench (LocAgent, ACL 2025): curated multi-hop code-localization benchmark. +# Schema-compatible subset; localization-only (no FAIL_TO_PASS / PASS_TO_PASS). +LOC_BENCH_DATASET = "czlll/Loc-Bench_V1" DEFAULT_CACHE_ROOT = Path(__file__).resolve().parents[1] / "cache" REPOS_DIR = DEFAULT_CACHE_ROOT / "repos" WORKTREES_DIR = DEFAULT_CACHE_ROOT / "worktrees" @@ -49,6 +56,62 @@ # Locked-in seed from plan / configs/default.yaml. DEFAULT_SEED = 20260526 +# --------------------------------------------------------------------------- +# Answer-leakage hardening (default ON; opt out with BENCH_BLOCK_NETWORK=0) +# --------------------------------------------------------------------------- +# The localize worktree was historically named ``{instance_id}__loc``. The +# instance_id embeds the upstream GitHub PR/issue number, so that name leaked +# into the prompt cwd, ``--add-dir`` and the code-graph ``project=`` key — the +# agent could read the PR number off the path and fetch the merged PR's file +# list (the gold answer), or read the cloned ``.git`` (origin + post-fix +# default-branch ref) fully offline. When hardening is enabled we (a) name the +# worktree with an opaque salted HMAC of the instance_id and (b) strip ``.git``. +# +# The salt defaults to a per-process random value; pin BENCH_LEAK_SALT only if +# a stable mapping across processes is needed (it is NOT required for resume, +# since localize worktrees are rmtree'd per run). The salt must never reach the +# agent process env (the runner scrubs it from the Copilot child environment). +_RUN_SALT = os.environ.get("BENCH_LEAK_SALT") or secrets.token_hex(16) + +# Env vars scrubbed from the agent's process environment under hardening, so the +# agent cannot recover the opaque-name salt or use ambient GitHub credentials. +LEAK_SCRUB_ENV_VARS = ( + "BENCH_LEAK_SALT", + "GITHUB_TOKEN", + "GH_TOKEN", + "GITHUB_PAT", + "GH_PAT", +) + + +def network_block_enabled() -> bool: + """Whether answer-leakage hardening is active for this run. + + Default ON. Tracing repeatedly caught the agent fetching the gold file list + from GitHub (``gh pr view``, ``web_fetch`` of the issue/PR) and reading the + cloned ``.git`` post-fix ref, which silently turned localization misses into + fake recall=1.0 wins. Hardening is therefore enabled unless explicitly + disabled with ``BENCH_BLOCK_NETWORK`` set to a falsy value + (``0``/``false``/``no``/``off``). + """ + val = os.environ.get("BENCH_BLOCK_NETWORK") + if val is None: + return True + return val.strip().lower() not in ("0", "false", "no", "off", "") + + +def opaque_worktree_name(instance_id: str) -> str: + """Opaque, salted worktree dir name that does not embed the PR/issue number. + + HMAC-SHA256(salt, instance_id) truncated to 16 hex chars, ``loc-`` prefixed. + Deterministic within a process (stable salt) so a single run's index/prompt/ + query all agree, but reveals nothing about the upstream instance. + """ + digest = hmac.new( + _RUN_SALT.encode(), instance_id.encode(), hashlib.sha256 + ).hexdigest()[:16] + return f"loc-{digest}" + # Per-stage sample sizes (locked-in from plan). STAGE_SIZES = {"smoke": 3, "calibration": 10, "headline": 37} @@ -66,6 +129,8 @@ class SweBenchInstance: pass_to_pass: list[str] environment_setup_commit: str version: str + patch: str = "" # gold source patch (localization ground truth) + category: str = "" # Loc-Bench issue category (Bug, Feature, Performance, ...) def _git(args: list[str], cwd: Path | None = None, check: bool = True) -> subprocess.CompletedProcess: @@ -79,11 +144,20 @@ def _git(args: list[str], cwd: Path | None = None, check: bool = True) -> subpro def _parse_list_field(value: Any) -> list[str]: - """SWE-bench stores FAIL_TO_PASS / PASS_TO_PASS as JSON strings.""" + """SWE-bench stores FAIL_TO_PASS / PASS_TO_PASS as JSON strings. + + Localization-only datasets (e.g. Loc-Bench) omit these; treat missing / + empty values as an empty list rather than raising. + """ + if value is None: + return [] if isinstance(value, list): return list(value) if isinstance(value, str): - return list(json.loads(value)) + s = value.strip() + if not s: + return [] + return list(json.loads(s)) raise TypeError(f"unsupported list field: {type(value)!r}") @@ -91,14 +165,21 @@ def load_instances( *, split: str = "test", cache_dir: Path | None = None, + dataset_name: str | None = None, ) -> list[SweBenchInstance]: - """Load all SWE-bench Verified instances from HuggingFace.""" + """Load SWE-bench instances from HuggingFace. + + Defaults to `princeton-nlp/SWE-bench_Verified`. Pass `dataset_name` (e.g. + `SWE-bench-Live/SWE-bench-Live`, which is schema-compatible and exposes a + `verified` split) to evaluate a contamination-free / less-pretraining- + saturated corpus. + """ from datasets import load_dataset # local import — heavy kwargs: dict[str, Any] = {"split": split} if cache_dir is not None: kwargs["cache_dir"] = str(cache_dir) - ds = load_dataset(DATASET_NAME, **kwargs) + ds = load_dataset(dataset_name or DATASET_NAME, **kwargs) out: list[SweBenchInstance] = [] for row in ds: @@ -108,11 +189,13 @@ def load_instances( repo=row["repo"], base_commit=row["base_commit"], problem_statement=row["problem_statement"], - test_patch=row["test_patch"], - fail_to_pass=_parse_list_field(row["FAIL_TO_PASS"]), - pass_to_pass=_parse_list_field(row["PASS_TO_PASS"]), + test_patch=row.get("test_patch") or "", + fail_to_pass=_parse_list_field(row.get("FAIL_TO_PASS")), + pass_to_pass=_parse_list_field(row.get("PASS_TO_PASS")), environment_setup_commit=row.get("environment_setup_commit") or "", version=row.get("version") or "", + patch=row.get("patch") or "", + category=row.get("category") or "", ) ) return out @@ -214,6 +297,287 @@ def instance_to_task(inst: SweBenchInstance, repo_path: Path) -> Task: ) +# --------------------------------------------------------------------------- +# Localization ground truth (LocAgent-style) +# --------------------------------------------------------------------------- + +# Paths we exclude from the "files to modify" gold set: tests, docs, and +# anything that isn't Python source. Localization asks for the *implementation* +# files, so an agent that correctly avoids tests shouldn't be penalized. +_TEST_PATH_RE = re.compile( + r"(^|/)(tests?|testing|test)(/|$)" # tests/ dir + r"|(^|/)conftest\.py$" # pytest conftest + r"|(^|/)test_[^/]*\.py$" # test_*.py + r"|[^/]*_test\.py$" # *_test.py +) +_DOC_PATH_RE = re.compile(r"(^|/)docs?(/|$)|\.(rst|md|txt|cfg|ini|toml)$") + + +def is_source_file(path: str) -> bool: + """True for non-test, non-doc Python source files.""" + if not path.endswith(".py"): + return False + if _TEST_PATH_RE.search(path): + return False + if _DOC_PATH_RE.search(path): + return False + return True + + +def gold_changed_files(patch: str, *, source_only: bool = True) -> list[str]: + """Repo-relative files touched by a unified diff, in patch order. + + Reads `+++ b/` headers (skips /dev/null deletions). When + `source_only`, filters to non-test non-doc Python files. + """ + files: list[str] = [] + for line in patch.splitlines(): + if not line.startswith("+++ "): + continue + target = line[4:].strip() + if target == "/dev/null": + continue + # strip the leading "b/" git prefix if present + if target.startswith("b/"): + target = target[2:] + if target in files: + continue + if source_only and not is_source_file(target): + continue + files.append(target) + return files + + +def _patch_hunk_ranges(patch: str) -> dict[str, list[tuple[int, int]]]: + """Map each target file -> list of (start, end) NEW-file line ranges + that the gold patch modifies. Used for symbol-level localization.""" + ranges: dict[str, list[tuple[int, int]]] = {} + cur: str | None = None + for line in patch.splitlines(): + if line.startswith("+++ "): + target = line[4:].strip() + if target.startswith("b/"): + target = target[2:] + cur = None if target == "/dev/null" else target + if cur is not None: + ranges.setdefault(cur, []) + continue + if line.startswith("@@"): + m = re.search(r"\+(\d+)(?:,(\d+))?", line) + if m and cur is not None: + start = int(m.group(1)) + count = int(m.group(2) or "1") + ranges[cur].append((start, start + max(count - 1, 0))) + continue + return ranges + + +def gold_symbols(inst: SweBenchInstance, repo_path: Path) -> dict[str, list[str]]: + """Best-effort Python symbol-level gold: for each gold source file, + the set of enclosing top-level/def/class symbol names whose body the + gold patch modifies. Maps NEW-file hunk line ranges to enclosing + ast.FunctionDef/AsyncFunctionDef/ClassDef. Files that don't parse or + don't map are silently skipped (reported as unmappable upstream). + """ + import ast + + out: dict[str, list[str]] = {} + ranges = _patch_hunk_ranges(inst.patch) + for rel, rngs in ranges.items(): + if not is_source_file(rel): + continue + fpath = repo_path / rel + if not fpath.exists(): + continue + try: + tree = ast.parse(fpath.read_text()) + except (SyntaxError, UnicodeDecodeError): + continue + # Build (start,end,qualname) for every def/class. + spans: list[tuple[int, int, str]] = [] + + def _walk(node: ast.AST, prefix: str) -> None: + for child in ast.iter_child_nodes(node): + if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): + qual = f"{prefix}{child.name}" + start = child.lineno + end = getattr(child, "end_lineno", start) + spans.append((start, end, qual)) + _walk(child, qual + ".") + else: + _walk(child, prefix) + + _walk(tree, "") + hit: list[str] = [] + for (hs, he) in rngs: + # innermost enclosing symbol per hunk + best: tuple[int, str] | None = None + for (s, e, q) in spans: + if s <= hs <= e or s <= he <= e or (hs <= s and he >= e): + size = e - s + if best is None or size < best[0]: + best = (size, q) + if best and best[1] not in hit: + hit.append(best[1]) + if hit: + out[rel] = hit + return out + + +def leakage_flags(inst: SweBenchInstance, gold_files: list[str]) -> dict[str, bool]: + """Annotate whether the issue text trivially leaks the gold location.""" + text = inst.problem_statement or "" + basenames = {Path(f).name for f in gold_files} + return { + "mentions_gold_path": any(f in text for f in gold_files), + "mentions_gold_basename": any(b in text for b in basenames), + "contains_traceback": ("Traceback (most recent call last)" in text) + or ("\n File \"" in text), + } + + +def is_structural(inst: SweBenchInstance) -> bool: + """A task stresses structural navigation if its gold source patch + spans >=2 source files OR >=2 distinct directories.""" + files = gold_changed_files(inst.patch, source_only=True) + if len(files) >= 2: + return True + dirs = {str(Path(f).parent) for f in files} + return len(dirs) >= 2 + + +def select_structural( + instances: Iterable[SweBenchInstance], + *, + seed: int = DEFAULT_SEED, + n: int | None = None, + repos: set[str] | None = None, + python_only: bool = False, + no_leak: bool = False, +) -> list[SweBenchInstance]: + """Deterministically sample instances whose gold patch is multi-file/ + multi-dir (structural-navigation stressors). + + `repos`: if given, restrict to these `owner/name` repos (used to target + large, less-pretraining-saturated codebases on the SWE-bench-Live corpus). + `python_only`: require at least one `.py` gold source file (the navigation + tools — tree-sitter / jedi — are Python-only). + `no_leak`: drop instances whose problem statement names a gold file's path + or basename (the "structural-hard" gate — forces real multi-hop navigation + rather than single-hop lookup of an explicitly-named file). + """ + pool = [i for i in instances if is_structural(i)] + if repos is not None: + pool = [i for i in pool if i.repo in repos] + if python_only: + pool = [ + i + for i in pool + if any(f.endswith(".py") for f in gold_changed_files(i.patch, source_only=True)) + ] + if no_leak: + kept = [] + for i in pool: + gold = gold_changed_files(i.patch, source_only=True) + lf = leakage_flags(i, gold) + if not lf["mentions_gold_path"] and not lf["mentions_gold_basename"]: + kept.append(i) + pool = kept + rng = random.Random(seed) + rng.shuffle(pool) + return pool[:n] if n is not None else pool + + +def prepare_localize_worktree( + inst: SweBenchInstance, + *, + repos_dir: Path = REPOS_DIR, + worktrees_dir: Path | None = None, +) -> Path: + """Materialize a TEST-FREE worktree under a distinct name. + + The distinct dirname matters: the code-graph backend keys its index on + the worktree dirname, so a fresh name forces a clean re-index that does + NOT contain the test_patch files (which would leak the bug location). + + Naming: + * unhardened (``BENCH_BLOCK_NETWORK=0``): ``{instance_id}__loc`` + (preserves prior-run provenance). + * hardened (default): ``loc-`` so the + dirname does NOT embed the upstream PR/issue number, and the cloned + ``.git`` is stripped so the post-fix oracle is unreachable offline. + """ + hardened = network_block_enabled() + wt_dir = worktrees_dir or (DEFAULT_CACHE_ROOT / "worktrees-localize") + src = _ensure_repo_clone(inst.repo, repos_dir) + name = opaque_worktree_name(inst.instance_id) if hardened else f"{inst.instance_id}__loc" + dest = wt_dir / name + if dest.exists(): + shutil.rmtree(dest, ignore_errors=True) + if dest.exists(): + # A locked/partial dir survived rmtree (e.g. an open handle from a + # prior interrupted run). Move it aside so the clone can proceed. + import time as _t + dest.rename(dest.with_name(f"{dest.name}.stale.{int(_t.time())}")) + dest.parent.mkdir(parents=True, exist_ok=True) + # Clone with a single retry. We have observed a transient `git clone` + # exit-128 on the *first* clone of a freshly-cleaned worktree dir (the + # next config's clone of the same instance then succeeds). Re-clean and + # retry once; surface git's stderr if it still fails so it's diagnosable. + last_err = "" + for attempt in range(2): + if dest.exists(): + shutil.rmtree(dest, ignore_errors=True) + res = _git(["clone", str(src), str(dest)], check=False) + if res.returncode == 0: + break + last_err = (res.stderr or res.stdout or "").strip() + print( + f"[warn] git clone {dest.name} failed (attempt {attempt + 1}/2): " + f"{last_err}", + flush=True, + ) + else: + raise RuntimeError(f"git clone failed for {dest}: {last_err}") + _git(["fetch", "origin", inst.base_commit], cwd=dest, check=False) + # The cached clone (origin) only has commits reachable from the default + # branch. Loc-Bench base_commits are sometimes unreachable from it (PR + # bases, rewritten history). GitHub serves any reachable SHA directly, so + # fall back to fetching the commit straight from the upstream URL. + if _git(["cat-file", "-e", inst.base_commit], cwd=dest, check=False).returncode != 0: + url = f"https://github.com/{inst.repo}.git" + _git(["fetch", "--depth", "1", url, inst.base_commit], cwd=dest, check=False) + _git(["checkout", "--detach", inst.base_commit], cwd=dest) + if hardened: + # Strip the offline oracle: the cloned ``.git`` retains ``origin`` plus a + # local default-branch ref at the post-fix tip, so ``git log/diff + # origin/`` would reveal the gold change with no network. The + # localize path needs no git history (gold comes from the dataset patch; + # analyze_folder ignores ``.git``), so removing it is safe. + strip_git_oracle(dest) + return dest + + +def strip_git_oracle(root: Path) -> None: + """Remove every ``.git`` (directory OR gitdir-pointer file) under ``root``. + + A bare ``rmtree(root/'.git')`` only handles the top-level repo dir. It misses + (a) submodule checkouts, whose ``.git`` is a *file* containing a + ``gitdir: ...`` pointer back into the superproject, and (b) any nested git + checkout. Any surviving ``.git`` lets ``git`` rediscover history from inside + the worktree, re-exposing the post-fix oracle. Remove them all so the + worktree is genuinely history-free. + """ + for git_path in sorted(root.rglob(".git"), key=lambda p: len(p.parts), reverse=True): + if git_path.is_dir() and not git_path.is_symlink(): + shutil.rmtree(git_path, ignore_errors=True) + else: + try: + git_path.unlink() + except OSError: + pass + + # --------------------------------------------------------------------------- # Verification (approximate — official harness needs Docker) # --------------------------------------------------------------------------- @@ -239,6 +603,19 @@ def verify_instance( cmd = [py, "-m", "pytest", "-q", "--no-header", "-p", "no:cacheprovider", *test_ids] res = subprocess.run(cmd, cwd=str(repo_path), capture_output=True, text=True) - ok = res.returncode == 0 summary = res.stdout[-500:] + res.stderr[-500:] + # Distinguish "tests ran and failed" (authoritative-ish negative) from + # "we could not run tests at all" (no pytest in env, collection crash). + # The latter must NOT be reported as a real failure — the authoritative + # grade comes from the SWE-bench Docker harness (bench.runners. + # swebench_verify). pytest uses returncode 2-5 for usage/collection/internal + # errors, and 1 for genuine test failures; 0 is pass. + could_not_run = ( + "No module named pytest" in summary + or "no tests ran" in summary + or res.returncode >= 2 + ) + if could_not_run and res.returncode != 1: + return False, "UNGRADED: " + summary + ok = res.returncode == 0 return ok, summary diff --git a/bench/mcp/__init__.py b/bench/mcp/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/bench/mcp/lsp_server.py b/bench/mcp/lsp_server.py new file mode 100644 index 00000000..08b55cab --- /dev/null +++ b/bench/mcp/lsp_server.py @@ -0,0 +1,120 @@ +"""FastMCP server exposing LSP navigation primitives for the ``lsp`` arm. + +This is the symmetric counterpart to the code-graph MCP server: both arms +surface a small navigation tool set to the Copilot agent over stdio, so the +4-arm comparison isolates *what each backend can answer*, not the harness. + +Backend: multilspy's ``SyncLanguageServer`` (jedi-language-server for Python), +wrapped by :mod:`bench.agents.lsp_adapter`. A single jedi subprocess is kept +alive for the whole session (see ``LSPClient.start``) so per-call startup cost +isn't paid on every tool call — the agent-token comparison stays fair. + +Tools (mirroring bench/agents/lsp_adapter): + - goto_definition(file, line, col) -> [{path, line, col}] + - find_references(file, line, col) -> [{path, line, col}] + - hover(file, line, col) -> {text} + - document_symbols(file) -> [{name, kind, path, line, col}] + +Positions are 0-based (LSP convention): the first line of a file is line 0, +the first column is col 0. This matches raw LSP/multilspy semantics. + +Required env (set by the benchmark runner): + LSP_REPO_ROOT absolute path to the repo being analyzed (the agent's cwd) + LSP_LANGUAGE optional; defaults to "python" + LSP_ENV_PATH optional; environment_path passed to jedi for import + resolution (defaults to the server interpreter's prefix) +""" + +from __future__ import annotations + +import os +from typing import Any + +from mcp.server.fastmcp import FastMCP + +from bench.agents.lsp_adapter import DEFAULT_SHIM, LSPClient + +app: FastMCP = FastMCP("lsp") + +# Single persistent client/server for the process lifetime. Started lazily on +# the first tool call so server construction errors surface as a tool error +# (visible in the trajectory) rather than killing stdio startup. +_client: LSPClient | None = None + + +def _get_client() -> LSPClient: + global _client + if _client is None: + repo_root = os.environ.get("LSP_REPO_ROOT") or os.getcwd() + language = os.environ.get("LSP_LANGUAGE", "python") + env_path = os.environ.get("LSP_ENV_PATH") or None + client = LSPClient( + repo_root=repo_root, + language=language, + shim=DEFAULT_SHIM, + environment_path=env_path, + ) + client.start() + _client = client + return _client + + +@app.tool( + name="goto_definition", + description=( + "Resolve the symbol at a 0-based (line, col) in `file` to its " + "definition site(s); returns [{path, line, col}]. Use after you have " + "located a symbol's position (e.g. via grep) to jump to where it is " + "defined. line/col are 0-based: the first line is 0." + ), +) +async def goto_definition(file: str, line: int, col: int) -> list[dict[str, Any]]: + return _get_client().goto_definition(file, int(line), int(col)) + + +@app.tool( + name="find_references", + description=( + "Find all references to the symbol at a 0-based (line, col) in `file`; " + "returns [{path, line, col}] across the repo (capped). Use to discover " + "which other source files use a symbol. line/col are 0-based." + ), +) +async def find_references(file: str, line: int, col: int) -> list[dict[str, Any]]: + return _get_client().find_references(file, int(line), int(col)) + + +@app.tool( + name="hover", + description=( + "Get the signature + 1-sentence docstring for the symbol at a 0-based " + "(line, col) in `file`; returns {text}. line/col are 0-based." + ), +) +async def hover(file: str, line: int, col: int) -> dict[str, Any]: + return _get_client().hover(file, int(line), int(col)) + + +@app.tool( + name="document_symbols", + description=( + "List the symbols (functions, classes, methods) defined in `file` with " + "their 0-based positions; returns [{name, kind, path, line, col}]. Use " + "to map a file's structure without reading the whole file." + ), +) +async def document_symbols(file: str) -> list[dict[str, Any]]: + return _get_client().document_symbols(file) + + +def main() -> None: + """Run the LSP MCP server over stdio.""" + try: + app.run(transport="stdio") + finally: + if _client is not None: + _client.stop() + + +if __name__ == "__main__": + main() diff --git a/bench/mcp/noisy_inject.py b/bench/mcp/noisy_inject.py new file mode 100644 index 00000000..5170f180 --- /dev/null +++ b/bench/mcp/noisy_inject.py @@ -0,0 +1,150 @@ +"""Runtime NOISY negative-control injection for ``search_code`` results. + +This module is the SINGLE source of truth for the adoption-calibration pilot's +NOISY arm (prereg §6): after the real graph result, append K deterministic +"distractor" files (verified-non-gold siblings, pre-computed offline into a +manifest) so the experiment can measure whether the agent KEEPS a plausible but +wrong graph-surfaced candidate (a false positive) or correctly DROPS it (a true +negative). + +Design constraints (mirrors ``rel_explain`` and validated with rubber-duck): + * The injection lives INSIDE the registered tool, before FastMCP serializes + the return value, so distractors flow through the exact same schema and + output path as real results (no JSON-RPC proxy, no registry surgery). + * It is ENV-GATED and DEFAULT-OFF: with ``BENCH_NOISY_MANIFEST`` / + ``BENCH_NOISY_TASK`` unset, ``maybe_inject`` is a no-op and ``search_code`` + output is byte-identical to production. Only the NOISY arm sets the env. + * PURE core (``inject``) so the no-LLM dry-run / unit tests can exercise the + real intervention against a canned result list with no FalkorDB, no agent. + * Distractors are appended AFTER the real result and never duplicate a file + already present, so they cannot displace a genuine hit. + +Env contract (set by the bench runner only for the NOISY condition): + * ``BENCH_NOISY_MANIFEST`` -- path to the JSON manifest produced by + ``bench.analysis.adopt_controls.build_noisy_manifest`` (top-level + ``{"k", "seed", "coverage", "manifest": {task -> {... "distractors": [...]}}}``). + * ``BENCH_NOISY_TASK`` -- the task id key for THIS run's instance. + * ``BENCH_NOISY_K`` -- optional override of how many distractors to + append (default: the manifest's ``k``, else ``DEFAULT_K``). +""" + +from __future__ import annotations + +import json +import logging +import os +from functools import lru_cache +from pathlib import Path +from typing import Any + +log = logging.getLogger(__name__) + +ENV_MANIFEST = "BENCH_NOISY_MANIFEST" +ENV_TASK = "BENCH_NOISY_TASK" +ENV_K = "BENCH_NOISY_K" + +DEFAULT_K = 2 + +# Provenance marker stamped on every injected record so the offline diagnostic +# can distinguish a NOISY distractor from a genuine co-override sibling. +VIA_NOISY = "noisy_inject" + + +def build_distractor_record(file: str) -> dict[str, Any]: + """A single injected distractor in the flat ``rank_kind:"related"`` schema. + + Mirrors the shape ``search_code`` already uses for flat-appended siblings so + the agent and the offline scorer see a uniform candidate list. ``file_id`` is + ``None`` (a distractor is identified by path, not a query-relevant node id). + """ + return { + "file": file, + "file_id": None, + "score": None, + "name": None, + "line": None, + "label": "File", + "rank_kind": "related", + "confidence": "medium", + "via": VIA_NOISY, + "related_to": None, + "shared_methods": [], + } + + +def inject( + out: list[dict[str, Any]], + distractors: list[dict[str, Any]], + k: int, +) -> list[dict[str, Any]]: + """Append up to ``k`` distractor records to ``out`` (pure, in place). + + Skips any distractor whose ``file`` already appears in ``out`` (a real hit is + never duplicated/displaced). Returns ``out`` for convenience. + """ + if k <= 0 or not distractors: + return out + present = {r.get("file") for r in out} + appended = 0 + for d in distractors: + if appended >= k: + break + f = d.get("file") + if not f or f in present: + continue + present.add(f) + out.append(build_distractor_record(f)) + appended += 1 + return out + + +@lru_cache(maxsize=8) +def _load_manifest(path: str) -> dict[str, Any]: + """Load + cache the manifest JSON (cache keyed by path string).""" + return json.loads(Path(path).read_text()) + + +def distractors_for_task(manifest: dict[str, Any], task: str) -> list[dict[str, Any]]: + """The ``distractors`` list for ``task`` from a loaded manifest, or ``[]``.""" + entry = manifest.get("manifest", {}).get(task) + if not entry: + return [] + return entry.get("distractors", []) or [] + + +def _resolve_k(manifest: dict[str, Any]) -> int: + raw = os.getenv(ENV_K) + if raw is not None and raw.strip(): + try: + return max(0, int(raw)) + except ValueError: + log.warning("noisy_inject: bad %s=%r, falling back to manifest k", ENV_K, raw) + mk = manifest.get("k") + if isinstance(mk, int) and mk >= 0: + return mk + return DEFAULT_K + + +def maybe_inject(out: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Env-gated entry point called by ``search_code`` before returning. + + No-op (byte-identical output) unless BOTH ``BENCH_NOISY_MANIFEST`` and + ``BENCH_NOISY_TASK`` are set. A misconfiguration (missing file, unknown task, + fewer than K distractors) is logged and otherwise tolerated so a NOISY run + degrades to "fewer distractors" rather than crashing the agent mid-task. + """ + manifest_path = os.getenv(ENV_MANIFEST) + task = os.getenv(ENV_TASK) + if not manifest_path or not task: + return out + try: + manifest = _load_manifest(manifest_path) + except (OSError, ValueError) as e: + log.warning("noisy_inject: cannot load manifest %s: %s", manifest_path, e) + return out + distractors = distractors_for_task(manifest, task) + if not distractors: + log.warning("noisy_inject: no distractors for task %r in %s", task, manifest_path) + return out + k = _resolve_k(manifest) + return inject(out, distractors, k) diff --git a/bench/mcp/rel_explain.py b/bench/mcp/rel_explain.py new file mode 100644 index 00000000..25c6b794 --- /dev/null +++ b/bench/mcp/rel_explain.py @@ -0,0 +1,226 @@ +"""Factual relationship/provenance explanations for ``search_code`` results. + +This module is the SINGLE source of truth for the optional, env-gated +``relationship_explanation`` / ``match_provenance`` strings attached to +``search_code`` output. It is deliberately PURE (no FastMCP, no graph, no I/O) +and dependency-free so that: + + * the production tool (``structural.py``) can call it at query time, and + * the offline A/B "reader" harness (bench tree, a *different* worktree/venv) + can import it and re-annotate captured ``search_code`` outputs with the + EXACT same logic — guaranteeing the offline mechanism test exercises the + real intervention, not a re-implementation that could drift. + +Design constraints (validated with rubber-duck): + * FACTUAL, not directive. Strings describe the STRUCTURAL relationship or the + matched query provenance. They never tell the agent what to answer + ("you should include this file"), which would overfit/game the benchmark. + * Derived only from data already present in the result entry (``via``, + ``shared_methods``, ``related_to``) or trivially verifiable against the + query (token overlap with ``name``/``file``). + * A length-matched, semantically EMPTY ``placebo`` is provided so the A/B can + isolate "explanation content" from "extra prose / salience". + +Modes (string, case-insensitive): + * ``"off"`` -- no annotation (control arm; current production default). + * ``"explain"`` -- attach the real factual explanation/provenance. + * ``"placebo"`` -- attach a length-matched neutral filler (salience control). +""" + +from __future__ import annotations + +import re +from typing import Any, Optional + +OFF = "off" +EXPLAIN = "explain" +PLACEBO = "placebo" +VALID_MODES = (OFF, EXPLAIN, PLACEBO) + +# Field names attached to result entries. +RELATED_FIELD = "relationship_explanation" +DIRECT_FIELD = "match_provenance" + +_CAMEL_RE = re.compile(r"[A-Z]+(?=[A-Z][a-z])|[A-Z]?[a-z0-9]+|[A-Z]+") +_WORD_RE = re.compile(r"[A-Za-z_][A-Za-z0-9_]*") +_MIN_TOK = 4 # ignore short/common tokens when reporting query provenance + + +def normalize_mode(value: Optional[str]) -> str: + """Coerce an env value to a valid mode; unknown/empty -> ``off``.""" + if not value: + return OFF + v = str(value).strip().lower() + if v in ("1", "true", "yes", "on"): + return EXPLAIN + if v in ("0", "false", "no"): + return OFF + return v if v in VALID_MODES else OFF + + +def _subtokens(ident: str) -> set[str]: + out: set[str] = set() + for part in re.split(r"[_\s]+", ident or ""): + for m in _CAMEL_RE.findall(part): + if m: + out.add(m.lower()) + return out + + +def _query_tokens(query: str) -> set[str]: + toks: set[str] = set() + for w in _WORD_RE.findall(query or ""): + toks |= _subtokens(w) + toks.add(w.lower()) + return {t for t in toks if len(t) >= _MIN_TOK} + + +def _fmt_methods(methods: Any) -> str: + if not methods: + return "" + if isinstance(methods, str): + methods = [methods] + parts = [f"`{m}`" for m in methods if m] + if not parts: + return "" + if len(parts) == 1: + return parts[0] + if len(parts) == 2: + return f"{parts[0]} and {parts[1]}" + return ", ".join(parts[:-1]) + f", and {parts[-1]}" + + +def related_explanation(entry: dict[str, Any], related_to: Optional[str]) -> Optional[str]: + """Factual explanation of WHY a related file is coupled to ``related_to``. + + ``entry`` is a ``likely_related_files`` item or a flat ``rank_kind=related`` + object carrying ``via`` (co_override|shared_method) and ``shared_methods``. + ``related_to`` is the primary (seed) file this sibling attaches to; for flat + entries it is the entry's own ``related_to`` field. + """ + via = entry.get("via") + methods = _fmt_methods(entry.get("shared_methods")) + seed = related_to or entry.get("related_to") + seed_s = f"`{seed}`" if seed else "a top-ranked file" + if via == "co_override": + base = ( + f"Overrides the same base method {methods} as {seed_s}" + if methods + else f"Overrides the same base method as {seed_s}" + ) + return ( + f"{base} (co-override sibling). Files that override a shared base " + f"method are frequent co-edit candidates that a textual search misses." + ) + if via == "shared_method": + base = ( + f"Defines the same method name {methods} as {seed_s}" + if methods + else f"Defines a same-named method as {seed_s}" + ) + return ( + f"{base} (shared-method sibling, often a co-change companion). " + f"Not linked by a resolved inheritance edge, so a name lookup would " + f"not connect them." + ) + # Unknown channel: fall back to a minimal factual statement. + if seed: + return f"Structurally coupled to {seed_s} in the code graph." + return None + + +def direct_provenance(entry: dict[str, Any], query: str) -> Optional[str]: + """Honest provenance for a DIRECT (primary ranked) hit. + + Reports which query terms verifiably appear in the hit's representative + symbol ``name`` or its ``file`` path. Makes no claim of relevance beyond the + literal token overlap; when there is none, it states the ranking was driven + by symbol/docstring relevance (BM25/centrality) rather than inventing a + match. This keeps direct-hit annotations FACTUAL, not directive. + """ + name = entry.get("name") or "" + file = entry.get("file") or "" + qtok = _query_tokens(query) + if not qtok: + return None + name_hits = sorted(qtok & _subtokens(name)) + path_hits = sorted(qtok & _subtokens(file.replace("/", " ").replace(".", " "))) + if name_hits: + terms = _fmt_methods(name_hits) + where = f"symbol `{name}`" if name else "a symbol in this file" + return f"Query term {terms} appears in {where}." + if path_hits: + terms = _fmt_methods(path_hits) + return f"Query term {terms} appears in the file path." + return ( + "Ranked by symbol-name/docstring relevance to the query " + "(no exact query term in the file path or representative symbol)." + ) + + +# --------------------------------------------------------------------------- +# Length-matched placebo (salience control) +# --------------------------------------------------------------------------- + +# A neutral vocabulary with NO file names, symbol names, or structural-coupling +# terms. Used to build filler of comparable length to a real explanation so the +# A/B can attribute any adoption change to explanation CONTENT, not to the mere +# presence of extra prose near the entry. +_PLACEBO_WORDS = ( + "this entry is part of the indexed repository and was returned by the " + "search operation along with other candidate entries for your review at " + "this time as additional general information about the result listing here" +).split() + + +def placebo_for(real_text: Optional[str]) -> Optional[str]: + """Return a neutral filler string of length comparable to ``real_text``.""" + if not real_text: + return None + target = len(real_text) + words: list[str] = [] + n = 0 + i = 0 + while n < target: + w = _PLACEBO_WORDS[i % len(_PLACEBO_WORDS)] + words.append(w) + n += len(w) + 1 + i += 1 + s = " ".join(words) + return s[:target].rstrip() + + +# --------------------------------------------------------------------------- +# Top-level annotators (operate IN PLACE on a list of search_code result objs) +# --------------------------------------------------------------------------- + + +def annotate_results(results: list[dict[str, Any]], query: str, mode: str) -> list[dict[str, Any]]: + """Attach explanation/provenance fields to a list of search_code objects. + + Mutates and returns ``results``. ``mode`` is one of ``VALID_MODES``. In + ``placebo`` mode the SAME fields are attached but with length-matched + neutral filler, so the two arms differ only in CONTENT, not in which entries + carry a field or roughly how many tokens they add. + """ + mode = normalize_mode(mode) + if mode == OFF: + return results + explain = mode == EXPLAIN + + for prim in results: + if not isinstance(prim, dict): + continue + is_related = prim.get("rank_kind") == "related" + if is_related: + real = related_explanation(prim, prim.get("related_to")) + prim[RELATED_FIELD] = real if explain else placebo_for(real) + else: + real = direct_provenance(prim, query) + if real is not None: + prim[DIRECT_FIELD] = real if explain else placebo_for(real) + for rel in prim.get("likely_related_files", []) or []: + if isinstance(rel, dict): + r = related_explanation(rel, prim.get("file")) + rel[RELATED_FIELD] = r if explain else placebo_for(r) + return results diff --git a/bench/runners/compare_models.py b/bench/runners/compare_models.py new file mode 100644 index 00000000..b7c1563e --- /dev/null +++ b/bench/runners/compare_models.py @@ -0,0 +1,118 @@ +"""Compare two model runs of the SWE-bench fix benchmark (e.g. Opus vs Sonnet). + +Reads two results.jsonl files (same instance set, same 4 configs) and reports, +per config: resolved accuracy, summed input/output tokens, and estimated USD +cost under list pricing. Prints a paired Opus-vs-Sonnet table and the overall +price delta. + +Token accounting note: `input_tokens` here is the cumulative count the agent +loop sends across all steps (history is re-sent each turn), so it is "tokens +processed", not unique context. We price it as-is and apply the SAME accounting +to both models, so the *ratio* is the honest comparison; absolute dollars are +an upper bound for a no-prompt-caching setup. + +Usage: + python -m bench.runners.compare_models \ + --a bench/cache/opus/results.jsonl --a-name opus --a-model opus \ + --b bench/cache/sonnet-n40/results.jsonl --b-name sonnet --b-model sonnet +""" + +from __future__ import annotations + +import argparse +import json +from collections import defaultdict +from pathlib import Path +from typing import Any + +# List price per 1M tokens (USD), input / output. +PRICING = { + "opus": (15.0, 75.0), # Claude Opus 4.x + "sonnet": (3.0, 15.0), # Claude Sonnet 4.5 + "haiku": (0.80, 4.0), +} + +CONFIG_ORDER = ["baseline", "lsp", "code_graph", "code_graph_mcp"] + + +def load(path: Path) -> dict[str, list[dict[str, Any]]]: + by: dict[str, list[dict[str, Any]]] = defaultdict(list) + for line in path.read_text().splitlines(): + if not line.strip(): + continue + r = json.loads(line) + by[r.get("config", "?")].append(r) + return by + + +def cost_usd(in_tok: int, out_tok: int, model: str) -> float: + pin, pout = PRICING[model] + return in_tok / 1e6 * pin + out_tok / 1e6 * pout + + +def config_stats(rows: list[dict[str, Any]], model: str) -> dict[str, Any]: + n = len(rows) + in_sum = sum(r.get("input_tokens", 0) for r in rows) + out_sum = sum(r.get("output_tokens", 0) for r in rows) + resolved = sum(1 for r in rows if r.get("outcome") == "resolved") + return { + "n": n, + "in_sum": in_sum, + "out_sum": out_sum, + "resolved": resolved, + "acc": round(100 * resolved / n, 1) if n else 0.0, + "usd": round(cost_usd(in_sum, out_sum, model), 2), + } + + +def main(argv: list[str] | None = None) -> int: + p = argparse.ArgumentParser(description=__doc__) + p.add_argument("--a", type=Path, required=True) + p.add_argument("--a-name", default="A") + p.add_argument("--a-model", default="opus", choices=list(PRICING)) + p.add_argument("--b", type=Path, required=True) + p.add_argument("--b-name", default="B") + p.add_argument("--b-model", default="sonnet", choices=list(PRICING)) + p.add_argument("--json-out", type=Path, default=None) + args = p.parse_args(argv) + + a = load(args.a) + b = load(args.b) + configs = [c for c in CONFIG_ORDER if c in a or c in b] + + report: dict[str, Any] = {"a": args.a_name, "b": args.b_name, "configs": {}} + print(f"\n{'config':>16} | {args.a_name:>26} | {args.b_name:>26} | price") + print(f"{'':>16} | {'acc in_tok out_tok $':>26} | " + f"{'acc in_tok out_tok $':>26} | A/B$") + print("-" * 92) + tot_a_usd = tot_b_usd = 0.0 + for c in configs: + sa = config_stats(a.get(c, []), args.a_model) + sb = config_stats(b.get(c, []), args.b_model) + tot_a_usd += sa["usd"] + tot_b_usd += sb["usd"] + ratio = round(sa["usd"] / sb["usd"], 2) if sb["usd"] else None + report["configs"][c] = {"a": sa, "b": sb, "price_ratio_a_over_b": ratio} + print(f"{c:>16} | {sa['acc']:>4}% {sa['in_sum']:>9} {sa['out_sum']:>7} " + f"${sa['usd']:>7} | {sb['acc']:>4}% {sb['in_sum']:>9} " + f"{sb['out_sum']:>7} ${sb['usd']:>7} | {ratio}x") + + print("-" * 92) + print(f"{'TOTAL $':>16} | {'':>19}${tot_a_usd:>7.2f} | " + f"{'':>19}${tot_b_usd:>7.2f} | " + f"{round(tot_a_usd / tot_b_usd, 2) if tot_b_usd else None}x") + report["total"] = { + "a_usd": round(tot_a_usd, 2), + "b_usd": round(tot_b_usd, 2), + "a_over_b": round(tot_a_usd / tot_b_usd, 2) if tot_b_usd else None, + } + + if args.json_out: + args.json_out.parent.mkdir(parents=True, exist_ok=True) + args.json_out.write_text(json.dumps(report, indent=2)) + print(f"\nwrote {args.json_out}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/bench/runners/copilot_runner.py b/bench/runners/copilot_runner.py new file mode 100644 index 00000000..5a52ce7b --- /dev/null +++ b/bench/runners/copilot_runner.py @@ -0,0 +1,1868 @@ +"""Benchmark harness driving the **real GitHub Copilot CLI** over SWE-bench. + +Unlike `mini_runner` (a scripted ReAct loop with a hard step cap), this runner +invokes the production Copilot CLI headlessly so the measured token / accuracy +numbers reflect how people actually use the agent. It compares tracks that +differ ONLY in their MCP wiring: + + * ``copilot_no_mcp`` -- Copilot's native tools, no extra MCP servers. + * ``code_graph`` -- same, plus our ``cgraph-mcp`` stdio server. + * ``lsp`` -- (reserved) same, plus an LSP-backed MCP server. + +For each ``(instance, track)`` it: + 1. prepares a fresh worktree at the instance base commit, + 2. (code_graph only) deletes any stale FalkorDB graph and re-indexes, + 3. builds a neutral prompt from the SWE-bench problem statement, + 4. runs ``copilot`` headless with a wall-clock timeout, + 5. parses tokens (summed from the debug process logs), premium requests and + tool calls, + 6. extracts the patch via ``git diff `` (junk-excluded), + 7. writes a results row in the shared ``mini_runner`` schema so the existing + Docker grader (``swebench_verify.py``) works unchanged. + +Grading is intentionally deferred: run this to generate patches + token rows, +then grade with the official SWE-bench Docker harness. +""" + +from __future__ import annotations + +import argparse +import json +import os +import re +import shutil +import signal +import subprocess +import sys +import time +import traceback +import uuid +from pathlib import Path +from typing import Any + +from bench.datasets import swe_bench + +RUNNER_VERSION = "copilot-runner/2" + +# Marks the measurement epoch for answer-leakage hardening + thinking-on + +# full-trace capture. Recorded on every row so rows from different harness +# generations are never silently pooled. +# +# harden/2: closed the git-walk-up leak. Stripping the worktree's own ``.git`` +# did NOT stop ``git`` (run by the agent or by the indexer's branch detection) +# from traversing UP to the enclosing harness repo, which leaked its branch name +# and commit messages (revealing the benchmark intent) and mis-keyed the index +# under the parent branch. Fixed by recursively stripping ``.git``, pinning the +# index to the ``_default`` branch, scrubbing inherited ``GIT_*`` vars, and +# fencing the agent's git with ``GIT_CEILING_DIRECTORIES``. harden/1 rows where +# the agent ran git are suspect and must not be pooled with harden/2. +HARNESS_HARDENING_VERSION = "harden/2" + +# Reasoning effort for scored runs. Thinking is now ENABLED by default so the +# agent's deliberation is captured in the trace; the reasoning-token cost is +# accounted SEPARATELY (see parse_tokens_from_logs -> reasoning_tokens) so the +# base token comparison across arms stays interpretable. All arms in an epoch +# share one effort level. Override with COPILOT_REASONING_EFFORT. +DEFAULT_REASONING_EFFORT = os.environ.get("BENCH_REASONING_EFFORT", "medium") + + +def _resolve_reasoning_effort() -> str | None: + """Effort level to pass to copilot, or None to omit the flag entirely.""" + effort = os.environ.get("COPILOT_REASONING_EFFORT", DEFAULT_REASONING_EFFORT) + if not effort or effort.lower() == "off": + return None + return effort + + +# Tracks that only need different Copilot MCP wiring. +NO_MCP = "copilot_no_mcp" +CODE_GRAPH = "code_graph" +LSP = "lsp" +VALID_TRACKS = (NO_MCP, CODE_GRAPH, LSP) + +DEFAULT_CACHE = Path(__file__).resolve().parents[1] / "cache" / "copilot" + +# Dirs that must never end up in an extracted patch even if Copilot or a tool +# left them untracked in the worktree. +_PATCH_EXCLUDES = ( + "__pycache__", + ".pytest_cache", + ".mypy_cache", + ".ruff_cache", + ".tox", + ".eggs", + "node_modules", + ".git", + ".venv", + "venv", + "build", + "dist", +) + +# The code-graph MCP server lives in a sibling worktree and is launched via a +# wrapper that fixes PYTHONPATH (see _write_mcp_config). +DEFAULT_MCP_SERVER_ROOT = Path( + os.environ.get( + "CGRAPH_MCP_SERVER_ROOT", + "/Users/dvirdukhan/Code/code-graph/.worktrees/mcp-smoke", + ) +) + +# The LSP-backed MCP server (bench/mcp/lsp_server.py) lives in THIS bench tree +# (mcp-t17), but must be launched with the mcp-smoke venv python because that is +# the only environment with BOTH `mcp`/FastMCP AND `multilspy`. Its wrapper also +# prepends the mcp-smoke venv `bin/` to PATH so the `jedi-language-server` +# console script multilspy launches by bare name is found. +LSP_BENCH_ROOT = Path( + os.environ.get("LSP_BENCH_ROOT", str(Path(__file__).resolve().parents[2])) +) +DEFAULT_LSP_SERVER_PYTHON_ROOT = Path( + os.environ.get("LSP_SERVER_PYTHON_ROOT", str(DEFAULT_MCP_SERVER_ROOT)) +) + + +# --------------------------------------------------------------------------- +# Prompt assembly (symmetric across tracks; only the capability note differs) +# --------------------------------------------------------------------------- + +FIX = "fix" +LOCALIZE = "localize" +VALID_MODES = (FIX, LOCALIZE) + +# The strict line the localization agent must end on. Re-used by the parser. +LOCALIZE_SENTINEL = "FINAL_LOCALIZATION_JSON:" + +_BASE_PROMPT = """\ +You are fixing a bug in the Python repository checked out at {cwd}. + +{problem} + +Inspect the repository to understand the relevant code before editing, then +make the minimal source change that fixes the issue. Do not modify test files. +{capability} +When you are done, stop and give a one-line summary of what you changed.""" + +_LOCALIZE_PROMPT = """\ +You are localizing (not fixing) a bug in the Python repository checked out at {cwd}. + +{problem} + +Investigate the repository to determine which SOURCE files must be edited to fix +this issue. Do NOT modify any files. Do NOT run or edit tests. +{capability} +When you are confident, finish your FINAL assistant message with a single line in +EXACTLY this format (most-likely file first, repo-root-relative paths, Python +source files only, no test or doc files): + +{sentinel} ["pkg/module_a.py", "pkg/module_b.py"] + +Write that line as plain text in your own final message. Do NOT emit it through a +shell command, `echo`, a file write, or any tool call.""" + +# Lane 1 adoption-calibration frozen text (prereg §5). Do NOT edit without +# amending the pre-registration; the experiment's validity depends on the exact +# wording (negative-control / non-overfitting requirement). +# +# SEM (lever a): edge-semantics clause appended verbatim to the code_graph +# capability preamble. NO frequency/benchmark prior (the rejected wording "the +# edit site is often a caller or a sibling, not the matched symbol" is forbidden). +_ADOPT_SEM_CLAUSE = ( + "Graph edges (calls, imports, inheritance, overrides, definitions) are " + "evidence that code is RELATED — not evidence that a connected file is the " + "location you must change. Treat every graph result as a hypothesis. Keep a " + "candidate in your final answer only when the code you have read supports " + "that the file participates directly in the behavior the task asks you to " + "change; drop it otherwise. Relatedness alone is not a reason to keep or to " + "drop." +) +# RAT (lever b): mandatory keep/drop-with-reason step injected into the localize +# prompt body, BEFORE the FINAL sentinel instruction. +_ADOPT_RAT_STEP = ( + "Before your final answer, list every file the graph surfaced and, for each, " + "write one line: `KEEP ` or " + "`DROP `. Your final answer must be consistent with these " + "decisions. You may add files the graph did not surface." +) +# RAT localize variant: identical to _LOCALIZE_PROMPT but with _ADOPT_RAT_STEP +# inserted after the capability note and before the FINAL sentinel instruction. +_LOCALIZE_PROMPT_RAT = """\ +You are localizing (not fixing) a bug in the Python repository checked out at {cwd}. + +{problem} + +Investigate the repository to determine which SOURCE files must be edited to fix +this issue. Do NOT modify any files. Do NOT run or edit tests. +{capability} +{rat_step} +When you are confident, finish your FINAL assistant message with a single line in +EXACTLY this format (most-likely file first, repo-root-relative paths, Python +source files only, no test or doc files): + +{sentinel} ["pkg/module_a.py", "pkg/module_b.py"] + +Write that line as plain text in your own final message. Do NOT emit it through a +shell command, `echo`, a file write, or any tool call.""" + +# Valid Lane 1 arm names. CTRL == canonical nudge base (prereg §2 amended: a +# neutral preamble yields ~0% spontaneous adoption on strong models, leaving +# nothing to calibrate, so CTRL is pinned to _CAP_CODE_GRAPH_NUDGE). +ADOPT_ARMS = ("ctrl", "sem", "rat") + +_CAP_NO_MCP = ( + "No external MCP tools are available; use Copilot's built-in file, search " + "and edit tools." +) +# Matched no-MCP nudge: parallels the code_graph search-first mandate without +# naming any specific tool, so the comparison isolates the graph, not the +# "search before grep" instruction. +_CAP_NO_MCP_NUDGE = ( + "No external MCP tools are available. Before resorting to plain text search " + "(grep/rg), begin by broadly mapping the repository structure to locate the " + "relevant symbols and how they relate; use Copilot's built-in file, search " + "and edit tools." +) +_CAP_CODE_GRAPH = ( + "A code-graph MCP server is available exposing code-navigation tools " + "(search_code, get_callers, get_callees, get_dependencies, impact_analysis, " + "find_path). The repository has ALREADY been indexed under project=\"{project}\" " + "and is ready to query immediately — do NOT call index_repo; call the " + "navigation tools directly with project=\"{project}\". Prefer precise " + "code-navigation tools over plain text search when they help. Do not use the " + "`ask` tool." +) +# Nudged code_graph: mandate an initial search_code call to measure the tool's +# value when the model is forced to engage it (the neutral prompt yields ~0% +# spontaneous adoption on strong models). +_CAP_CODE_GRAPH_NUDGE = ( + "A code-graph MCP server is available exposing code-navigation tools " + "(search_code, get_callers, get_callees, get_dependencies, impact_analysis, " + "find_path). The repository has ALREADY been indexed under project=\"{project}\" " + "and is ready to query — do NOT call index_repo. You MUST begin by calling " + "search_code(project=\"{project}\") to " + "locate the relevant symbols BEFORE any plain text search, and prefer these " + "graph tools over grep throughout your investigation. Do not use the `ask` tool." +) + + +# Traversal-mandate variant: gated by CGRAPH_TRAVERSE_NUDGE=1 + --nudge. Forces the +# model to actually traverse (get_callers/get_callees/find_path) from candidate +# symbols, isolating whether traversal — not just search-first — helps localization. +_CAP_CODE_GRAPH_TRAVERSE = ( + "A code-graph MCP server is available exposing code-navigation tools " + "(search_code, get_callers, get_callees, get_dependencies, impact_analysis, " + "find_path). The repository has ALREADY been indexed under project=\"{project}\" " + "— do NOT call index_repo. You MUST follow this workflow: (1) call search_code(project=\"{project}\") " + "to locate candidate symbols; (2) for your top candidate symbol(s) you MUST call " + "get_callers AND get_callees (and find_path between candidates when relevant), and " + "inspect the files those calls surface, BEFORE finalizing your answer; (3) prefer " + "these graph tools over grep throughout. Do not use the `ask` tool." +) + + +# Spike variant (Spike 1a: IMPORTS + OVERRIDES edges): gated by CGRAPH_SPIKE_NUDGE=1 +# + --nudge. Forces the model to exercise the NEW edge types — get_importers +# (file<-file IMPORTS) and get_overrides (subclass.method->ancestor.method) — which +# can bridge to gold files that the CALLS/DEFINES/EXTENDS call-graph never reached. +_CAP_CODE_GRAPH_SPIKE = ( + "A code-graph MCP server is available exposing code-navigation tools " + "(search_code, get_callers, get_callees, get_dependencies, impact_analysis, " + "find_path, get_importers, get_overrides). The repository has ALREADY been " + "indexed under project=\"{project}\" — do NOT call index_repo. You MUST follow this workflow: " + "(1) call search_code(project=\"{project}\") to locate candidate symbols and " + "their files; (2) for your top candidate file(s) you MUST call " + "get_importers (to find which other source files import them) AND, for any " + "candidate class/method, get_overrides (to find ancestor or subclass methods " + "that share its behavior); inspect the files those calls surface BEFORE " + "finalizing your answer; (3) prefer these graph tools over grep throughout. " + "Do not use the `ask` tool." +) + + +# Substitution+stop variant: gated by CGRAPH_SUBST_NUDGE=1 + --nudge. Targets the +# observed thrash failure mode (agent ignores a correct high-confidence rank-1 hit, +# chases a wrong hypothesis with broad grep sweeps, and never stops). Instructs the +# agent to TRUST the ranked search_code output (the top hits and their +# likely_related_files) as the candidate answer set, confirm with at most 1-2 file +# views, then STOP — substituting the graph for grep rather than running both. +_CAP_CODE_GRAPH_SUBST = ( + "A code-graph MCP server is available exposing code-navigation tools " + "(search_code, get_callers, get_callees, get_dependencies, impact_analysis, " + "find_path). The repository has ALREADY been indexed under project=\"{project}\" " + "— do NOT call index_repo. You MUST follow this workflow: (1) call " + "search_code(project=\"{project}\") with a CONCEPTUAL free-text query describing " + "the buggy behavior and area; (2) TRUST the ranked results — the top-ranked files " + "and the likely_related_files attached to them ARE your candidate answer set. " + "Confirm with AT MOST 1-2 targeted file views; (3) do NOT run broad grep/find " + "sweeps to second-guess a confident high-ranked hit, and do NOT keep searching " + "once the ranked results plus a quick view agree — STOP and answer. Substitute " + "the graph for grep; do not run both. Do not use the `ask` tool." +) + + +# LSP capability note. The LSP MCP server exposes jedi-backed navigation tools +# (goto_definition, find_references, hover, document_symbols). Positions are +# 0-based (LSP convention) while grep/view are 1-based — the agent must adjust. +_CAP_LSP = ( + "An LSP MCP server is available exposing jedi-backed Python navigation tools " + "(goto_definition, find_references, hover, document_symbols). Paths are " + "repo-root-relative; line/character positions are 0-based (subtract 1 from " + "the 1-based line numbers grep/view report). Prefer these precise " + "navigation tools over plain text search when they help." +) +# Nudged LSP: mandate an initial navigation call to measure the tool's value +# when the model is forced to engage it. +_CAP_LSP_NUDGE = ( + "An LSP MCP server is available exposing jedi-backed Python navigation tools " + "(goto_definition, find_references, hover, document_symbols). Paths are " + "repo-root-relative; line/character positions are 0-based (subtract 1 from " + "the 1-based line numbers grep/view report). You MUST begin by calling " + "document_symbols on a likely-relevant file (or goto_definition on a symbol " + "from the problem statement) BEFORE any plain text search, and prefer these " + "LSP tools over grep throughout your investigation." +) + + +# Appended to the prompt under hardening (default ON; BENCH_BLOCK_NETWORK=0 to opt out). Tells the agent +# to derive the answer from the code only — not from the network, GitHub, the +# issue/PR number, git remotes, or the harness's own files. +_HARDEN_PROMPT_LINE = ( + "IMPORTANT: Determine the answer ONLY from the source code in the working " + "directory. Do NOT access the network or fetch any URL; do NOT consult " + "GitHub, pull requests, commits, patches, or diffs; do NOT read or infer " + "anything from a git remote, `origin`, the issue/PR number, or files " + "outside the working directory. Any attempt to look up the fix externally " + "invalidates the result." +) + + +def _capability(track: str, project: str, *, nudge: bool, adopt_arm: str | None = None) -> str: + if track == CODE_GRAPH: + if adopt_arm is not None: + # Lane 1 arms bypass the env-gated nudge variants entirely. CTRL, + # SEM and RAT all share the canonical nudge base (prereg §2 amended); + # SEM additionally appends the frozen edge-semantics clause. + cap = _CAP_CODE_GRAPH_NUDGE.format(project=project) + if adopt_arm == "sem": + cap = f"{cap} {_ADOPT_SEM_CLAUSE}" + return cap + if nudge and os.environ.get("CGRAPH_SUBST_NUDGE") == "1": + tmpl = _CAP_CODE_GRAPH_SUBST + elif nudge and os.environ.get("CGRAPH_SPIKE_NUDGE") == "1": + tmpl = _CAP_CODE_GRAPH_SPIKE + elif nudge and os.environ.get("CGRAPH_TRAVERSE_NUDGE") == "1": + tmpl = _CAP_CODE_GRAPH_TRAVERSE + elif nudge: + tmpl = _CAP_CODE_GRAPH_NUDGE + else: + tmpl = _CAP_CODE_GRAPH + return tmpl.format(project=project) + if track == LSP: + return _CAP_LSP_NUDGE if nudge else _CAP_LSP + return _CAP_NO_MCP_NUDGE if nudge else _CAP_NO_MCP + + +def build_prompt( + track: str, + cwd: Path, + problem: str, + project: str, + *, + nudge: bool = False, + mode: str = FIX, + adopt_arm: str | None = None, +) -> str: + if adopt_arm is not None and (track != CODE_GRAPH or mode != LOCALIZE): + raise ValueError( + f"adopt_arm={adopt_arm!r} requires track={CODE_GRAPH} and mode={LOCALIZE}; " + f"got track={track!r} mode={mode!r}" + ) + if adopt_arm is not None and adopt_arm not in ADOPT_ARMS: + raise ValueError(f"unknown adopt_arm={adopt_arm!r}; expected one of {ADOPT_ARMS}") + capability = _capability(track, project, nudge=nudge, adopt_arm=adopt_arm) + if swe_bench.network_block_enabled(): + capability = f"{capability}\n{_HARDEN_PROMPT_LINE}" + if mode == LOCALIZE: + if adopt_arm == "rat": + return _LOCALIZE_PROMPT_RAT.format( + cwd=cwd, + problem=problem.strip(), + capability=capability, + rat_step=_ADOPT_RAT_STEP, + sentinel=LOCALIZE_SENTINEL, + ) + return _LOCALIZE_PROMPT.format( + cwd=cwd, + problem=problem.strip(), + capability=capability, + sentinel=LOCALIZE_SENTINEL, + ) + return _BASE_PROMPT.format(cwd=cwd, problem=problem.strip(), capability=capability) + + +# --------------------------------------------------------------------------- +# code-graph MCP wiring +# --------------------------------------------------------------------------- + + +def _write_mcp_wrapper(run_dir: Path, server_root: Path) -> Path: + """Write the stdio launcher for cgraph-mcp. + + The server's editable install is only importable with the server worktree + on PYTHONPATH, so the wrapper cd's there and sets PYTHONPATH before exec'ing + the server's venv python. Validated to start in ~1.7s from any cwd. + """ + py = server_root / ".venv" / "bin" / "python" + if not py.exists(): + raise FileNotFoundError(f"cgraph-mcp server python not found: {py}") + wrapper = run_dir / "cgraph-mcp-wrapper.sh" + wrapper.write_text( + "#!/bin/bash\n" + f'cd "{server_root}"\n' + f'export PYTHONPATH="{server_root}:$PYTHONPATH"\n' + f'exec "{py}" -c "from api.mcp.server import main; main()"\n' + ) + wrapper.chmod(0o755) + return wrapper + + +def _write_mcp_config( + run_dir: Path, + wrapper: Path, + falkor_host: str, + falkor_port: int, + extra_env: dict[str, str] | None = None, +) -> Path: + cfg = run_dir / "cg-mcp-config.json" + env = { + "FALKORDB_HOST": falkor_host, + "FALKORDB_PORT": str(falkor_port), + } + if extra_env: + env.update(extra_env) + cfg.write_text( + json.dumps( + { + "mcpServers": { + "code-graph": { + "command": str(wrapper), + "args": [], + "env": env, + } + } + }, + indent=2, + ) + ) + return cfg + + +def _write_lsp_wrapper(run_dir: Path, repo_path: Path) -> Path: + """Write the stdio launcher for the LSP MCP server (bench/mcp/lsp_server.py). + + The server module lives in this bench tree (LSP_BENCH_ROOT) but must run on + the mcp-smoke venv python (the only env with both `mcp` and `multilspy`). The + wrapper also prepends that venv's `bin/` to PATH so multilspy can exec the + `jedi-language-server` console script by bare name, and points the adapter at + the target repo via LSP_REPO_ROOT. + """ + py = DEFAULT_LSP_SERVER_PYTHON_ROOT / ".venv" / "bin" / "python" + if not py.exists(): + raise FileNotFoundError(f"lsp-mcp server python not found: {py}") + venv_bin = DEFAULT_LSP_SERVER_PYTHON_ROOT / ".venv" / "bin" + wrapper = run_dir / "lsp-mcp-wrapper.sh" + wrapper.write_text( + "#!/bin/bash\n" + f'cd "{LSP_BENCH_ROOT}"\n' + f'export PATH="{venv_bin}:$PATH"\n' + f'export PYTHONPATH="{LSP_BENCH_ROOT}:$PYTHONPATH"\n' + f'export LSP_REPO_ROOT="{repo_path}"\n' + 'export LSP_LANGUAGE="python"\n' + f'exec "{py}" -c "from bench.mcp.lsp_server import main; main()"\n' + ) + wrapper.chmod(0o755) + return wrapper + + +def _write_lsp_mcp_config(run_dir: Path, wrapper: Path) -> Path: + cfg = run_dir / "lsp-mcp-config.json" + cfg.write_text( + json.dumps( + { + "mcpServers": { + "lsp": { + "command": str(wrapper), + "args": [], + "env": {}, + } + } + }, + indent=2, + ) + ) + return cfg + + +def _falkor_settings() -> tuple[str, int]: + return ( + os.environ.get("FALKORDB_HOST", "127.0.0.1"), + int(os.environ.get("FALKORDB_PORT", "6379")), + ) + + +def ensure_indexed(repo_path: Path, *, fresh: bool = True) -> float: + """Delete any stale graph for this worktree and (re)index it. + + Returns indexing wall-clock seconds. Indexes via the running code-graph + HTTP API (``/api/analyze_folder``); the agent's cgraph-mcp reads the same + FalkorDB instance, so the graph ``code:{repo_path.name}:_default`` is what + the agent will query with ``project=repo_path.name``. + + ``branch="_default"`` is passed EXPLICITLY so the index lands on the exact + key the agent (which omits ``branch``) reads. Without it the API falls back + to ``detect_branch(worktree)`` = ``git rev-parse``; when the hardened path + has stripped the worktree's ``.git``, git walks UP to the enclosing harness + repo and returns ITS branch, so the index would land under that branch key + while the agent queries an empty ``_default`` graph. + """ + import httpx + import redis + + host, port = _falkor_settings() + repo_name = repo_path.name + graph = f"code:{repo_name}:_default" + + if fresh: + try: + r = redis.Redis(host=host, port=port, decode_responses=True, socket_timeout=2) + if graph in (r.execute_command("GRAPH.LIST") or []): + r.execute_command("GRAPH.DELETE", graph) + print(f"[index] dropped stale {graph}") + except Exception as exc: # noqa: BLE001 + print(f"[index] WARN could not drop {graph}: {exc!r}") + + base = os.environ.get("CODEGRAPH_URL", "http://127.0.0.1:5000").rstrip("/") + token = os.environ.get("SECRET_TOKEN") or os.environ.get("CODEGRAPH_TOKEN") + headers = {"Authorization": f"Bearer {token}"} if token else {} + default_ignore = [ + ".git", "venv", ".venv", "node_modules", "__pycache__", + "rubi/rules", "build", "dist", ".tox", ".eggs", + ] + t0 = time.time() + with httpx.Client(timeout=7200.0, headers=headers) as c: + # Preflight: confirm the API server points at the same FalkorDB the + # agent's MCP server will read, else the agent queries an empty graph. + try: + h = c.get(f"{base}/api/_health", timeout=10.0) + if h.status_code == 200: + hp = int(h.json().get("falkordb_port", port)) + if hp != port: + raise RuntimeError( + f"API server FalkorDB port {hp} != runner port {port}; " + "agent and indexer would see different graphs." + ) + except httpx.HTTPError: + pass # _health is best-effort + resp = c.post( + f"{base}/api/analyze_folder", + json={"path": str(repo_path), "ignore": default_ignore, "branch": "_default"}, + ) + if resp.status_code != 200: + raise RuntimeError( + f"analyze_folder {resp.status_code}: {resp.text[:300]}. " + f"Check ALLOWED_ANALYSIS_DIR covers {repo_path}." + ) + dt = time.time() - t0 + print(f"[index] indexed {repo_name} in {dt:.1f}s") + return dt + + +# --------------------------------------------------------------------------- +# Copilot invocation +# --------------------------------------------------------------------------- + +COPILOT_MAX_ATTEMPTS = 3 +COPILOT_RETRY_BACKOFF_SEC = 15.0 + +# Substrings that mark a transient startup/network failure (token validation +# fetch failed, connection resets) rather than a real model run. +_TRANSIENT_STARTUP_MARKERS = ( + "could not be validated", + "fetch failed", + "econnreset", + "etimedout", + "enotfound", + "socket hang up", + "network", + "getaddrinfo", +) + + +def _is_transient_startup_failure( + returncode: int | None, stdout: str, stderr: str +) -> bool: + """True when Copilot exited early without producing any result stream. + + A genuine run always emits at least one JSON line on stdout. A transient + auth/network failure exits non-zero with empty stdout and a recognizable + error on stderr; those rows must be retried, not scored as recall=0. + """ + if returncode in (0, None): + return False + if stdout and stdout.strip(): + return False + blob = (stderr or "").lower() + return any(marker in blob for marker in _TRANSIENT_STARTUP_MARKERS) + + +# --------------------------------------------------------------------------- +# Answer-leakage hardening (default ON; opt out with BENCH_BLOCK_NETWORK=0) +# --------------------------------------------------------------------------- +# Shell commands that can exfiltrate the gold answer from the network or from a +# git remote. Denied as ``shell(:*)`` so the agent's tool layer refuses +# them outright (deny takes precedence over --allow-all-tools). These are a +# defense-in-depth layer, NOT a hermetic jail: a determined agent can still +# reach the network via python/node/etc., which is why detect_network_leak() +# backstops every run and trips signals are quarantined from scored numbers. +_DENY_SHELL_CMDS = ( + "curl", "wget", "gh", "nc", "ncat", "ssh", "scp", "telnet", + "git fetch", "git pull", "git clone", "git remote", + "git ls-remote", "git push", +) + +# GitHub domains that serve merged-PR file lists / patches / commits. Denied via +# --deny-url (precedence over allow). The model endpoint (*.githubcopilot.com) +# and localhost (code-graph API :5000, FalkorDB) are deliberately NOT blocked. +_DENY_URLS = ( + "github.com", + "*.github.com", + "api.github.com", + "raw.githubusercontent.com", + "*.githubusercontent.com", + "codeload.github.com", + "patch-diff.githubusercontent.com", + "objects.githubusercontent.com", +) + + +def _network_deny_flags() -> list[str]: + """copilot CLI flags that block network/remote exfiltration of the gold answer.""" + flags = ["--excluded-tools=web_fetch"] + for cmd in _DENY_SHELL_CMDS: + flags.append(f"--deny-tool=shell({cmd}:*)") + for url in _DENY_URLS: + flags.append(f"--deny-url={url}") + return flags + + +def _git_ceiling_dirs(cwd: Path) -> str: + """``GIT_CEILING_DIRECTORIES`` value that fences git inside the worktree. + + Lists the worktree's parent (both resolved and lexical, to defeat symlinked + paths) so git's upward repo discovery stops there: from inside the + history-free worktree it then finds no repository instead of walking up to + the enclosing harness repo. Listed dirs are NOT themselves crossed. + """ + cwd_resolved = cwd.resolve() + ceilings = {str(cwd_resolved.parent), str(cwd.parent)} + return os.pathsep.join(sorted(ceilings)) + + +def _harden_env(env: dict[str, str]) -> dict[str, str]: + """Strip leak-enabling vars (opaque-name salt, GitHub creds) from the agent env. + + Also removes inherited ``GIT_*`` discovery overrides (``GIT_DIR``, + ``GIT_WORK_TREE``, ``GIT_COMMON_DIR``, ``GIT_CONFIG``) which would otherwise + let the agent's git escape the worktree regardless of ``GIT_CEILING_DIRECTORIES``, + and sets ``GIT_CONFIG_NOSYSTEM=1`` so host git config can't re-point discovery. + The actual upward fence (``GIT_CEILING_DIRECTORIES``) is set in ``run_copilot`` + where the worktree path is known. + """ + for var in swe_bench.LEAK_SCRUB_ENV_VARS: + env.pop(var, None) + for var in ("GIT_DIR", "GIT_WORK_TREE", "GIT_COMMON_DIR", "GIT_CONFIG"): + env.pop(var, None) + env["GIT_CONFIG_NOSYSTEM"] = "1" + return env + + +# Substrings in a bash command that indicate an attempt to reach the gold answer +# via the network or a git remote / the cloned-repo offline oracle. +# NOTE: "/.git/" is handled separately (see _git_read_is_suspicious) because it +# legitimately appears in benign `find -not -path '*/.git/*'` / grep +# `--exclude-dir=.git` exclusions, which must NOT be flagged. +_LEAK_CMD_PATTERNS = ( + "github.com", "githubusercontent", "/pull/", "pull/", "/commit/", + ".patch", ".diff", "curl", "wget", " gh ", "gh pr", "gh api", + "git fetch", "git pull", "git ls-remote", "git remote", + "log origin", "diff origin", "rev-parse origin", "show origin", + # git-escape attempts: explicitly re-pointing git past the GIT_CEILING + # fence to reach the enclosing harness repo (branch name + commit messages). + "git -c ", "git --git-dir", "--git-dir=", "--work-tree", + "env -u git", "git_ceiling", "git_dir=", "git_work_tree", + "cache/repos", "urllib", "requests.get", "http.client", + "socket.", "urlopen", "httpx", "fetch(", +) +# Regexes that strip BENIGN ``.git`` references (path-exclusion filters) from a +# command before we test for a genuine ``.git`` *read*. Without this, every +# ``find . -not -path '*/.git/*'`` directory listing trips a false leak. +_GIT_EXCLUSION_RE = re.compile( + r"""(?:!\s*)?-?-?(?:not\s+)? # optional ! / - / --not + (?:-path|-ipath|exclude(?:-dir)?)\s* # find -path / grep --exclude-dir + =?\s*['"]?[^'"\s]*\.git[^'"\s]*['"]? # a token containing .git + """, + re.VERBOSE, +) +# Verbs/redirections that indicate an actual READ of git internals (the oracle). +# Deliberately excludes grep/rg/sed/awk: those are directory searchers that take +# benign ``.git`` exclusion globs (e.g. ``rg --glob '!**/.git/**'``); a genuine +# git-internal read through them is still caught by the specific-file alternative +# below (``.git/HEAD`` etc.). +_GIT_READ_RE = re.compile( + r"(?:cat|less|more|head|tail|strings|xxd|od|" + r"open\(|cp|rsync)\b[^|;&]*\.git/" + r"|<\s*[^|;&]*\.git/" # input redirection from a .git file + r"|\.git/(?:HEAD|refs|logs|objects|COMMIT_EDITMSG|ORIG_HEAD|packed-refs)" +) +# Path substrings whose READ would leak the answer or the harness's own state. +_LEAK_PATH_PATTERNS = ( + "cache/repos", "/.git/", "results.jsonl", "gold", "mapping", + "trace.jsonl", "trace.md", +) + + +def _scan_leak_arguments(name: str, args: dict[str, Any]) -> list[str]: + """Return leak signals for a single tool-execution-start event.""" + signals: list[str] = [] + lname = (name or "").lower() + if lname in ("web_fetch", "fetch") or lname.endswith("-fetch"): + url = str(args.get("url") or args.get("uri") or "") + signals.append(f"{name}:url={url[:120]}") + return signals + # Shell / bash: inspect the command string. + cmd = args.get("command") or args.get("cmd") or args.get("script") + if isinstance(cmd, str) and cmd: + low = cmd.lower() + for pat in _LEAK_CMD_PATTERNS: + if pat in low: + signals.append(f"bash:{pat.strip()}") + # ".git" needs context: ignore benign path-exclusion filters + # (find -not -path '*/.git/*', grep --exclude-dir=.git) and only flag a + # genuine READ of git internals (the offline gold oracle). + if ".git" in low: + stripped = _GIT_EXCLUSION_RE.sub(" ", low) + if _GIT_READ_RE.search(stripped): + signals.append("bash:.git-read") + # File-reader tools: inspect the path. Skip benign .github/.gitignore. + path = args.get("path") or args.get("file") or args.get("filename") + if isinstance(path, str) and path: + low = path.lower() + for pat in _LEAK_PATH_PATTERNS: + if pat in low: + signals.append(f"path:{pat.strip()}") + return signals + + +def detect_network_leak(stdout: str) -> dict[str, Any]: + """Scan the event stream for attempts to reach the gold answer off-task. + + Inspects every ``tool.execution_start`` event (both ``data.*`` and flat + top-level shapes; ``arguments`` is a dict). Flags web_fetch, GitHub/PR/ + commit/patch URLs, network shell commands, git-remote / origin reads, and + reads of the cloned ``.git`` oracle, the shared repos cache, or the + harness's own results/gold/trace files. Returns a bool + de-duplicated + signal list recorded on the row so tripped runs can be quarantined. + """ + signals: list[str] = [] + for line in stdout.splitlines(): + line = line.strip() + if not line: + continue + try: + ev = json.loads(line) + except json.JSONDecodeError: + continue + if not str(ev.get("type", "")).startswith("tool.execution_start"): + continue + data = ev.get("data") if isinstance(ev.get("data"), dict) else {} + name = data.get("name") or data.get("toolName") or ev.get("toolName") or "" + args = data.get("arguments") + if not isinstance(args, dict): + top = ev.get("arguments") + args = top if isinstance(top, dict) else {} + signals.extend(_scan_leak_arguments(name, args)) + deduped = sorted(set(signals)) + return {"network_leak": bool(deduped), "leak_signals": deduped} + + +def hardening_meta(repo_path: Path, stdout: str, reasoning_tokens: int) -> dict[str, Any]: + """Per-row leak-hardening + thinking provenance, recorded on every run. + + Marks which harness generation produced the row (so generations are never + pooled), whether the network/opaque-path/.git defenses were active, the + reasoning effort + separately-accounted thinking tokens, and any leak + signals the detector tripped (so contaminated runs can be quarantined). + """ + hardened = swe_bench.network_block_enabled() + leak = detect_network_leak(stdout) + return { + "harness_hardening_version": HARNESS_HARDENING_VERSION, + "network_block_mode": hardened, + "opaque_path_mode": hardened, + "git_sanitized": hardened and not (repo_path / ".git").exists(), + "git_walk_up_blocked": hardened, + "reasoning_effort": _resolve_reasoning_effort(), + "reasoning_tokens": int(reasoning_tokens or 0), + "network_leak": leak["network_leak"], + "leak_signals": leak["leak_signals"], + } + + +def run_copilot( + *, + prompt: str, + model: str, + cwd: Path, + log_dir: Path, + mcp_config: Path | None, + wall_time: float, +) -> dict[str, Any]: + """Invoke Copilot headless. Returns {stdout_jsonl, returncode, timed_out, wall}.""" + # Copilot runs with cwd=worktree and resolves a relative --log-dir against + # THAT cwd, which would scatter process logs under the worktree. Force + # absolute so logs land where the parser reads them. + log_dir = log_dir.resolve() + log_dir.mkdir(parents=True, exist_ok=True) + env = dict(os.environ) + hardened = swe_bench.network_block_enabled() + if hardened: + # Remove the opaque-name salt and any GitHub credentials so the agent + # process cannot recover them. + env = _harden_env(env) + # Fence the agent's git: with the worktree's own .git stripped, a bare + # `git log`/`git status` would otherwise walk UP to the enclosing harness + # repo and leak its branch name + commit messages (which reveal the + # benchmark intent). GIT_CEILING_DIRECTORIES stops the upward search at + # the worktree's parent. Listed dirs are NOT crossed, so git sees no + # repository from inside the (history-free) worktree. Both the resolved + # and lexical parent are listed to defeat symlinked paths. + env["GIT_CEILING_DIRECTORIES"] = _git_ceiling_dirs(cwd) + t0 = time.time() + timed_out = False + stdout, stderr, returncode = "", "", None + # Transient startup failures (OAuth token validation hitting a network blip, + # connection resets) make Copilot exit in ~1s with empty stdout. Those rows + # would otherwise be scored as recall=0 false negatives, so retry them. + for attempt in range(1, COPILOT_MAX_ATTEMPTS + 1): + session_id = str(uuid.uuid4()) + cmd = [ + "copilot", "-p", prompt, + "--model", model, + "--output-format", "json", + "--no-remote", + "--disable-builtin-mcps", + "--allow-all-tools", + ] + # Under hardening, confine the `view` file tool to the worktree (via + # --add-dir alone) instead of --allow-all-paths, so it cannot read the + # sibling cloned-repo `.git` oracle or the harness's own results/gold + # files. Shell reads are backstopped by deny-globs + the leak detector. + if not hardened: + cmd.append("--allow-all-paths") + cmd += [ + "--add-dir", str(cwd), + "--log-level", "debug", + "--log-dir", str(log_dir), + "--session-id", session_id, + ] + # Thinking is ENABLED for scored runs so the agent's tool-choice + # deliberation is captured in the trace. The reasoning-token cost is + # accounted separately (parse_tokens_from_logs -> reasoning_tokens) so + # the base token comparison across arms stays interpretable. Set + # COPILOT_REASONING_EFFORT=off to disable. + _effort = _resolve_reasoning_effort() + if _effort: + cmd += ["--effort", _effort] + # Network/remote exfiltration block (defense-in-depth; detector backstops). + if hardened: + cmd += _network_deny_flags() + if mcp_config is not None: + cmd += ["--additional-mcp-config", f"@{mcp_config}"] + + timed_out = False + # start_new_session=True puts Copilot + its children (MCP server, shells) + # in a fresh process group we can signal as a unit on timeout. + proc = subprocess.Popen( + cmd, + cwd=str(cwd), + stdin=subprocess.DEVNULL, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + env=env, + start_new_session=True, + ) + try: + stdout, stderr = proc.communicate(timeout=wall_time) + except subprocess.TimeoutExpired: + timed_out = True + _kill_group(proc.pid) + try: + stdout, stderr = proc.communicate(timeout=30) + except subprocess.TimeoutExpired: + stdout, stderr = "", "" + returncode = proc.returncode + + if timed_out or not _is_transient_startup_failure(returncode, stdout, stderr): + break + if attempt < COPILOT_MAX_ATTEMPTS: + print( + f"[retry] copilot startup failure (rc={returncode}, attempt " + f"{attempt}/{COPILOT_MAX_ATTEMPTS}); backing off " + f"{COPILOT_RETRY_BACKOFF_SEC}s. stderr={stderr.strip()[:160]!r}" + ) + time.sleep(COPILOT_RETRY_BACKOFF_SEC) + + wall = time.time() - t0 + (log_dir / "stdout.jsonl").write_text(stdout or "") + (log_dir / "stderr.txt").write_text(stderr or "") + startup_failed = _is_transient_startup_failure(returncode, stdout, stderr) and not timed_out + return { + "stdout": stdout or "", + "stderr": stderr or "", + "returncode": returncode, + "timed_out": timed_out, + "startup_failed": startup_failed, + "wall": wall, + } + + +def _kill_group(pid: int) -> None: + """Best-effort terminate a process and its group. + + On macOS ``os.killpg`` can raise ``PermissionError`` (EPERM) when a child + has changed session/owner or is mid-reap. That must never turn a recoverable + timeout into a fatal exception, so all signalling errors are swallowed and we + fall back to signalling the direct pid. + """ + try: + pgid = os.getpgid(pid) + except (ProcessLookupError, PermissionError, OSError): + pgid = None + for sig in (signal.SIGTERM, signal.SIGKILL): + signalled = False + if pgid is not None: + try: + os.killpg(pgid, sig) + signalled = True + except ProcessLookupError: + return + except (PermissionError, OSError): + pgid = None + if not signalled: + try: + os.kill(pid, sig) + except ProcessLookupError: + return + except (PermissionError, OSError): + pass + time.sleep(2) + + +# --------------------------------------------------------------------------- +# Parsing: tokens (debug logs), premium / files (result event), tool calls +# --------------------------------------------------------------------------- + +# A genuine Copilot model-response usage block. We require all of these keys so +# stray JSON (e.g. an MCP tool result or the server's own stderr) can't be +# mis-counted as token usage. +_USAGE_REQUIRED = ("prompt_tokens", "completion_tokens", "total_tokens", "prompt_tokens_details") + + +def parse_tokens_from_logs(log_dir: Path) -> dict[str, int]: + """Sum token usage across every model-response block in this run's logs. + + Copilot fans out multiple requests per turn; each writes a pretty-printed + ``"usage": { ... }`` block to ``process-*.log``. We sum them all. The log + dir is per-run, so there is no cross-run contamination. + """ + totals = { + "input_tokens": 0, + "output_tokens": 0, + "total_tokens": 0, + "cached_input_tokens": 0, + "cache_creation_tokens": 0, + "reasoning_tokens": 0, + "usage_blocks": 0, + } + for log in sorted(log_dir.glob("process-*.log")): + text = log.read_text(errors="replace") + for block in _iter_usage_blocks(text): + if not all(k in block for k in _USAGE_REQUIRED): + continue + totals["input_tokens"] += int(block.get("prompt_tokens", 0)) + totals["output_tokens"] += int(block.get("completion_tokens", 0)) + totals["total_tokens"] += int(block.get("total_tokens", 0)) + details = block.get("prompt_tokens_details") or {} + totals["cached_input_tokens"] += int(details.get("cached_tokens", 0)) + totals["cache_creation_tokens"] += int(details.get("cache_creation_tokens", 0)) + # Thinking tokens are a subset of completion_tokens; surfaced + # separately so the base (non-reasoning) output is comparable across + # arms even with thinking enabled. + cdetails = block.get("completion_tokens_details") or {} + totals["reasoning_tokens"] += int(cdetails.get("reasoning_tokens", 0) or 0) + totals["usage_blocks"] += 1 + return totals + + +def _iter_usage_blocks(text: str): + """Yield parsed JSON objects for each ``"usage": {...}`` in the log text. + + Brace-balanced scan from the opening ``{`` so multi-line pretty-printed + blocks parse correctly. + """ + for m in re.finditer(r'"usage"\s*:\s*\{', text): + start = m.end() - 1 # position of the opening brace + depth = 0 + for i in range(start, len(text)): + ch = text[i] + if ch == "{": + depth += 1 + elif ch == "}": + depth -= 1 + if depth == 0: + blob = text[start : i + 1] + try: + yield json.loads(blob) + except json.JSONDecodeError: + pass + break + + +def parse_result_event(stdout: str) -> dict[str, Any]: + """Extract premium-request count + files modified from the result event.""" + out = {"premium_requests": 0, "files_modified": [], "is_error": None, "num_turns": None} + for line in stdout.splitlines(): + line = line.strip() + if not line: + continue + try: + ev = json.loads(line) + except json.JSONDecodeError: + continue + if ev.get("type") != "result": + continue + data = ev.get("data", ev) + usage = data.get("usage") or {} + out["premium_requests"] = int(usage.get("premiumRequests", 0) or 0) + code_changes = usage.get("codeChanges") or data.get("codeChanges") or {} + out["files_modified"] = list(code_changes.get("filesModified", []) or []) + out["is_error"] = data.get("isError") + out["num_turns"] = data.get("numTurns") + return out + + +def parse_tool_calls(stdout: str) -> tuple[int, dict[str, int]]: + """Count tool invocations by name from execution-start events.""" + by_name: dict[str, int] = {} + total = 0 + for line in stdout.splitlines(): + line = line.strip() + if not line: + continue + try: + ev = json.loads(line) + except json.JSONDecodeError: + continue + etype = ev.get("type", "") + if not etype.startswith("tool.execution_start"): + continue + data = ev.get("data", {}) + name = data.get("name") or data.get("toolName") or "unknown" + if name is None: + name = "unknown" + by_name[name] = by_name.get(name, 0) + 1 + total += 1 + return total, by_name + + +# A code-graph MCP tool call shows up with this prefix in the tool name +# (e.g. ``code-graph-search_code``). Used for nudge-compliance metrics. +_GRAPH_TOOL_PREFIX = "code-graph" + + +def parse_tool_sequence(stdout: str) -> list[str]: + """Return tool names in invocation order (for first-tool / compliance).""" + seq: list[str] = [] + for line in stdout.splitlines(): + line = line.strip() + if not line: + continue + try: + ev = json.loads(line) + except json.JSONDecodeError: + continue + if not ev.get("type", "").startswith("tool.execution_start"): + continue + data = ev.get("data", {}) + name = data.get("name") or data.get("toolName") or "unknown" + seq.append(name or "unknown") + return seq + + +def _tool_prefix_for_track(track: str) -> str: + """The MCP server name prefix that identifies the track's nav tool calls.""" + if track == LSP: + return "lsp" + return _GRAPH_TOOL_PREFIX + + +def _is_graph_tool(name: str, prefix: str = _GRAPH_TOOL_PREFIX) -> bool: + return bool(name) and name.startswith(prefix) + + +def nudge_compliance(stdout: str, track: str = CODE_GRAPH) -> dict[str, Any]: + """Measure whether/how the agent engaged the track's MCP nav tools.""" + prefix = _tool_prefix_for_track(track) + seq = parse_tool_sequence(stdout) + first = seq[0] if seq else None + graph_calls = sum(1 for n in seq if _is_graph_tool(n, prefix)) + return { + "first_tool": first, + "first_is_graph": bool(first and _is_graph_tool(first, prefix)), + "graph_calls": graph_calls, + } + + +# --------------------------------------------------------------------------- +# Localization (LocAgent-style): extract the agent's predicted files +# --------------------------------------------------------------------------- + + +def extract_agent_text(stdout: str) -> str: + """Concatenate the agent's own message text (not tool output) in order. + + Scans both ``assistant.message`` (finalized) and ``assistant.message_delta`` + (streaming) so the sentinel is recoverable across CLI versions. Finalized + messages stream after their deltas, so the last sentinel occurrence (which + the parser keys on) lands in a complete message. + """ + parts: list[str] = [] + for line in stdout.splitlines(): + line = line.strip() + if not line: + continue + try: + ev = json.loads(line) + except json.JSONDecodeError: + continue + if ev.get("type") in ("assistant.message", "assistant.message_delta"): + content = ev.get("data", {}).get("content") + if isinstance(content, str) and content.strip(): + parts.append(content) + return "\n".join(parts) + + +def _norm_path(path: str) -> str: + """Normalize a predicted path to a repo-root-relative posix form.""" + p = path.strip().strip("'\"").strip() + p = p.replace("\\", "/") + while p.startswith("./"): + p = p[2:] + for prefix in ("a/", "b/"): + if p.startswith(prefix): + p = p[len(prefix):] + return p.lstrip("/") + + +def parse_localization(text: str) -> tuple[list[str], str | None, bool]: + """Parse the predicted file list from the agent's final message. + + Returns ``(pred_files, parse_error, fallback)``. The strict path looks for + the ``FINAL_LOCALIZATION_JSON:`` sentinel followed by a JSON array. If the + sentinel is missing/malformed, ``fallback`` is True and ``parse_error`` + carries the reason (headline numbers should drop / stratify these). + """ + idx = text.rfind(LOCALIZE_SENTINEL) + if idx == -1: + return [], "sentinel_missing", True + tail = text[idx + len(LOCALIZE_SENTINEL):] + start = tail.find("[") + if start == -1: + return [], "no_array", True + depth = 0 + end = -1 + for i in range(start, len(tail)): + c = tail[i] + if c == "[": + depth += 1 + elif c == "]": + depth -= 1 + if depth == 0: + end = i + break + if end == -1: + return [], "unbalanced_array", True + blob = tail[start:end + 1] + try: + arr = json.loads(blob) + except json.JSONDecodeError as exc: + return [], f"json_error:{exc.msg}", True + if not isinstance(arr, list): + return [], "not_a_list", True + pred: list[str] = [] + for item in arr: + if not isinstance(item, str): + continue + norm = _norm_path(item) + if norm and norm not in pred: + pred.append(norm) + return pred, None, False + + +def score_localization(pred: list[str], gold: list[str]) -> dict[str, Any]: + """Score predicted files vs gold (order-sensitive for acc@k / MRR).""" + gold_set = {_norm_path(g) for g in gold} + pred_norm = [_norm_path(p) for p in pred] + pred_set = set(pred_norm) + hits = gold_set & pred_set + recall = len(hits) / len(gold_set) if gold_set else 0.0 + precision = len(hits) / len(pred_set) if pred_set else 0.0 + all_found = bool(gold_set) and gold_set.issubset(pred_set) + + def acc_at(k: int) -> float: + topk = set(pred_norm[:k]) + return 1.0 if gold_set and (gold_set & topk) else 0.0 + + mrr = 0.0 + for rank, path in enumerate(pred_norm, start=1): + if path in gold_set: + mrr = 1.0 / rank + break + return { + "gold_files": sorted(gold_set), + "pred_files": pred_norm, + "file_recall": round(recall, 4), + "file_precision": round(precision, 4), + "file_all_found": all_found, + "acc_at_1": acc_at(1), + "acc_at_3": acc_at(3), + "acc_at_5": acc_at(5), + "file_mrr": round(mrr, 4), + } + + +# --------------------------------------------------------------------------- +# Patch extraction +# --------------------------------------------------------------------------- + + +def extract_patch(repo_path: Path, base_commit: str) -> dict[str, Any]: + """Capture all changes vs base as a single unified diff (junk-excluded). + + ``git add -A`` then ``git diff --cached `` captures committed, staged, + unstaged and untracked changes regardless of how Copilot left the tree. + Build/cache dirs are excluded via pathspec. + """ + excludes = [f":(exclude){d}" for d in _PATCH_EXCLUDES] + excludes += [f":(exclude)*/{d}/*" for d in _PATCH_EXCLUDES] + swe_bench._git(["add", "-A"], cwd=repo_path, check=False) + res = swe_bench._git( + ["diff", "--cached", base_commit, "--", ".", *excludes], + cwd=repo_path, + check=False, + ) + patch = res.stdout + files = _patched_files(patch) + touched_tests = any(not swe_bench.is_source_file(f) and f.endswith(".py") for f in files) or any( + swe_bench._TEST_PATH_RE.search(f) for f in files + ) + return {"patch": patch, "patched_files": files, "touched_tests": touched_tests} + + +def _patched_files(patch: str) -> list[str]: + files = [] + for line in patch.splitlines(): + if line.startswith("+++ b/"): + files.append(line[6:]) + return files + + +# --------------------------------------------------------------------------- +# Per-instance driver +# --------------------------------------------------------------------------- + + +def _resolve_run_dir( + cache_dir: Path, + *, + model: str, + mode: str, + prompt_mode: str, + track: str, + instance_id: str, + run_idx: int, +) -> Path: + """Build the run_dir for one trajectory. + + For multi-run pilots (run_idx>0) each repeat is nested under ``run`` so + logs are not overwritten; run_idx==0 keeps the bare layout for + backwards-compat with existing single-run caches. ``row_stdout_path()`` + resolves both layouts. + """ + run_dir = cache_dir / "runs" / model / mode / prompt_mode / track / instance_id + if run_idx > 0: + run_dir = run_dir / f"run{run_idx}" + return run_dir + + +def _compute_prompt_mode( + *, adopt_arm: str | None, nudge: bool, inject_label: str | None = None +) -> str: + """Single source of truth for prompt_mode so main() and run_one() agree. + + The NOISY/GRAPH-WRONG distractor condition is orthogonal to the prompt arm, + so it is encoded as a suffix (e.g. ``adopt-sem-noisy``). CLEAN runs carry no + suffix and stay byte-identical to the plain arm prompt_mode. + """ + if adopt_arm is not None: + base = f"adopt-{adopt_arm}" + else: + base = "nudged" if nudge else "neutral" + if inject_label: + return f"{base}-{inject_label}" + return base + + +def _inject_env( + inst: swe_bench.SweBenchInstance, + *, + inject_manifest: Path | None, + inject_k: int | None, +) -> dict[str, str] | None: + """Build the env that gates server-side NOISY distractor injection. + + The keyed-by-task manifest is read inside the MCP server; here we just point + it at the manifest path and pin BENCH_NOISY_TASK to this instance so only + this task's distractors are injected. Returns None when injection is off. + """ + if inject_manifest is None: + return None + env = { + "BENCH_NOISY_MANIFEST": str(inject_manifest), + "BENCH_NOISY_TASK": inst.instance_id, + } + if inject_k is not None: + env["BENCH_NOISY_K"] = str(inject_k) + return env + + +def run_one( + inst: swe_bench.SweBenchInstance, + *, + track: str, + model: str, + cache_dir: Path, + wall_time: float, + server_root: Path, + run_idx: int = 0, + nudge: bool = False, + mode: str = FIX, + adopt_arm: str | None = None, + inject_manifest: Path | None = None, + inject_label: str | None = None, + inject_k: int | None = None, +) -> dict[str, Any]: + if adopt_arm is not None and (track != CODE_GRAPH or mode != LOCALIZE): + raise ValueError( + f"adopt_arm={adopt_arm!r} requires track={CODE_GRAPH} and mode={LOCALIZE}; " + f"got track={track!r} mode={mode!r}" + ) + if adopt_arm is not None and adopt_arm not in ADOPT_ARMS: + raise ValueError(f"unknown adopt_arm={adopt_arm!r}; expected one of {ADOPT_ARMS}") + if inject_manifest is not None: + if track != CODE_GRAPH or mode != LOCALIZE: + raise ValueError( + f"inject_manifest requires track={CODE_GRAPH} and mode={LOCALIZE}; " + f"got track={track!r} mode={mode!r}" + ) + if not inject_label: + raise ValueError("inject_manifest requires a non-empty inject_label") + prompt_mode = _compute_prompt_mode( + adopt_arm=adopt_arm, nudge=nudge, inject_label=inject_label + ) + work_root = cache_dir / "worktrees" / track + work_root.mkdir(parents=True, exist_ok=True) + run_dir = _resolve_run_dir( + cache_dir, + model=model, + mode=mode, + prompt_mode=prompt_mode, + track=track, + instance_id=inst.instance_id, + run_idx=run_idx, + ) + if run_dir.exists(): + shutil.rmtree(run_dir) + run_dir.mkdir(parents=True, exist_ok=True) + + print(f"\n=== {inst.instance_id} [{track}] model={model} mode={mode} prompt={prompt_mode} ===") + + # Common base row fields (identity). + base_row = { + "benchmark": "swe_bench_verified", + "task_id": inst.instance_id, + "config": track, + "model": model, + "mode": mode, + "prompt_mode": prompt_mode, + "run_idx": run_idx, + "runner": RUNNER_VERSION, + } + + if mode == LOCALIZE: + return _run_localize( + inst, track=track, model=model, run_dir=run_dir, work_root=work_root, + wall_time=wall_time, server_root=server_root, nudge=nudge, base_row=base_row, + adopt_arm=adopt_arm, inject_manifest=inject_manifest, inject_k=inject_k, + ) + + repo_path = swe_bench.prepare_worktree( + inst, worktrees_dir=work_root.resolve(), apply_test_patch=True + ) + + index_sec = None + mcp_config = None + if track == CODE_GRAPH: + index_sec = ensure_indexed(repo_path, fresh=True) + host, port = _falkor_settings() + wrapper = _write_mcp_wrapper(run_dir, server_root) + mcp_config = _write_mcp_config(run_dir, wrapper, host, port) + elif track == LSP: + wrapper = _write_lsp_wrapper(run_dir, repo_path) + mcp_config = _write_lsp_mcp_config(run_dir, wrapper) + + prompt = build_prompt( + track, repo_path, inst.problem_statement, repo_path.name, nudge=nudge, mode=mode + ) + (run_dir / "prompt.txt").write_text(prompt) + + result = run_copilot( + prompt=prompt, + model=model, + cwd=repo_path, + log_dir=run_dir / "logs", + mcp_config=mcp_config, + wall_time=wall_time, + ) + + tokens = parse_tokens_from_logs(run_dir / "logs") + result_ev = parse_result_event(result["stdout"]) + tool_total, tool_by_name = parse_tool_calls(result["stdout"]) + compliance = nudge_compliance(result["stdout"], track) + patch_info = extract_patch(repo_path, inst.base_commit) + + if result.get("startup_failed"): + print( + f"[error] {inst.instance_id} [{track}] copilot startup failed after " + f"{COPILOT_MAX_ATTEMPTS} attempts (rc={result['returncode']}); " + f"marking incomplete for re-run" + ) + return { + **base_row, + "index_sec": index_sec, + "timed_out": result["timed_out"], + "returncode": result["returncode"], + "outcome": "error", + "error": f"copilot_startup_failed: {result.get('stderr', '').strip()[:200]}", + "wall_clock_sec": round(result["wall"], 2), + "completed": False, + } + + row = { + **base_row, + "input_tokens": tokens["input_tokens"], + "output_tokens": tokens["output_tokens"], + "total_tokens": tokens["total_tokens"], + "cached_input_tokens": tokens["cached_input_tokens"], + "cache_creation_tokens": tokens["cache_creation_tokens"], + "usage_blocks": tokens["usage_blocks"], + "premium_requests": result_ev["premium_requests"], + "tool_calls_total": tool_total, + "tool_calls_by_name": tool_by_name, + "first_tool": compliance["first_tool"], + "first_is_graph": compliance["first_is_graph"], + "graph_calls": compliance["graph_calls"], + "files_modified": result_ev["files_modified"], + "touched_tests": patch_info["touched_tests"], + "index_sec": index_sec, + "timed_out": result["timed_out"], + "returncode": result["returncode"], + "outcome": "ungraded", + "patch": patch_info["patch"], + "wall_clock_sec": round(result["wall"], 2), + "completed": True, + **hardening_meta(repo_path, result["stdout"], tokens["reasoning_tokens"]), + } + _maybe_write_trace(run_dir, row) + print( + f"[done] {inst.instance_id} [{track}] in={row['input_tokens']} " + f"out={row['output_tokens']} premium={row['premium_requests']} " + f"tools={tool_total} graph={compliance['graph_calls']} " + f"patch_files={len(patch_info['patched_files'])} " + f"timed_out={result['timed_out']} wall={row['wall_clock_sec']}s" + ) + return row + + +def _run_localize( + inst: swe_bench.SweBenchInstance, + *, + track: str, + model: str, + run_dir: Path, + work_root: Path, + wall_time: float, + server_root: Path, + nudge: bool, + base_row: dict[str, Any], + adopt_arm: str | None = None, + inject_manifest: Path | None = None, + inject_k: int | None = None, +) -> dict[str, Any]: + """Localization driver: no edits, no Docker; score predicted files vs gold.""" + gold = swe_bench.gold_changed_files(inst.patch, source_only=True) + if not gold: + print(f"[skip] {inst.instance_id} [{track}] no source-only gold files") + return { + **base_row, + "outcome": "skipped_no_gold", + "completed": True, + "gold_files": [], + } + + # Distinct, test-free worktree forces a clean re-index with no test_patch + # leakage into the graph. + repo_path = swe_bench.prepare_localize_worktree( + inst, worktrees_dir=work_root.resolve() + ) + + index_sec = None + mcp_config = None + if track == CODE_GRAPH: + index_sec = ensure_indexed(repo_path, fresh=True) + host, port = _falkor_settings() + wrapper = _write_mcp_wrapper(run_dir, server_root) + extra_env = _inject_env(inst, inject_manifest=inject_manifest, inject_k=inject_k) + mcp_config = _write_mcp_config(run_dir, wrapper, host, port, extra_env=extra_env) + elif track == LSP: + wrapper = _write_lsp_wrapper(run_dir, repo_path) + mcp_config = _write_lsp_mcp_config(run_dir, wrapper) + + prompt = build_prompt( + track, repo_path, inst.problem_statement, repo_path.name, + nudge=nudge, mode=LOCALIZE, adopt_arm=adopt_arm, + ) + (run_dir / "prompt.txt").write_text(prompt) + + result = run_copilot( + prompt=prompt, + model=model, + cwd=repo_path, + log_dir=run_dir / "logs", + mcp_config=mcp_config, + wall_time=wall_time, + ) + + tokens = parse_tokens_from_logs(run_dir / "logs") + result_ev = parse_result_event(result["stdout"]) + tool_total, tool_by_name = parse_tool_calls(result["stdout"]) + compliance = nudge_compliance(result["stdout"], track) + + agent_text = extract_agent_text(result["stdout"]) + (run_dir / "agent_text.txt").write_text(agent_text) + pred, parse_error, fallback = parse_localization(agent_text) + scores = score_localization(pred, gold) + leak = swe_bench.leakage_flags(inst, gold) + + # A transient startup/network failure produces no model output; record it as + # an error (completed=False) so it is re-run rather than scored as recall=0. + if result.get("startup_failed"): + print( + f"[error] {inst.instance_id} [{track}] copilot startup failed after " + f"{COPILOT_MAX_ATTEMPTS} attempts (rc={result['returncode']}); " + f"marking incomplete for re-run" + ) + return { + **base_row, + "index_sec": index_sec, + "index_fresh": track == CODE_GRAPH, + "timed_out": result["timed_out"], + "returncode": result["returncode"], + "outcome": "error", + "error": f"copilot_startup_failed: {result.get('stderr', '').strip()[:200]}", + "wall_clock_sec": round(result["wall"], 2), + "completed": False, + } + + row = { + **base_row, + "input_tokens": tokens["input_tokens"], + "output_tokens": tokens["output_tokens"], + "total_tokens": tokens["total_tokens"], + "cached_input_tokens": tokens["cached_input_tokens"], + "cache_creation_tokens": tokens["cache_creation_tokens"], + "usage_blocks": tokens["usage_blocks"], + "premium_requests": result_ev["premium_requests"], + "tool_calls_total": tool_total, + "tool_calls_by_name": tool_by_name, + "first_tool": compliance["first_tool"], + "first_is_graph": compliance["first_is_graph"], + "graph_calls": compliance["graph_calls"], + "index_sec": index_sec, + "index_fresh": track == CODE_GRAPH, + "timed_out": result["timed_out"], + "returncode": result["returncode"], + "parse_error": parse_error, + "parse_fallback": fallback, + "is_structural": swe_bench.is_structural(inst), + "mentions_gold_path": leak.get("mentions_gold_path"), + "mentions_gold_basename": leak.get("mentions_gold_basename"), + "contains_traceback": leak.get("contains_traceback"), + "outcome": "localized", + "wall_clock_sec": round(result["wall"], 2), + "completed": True, + **scores, + **hardening_meta(repo_path, result["stdout"], tokens["reasoning_tokens"]), + } + _maybe_write_trace(run_dir, row) + print( + f"[loc] {inst.instance_id} [{track}] recall={scores['file_recall']} " + f"acc@1={scores['acc_at_1']} mrr={scores['file_mrr']} " + f"pred={len(pred)} gold={len(scores['gold_files'])} " + f"graph={compliance['graph_calls']} parse_err={parse_error} " + f"in={row['input_tokens']} wall={row['wall_clock_sec']}s" + ) + return row + + +def _maybe_write_trace(run_dir: Path, row: dict[str, Any]) -> None: + """Best-effort decision-loop trace extraction; never break a run on error.""" + try: + from bench.analysis.trace import extract_run + + extract_run(run_dir, row=row, write=True) + except Exception as exc: # noqa: BLE001 - trace is diagnostic, not critical + print(f"[trace] extraction failed for {run_dir.name}: {exc}") + + +# --------------------------------------------------------------------------- +# Resume / IO +# --------------------------------------------------------------------------- + + +def _load_done(results_path: Path) -> set[tuple]: + done: set[tuple] = set() + if not results_path.exists(): + return done + for line in results_path.read_text().splitlines(): + line = line.strip() + if not line: + continue + try: + r = json.loads(line) + except json.JSONDecodeError: + continue + if r.get("completed") and r.get("runner") == RUNNER_VERSION: + done.add(( + r["task_id"], + r["config"], + r.get("model", ""), + r.get("mode", FIX), + r.get("prompt_mode", "neutral"), + int(r.get("run_idx", 0)), + )) + return done + + +def _append_row(results_path: Path, row: dict[str, Any]) -> None: + results_path.parent.mkdir(parents=True, exist_ok=True) + with results_path.open("a") as f: + f.write(json.dumps(row) + "\n") + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def _load_instance_ids(args) -> list[str]: + if args.instances_file: + ids = [ + ln.strip() + for ln in Path(args.instances_file).read_text().splitlines() + if ln.strip() and not ln.startswith("#") + ] + return ids + if args.instance: + return list(args.instance) + if args.select_structural: + return [] # resolved later against the loaded dataset + raise SystemExit( + "provide --instance ID [ID ...], --instances-file FILE, or --select-structural N" + ) + + +def main(argv: list[str] | None = None) -> int: + p = argparse.ArgumentParser(description="Drive Copilot CLI over SWE-bench Verified.") + p.add_argument("--instance", nargs="*", help="explicit instance id(s)") + p.add_argument("--instances-file", help="file with one instance id per line") + p.add_argument( + "--select-structural", type=int, default=0, + help="auto-select N structural instances (>=2 source files/dirs) for localization", + ) + p.add_argument( + "--track", action="append", choices=VALID_TRACKS, default=None, + help="track(s) to run (default: both)", + ) + p.add_argument("--model", default="claude-opus-4.8") + p.add_argument("--mode", choices=VALID_MODES, default=FIX, help="fix or localize") + p.add_argument( + "--nudge", action="store_true", + help="use the nudged prompt variant (forces structured search-first)", + ) + p.add_argument( + "--adopt-arm", choices=ADOPT_ARMS, default=None, + help="Lane 1 adoption-calibration arm (code_graph + localize only): " + "ctrl (=nudge base), sem (edge-semantics clause), rat (keep/drop step)", + ) + p.add_argument( + "--inject-manifest", default=None, + help="path to a NOISY/GRAPH-WRONG distractor manifest (code_graph + localize only); " + "enables server-side injection of verified non-gold candidates", + ) + p.add_argument( + "--inject-label", default=None, + help="condition label suffixed onto prompt_mode when injecting (e.g. 'noisy', 'gwrong')", + ) + p.add_argument( + "--inject-k", type=int, default=None, + help="override number of distractors to inject (default: manifest k)", + ) + p.add_argument("--cache-dir", default=str(DEFAULT_CACHE)) + p.add_argument("--results", default=None, help="results jsonl (default: //results.jsonl)") + p.add_argument("--wall-time", type=float, default=1200.0, help="per-run wall-clock seconds") + p.add_argument("--server-root", default=str(DEFAULT_MCP_SERVER_ROOT)) + p.add_argument("--run-idx", type=int, default=0) + p.add_argument("--seed", type=int, default=swe_bench.DEFAULT_SEED, help="seed for --select-structural") + p.add_argument( + "--dataset", default=None, + help="HuggingFace dataset name (default: SWE-bench_Verified). " + "Use 'loc-bench' shorthand or a full id like czlll/Loc-Bench_V1.", + ) + p.add_argument( + "--no-leak", action="store_true", + help="with --select-structural: drop instances whose problem statement names a gold file (structural-hard gate)", + ) + args = p.parse_args(argv) + + tracks = args.track or list(VALID_TRACKS) + if args.adopt_arm is not None: + # Lane 1 arms are code_graph + localize only; pin the track/mode so the + # dedup key, run_dir and prompt all agree with run_one's guard. + if args.mode != LOCALIZE: + raise SystemExit(f"--adopt-arm requires --mode {LOCALIZE}") + if tracks != [CODE_GRAPH]: + raise SystemExit(f"--adopt-arm requires --track {CODE_GRAPH} (only)") + cache_dir = Path(args.cache_dir).resolve() + inject_manifest: Path | None = None + if args.inject_manifest is not None: + if args.mode != LOCALIZE: + raise SystemExit(f"--inject-manifest requires --mode {LOCALIZE}") + if tracks != [CODE_GRAPH]: + raise SystemExit(f"--inject-manifest requires --track {CODE_GRAPH} (only)") + if not args.inject_label: + raise SystemExit("--inject-manifest requires --inject-label") + inject_manifest = Path(args.inject_manifest).resolve() + if not inject_manifest.is_file(): + raise SystemExit(f"--inject-manifest not found: {inject_manifest}") + results_path = ( + Path(args.results) + if args.results + else cache_dir / args.model / "results.jsonl" + ) + server_root = Path(args.server_root) + # Only suffix prompt_mode when injection is actually active. + effective_inject_label = args.inject_label if inject_manifest is not None else None + prompt_mode = _compute_prompt_mode( + adopt_arm=args.adopt_arm, nudge=args.nudge, inject_label=effective_inject_label, + ) + + ids = _load_instance_ids(args) + dataset_name = args.dataset + if dataset_name and dataset_name.lower() in ("loc-bench", "locbench"): + dataset_name = swe_bench.LOC_BENCH_DATASET + all_insts = {i.instance_id: i for i in swe_bench.load_instances(dataset_name=dataset_name)} + if ids: + missing = [i for i in ids if i not in all_insts] + if missing: + raise SystemExit(f"unknown instance ids: {missing}") + insts = [all_insts[i] for i in ids] + else: + insts = swe_bench.select_structural( + list(all_insts.values()), seed=args.seed, n=args.select_structural, + python_only=True, no_leak=args.no_leak, + ) + print(f"[plan] selected {len(insts)} structural instances: " + f"{[i.instance_id for i in insts]}") + + done = _load_done(results_path) + print(f"[plan] {len(insts)} instances x {len(tracks)} tracks; mode={args.mode} " + f"prompt={prompt_mode}; {len(done)} rows already complete; results -> {results_path}") + + for inst in insts: + for track in tracks: + key = (inst.instance_id, track, args.model, args.mode, prompt_mode, args.run_idx) + if key in done: + print(f"[skip] {inst.instance_id} [{track}] already complete") + continue + try: + row = run_one( + inst, + track=track, + model=args.model, + cache_dir=cache_dir, + wall_time=args.wall_time, + server_root=server_root, + run_idx=args.run_idx, + nudge=args.nudge, + mode=args.mode, + adopt_arm=args.adopt_arm, + inject_manifest=inject_manifest, + inject_label=effective_inject_label, + inject_k=args.inject_k, + ) + except Exception as exc: # noqa: BLE001 + print(f"[error] {inst.instance_id} [{track}]: {exc!r}", file=sys.stderr) + traceback.print_exc() + row = { + "benchmark": "swe_bench_verified", + "task_id": inst.instance_id, + "config": track, + "model": args.model, + "mode": args.mode, + "prompt_mode": prompt_mode, + "run_idx": args.run_idx, + "runner": RUNNER_VERSION, + "outcome": "error", + "error": repr(exc), + "patch": "", + "completed": False, + } + _append_row(results_path, row) + + print("[plan] done") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/bench/runners/copilot_tco.py b/bench/runners/copilot_tco.py new file mode 100644 index 00000000..628f0cdd --- /dev/null +++ b/bench/runners/copilot_tco.py @@ -0,0 +1,196 @@ +"""Total-cost-of-ownership (TCO) accounting for the Copilot benchmark tracks. + +Copilot's own token accounting captures only the **agent model** spend. For the +``code_graph`` track the *true* cost has three more components that the agent's +token count never sees: + + 1. **Indexing** -- one-time CPU to build the FalkorDB graph. With the + tree-sitter resolver (``CODE_GRAPH_PY_RESOLVER=tree_sitter``) this is + **LLM-free**, so it is pure compute that amortizes across every later + query of the same repo. We report it as wall-seconds (and an optional + compute-$ estimate), never blended into the per-task model cost. + 2. **FalkorDB hosting** -- a standing graph DB. Amortizable infra, reported + as a flat note, not a per-task charge. + 3. **GraphRAG ``ask`` side-LLM** -- the ONLY code-graph tool that calls an + LLM (NL->Cypher via ``MODEL_NAME``, default gemini-flash-lite). The + headline tracks exclude ``ask`` so this is normally **$0**; if a run does + call it, ``graphrag_*`` fields on the row meter it and it is added here. + +So the headline takeaway: with ``ask`` excluded and tree-sitter indexing, the +code-graph track adds **zero per-task side-LLM cost** over the no-MCP control -- +its only delta is amortizable infra. This module makes that explicit and prices +any ``ask`` usage when present. + +Usage: + uv run python -m bench.runners.copilot_tco --results +""" + +from __future__ import annotations + +import argparse +import json +from collections import defaultdict +from pathlib import Path +from typing import Any + +# List price per 1M tokens (USD): input / output. Illustrative (non-billing). +AGENT_PRICING = { + "opus": (15.0, 75.0), + "sonnet": (3.0, 15.0), + "haiku": (0.80, 4.0), +} + +# GraphRAG `ask` underlying model price per 1M tokens (input/output). +# Default MODEL_NAME is gemini/gemini-flash-lite-latest. Illustrative. +GRAPHRAG_PRICING = { + "gemini-flash-lite": (0.075, 0.30), + "gemini-flash": (0.15, 0.60), +} +DEFAULT_GRAPHRAG_MODEL = "gemini-flash-lite" + +# Rough on-demand compute price for indexing wall-time (1 vCPU-hour). Illustrative. +INDEX_CPU_USD_PER_HOUR = 0.05 + + +def agent_key(model: str) -> str: + """Map a Copilot model id (or shorthand) to an AGENT_PRICING key.""" + m = model.lower() + if "opus" in m: + return "opus" + if "sonnet" in m: + return "sonnet" + if "haiku" in m: + return "haiku" + return "sonnet" + + +def agent_cost_usd(in_tok: int, out_tok: int, model: str) -> float: + pin, pout = AGENT_PRICING[agent_key(model)] + return in_tok / 1e6 * pin + out_tok / 1e6 * pout + + +def graphrag_cost_usd(in_tok: int, out_tok: int, model: str = DEFAULT_GRAPHRAG_MODEL) -> float: + pin, pout = GRAPHRAG_PRICING.get(model, GRAPHRAG_PRICING[DEFAULT_GRAPHRAG_MODEL]) + return in_tok / 1e6 * pin + out_tok / 1e6 * pout + + +def index_cost_usd(index_sec: float | None) -> float: + if not index_sec: + return 0.0 + return index_sec / 3600.0 * INDEX_CPU_USD_PER_HOUR + + +def row_tco(row: dict[str, Any]) -> dict[str, Any]: + """Full TCO breakdown for a single result row.""" + model = row.get("model", "claude-sonnet-4.6") + in_tok = int(row.get("input_tokens", 0) or 0) + out_tok = int(row.get("output_tokens", 0) or 0) + agent_usd = agent_cost_usd(in_tok, out_tok, model) + + g_in = int(row.get("graphrag_input_tokens", 0) or 0) + g_out = int(row.get("graphrag_output_tokens", 0) or 0) + g_calls = int(row.get("graphrag_ask_calls", 0) or 0) + g_usd = graphrag_cost_usd(g_in, g_out) if (g_in or g_out) else 0.0 + + idx_usd = index_cost_usd(row.get("index_sec")) + + return { + "task_id": row.get("task_id"), + "config": row.get("config"), + "model": model, + "agent_input_tokens": in_tok, + "agent_output_tokens": out_tok, + "agent_usd": round(agent_usd, 4), + "premium_requests": int(row.get("premium_requests", 0) or 0), + "graphrag_ask_calls": g_calls, + "graphrag_tokens": g_in + g_out, + "graphrag_usd": round(g_usd, 4), + "index_sec": row.get("index_sec"), + "index_usd_amortized_once": round(idx_usd, 4), + # Per-task TCO = agent model + any ask side-LLM. Indexing is reported + # separately because it amortizes across all queries of the repo. + "per_task_tco_usd": round(agent_usd + g_usd, 4), + } + + +def aggregate(rows: list[dict[str, Any]]) -> dict[str, dict[str, Any]]: + by: dict[str, list[dict[str, Any]]] = defaultdict(list) + for r in rows: + if not r.get("completed"): + continue + by[r.get("config", "?")].append(r) + + out: dict[str, dict[str, Any]] = {} + for cfg, crows in by.items(): + n = len(crows) + tcos = [row_tco(r) for r in crows] + agent = sum(t["agent_usd"] for t in tcos) + graphrag = sum(t["graphrag_usd"] for t in tcos) + index = sum(t["index_usd_amortized_once"] for t in tcos) + premium = sum(t["premium_requests"] for t in tcos) + ask_calls = sum(t["graphrag_ask_calls"] for t in tcos) + resolved = sum(1 for r in crows if r.get("outcome") == "resolved") + out[cfg] = { + "n": n, + "resolved": resolved, + "agent_usd": round(agent, 2), + "graphrag_ask_calls": ask_calls, + "graphrag_usd": round(graphrag, 4), + "index_usd_one_time": round(index, 4), + "premium_requests": premium, + "per_task_tco_usd_sum": round(agent + graphrag, 2), + "per_task_tco_usd_mean": round((agent + graphrag) / n, 4) if n else 0.0, + } + return out + + +def _load(path: Path) -> list[dict[str, Any]]: + rows = [] + for line in path.read_text().splitlines(): + line = line.strip() + if line: + rows.append(json.loads(line)) + return rows + + +def main(argv: list[str] | None = None) -> int: + p = argparse.ArgumentParser(description="TCO accounting for Copilot benchmark runs.") + p.add_argument("--results", required=True, help="results jsonl") + p.add_argument("--json", action="store_true", help="emit JSON instead of a table") + args = p.parse_args(argv) + + rows = _load(Path(args.results)) + agg = aggregate(rows) + + if args.json: + print(json.dumps(agg, indent=2)) + return 0 + + print(f"\nTCO by track ({Path(args.results).name})\n") + hdr = ( + f"{'track':>16} | {'n':>3} | {'resolved':>8} | {'agent $':>9} | " + f"{'ask calls':>9} | {'ask $':>7} | {'index $ (1x)':>12} | " + f"{'premium':>7} | {'TCO $/task':>10}" + ) + print(hdr) + print("-" * len(hdr)) + for cfg in sorted(agg): + s = agg[cfg] + print( + f"{cfg:>16} | {s['n']:>3} | {s['resolved']:>8} | " + f"{s['agent_usd']:>9.2f} | {s['graphrag_ask_calls']:>9} | " + f"{s['graphrag_usd']:>7.4f} | {s['index_usd_one_time']:>12.4f} | " + f"{s['premium_requests']:>7} | {s['per_task_tco_usd_mean']:>10.4f}" + ) + print( + "\nNotes: agent $ = Copilot model tokens (list price). ask $ = GraphRAG " + "side-LLM (0 when `ask` excluded). index $ = one-time tree-sitter " + "indexing CPU, amortizes across all queries (LLM-free). FalkorDB hosting " + "is standing infra, not charged per task. Premium requests are Copilot's " + "real billing unit -- reported separately, never blended into $." + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/bench/runners/localize_runner.py b/bench/runners/localize_runner.py new file mode 100644 index 00000000..fc37f9d3 --- /dev/null +++ b/bench/runners/localize_runner.py @@ -0,0 +1,576 @@ +"""LocAgent-style code-localization benchmark. + +Where the full SWE-bench *fix* task ties all configs on accuracy (because +fixes are localized and grep suffices), this benchmark isolates the +**navigation** problem: given only the issue text, the agent must name the +source file(s) that need to change — without editing anything. We then score +file-level localization (recall / precision / Acc@k / MRR) and the token / +command cost each tool incurs to get there. + +Design (see plan.md 2026-05-30 23:30): + * Test-free worktree under a distinct name `{id}__loc` -> a FRESH FalkorDB + index that does NOT contain the test_patch (which would leak the answer). + * One shared, free-form instance template for every config (tools are + advertised by the per-config preamble; no forced first command). + * Strict `FINAL_LOCALIZATION_JSON:` sentinel parsing with an explicit + `parse_error` flag; a regex fallback is recorded for diagnostics only and + never feeds the headline metric. + * Gold = non-test, non-doc Python files from the gold patch. + +Run: + uv run python -m bench.runners.localize_runner --set structural \ + --config baseline --config lsp --config code_graph --config code_graph_mcp \ + --limit 30 --model anthropic/claude-opus-4-... \ + --results bench/cache/opus-localize/results.jsonl +""" +from __future__ import annotations + +import argparse +import json +import os +import re +import signal +import subprocess +import time +from pathlib import Path +from typing import Any + +from bench.runners.mini_runner import ( + DEFAULT_CACHE_DIR, + VALID_CONFIGS, + _ensure_indexed, + _ensure_indexed_mcp, + config_env, + load_preamble, +) + +LOCALIZE_RESULTS = DEFAULT_CACHE_DIR / "opus-localize" / "results.jsonl" +LOCALIZE_TRAJECTORIES = DEFAULT_CACHE_DIR / "opus-localize" / "trajectories" + +SENTINEL = "FINAL_LOCALIZATION_JSON:" + +from minisweagent.environments.local import LocalEnvironment # noqa: E402 + + +class SafeLocalEnvironment(LocalEnvironment): + """LocalEnvironment whose timeout reliably reaps the whole process tree. + + The stock implementation runs ``subprocess.run(shell=True, timeout=...)``. + When a command spawns a grandchild that inherits the stdout pipe (e.g. a + jedi/multilspy language server that hangs while indexing a large repo such + as Django), the timeout kills only the shell and ``communicate()`` then + blocks *forever* waiting for the inherited pipe to close — wedging the whole + agent. We launch each command in its own session and ``SIGKILL`` the entire + process group on timeout, which closes the pipe and unblocks the read. + + Everything else (the ``COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT`` completion + check, template vars, serialization, pydantic config) is inherited. + """ + + def execute(self, action: dict, cwd: str = "", *, timeout: int | None = None) -> dict[str, Any]: + command = action.get("command", "") + run_cwd = cwd or self.config.cwd or os.getcwd() + tmo = timeout or self.config.timeout + proc = subprocess.Popen( + command, + shell=True, + text=True, + cwd=run_cwd, + env=os.environ | self.config.env, + encoding="utf-8", + errors="replace", + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + start_new_session=True, + ) + try: + out, _ = proc.communicate(timeout=tmo) + output = {"output": out, "returncode": proc.returncode, "exception_info": ""} + except subprocess.TimeoutExpired: + try: + os.killpg(os.getpgid(proc.pid), signal.SIGKILL) + except (ProcessLookupError, PermissionError): + pass + try: + out, _ = proc.communicate(timeout=10) + except subprocess.TimeoutExpired: + out = "" + output = { + "output": (out or "") + f"\n[command timed out after {tmo}s; process group killed]", + "returncode": -1, + "exception_info": f"TimeoutExpired after {tmo}s", + "extra": {"exception_type": "TimeoutExpired", "exception": "timeout"}, + } + self._check_finished(output) + return output + + +class TimeoutRetryModel: + """Wrap a minisweagent model so each API call is bounded by a hard timeout. + + litellm's own ``timeout`` does not reliably interrupt the Azure Anthropic + passthrough — we have observed an ESTABLISHED socket stall with the Python + process blocked in a C-level read for 20+ min, CPU frozen, never returning. + ``SIGALRM`` interrupts even a blocked syscall (PEP 475 re-raises from the + handler), so we arm it around each ``query`` and retry on stall. The agent's + own between-step wall-time check then actually becomes reachable. + + All other attributes/methods (cost, n_calls, serialize, format_message, …) + are delegated to the wrapped model. + """ + + def __init__(self, inner: Any, *, per_call_timeout: int = 180, retries: int = 3): + self._inner = inner + self._per_call_timeout = per_call_timeout + self._retries = retries + + def query(self, messages: list[dict[str, str]], **kwargs) -> dict: + last_exc: Exception | None = None + for attempt in range(self._retries + 1): + def _on_alarm(signum, frame): # noqa: ARG001 + raise TimeoutError( + f"model.query stalled > {self._per_call_timeout}s" + ) + + prev = signal.signal(signal.SIGALRM, _on_alarm) + signal.alarm(self._per_call_timeout) + try: + return self._inner.query(messages, **kwargs) + except TimeoutError as exc: + last_exc = exc + print( + f"[warn] model stalled (attempt {attempt + 1}/" + f"{self._retries + 1}); retrying", + flush=True, + ) + finally: + signal.alarm(0) + signal.signal(signal.SIGALRM, prev) + raise last_exc if last_exc else RuntimeError("model.query failed") + + def __getattr__(self, name: str) -> Any: + # Delegate everything we don't override (cost, n_calls, serialize, …). + return getattr(self._inner, name) + + +# One template for ALL configs. The per-config preamble already advertises the +# available navigation tool (cg / lsp / none); we deliberately do NOT force a +# first command here so the comparison measures *natural* tool usage. +LOCALIZE_INSTANCE_TEMPLATE = f"""\ +You are working in the repository at {{{{cwd}}}}. + +You are doing CODE LOCALIZATION ONLY. Read the issue below and determine +which source file(s) must be modified to resolve it. **Do NOT edit, create, +or patch any file.** Investigate the codebase with the tools available to +you, then report your answer PROMPTLY — do not over-explore. As soon as you +are reasonably confident of the file(s), submit. + +The issue: + +{{{{task}}}} + +To submit, run a single bash command whose stdout is exactly these two lines +(this is how you end the task): + + echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT + echo '{SENTINEL} ["pkg/module/foo.py", "pkg/other.py"]' + +Replace the array with the real repo-relative source file paths you believe +must change, most-likely first. List only implementation files (exclude +tests). The text after `{SENTINEL}` MUST be a valid JSON array of strings. +""" + +_PY_PATH_RE = re.compile(r"[A-Za-z0-9_./-]+\.py") + +# Optional forced-workflow ablation. The free-form primary measures *natural* +# tool adoption (which on this model is near-zero — the agent defaults to +# grep/find). To measure the tool's *intrinsic* value when adoption is +# guaranteed, prepend a per-config mandate to invoke the navigation tool first. +_FORCE_TOOL_SNIPPET = { + "lsp": ( + "MANDATORY WORKFLOW: Before running any grep/find/cat, you MUST use the " + "`lsp` tool at least once to locate a relevant symbol's definition or " + "references (e.g. `lsp goto-definition ` or " + "`lsp find-references ...`). Prefer `lsp` over text search throughout.\n\n" + ), + "code_graph": ( + "MANDATORY WORKFLOW: Before running any grep/find/cat, you MUST use the " + "`cg` tool at least once to locate candidate symbols " + "(`cg search_code --prefix `) and trace cross-file structure " + "(`cg get-callers` / `cg get-dependencies` / `cg impact-analysis`). " + "Prefer `cg` over text search throughout.\n\n" + ), + "code_graph_mcp": ( + "MANDATORY WORKFLOW: Before running any grep/find/cat, you MUST use the " + "`cg-mcp` tool at least once to locate candidate symbols " + "(`cg-mcp search_code --prefix `) and trace cross-file structure " + "(`cg-mcp get_callers` / `cg-mcp get_dependencies` / " + "`cg-mcp impact_analysis`). Prefer `cg-mcp` over text search throughout.\n\n" + ), +} + + +def build_instance_template(config: str, *, force_tool: bool) -> str: + """Return the instance template, optionally prefixed with a per-config + mandate to use the navigation tool first (forced-workflow ablation).""" + if not force_tool: + return LOCALIZE_INSTANCE_TEMPLATE + snippet = _FORCE_TOOL_SNIPPET.get(config) + if not snippet: # baseline has no tool; nothing to force. + return LOCALIZE_INSTANCE_TEMPLATE + return snippet + LOCALIZE_INSTANCE_TEMPLATE + + +# --------------------------------------------------------------------------- +# Prediction parsing +# --------------------------------------------------------------------------- + +def _all_text(traj: dict[str, Any]) -> str: + """Concatenate ONLY the model's own outputs (assistant + exit/submission). + + System/user/tool messages are excluded so the example sentinel in the + instance prompt can never be mistaken for the agent's answer. + """ + parts: list[str] = [] + for m in traj.get("messages", []): + if m.get("role") not in ("assistant", "exit"): + continue + c = m.get("content", "") + if isinstance(c, str): + parts.append(c) + elif isinstance(c, list): + for seg in c: + if isinstance(seg, dict) and isinstance(seg.get("text"), str): + parts.append(seg["text"]) + # Also include the captured submission text if present. + sub = traj.get("info", {}).get("submission") + if isinstance(sub, str): + parts.append(sub) + return "\n".join(parts) + + +def _norm_path(p: str) -> str: + p = p.strip().strip('"').strip("'") + if p.startswith("./"): + p = p[2:] + if p.startswith("b/") or p.startswith("a/"): + p = p[2:] + return p + + +def parse_prediction(traj: dict[str, Any]) -> tuple[list[str], bool, list[str]]: + """Return (pred_files, parse_error, fallback_files). + + Primary: the LAST `FINAL_LOCALIZATION_JSON:` sentinel followed by a JSON + array. `parse_error` is True when no sentinel+valid-array is found. + `fallback_files` is a diagnostic regex scan (NOT used for headline). + """ + text = _all_text(traj) + fallback: list[str] = [] + seen: set[str] = set() + for m in _PY_PATH_RE.finditer(text): + fp = _norm_path(m.group(0)) + if fp not in seen: + seen.add(fp) + fallback.append(fp) + + idx = text.rfind(SENTINEL) + if idx == -1: + return [], True, fallback + after = text[idx + len(SENTINEL):] + # find the first balanced [...] JSON array + start = after.find("[") + if start == -1: + return [], True, fallback + depth = 0 + end = -1 + for i in range(start, len(after)): + if after[i] == "[": + depth += 1 + elif after[i] == "]": + depth -= 1 + if depth == 0: + end = i + break + if end == -1: + return [], True, fallback + blob = after[start : end + 1] + try: + arr = json.loads(blob) + if not isinstance(arr, list): + return [], True, fallback + except json.JSONDecodeError: + return [], True, fallback + pred: list[str] = [] + pseen: set[str] = set() + for item in arr: + if not isinstance(item, str): + continue + fp = _norm_path(item) + if fp and fp not in pseen: + pseen.add(fp) + pred.append(fp) + return pred, False, fallback + + +# --------------------------------------------------------------------------- +# Scoring +# --------------------------------------------------------------------------- + +def score_localization(pred: list[str], gold: list[str]) -> dict[str, Any]: + gold_set = set(gold) + pred_set = set(pred) + inter = gold_set & pred_set + recall = len(inter) / len(gold_set) if gold_set else 0.0 + precision = len(inter) / len(pred_set) if pred_set else 0.0 + all_found = gold_set.issubset(pred_set) if gold_set else False + + def acc_at_k(k: int) -> bool: + return gold_set.issubset(set(pred[:k])) if gold_set else False + + # MRR: reciprocal rank of the first gold hit in the predicted order. + mrr = 0.0 + for rank, fp in enumerate(pred, start=1): + if fp in gold_set: + mrr = 1.0 / rank + break + return { + "file_recall": round(recall, 4), + "file_precision": round(precision, 4), + "file_all_found": all_found, + "acc_at_1": acc_at_k(1), + "acc_at_3": acc_at_k(3), + "acc_at_5": acc_at_k(5), + "file_mrr": round(mrr, 4), + } + + +# --------------------------------------------------------------------------- +# Single run +# --------------------------------------------------------------------------- + +def run_localize_task( + inst: Any, + config: str, + *, + model_name: str, + step_limit: int = 30, + cost_limit: float = 2.0, + wall_time_limit_seconds: int = 900, + force_tool: bool = False, +) -> dict[str, Any]: + from bench.datasets import swe_bench as sb + + if config not in VALID_CONFIGS: + raise ValueError(f"unknown config {config!r}") + + repo_path = sb.prepare_localize_worktree(inst) + gold_files = sb.gold_changed_files(inst.patch, source_only=True) + gold_syms = sb.gold_symbols(inst, repo_path) + leak = sb.leakage_flags(inst, gold_files) + + # Fresh, test-free index for the graph configs. + if config == "code_graph": + _ensure_indexed(repo_path) + elif config == "code_graph_mcp": + _ensure_indexed_mcp(repo_path) + + from minisweagent.agents.default import DefaultAgent + from minisweagent.models.litellm_model import LitellmModel + + env_vars = config_env(config, repo_path) + env = SafeLocalEnvironment(cwd=str(repo_path), env=env_vars, timeout=120) + agent = DefaultAgent( + TimeoutRetryModel( + LitellmModel( + model_name=model_name, + model_kwargs={"timeout": 180}, + ), + per_call_timeout=180, + retries=3, + ), + env, + system_template=load_preamble(config), + instance_template=build_instance_template(config, force_tool=force_tool), + step_limit=step_limit, + cost_limit=cost_limit, + wall_time_limit_seconds=wall_time_limit_seconds, + ) + + started = time.time() + exit_status = "ok" + try: + agent.run(task=inst.problem_statement) + except Exception as exc: # noqa: BLE001 + exit_status = f"error:{type(exc).__name__}" + wall = round(time.time() - started, 3) + traj = agent.serialize() + + pred, parse_error, fallback = parse_prediction(traj) + sc = score_localization(pred, gold_files) + + from bench.metrics import task_metrics_from_trajectory + + tm = task_metrics_from_trajectory( + traj, benchmark="swe_localize", task_id=inst.instance_id, + config=config, wall_clock_sec=wall, + ) + + row = { + "benchmark": "swe_localize", + "task_id": inst.instance_id, + "config": config, + "force_tool": force_tool, + "input_tokens": tm.input_tokens, + "output_tokens": tm.output_tokens, + "tool_calls_total": tm.tool_calls_total, + "tool_calls_by_name": tm.tool_calls_by_name, + "wall_clock_sec": wall, + "exit_status": exit_status, + "gold_files": gold_files, + "gold_files_count": len(gold_files), + "gold_dirs_count": len({str(Path(f).parent) for f in gold_files}), + "gold_symbols": gold_syms, + "symbol_mappable": bool(gold_syms), + "predicted_files": pred, + "predicted_files_fallback": fallback[:20], + "parse_error": parse_error, + "is_structural": sb.is_structural(inst), + **leak, + **sc, + } + return {"row": row, "trajectory": traj} + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + +def _write_trajectory(task_id: str, config: str, traj: dict[str, Any], d: Path) -> None: + d.mkdir(parents=True, exist_ok=True) + (d / f"{task_id}__{config}.json").write_text( + json.dumps(traj, indent=2, sort_keys=True, default=str) + ) + + +def main(argv: list[str] | None = None) -> int: + p = argparse.ArgumentParser(description="LocAgent-style localization benchmark") + p.add_argument("--config", choices=VALID_CONFIGS, action="append", + help="repeatable; defaults to baseline/lsp/code_graph " + "(code_graph_mcp omitted by default: HTTP/MCP transport " + "parity already established at n=40, and this worktree's " + "venv lacks the mcp module)") + p.add_argument("--set", choices=("cached", "structural", "all"), default="structural", + help="cached=prior n=40 ids (pilot); structural=multi-file/dir gold") + p.add_argument("--limit", type=int, default=None) + p.add_argument("--model", default="anthropic/claude-opus-4-5") + p.add_argument("--results", type=Path, default=LOCALIZE_RESULTS) + p.add_argument("--trajectories", type=Path, default=LOCALIZE_TRAJECTORIES) + p.add_argument("--step-limit", type=int, default=40) + p.add_argument("--cost-limit", type=float, default=2.0) + p.add_argument("--wall-time", type=int, default=900) + p.add_argument("--cached-ids", type=Path, default=None, + help="JSONL/txt of task_ids to use when --set cached") + p.add_argument("--force-tool", action="store_true", + help="forced-workflow ablation: prepend a per-config mandate " + "to invoke the navigation tool (cg/lsp) before any " + "grep/find. Measures the tool's intrinsic value when " + "adoption is guaranteed (free-form adoption is ~0).") + p.add_argument("--dataset", default=None, + help="HF dataset name (default: princeton-nlp/SWE-bench_Verified). " + "Use SWE-bench-Live/SWE-bench-Live for a contamination-free, " + "less-pretraining-saturated corpus.") + p.add_argument("--split", default="test", + help="dataset split (SWE-bench-Live exposes test/lite/verified/full)") + p.add_argument("--repos", default=None, + help="comma-separated owner/name allowlist for --set structural " + "(target large, less-saturated repos)") + p.add_argument("--python-only", action="store_true", + help="require >=1 .py gold file (tools are Python-only)") + args = p.parse_args(argv) + + configs = args.config or ["baseline", "lsp", "code_graph"] + + from bench.datasets import swe_bench as sb + + all_insts = sb.load_instances(split=args.split, dataset_name=args.dataset) + by_id = {i.instance_id: i for i in all_insts} + + if args.set == "cached": + ids: list[str] = [] + src = args.cached_ids + if src and src.exists(): + for line in src.read_text().splitlines(): + line = line.strip() + if line: + ids.append(json.loads(line)["task_id"] if line.startswith("{") else line) + else: + # derive from the prior fix-run results file + prior = DEFAULT_CACHE_DIR / "opus" / "results.jsonl" + seen: set[str] = set() + for line in prior.read_text().splitlines(): + tid = json.loads(line)["task_id"] + if tid not in seen: + seen.add(tid) + ids.append(tid) + insts = [by_id[i] for i in ids if i in by_id] + elif args.set == "structural": + repo_allow = ( + {r.strip() for r in args.repos.split(",") if r.strip()} + if args.repos + else None + ) + insts = sb.select_structural( + all_insts, n=args.limit, repos=repo_allow, python_only=args.python_only + ) + else: + insts = all_insts + + if args.limit is not None: + insts = insts[: args.limit] + + # Drop instances with no source-file gold (e.g. test/doc-only patches). + insts = [i for i in insts if sb.gold_changed_files(i.patch, source_only=True)] + + print(f"[localize] {len(insts)} instances x {len(configs)} configs " + f"({args.set} set), model={args.model}") + args.results.parent.mkdir(parents=True, exist_ok=True) + + done: set[tuple[str, str]] = set() + if args.results.exists(): + for line in args.results.read_text().splitlines(): + if not line.strip(): + continue + r = json.loads(line) + done.add((r["task_id"], r["config"])) + + with args.results.open("a") as out: + for inst in insts: + for cfg in configs: + if (inst.instance_id, cfg) in done: + print(f"[resume] {inst.instance_id}/{cfg} exists; skip") + continue + print(f"[run] {inst.instance_id}/{cfg} ...", flush=True) + try: + res = run_localize_task( + inst, cfg, model_name=args.model, + step_limit=args.step_limit, cost_limit=args.cost_limit, + wall_time_limit_seconds=args.wall_time, + force_tool=args.force_tool, + ) + except Exception as exc: # noqa: BLE001 + print(f"[error] {inst.instance_id}/{cfg}: {exc!r}", flush=True) + continue + out.write(json.dumps(res["row"]) + "\n") + out.flush() + _write_trajectory(inst.instance_id, cfg, res["trajectory"], args.trajectories) + r = res["row"] + print(f"[done] {inst.instance_id}/{cfg} " + f"acc@1={r['acc_at_1']} recall={r['file_recall']} " + f"in={r['input_tokens']} parse_err={r['parse_error']}", flush=True) + print("[localize] DONE") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/bench/runners/mini_runner.py b/bench/runners/mini_runner.py index 3689c0aa..49481c07 100644 --- a/bench/runners/mini_runner.py +++ b/bench/runners/mini_runner.py @@ -49,7 +49,7 @@ DEFAULT_CACHE_DIR = BENCH_DIR / "cache" DEFAULT_RESULTS = DEFAULT_CACHE_DIR / "results.jsonl" -VALID_CONFIGS = ("baseline", "lsp", "code_graph") +VALID_CONFIGS = ("baseline", "lsp", "code_graph", "code_graph_mcp") # --------------------------------------------------------------------------- @@ -155,11 +155,40 @@ class Task: """ +INSTANCE_TEMPLATE_CODE_GRAPH_MCP = """\ +You are working in the repository at {{cwd}}. +The code-graph MCP server has already indexed this repository under the +project name `$PROJECT_NAME` on branch `$BRANCH` (use the env vars +literally). + +The task to solve: + +{{task}} + +**Required workflow.** Before reading or editing any file, your first +bash command MUST be: + + `cg-mcp search_code --project "$PROJECT_NAME" --branch "$BRANCH" --prefix ` + +Then use `cg-mcp get_callers --project "$PROJECT_NAME" --branch "$BRANCH" --symbol-id ` +to expand relationships before doing any textual search. Use +`cg-mcp impact_analysis ... --symbol-id --depth 3` before +non-trivial edits. + +When you believe the task is complete, finish your turn with a final +message that contains a unified diff of your changes inside a fenced +``` block, then exit. Do not commit; the harness reads the diff via +`git diff`. +""" + + def load_instance_template(config: str) -> str: if config == "lsp": return INSTANCE_TEMPLATE_LSP if config == "code_graph": return INSTANCE_TEMPLATE_CODE_GRAPH + if config == "code_graph_mcp": + return INSTANCE_TEMPLATE_CODE_GRAPH_MCP return INSTANCE_TEMPLATE @@ -210,6 +239,23 @@ def config_env(config: str, repo_path: Path) -> dict[str, str]: # The agent's preamble references $REPO_NAME — set it to the # worktree dirname, which is what analyze_folder used as the id. env["REPO_NAME"] = repo_path.name + elif config == "code_graph_mcp": + # MCP transport: agent calls `cg-mcp …` which spawns the + # `cgraph-mcp` stdio server per call. FalkorDB coordinates + # are passed through verbatim. + env.setdefault("FALKORDB_HOST", os.environ.get("FALKORDB_HOST", "127.0.0.1")) + env.setdefault("FALKORDB_PORT", os.environ.get("FALKORDB_PORT", "6379")) + # `cgraph-mcp` must be on PATH; the runner installs the + # falkordb-code-graph package into the same interpreter, so + # prepending the venv bin gives us the entry point. + venv_bin = str(Path(sys.executable).parent) + env["PATH"] = f"{venv_bin}:{env['PATH']}" + # The preamble references $PROJECT_NAME and $BRANCH; project + # name matches what `index_repo` derives from the folder + # (= worktree dirname), and branch is the per-instance tag we + # used when indexing. + env["PROJECT_NAME"] = repo_path.name + env["BRANCH"] = os.environ.get("CGRAPH_MCP_BRANCH", "_default") return env @@ -235,17 +281,63 @@ def _ensure_indexed(repo_path: Path) -> None: print(f"[index] {repo_name} already indexed; skip") return print(f"[index] analyzing {repo_path} ...") - with httpx.Client(timeout=600.0, headers=headers) as c: + # Default ignore set: auto-generated / vendored / pathological dirs + # that either contain no useful symbols or send jedi into a + # multi-hour resolve loop (e.g. sympy/integrals/rubi/rules has + # 3000-line files with hundreds of unresolvable symbols per line). + default_ignore = [ + ".git", "venv", ".venv", "node_modules", "__pycache__", + "rubi/rules", # sympy: blocks indexing for ~hours otherwise + "build", "dist", ".tox", ".eggs", + ] + with httpx.Client(timeout=7200.0, headers=headers) as c: r = c.post( f"{base}/api/analyze_folder", - json={"path": str(repo_path), "ignore": []}, + json={"path": str(repo_path), "ignore": default_ignore}, ) if r.status_code != 200: - print(f"[index] WARN analyze_folder returned {r.status_code}: {r.text[:200]}") - else: - print(f"[index] indexed {repo_name}") + raise RuntimeError( + f"analyze_folder returned {r.status_code}: {r.text[:300]}. " + f"Check ALLOWED_ANALYSIS_DIR on the API server covers {repo_path}." + ) + print(f"[index] indexed {repo_name}") + except Exception as exc: + raise RuntimeError(f"failed to index {repo_name} at {repo_path}: {exc}") from exc + + +def _ensure_indexed_mcp(repo_path: Path) -> None: + """MCP-track equivalent of _ensure_indexed. + + Drives the `index_repo` MCP tool in-process via the bench adapter + (avoids spawning a second cgraph-mcp just to bootstrap; the agent + will spawn its own per call). Same skip-if-present optimization + as the HTTP path: cheap GRAPH.LIST scan against FalkorDB. + """ + from bench.agents import code_graph_mcp_adapter as cgm + import redis + + repo_name = repo_path.name + branch = os.environ.get("CGRAPH_MCP_BRANCH", "_default") + host = os.environ.get("FALKORDB_HOST", "127.0.0.1") + port = int(os.environ.get("FALKORDB_PORT", "6379")) + expected_graph = f"code:{repo_name}:{branch}" + try: + r = redis.Redis(host=host, port=port, decode_responses=True, socket_timeout=2) + if expected_graph in (r.execute_command("GRAPH.LIST") or []): + print(f"[index-mcp] {expected_graph} already indexed; skip") + return except Exception as exc: # noqa: BLE001 - print(f"[index] WARN failed to index {repo_name}: {exc!r}") + print(f"[index-mcp] WARN list_graphs failed ({exc!r}); will attempt index anyway") + + print(f"[index-mcp] indexing {repo_path} as {expected_graph} ...") + try: + payload = cgm.index_repo(str(repo_path), branch=branch) + if isinstance(payload, dict) and payload.get("error"): + print(f"[index-mcp] WARN index_repo error: {payload['error']!r}") + else: + print(f"[index-mcp] indexed: {payload}") + except Exception as exc: # noqa: BLE001 + print(f"[index-mcp] WARN failed to index {repo_name}: {exc!r}") # --------------------------------------------------------------------------- @@ -569,7 +661,7 @@ def main(argv: list[str] | None = None) -> int: p = argparse.ArgumentParser(description="code-graph benchmark runner") p.add_argument("--config", choices=VALID_CONFIGS, action="append", - help="one of baseline / lsp / code_graph; repeatable. " + help="one of baseline / lsp / code_graph / code_graph_mcp; repeatable. " "Default: all three.") mode = p.add_mutually_exclusive_group(required=True) mode.add_argument("--dry-run", action="store_true", @@ -600,6 +692,11 @@ def main(argv: list[str] | None = None) -> int: "needs GITHUB_TOKEN with models:read scope); " "'github_copilot/gpt-4o' (uses your Copilot session, " "device-code OAuth on first call).") + p.add_argument("--instances-file", type=Path, default=None, + help="Path to a file listing instance_ids to run EXACTLY " + "(one per line, or a results .jsonl with a task_id " + "field). Overrides --stage/--limit sampling so a run " + "can be reproduced against a prior model's exact set.") p.add_argument("--step-limit", type=int, default=50) p.add_argument("--cost-limit", type=float, default=3.0) p.add_argument("--wall-time", type=int, default=1200) @@ -619,12 +716,40 @@ def main(argv: list[str] | None = None) -> int: from bench.metrics import append_jsonl insts = sample_instances(load_instances(), stage=args.stage) - if args.limit is not None: + if args.instances_file is not None: + wanted: list[str] = [] + seen: set[str] = set() + for line in args.instances_file.read_text().splitlines(): + line = line.strip() + if not line: + continue + if line.startswith("{"): + import json as _json + tid = _json.loads(line).get("task_id") + else: + tid = line + if tid and tid not in seen: + seen.add(tid) + wanted.append(tid) + pool = {i.instance_id: i for i in load_instances()} + missing = [t for t in wanted if t not in pool] + if missing: + raise SystemExit(f"instances-file ids not in dataset: {missing[:5]}") + insts = [pool[t] for t in wanted] + print(f"[swe-bench] instances-file override: {len(insts)} instances") + elif args.limit is not None: insts = insts[: args.limit] print(f"[swe-bench] stage={args.stage} running {len(insts)} instances " f"x {len(configs)} configs = {len(insts) * len(configs)} trajectories") for inst in insts: for cfg in configs: + # Resume support: if a trajectory file for this (instance, cfg) + # already exists, skip the run entirely. Lets us recover from + # crashes / kills without re-spending tokens on completed work. + existing_traj = args.trajectories / f"{inst.instance_id}__{cfg}.json" + if existing_traj.exists(): + print(f"[resume] {inst.instance_id}/{cfg}: trajectory exists, skip") + continue # Fresh worktree per (instance, config) to avoid cross-talk. wt = prepare_worktree(inst) # Rename so each cfg gets a distinct path. @@ -640,6 +765,8 @@ def main(argv: list[str] | None = None) -> int: # call returns nothing and the agent abandons the tool. if cfg == "code_graph": _ensure_indexed(cfg_wt) + elif cfg == "code_graph_mcp": + _ensure_indexed_mcp(cfg_wt) cfg_rows = run_batch( [task], [cfg], @@ -655,7 +782,15 @@ def main(argv: list[str] | None = None) -> int: ) rows.extend(cfg_rows) ok, summary = verify_instance(inst, cfg_wt) - cfg_rows[-1]["metrics"].outcome = "resolved" if ok else "failed" + # Inline verify is a best-effort signal only; the authoritative + # grade comes from the SWE-bench Docker harness (run separately + # via bench.runners.swebench_verify against the stored patch). + # If pytest couldn't even run here (e.g. missing in the launch + # env), record `ungraded` rather than a misleading `failed`. + if summary.startswith("UNGRADED:"): + cfg_rows[-1]["metrics"].outcome = "ungraded" + else: + cfg_rows[-1]["metrics"].outcome = "resolved" if ok else "failed" if not ok: cfg_rows[-1]["verify_summary"] = summary[-200:] append_jsonl(args.results, cfg_rows[-1]["metrics"]) diff --git a/bench/runners/nav_multihop_agent.py b/bench/runners/nav_multihop_agent.py new file mode 100644 index 00000000..accf10eb --- /dev/null +++ b/bench/runners/nav_multihop_agent.py @@ -0,0 +1,753 @@ +"""Multi-hop navigation PREMIUM agent arm (Lane 2). + +Drives the Copilot CLI over the validated multi-hop nav question set across the +three arms — ``no_mcp`` (baseline, builtin grep/view only), ``lsp`` (jedi MCP), +and ``code_graph`` (FalkorDB code-graph MCP) — and scores each answer against the +jedi oracle gold with set-F1 (file + qualname) and, for path questions, boolean +reachability correctness. Also records the realized token / tool-call / premium +cost per arm (prereg H2: median token reduction). + +This is the agent counterpart to the FREE ``nav_multihop_gate.py`` answerability +gate. The gate proved the GRAPH DATA is compact + correct on uxarray; this runner +measures whether an AGENT wielding each tool actually reaches that answer, and at +what cost. + +Design notes / invariants (see session checkpoint "Multi-hop nav gate"): +* The code_graph arm queries the PRE-BUILT fixed-resolver graph by project name + (default ``mh_uxarray`` on FalkorDB :6380). It does NOT re-index — the running + staging API server lacks the resolver fix (commit 8fa2a43), so re-indexing + would silently rebuild a BROKEN graph and invalidate the comparison. +* All three arms run with cwd = the SAME uxarray worktree the graph was indexed + from and the oracle gold is relative to, so paths align across arms. +* The MCP nav tools actually exposed are find_symbol / search_code / + get_neighbors / impact_analysis / find_path (NOT get_callers/get_callees — + those are internal helpers). The `ask` GraphRAG tool was dropped (it errored + 100% of the time without a Gemini key). find_symbol bridges a symbol name to + its integer node id, which the relationship tools require. The code_graph + capability note names the real tools. + +Usage: + .venv/bin/python -m bench.runners.nav_multihop_agent \ + --questions /tmp/ux_questions.json \ + --project mh_uxarray --port 6380 \ + --model claude-sonnet-4.6 \ + --arms no_mcp lsp code_graph \ + --out /tmp/ux_nav_agent.json +""" + +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Any + +from bench.runners.copilot_runner import ( + CODE_GRAPH, + LSP, + NO_MCP, + DEFAULT_MCP_SERVER_ROOT, + RUNNER_VERSION, + extract_agent_text, + nudge_compliance, + parse_result_event, + parse_tokens_from_logs, + parse_tool_calls, + run_copilot, + _write_lsp_mcp_config, + _write_lsp_wrapper, + _write_mcp_config, + _write_mcp_wrapper, +) +from bench.runners.nav_multihop_gate import _prf + +ARMS = (NO_MCP, LSP, CODE_GRAPH) + +# Question types whose answer is a SET of (path, qualname); the remaining type +# ("path") is a reachability boolean + an optional example chain. +SET_TYPES = ("callers", "callees", "blast_radius") + +NAV_SENTINEL = "FINAL_NAV_JSON:" + +# --------------------------------------------------------------------------- +# Capability notes — symmetric across arms; each names ONLY its own mechanism. +# The code_graph note lists the REAL exposed tools and the type->tool mapping. +# --------------------------------------------------------------------------- + +_CAP_NO_MCP = ( + "No external MCP navigation tools are available. Use Copilot's built-in file " + "reading and text search (grep/rg) tools to trace the call relationships " + "yourself." +) + +_CAP_LSP = ( + "An LSP MCP server is available exposing jedi-backed Python navigation tools " + "(goto_definition, find_references, hover, document_symbols). Paths are " + "repo-root-relative; line/character positions are 0-based (subtract 1 from " + "the 1-based line numbers grep/view report). To find CALLERS of a function, " + "use find_references on its definition; to find CALLEES, read the function " + "body and goto_definition on each name it calls. Prefer these precise " + "navigation tools over plain text search when they help." +) + +_CAP_CODE_GRAPH = ( + "A code-graph MCP server is available, already indexed under " + 'project="{project}" (do NOT call index_repo). Workflow: (1) call ' + 'find_symbol(name, project="{project}", file=) to resolve a ' + "function/method/class to its integer symbol_id. The question gives you the " + "exact qualname and file, so pass the leaf name (e.g. the part after the last " + "dot) plus that file to disambiguate; the result with file_match=true is the " + "one you want. (2) get_neighbors(symbol_id, project, relation=\"CALLS\", " + "direction=\"IN\") returns the direct CALLERS, direction=\"OUT\" returns the " + "direct CALLEES. (3) impact_analysis(symbol_id, project, direction=\"IN\", " + "depth=3) returns the transitive callers (blast radius) up to 3 hops. (4) For " + "a reachability question, resolve BOTH endpoints with find_symbol, then " + "find_path(source_id, dest_id, project) returns a call chain between them (an " + "empty result means unreachable). Each returned node carries its file and " + "name, so you can answer directly from the graph without grepping. Prefer " + "these precise graph tools over plain text search." +) + + +def _capability(track: str, project: str) -> str: + if track == CODE_GRAPH: + return _CAP_CODE_GRAPH.format(project=project) + if track == LSP: + return _CAP_LSP + return _CAP_NO_MCP + + +_OUTPUT_SPEC_SET = ( + '{{"items": [{{"path": "pkg/module.py", "qualname": "ClassName.method"}}, ' + '{{"path": "pkg/other.py", "qualname": "module_level_function"}}]}}' +) +_OUTPUT_SPEC_PATH = ( + '{{"reachable": true, "path": [{{"path": "pkg/a.py", "qualname": "A.f"}}, ' + '{{"path": "pkg/b.py", "qualname": "B.g"}}]}}' +) + +_PROMPT = """\ +You are answering a CODE NAVIGATION question about the Python repository checked +out at {cwd}. + +QUESTION: +{question} + +Investigate the repository to determine the answer. Do NOT modify any files. Do +NOT run or edit tests. +{capability} +When you are confident, finish your FINAL assistant message with a single line in +EXACTLY this format: + +{sentinel} {output_spec} + +Rules for that line: +- Use repo-root-relative POSIX paths to .py source files. +- `qualname` is the dotted name of the function/method, e.g. `ClassName.method` + or `module_level_function` (no file path, no parentheses, no arguments). +{type_rule} +- Write that line as plain text in your OWN final message. Do NOT emit it through + a shell command, `echo`, a file write, or any tool call.""" + +_TYPE_RULE_SET = ( + "- List EVERY matching function. Include both source and test functions." +) +_TYPE_RULE_PATH = ( + "- If a call chain exists, set reachable=true and give ONE such ordered chain " + "from source to target in `path`. If NO chain exists, set reachable=false and " + "path=[]." +) + + +def build_nav_prompt(track: str, cwd: Path, q: dict, project: str) -> str: + is_path = q["type"] == "path" + return _PROMPT.format( + cwd=cwd, + question=q["question"].strip(), + capability=_capability(track, project), + sentinel=NAV_SENTINEL, + output_spec=_OUTPUT_SPEC_PATH if is_path else _OUTPUT_SPEC_SET, + type_rule=_TYPE_RULE_PATH if is_path else _TYPE_RULE_SET, + ) + + +# --------------------------------------------------------------------------- +# Answer parsing + scoring +# --------------------------------------------------------------------------- + + +def _norm_path(p: str) -> str: + p = (p or "").strip().strip("'\"").strip().replace("\\", "/") + while p.startswith("./"): + p = p[2:] + return p.lstrip("/") + + +def _norm_qual(s: str) -> str: + return (s or "").strip().strip("'\"").strip() + + +def _extract_json_object(text: str) -> tuple[dict | None, str | None]: + """Pull the JSON object that follows the last NAV_SENTINEL occurrence.""" + idx = text.rfind(NAV_SENTINEL) + if idx == -1: + return None, "sentinel_missing" + tail = text[idx + len(NAV_SENTINEL):] + start = tail.find("{") + if start == -1: + return None, "no_object" + depth = 0 + end = -1 + in_str = False + esc = False + for i in range(start, len(tail)): + c = tail[i] + if in_str: + if esc: + esc = False + elif c == "\\": + esc = True + elif c == '"': + in_str = False + continue + if c == '"': + in_str = True + elif c == "{": + depth += 1 + elif c == "}": + depth -= 1 + if depth == 0: + end = i + break + if end == -1: + return None, "unbalanced_object" + try: + return json.loads(tail[start:end + 1]), None + except json.JSONDecodeError as exc: + return None, f"json_error:{exc.msg}" + + +def parse_nav_answer(text: str, qtype: str) -> tuple[dict, str | None]: + obj, err = _extract_json_object(text) + if obj is None: + if qtype == "path": + return {"reachable": None, "path": []}, err + return {"items": []}, err + if qtype == "path": + reachable = obj.get("reachable") + if isinstance(reachable, str): + reachable = reachable.strip().lower() == "true" + path = obj.get("path") or [] + items = [ + (_norm_path(it.get("path", "")), _norm_qual(it.get("qualname", ""))) + for it in path + if isinstance(it, dict) + ] + return {"reachable": bool(reachable), "path": items}, None + raw = obj.get("items") + if not isinstance(raw, list): + return {"items": []}, "items_not_a_list" + items = [ + (_norm_path(it.get("path", "")), _norm_qual(it.get("qualname", ""))) + for it in raw + if isinstance(it, dict) + ] + return {"items": items}, None + + +def _leaf(qual: str) -> str: + return _norm_qual(qual).split(".")[-1] + + +def _gold_set(q: dict) -> set[tuple[str, str]]: + return {(_norm_path(g["path"]), _norm_qual(g["qualname"])) for g in q["gold"]} + + +def _loose_set(items: list[tuple[str, str]]) -> set[tuple[str, str]]: + """Lenient identity: (path, last dotted component) — tolerates agents that + emit a bare leaf or a different qualname prefix than the oracle.""" + return {(p, _leaf(qn)) for p, qn in items if p} + + +def _endpoint_match(item: tuple[str, str], spec: dict) -> bool: + """A predicted (path, qual) matches a path-question endpoint spec when the + file matches and EITHER the full qualname or just the leaf agrees.""" + p, qn = item + sp = _norm_path(spec["path"]) + if p != sp: + return False + return _norm_qual(qn) == _norm_qual(spec["qualname"]) or _leaf(qn) == _leaf(spec["qualname"]) + + +def score_nav(q: dict, pred: dict) -> dict[str, Any]: + qtype = q["type"] + if qtype == "path": + gold_reachable = bool(q["gold"]["reachable"]) + pred_reachable = pred.get("reachable") + boolean_correct = (pred_reachable is not None) and (pred_reachable == gold_reachable) + ppath = pred.get("path", []) + # For a claimed-reachable answer, demand a non-empty chain whose + # endpoints are the requested source and target (beyond a lucky bool). + endpoints_correct = False + if pred_reachable and ppath: + src = q["symbol"]["source"] + tgt = q["symbol"]["target"] + endpoints_correct = _endpoint_match(ppath[0], src) and _endpoint_match(ppath[-1], tgt) + # The scored credit: negatives need only the correct boolean; positives + # additionally need a well-formed chain with correct endpoints. + if gold_reachable: + path_correct = bool(boolean_correct and endpoints_correct) + else: + path_correct = bool(boolean_correct) + return { + "gold_reachable": gold_reachable, + "pred_reachable": pred_reachable, + "boolean_correct": bool(boolean_correct), + "endpoints_correct": bool(endpoints_correct), + "path_correct": path_correct, + "pred_path_len": len(ppath), + } + gold = _gold_set(q) + pred_set = {(p, qn) for p, qn in pred.get("items", []) if p} + gold_files = {p for p, _ in gold} + pred_files = {p for p, _ in pred_set} + return { + "qual_prf": _prf(pred_set, gold), + "loose_qual_prf": _prf(_loose_set(pred.get("items", [])), _loose_set(list(gold))), + "file_prf": _prf(pred_files, gold_files), + "pred_n": len(pred_set), + "gold_n": len(gold), + } + + +# --------------------------------------------------------------------------- +# Per-(question, arm) run +# --------------------------------------------------------------------------- + + +def _nav_calls(tool_by_name: dict[str, int], track: str) -> int: + prefix = "lsp" if track == LSP else "code-graph" + return sum(n for k, n in tool_by_name.items() if k.startswith(prefix)) + + +def _gate_caps(gate_path: Path | None) -> dict[str, dict]: + """Load per-question GRAPH answerability caps from the agentless gate output. + + The gate computed the graph's Cypher answer vs gold for every question, so its + per-question file/qual F1 is the CEILING a code_graph agent can reach by + perfectly transcribing the tool output. Folding it in lets us attribute a + code_graph agent shortfall to either the agent (below cap) or the data (low + cap) — the rubber-duck's #1 must-fix. + """ + if not gate_path or not gate_path.exists(): + return {} + caps: dict[str, dict] = {} + for r in json.loads(gate_path.read_text()).get("rows", []): + if r["type"] == "path": + caps[r["id"]] = { + "graph_reachable": r.get("graph_reachable"), + "graph_path_correct": r.get("correct"), + } + else: + caps[r["id"]] = { + "graph_file_f1": r.get("file_prf", {}).get("f1"), + "graph_qual_f1": r.get("qual_prf", {}).get("f1"), + "graph_file_recall": r.get("file_prf", {}).get("recall"), + "grep_file_recall": r.get("grep_file_recall"), + } + return caps + + +def run_one_nav( + q: dict, + *, + track: str, + model: str, + worktree: Path, + project: str, + port: int, + server_root: Path, + out_dir: Path, + wall_time: float, + cap: dict | None = None, +) -> dict[str, Any]: + run_dir = out_dir / "runs" / track / q["id"].replace("/", "_").replace("::", "__") + if run_dir.exists(): + import shutil + + shutil.rmtree(run_dir) + run_dir.mkdir(parents=True, exist_ok=True) + + mcp_config = None + if track == CODE_GRAPH: + wrapper = _write_mcp_wrapper(run_dir, server_root) + mcp_config = _write_mcp_config(run_dir, wrapper, "127.0.0.1", port) + elif track == LSP: + wrapper = _write_lsp_wrapper(run_dir, worktree) + mcp_config = _write_lsp_mcp_config(run_dir, wrapper) + + prompt = build_nav_prompt(track, worktree, q, project) + (run_dir / "prompt.txt").write_text(prompt) + + print(f"\n=== {q['id']} [{track}] type={q['type']} model={model} ===") + result = run_copilot( + prompt=prompt, + model=model, + cwd=worktree, + log_dir=run_dir / "logs", + mcp_config=mcp_config, + wall_time=wall_time, + ) + + tokens = parse_tokens_from_logs(run_dir / "logs") + result_ev = parse_result_event(result["stdout"]) + tool_total, tool_by_name = parse_tool_calls(result["stdout"]) + compliance = nudge_compliance(result["stdout"], track) + agent_text = extract_agent_text(result["stdout"]) + (run_dir / "agent_text.txt").write_text(agent_text) + + pred, parse_error = parse_nav_answer(agent_text, q["type"]) + + base = { + "id": q["id"], + "type": q["type"], + "hop": q["hop"], + "config": track, + "model": model, + "runner": RUNNER_VERSION, + **(cap or {}), + } + + if result.get("startup_failed"): + print(f"[error] {q['id']} [{track}] copilot startup failed") + return { + **base, + "outcome": "error", + "error": (result.get("stderr") or "").strip()[:200], + "completed": False, + } + + scores = score_nav(q, pred) + row = { + **base, + **scores, + "parse_error": parse_error, + "input_tokens": tokens["input_tokens"], + "output_tokens": tokens["output_tokens"], + "total_tokens": tokens["total_tokens"], + "premium_requests": result_ev["premium_requests"], + "tool_calls_total": tool_total, + "tool_calls_by_name": tool_by_name, + "nav_tool_calls": _nav_calls(tool_by_name, track), + "first_tool": compliance["first_tool"], + "timed_out": result["timed_out"], + "wall_clock_sec": round(result["wall"], 2), + "outcome": "answered", + "completed": True, + } + _print_row(row) + return row + + +def _print_row(row: dict) -> None: + if row["type"] == "path": + verdict = "OK" if row.get("path_correct") else "X" + detail = ( + f"reach pred={row.get('pred_reachable')} gold={row.get('gold_reachable')} " + f"ends={row.get('endpoints_correct')} {verdict}" + ) + else: + f = row.get("file_prf", {}) + qf = row.get("qual_prf", {}) + cap = row.get("graph_file_f1") + detail = ( + f"fileF1={f.get('f1')} qualF1={qf.get('f1')} pred_n={row.get('pred_n')} " + f"cap(graph_fileF1)={cap}" + ) + print( + f"[nav] {row['id']} [{row['config']}] {detail} " + f"in={row['input_tokens']} out={row['output_tokens']} " + f"navtools={row['nav_tool_calls']} tools={row['tool_calls_total']} " + f"parse_err={row.get('parse_error')} wall={row['wall_clock_sec']}s" + ) + + +# --------------------------------------------------------------------------- +# Aggregate report +# --------------------------------------------------------------------------- + + +def _mean(xs: list[float]) -> float: + return round(sum(xs) / len(xs), 4) if xs else 0.0 + + +def _median(xs: list[float]) -> float: + if not xs: + return 0.0 + s = sorted(xs) + n = len(s) + mid = n // 2 + return round((s[mid] if n % 2 else (s[mid - 1] + s[mid]) / 2), 2) + + +def aggregate(rows: list[dict]) -> dict[str, Any]: + report: dict[str, Any] = {} + arms = sorted({r["config"] for r in rows if r.get("completed")}) + for arm in arms: + ar = [r for r in rows if r["config"] == arm and r.get("completed")] + per_type: dict[str, Any] = {} + for typ in ("callers", "callees", "blast_radius", "path"): + tr = [r for r in ar if r["type"] == typ] + if not tr: + continue + if typ == "path": + per_type[typ] = { + "n": len(tr), + "path_acc": _mean([1.0 if r.get("path_correct") else 0.0 for r in tr]), + "boolean_acc": _mean([1.0 if r.get("boolean_correct") else 0.0 for r in tr]), + "median_total_tokens": _median([r["total_tokens"] for r in tr]), + "median_nav_calls": _median([r["nav_tool_calls"] for r in tr]), + } + else: + row = { + "n": len(tr), + "file_f1": _mean([r["file_prf"]["f1"] for r in tr]), + "qual_f1": _mean([r["qual_prf"]["f1"] for r in tr]), + "loose_qual_f1": _mean([r["loose_qual_prf"]["f1"] for r in tr]), + "file_recall": _mean([r["file_prf"]["recall"] for r in tr]), + "median_total_tokens": _median([r["total_tokens"] for r in tr]), + "median_nav_calls": _median([r["nav_tool_calls"] for r in tr]), + } + caps = [r["graph_file_f1"] for r in tr if r.get("graph_file_f1") is not None] + if caps: + row["graph_file_f1_cap"] = _mean(caps) + per_type[typ] = row + multihop = [r for r in ar if r["hop"] == "multihop" and r["type"] != "path"] + report[arm] = { + "n": len(ar), + "by_type": per_type, + "median_total_tokens": _median([r["total_tokens"] for r in ar]), + "median_input_tokens": _median([r["input_tokens"] for r in ar]), + "median_output_tokens": _median([r["output_tokens"] for r in ar]), + "median_premium": _median([r["premium_requests"] for r in ar]), + "multihop_file_f1": _mean([r["file_prf"]["f1"] for r in multihop]) if multihop else None, + "parse_errors": sum(1 for r in ar if r.get("parse_error")), + } + report["_paired"] = _paired_deltas(rows) + return report + + +def _paired_deltas(rows: list[dict]) -> dict[str, Any]: + """Per-question paired file-F1 (set types) and path_correct deltas with a + bootstrap 90% CI — small-n honesty (rubber-duck #5).""" + import random + + by_arm: dict[str, dict[str, float]] = {} + for r in rows: + if not r.get("completed"): + continue + if r["type"] == "path": + val = 1.0 if r.get("path_correct") else 0.0 + else: + val = r["file_prf"]["f1"] + by_arm.setdefault(r["config"], {})[r["id"]] = val + out: dict[str, Any] = {} + arms = sorted(by_arm) + if CODE_GRAPH not in arms: + return out + for other in [a for a in arms if a != CODE_GRAPH]: + common = sorted(set(by_arm[CODE_GRAPH]) & set(by_arm[other])) + diffs = [by_arm[CODE_GRAPH][i] - by_arm[other][i] for i in common] + if not diffs: + continue + rng = random.Random(13) + boot = [] + for _ in range(2000): + sample = [diffs[rng.randrange(len(diffs))] for _ in diffs] + boot.append(sum(sample) / len(sample)) + boot.sort() + out[f"code_graph_minus_{other}"] = { + "n_paired": len(diffs), + "mean_delta": round(sum(diffs) / len(diffs), 4), + "ci90": [round(boot[int(0.05 * len(boot))], 4), round(boot[int(0.95 * len(boot))], 4)], + "wins": sum(1 for d in diffs if d > 1e-9), + "losses": sum(1 for d in diffs if d < -1e-9), + "ties": sum(1 for d in diffs if abs(d) <= 1e-9), + } + return out + + +def _print_report(report: dict) -> None: + print("\n" + "=" * 78) + print("MULTI-HOP NAV — PREMIUM AGENT ARM") + print("=" * 78) + for arm, a in report.items(): + if arm == "_paired": + continue + mh = a["multihop_file_f1"] + print(f"\n### {arm} (n={a['n']}, parse_errors={a['parse_errors']})") + print( + f" median_tokens total={a['median_total_tokens']} " + f"in={a['median_input_tokens']} out={a['median_output_tokens']} " + f"median_premium={a['median_premium']} " + f"multihop_file_f1={mh}" + ) + for typ, t in a["by_type"].items(): + if typ == "path": + print( + f" {typ:<13} n={t['n']} path_acc={t['path_acc']} " + f"bool_acc={t['boolean_acc']} med_tok={t['median_total_tokens']} " + f"med_navcalls={t['median_nav_calls']}" + ) + else: + cap = t.get("graph_file_f1_cap") + print( + f" {typ:<13} n={t['n']} fileF1={t['file_f1']} qualF1={t['qual_f1']} " + f"looseQ={t['loose_qual_f1']} fileRec={t['file_recall']} " + f"med_tok={t['median_total_tokens']} med_navcalls={t['median_nav_calls']}" + + (f" [graph_cap_fileF1={cap}]" if cap is not None else "") + ) + paired = report.get("_paired") or {} + if paired: + print("\n### paired deltas (code_graph minus other; file-F1 / path_correct)") + for k, v in paired.items(): + print( + f" {k}: mean_delta={v['mean_delta']} ci90={v['ci90']} " + f"W/L/T={v['wins']}/{v['losses']}/{v['ties']} (n={v['n_paired']})" + ) + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def _load_done(results_path: Path) -> set[tuple]: + done: set[tuple] = set() + if not results_path.exists(): + return done + for line in results_path.read_text().splitlines(): + line = line.strip() + if not line: + continue + try: + r = json.loads(line) + except json.JSONDecodeError: + continue + if r.get("completed") and r.get("runner") == RUNNER_VERSION: + done.add((r["id"], r["config"], r.get("model", ""))) + return done + + +def main(argv: list[str] | None = None) -> int: + p = argparse.ArgumentParser(description="Multi-hop nav premium agent arm.") + p.add_argument("--questions", required=True) + p.add_argument("--worktree", default=None, + help="repo cwd for all arms (default: questions JSON 'worktree')") + p.add_argument("--project", default="mh_uxarray", + help="code_graph project name of the PRE-BUILT fixed graph") + p.add_argument("--port", type=int, default=6380, help="FalkorDB port") + p.add_argument("--model", default="claude-sonnet-4.6") + p.add_argument("--arms", nargs="*", default=list(ARMS), choices=list(ARMS)) + p.add_argument("--types", nargs="*", default=None, + choices=["callers", "callees", "blast_radius", "path"]) + p.add_argument("--ids", nargs="*", default=None, help="restrict to these question ids") + p.add_argument("--limit", type=int, default=None, help="first N questions (post-filter)") + p.add_argument("--server-root", default=str(DEFAULT_MCP_SERVER_ROOT)) + p.add_argument("--gate", default="/tmp/ux_gate.json", + help="agentless gate output for per-question graph caps") + p.add_argument("--seed", type=int, default=13, help="run-order shuffle seed") + p.add_argument("--wall-time", type=float, default=900.0) + p.add_argument("--out", default="/tmp/ux_nav_agent.json") + p.add_argument("--results", default=None, + help="append-only jsonl for resume (default: .jsonl)") + p.add_argument("--no-resume", action="store_true") + args = p.parse_args(argv) + + data = json.loads(Path(args.questions).read_text()) + worktree = Path(args.worktree or data["worktree"]).resolve() + if not worktree.exists(): + raise SystemExit(f"worktree not found: {worktree}") + + qs = data["questions"] + if args.types: + qs = [q for q in qs if q["type"] in args.types] + if args.ids: + idset = set(args.ids) + qs = [q for q in qs if q["id"] in idset] + if args.limit: + qs = qs[: args.limit] + + out_path = Path(args.out) + out_dir = out_path.with_suffix("") + out_dir.mkdir(parents=True, exist_ok=True) + results_path = Path(args.results) if args.results else out_path.with_suffix(".jsonl") + + done = set() if args.no_resume else _load_done(results_path) + server_root = Path(args.server_root) + caps = _gate_caps(Path(args.gate) if args.gate else None) + + print( + f"worktree={worktree}\nproject={args.project} port={args.port} " + f"model={args.model}\narms={args.arms} questions={len(qs)} " + f"caps_loaded={len(caps)} already_done={len(done)}" + ) + + rows: list[dict] = [] + if results_path.exists(): + for line in results_path.read_text().splitlines(): + line = line.strip() + if line: + try: + rows.append(json.loads(line)) + except json.JSONDecodeError: + pass + + # Randomize (question, arm) order with a fixed seed so provider-side prompt + # caching / drift can't systematically favor whichever arm always runs first + # (rubber-duck #7). + import random + + worklist = [(q, arm) for q in qs for arm in args.arms] + random.Random(args.seed).shuffle(worklist) + + for q, arm in worklist: + key = (q["id"], arm, args.model) + if key in done: + continue + row = run_one_nav( + q, + track=arm, + model=args.model, + worktree=worktree, + project=args.project, + port=args.port, + server_root=server_root, + out_dir=out_dir, + wall_time=args.wall_time, + cap=caps.get(q["id"]), + ) + rows.append(row) + with results_path.open("a") as f: + f.write(json.dumps(row) + "\n") + + completed = [r for r in rows if r.get("completed")] + report = aggregate(completed) + _print_report(report) + out_path.write_text(json.dumps( + { + "worktree": str(worktree), + "project": args.project, + "model": args.model, + "n_rows": len(completed), + "report": report, + "rows": rows, + }, + indent=2, + )) + print(f"\nwrote {out_path} (jsonl: {results_path})") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/bench/runners/nav_multihop_gate.py b/bench/runners/nav_multihop_gate.py new file mode 100644 index 00000000..a4fe901f --- /dev/null +++ b/bench/runners/nav_multihop_gate.py @@ -0,0 +1,291 @@ +"""Agentless answerability/compression gate for the multi-hop nav experiment (Lane 2). + +This is the FREE gate that runs BEFORE any premium agent spend. For each generated +question (callers / callees / blast_radius / path) it computes three things: + + (a) ORACLE gold -- already embedded in the question record (jedi-based, + graph-independent; see nav_multihop_oracle.py). + (b) CODE_GRAPH answer -- a single Cypher query over the fixed-resolver CALLS + graph (1-hop reverse/forward, [:CALLS*1..3] closure, or + shortestPath) mapped to the SAME (relpath, qualname) + identity the oracle uses, plus the byte/token size of + that compact structured answer. + (c) NO-TOOL grep -- the raw `grep -rn ""` evidence the un-tooled agent + must scan, plus the FILE set grep trivially yields. + +We then score set-F1 (qualname-level AND file-level) of the graph answer vs the +oracle, and compare evidence-token compactness graph-vs-grep. The GATE DECISION: +if code_graph is not both compact AND correct -- especially on the >=2-hop +questions (blast_radius, path) where grep is expensive and wrong -- do NOT spend +premium on the agent arm. + +Usage: + .venv/bin/python -m bench.runners.nav_multihop_gate \ + --questions /tmp/ux_questions.json \ + --graph code:mh_uxarray:_default --worktree [--port 6380] [--out gate.json] +""" +from __future__ import annotations + +import argparse +import json +import re +import statistics as st +import subprocess +from pathlib import Path +from typing import Any, Optional + +from bench.runners.struct_query_bench import _graph_query +from bench.runners.nav_oracle_bench import Scope, build_scopes, enclosing_qualname + + +# --------------------------------------------------------------------------- +# token estimate (chars/4 heuristic, consistent across graph & grep so the +# RELATIVE compactness comparison is fair regardless of the absolute tokenizer) +# --------------------------------------------------------------------------- + +def _toks(s: str) -> int: + return (len(s) + 3) // 4 + + +# --------------------------------------------------------------------------- +# graph -> (relpath, qualname) mapping, shared by all query types +# --------------------------------------------------------------------------- + +def _map_nodes(rows: list[list[Any]], worktree: Path, + scope_cache: dict[Path, list[Scope]]) -> set[tuple[str, str]]: + """rows are [name, path, src_start]; map each to (relpath, enclosing qualname).""" + out: set[tuple[str, str]] = set() + for row in rows: + if len(row) < 2 or row[0] is None or row[1] is None: + continue + nm, pth = str(row[0]), str(row[1]) + start = int(row[2]) if len(row) > 2 and row[2] is not None else None + mp = Path(pth) + try: + rel = str(mp.relative_to(worktree)) + except ValueError: + rel = mp.name + qual = nm + if start is not None and mp.exists(): + if mp not in scope_cache: + scope_cache[mp] = build_scopes(mp) + q = enclosing_qualname(scope_cache[mp], start) + if q != "": + qual = q + out.add((rel, qual)) + return out + + +def _subject_match(leaf: str, subj_path: str) -> str: + """Cypher predicate selecting the subject node by leaf name + def path suffix.""" + leaf_q = leaf.replace("'", "\\'") + path_q = subj_path.replace("'", "\\'") + return f"s.name = '{leaf_q}' AND s.path ENDS WITH '{path_q}'" + + +def graph_callers(graph, port, leaf, subj_path, worktree, sc): + cy = (f"MATCH (c)-[:CALLS]->(s) WHERE {_subject_match(leaf, subj_path)} " + "RETURN DISTINCT c.name, c.path, c.src_start") + return _map_nodes(_graph_query(graph, cy, port), worktree, sc) + + +def graph_callees(graph, port, leaf, subj_path, worktree, sc): + cy = (f"MATCH (s)-[:CALLS]->(c) WHERE {_subject_match(leaf, subj_path)} " + "RETURN DISTINCT c.name, c.path, c.src_start") + return _map_nodes(_graph_query(graph, cy, port), worktree, sc) + + +def graph_blast(graph, port, leaf, subj_path, worktree, sc, depth=3): + cy = (f"MATCH (c)-[:CALLS*1..{depth}]->(s) WHERE {_subject_match(leaf, subj_path)} " + "RETURN DISTINCT c.name, c.path, c.src_start") + return _map_nodes(_graph_query(graph, cy, port), worktree, sc) + + +def graph_reachable(graph, port, src_leaf, src_path, dst_leaf, dst_path, depth=8): + cy = ( + f"MATCH (a) WHERE a.name='{src_leaf}' AND a.path ENDS WITH '{src_path}' " + f"MATCH (b) WHERE b.name='{dst_leaf}' AND b.path ENDS WITH '{dst_path}' " + f"WITH a, b MATCH p = (a)-[:CALLS*1..{depth}]->(b) RETURN count(p) > 0 LIMIT 1" + ) + rows = _graph_query(graph, cy, port) + if rows and rows[0]: + v = rows[0][0] + if isinstance(v, str): + return v.strip().lower() == "true" + return bool(v) + return False + + +# --------------------------------------------------------------------------- +# no-tool grep baseline +# --------------------------------------------------------------------------- + +def grep_evidence(leaf: str, worktree: Path) -> tuple[str, set[str]]: + """Return (raw grep output the agent must scan, set of files containing `leaf`).""" + try: + res = subprocess.run( + ["grep", "-rn", "--include=*.py", r"\b" + re.escape(leaf) + r"\b", str(worktree)], + capture_output=True, text=True, timeout=60, + ) + raw = res.stdout + except Exception: + raw = "" + files: set[str] = set() + for ln in raw.splitlines(): + fp = ln.split(":", 1)[0] + try: + files.add(str(Path(fp).relative_to(worktree))) + except ValueError: + files.add(Path(fp).name) + return raw, files + + +# --------------------------------------------------------------------------- +# scoring +# --------------------------------------------------------------------------- + +def _prf(pred: set, gold: set) -> dict[str, float]: + tp = len(pred & gold) + fp = len(pred - gold) + fn = len(gold - pred) + p = tp / (tp + fp) if (tp + fp) else (1.0 if not fn else 0.0) + r = tp / (tp + fn) if (tp + fn) else (1.0 if not fp else 0.0) + f1 = 2 * p * r / (p + r) if (p + r) else 0.0 + return {"tp": tp, "fp": fp, "fn": fn, + "precision": round(p, 3), "recall": round(r, 3), "f1": round(f1, 3)} + + +def _gold_set(q: dict) -> set[tuple[str, str]]: + return {(g["path"], g["qualname"]) for g in q["gold"]} + + +def _files(s: set[tuple[str, str]]) -> set[str]: + return {p for p, _ in s} + + +def run(questions_path: Path, graph: str, worktree: Path, port: int) -> dict: + data = json.loads(questions_path.read_text()) + qs = data["questions"] + sc: dict[Path, list[Scope]] = {} + rows: list[dict] = [] + + for q in qs: + typ = q["type"] + rec: dict[str, Any] = {"id": q["id"], "type": typ, "hop": q["hop"]} + + if typ == "path": + srt, dst = q["symbol"]["source"], q["symbol"]["target"] + gold_reach = q["gold"]["reachable"] + pred_reach = graph_reachable( + graph, port, srt["leaf"], srt["path"], dst["leaf"], dst["path"]) + rec["gold_reachable"] = gold_reach + rec["graph_reachable"] = pred_reach + rec["correct"] = (pred_reach == gold_reach) + # graph evidence = a single bool + the path; grep cannot answer reachability + rec["graph_tokens"] = _toks(json.dumps({"reachable": pred_reach})) + graw, _ = grep_evidence(srt["leaf"], worktree) + graw2, _ = grep_evidence(dst["leaf"], worktree) + rec["grep_tokens"] = _toks(graw) + _toks(graw2) + rows.append(rec) + continue + + leaf = q["symbol"]["leaf"] + subj_path = q["symbol"]["path"] + gold = _gold_set(q) + + if typ == "callers": + pred = graph_callers(graph, port, leaf, subj_path, worktree, sc) + elif typ == "callees": + pred = graph_callees(graph, port, leaf, subj_path, worktree, sc) + elif typ == "blast_radius": + pred = graph_blast(graph, port, leaf, subj_path, worktree, sc, + depth=q.get("depth", 3)) + else: + continue + + rec["qual_prf"] = _prf(pred, gold) + rec["file_prf"] = _prf(_files(pred), _files(gold)) + # graph evidence the agent reads = the compact structured answer + graph_ans = json.dumps(sorted([{"path": p, "qualname": qn} for p, qn in pred], + key=lambda d: (d["path"], d["qualname"]))) + rec["graph_tokens"] = _toks(graph_ans) + rec["graph_n"] = len(pred) + rec["gold_n"] = len(gold) + # grep baseline: raw evidence size + the file recall it trivially yields + graw, gfiles = grep_evidence(leaf, worktree) + rec["grep_tokens"] = _toks(graw) + rec["grep_file_recall"] = round( + len(gfiles & _files(gold)) / len(_files(gold)), 3) if _files(gold) else 1.0 + rows.append(rec) + + return {"graph": graph, "worktree": str(worktree), + "n": len(rows), "rows": rows} + + +def _agg(rows, types, key, sub=None): + vals = [] + for r in rows: + if r["type"] not in types: + continue + v = r.get(key) + if sub and isinstance(v, dict): + v = v.get(sub) + if isinstance(v, (int, float)): + vals.append(v) + return round(st.mean(vals), 3) if vals else None + + +def report(res: dict) -> None: + rows = res["rows"] + print(f"\n=== AGENTLESS MULTI-HOP GATE === graph={res['graph']} n={res['n']}\n") + + setq = ["callers", "callees", "blast_radius"] + print("SET QUESTIONS (graph CALLS answer vs jedi oracle):") + print(f"{'type':<14}{'qF1':>7}{'fileF1':>8}{'qRec':>7}{'qPrec':>7}" + f"{'gTok':>8}{'grepTok':>9}{'grepFRec':>9}") + for t in setq: + trows = [r for r in rows if r["type"] == t] + if not trows: + continue + print(f"{t:<14}" + f"{_agg(rows,[t],'qual_prf','f1'):>7}" + f"{_agg(rows,[t],'file_prf','f1'):>8}" + f"{_agg(rows,[t],'qual_prf','recall'):>7}" + f"{_agg(rows,[t],'qual_prf','precision'):>7}" + f"{_agg(rows,[t],'graph_tokens'):>8.0f}" + f"{_agg(rows,[t],'grep_tokens'):>9.0f}" + f"{_agg(rows,[t],'grep_file_recall'):>9}") + + prows = [r for r in rows if r["type"] == "path"] + if prows: + corr = sum(1 for r in prows if r["correct"]) + print(f"\nPATH QUESTIONS (reachability bool, graph shortestPath vs oracle):") + print(f" correct {corr}/{len(prows)} " + f"avg graph_tokens={_agg(rows,['path'],'graph_tokens'):.0f} " + f"avg grep_tokens={_agg(rows,['path'],'grep_tokens'):.0f}") + + # gate signal: compactness ratio + correctness on >=2-hop + multihop = [r for r in rows if r["hop"] == "multihop" and "qual_prf" in r] + if multihop: + print(f"\n>=2-HOP (blast_radius) qF1={_agg(rows,['blast_radius'],'qual_prf','f1')} " + f"fileF1={_agg(rows,['blast_radius'],'file_prf','f1')}") + print() + + +def main(argv=None): + ap = argparse.ArgumentParser() + ap.add_argument("--questions", required=True) + ap.add_argument("--graph", required=True) + ap.add_argument("--worktree", required=True) + ap.add_argument("--port", type=int, default=6380) + ap.add_argument("--out", default=None) + a = ap.parse_args(argv) + res = run(Path(a.questions), a.graph, Path(a.worktree), a.port) + report(res) + if a.out: + Path(a.out).write_text(json.dumps(res, indent=1)) + print(f"wrote {a.out}") + + +if __name__ == "__main__": + main() diff --git a/bench/runners/nav_multihop_oracle.py b/bench/runners/nav_multihop_oracle.py new file mode 100644 index 00000000..35e77e0d --- /dev/null +++ b/bench/runners/nav_multihop_oracle.py @@ -0,0 +1,550 @@ +"""Independent (graph-blind) multi-hop call-graph oracle for Lane 2. + +Builds a FORWARD call graph ``caller -> callee`` for a worktree using **jedi** +(goto on every call site) + ``ast`` (scopes / call-site enumeration), with ZERO +input from FalkorDB / the tree-sitter analyzer under test. From that one graph we +derive all four question types' ground truth: + + * callers(S) = reverse 1-hop -> {u : u -> S} + * callees(S) = forward 1-hop -> {v : S -> v} + * blast_radius(S) = reverse transitive closure, depth<=D -> who is affected if S changes + * path(A, B) = forward reachability + one valid path (edges all in the oracle) + +Independence discipline (prereg-multihop-nav §2): the oracle never reads the +graph; jedi is a different engine from the tree-sitter resolver we benchmark, so +grading the graph/agent against it is non-circular. Node identity is +``(relpath, qualname)`` -- the SAME identity ``nav_oracle_bench`` uses -- so graph +and agent answers are directly comparable. + +The forward graph is expensive (one jedi goto per call site) so it is cached to +``/.nav_oracle_cache.json`` keyed by a digest of the .py file set + +mtimes; pass ``--rebuild`` to force. + +Usage: + .venv/bin/python -m bench.runners.nav_multihop_oracle --worktree [--rebuild] [--depth 3] +""" +from __future__ import annotations + +import argparse +import ast +import hashlib +import json +import time +from collections import deque +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional + +from bench.runners.nav_oracle_bench import ( + Scope, + build_scopes, + enclosing_qualname, + find_definitions, + _iter_py, + _parse, +) + +# A node in the call graph: (relpath, qualname). qualname uses dotted scope path +# (e.g. "Grid.calculate_face_areas"); module-level call sites are "". +Node = tuple[str, str] + + +# --------------------------------------------------------------------------- +# Call-site enumeration (ast, source-only) +# --------------------------------------------------------------------------- + +@dataclass +class CallSite: + caller_qual: str # enclosing function/class qualname, or "" + callee_name: str # the identifier being called (attr or bare name) + line: int # 1-based line of the callee identifier + col: int # 0-based column of the callee identifier (jedi seed) + + +def _callee_ident(func: ast.AST) -> Optional[tuple[str, int, int]]: + """For a Call.func node return (name, line, col0) pointing AT the callee + identifier, suitable for seeding jedi.goto. Handles bare ``foo(`` (Name) and + method ``obj.bar(`` (Attribute). Returns None for unresolvable forms + (subscripts, calls-of-calls, lambdas).""" + if isinstance(func, ast.Name): + return func.id, func.lineno, func.col_offset + if isinstance(func, ast.Attribute): + # The attribute name sits at the END of the attribute expression. + end_line = getattr(func, "end_lineno", func.lineno) + end_col = getattr(func, "end_col_offset", None) + if end_col is None: + return None + return func.attr, end_line, end_col - len(func.attr) + return None + + +def enumerate_call_sites(path: Path, scopes: list[Scope]) -> list[CallSite]: + tree = _parse(path) + if tree is None: + return [] + out: list[CallSite] = [] + for node in ast.walk(tree): + if not isinstance(node, ast.Call): + continue + ident = _callee_ident(node.func) + if ident is None: + continue + name, line, col = ident + caller = enclosing_qualname(scopes, node.func.lineno) + out.append(CallSite(caller_qual=caller, callee_name=name, line=line, col=col)) + return out + + +# --------------------------------------------------------------------------- +# Forward call graph (jedi-resolved) +# --------------------------------------------------------------------------- + +@dataclass +class CallGraph: + worktree: str + fwd: dict[Node, set[Node]] = field(default_factory=dict) # caller -> {callee} + rev: dict[Node, set[Node]] = field(default_factory=dict) # callee -> {caller} + nodes: set[Node] = field(default_factory=set) + # def_index: qualname-leaf -> set of Nodes defining it (for question phrasing) + by_name: dict[str, set[Node]] = field(default_factory=dict) + + def add_edge(self, u: Node, v: Node) -> None: + if u == v: + return + self.fwd.setdefault(u, set()).add(v) + self.rev.setdefault(v, set()).add(u) + self.nodes.add(u) + self.nodes.add(v) + + def successors(self, s: Node) -> set[Node]: + return set(self.fwd.get(s, set())) + + def predecessors(self, s: Node) -> set[Node]: + return set(self.rev.get(s, set())) + + def reverse_closure(self, s: Node, depth: int) -> set[Node]: + """All nodes that transitively reach s within <=depth hops (excl. s).""" + seen: set[Node] = set() + frontier = deque([(s, 0)]) + while frontier: + cur, d = frontier.popleft() + if d >= depth: + continue + for u in self.rev.get(cur, set()): + if u not in seen and u != s: + seen.add(u) + frontier.append((u, d + 1)) + return seen + + def forward_path(self, a: Node, b: Node, max_depth: int = 8) -> Optional[list[Node]]: + """One shortest forward path a->...->b (BFS), or None if unreachable.""" + if a == b: + return [a] + prev: dict[Node, Node] = {a: a} + frontier = deque([(a, 0)]) + while frontier: + cur, d = frontier.popleft() + if d >= max_depth: + continue + for v in self.fwd.get(cur, set()): + if v not in prev: + prev[v] = cur + if v == b: + path = [b] + while path[-1] != a: + path.append(prev[path[-1]]) + return list(reversed(path)) + frontier.append((v, d + 1)) + return None + + +def _def_node_for(name_obj, worktree: Path, scope_cache: dict[Path, list[Scope]]) -> Optional[Node]: + """Map a jedi goto result to a (relpath, qualname) node, or None if outside + the worktree / not a func/class definition.""" + mod = getattr(name_obj, "module_path", None) + if mod is None: + return None + mp = Path(mod) + try: + rel = str(mp.relative_to(worktree)) + except ValueError: + return None # stdlib / site-packages + typ = getattr(name_obj, "type", None) + if typ not in ("function", "class"): + return None + line = getattr(name_obj, "line", None) + if line is None: + return None + if mp not in scope_cache: + scope_cache[mp] = build_scopes(mp) + qual = enclosing_qualname(scope_cache[mp], line) + if qual == "": + # def at module top: enclosing scope IS the def, so this should not happen; + # guard by using the leaf name. + qual = getattr(name_obj, "name", "") + return (rel, qual) + + +def build_call_graph(worktree: Path, *, progress: bool = True) -> CallGraph: + import jedi + + cg = CallGraph(worktree=str(worktree)) + project = jedi.Project(str(worktree)) + scope_cache: dict[Path, list[Scope]] = {} + files = list(_iter_py(worktree)) + t0 = time.time() + for i, path in enumerate(files): + if path not in scope_cache: + scope_cache[path] = build_scopes(path) + scopes = scope_cache[path] + try: + rel = str(path.relative_to(worktree)) + except ValueError: + continue + # register every defined scope as a node (so isolated defs still exist) + for s in scopes: + node = (rel, s.qualname) + cg.nodes.add(node) + leaf = s.qualname.rsplit(".", 1)[-1] + cg.by_name.setdefault(leaf, set()).add(node) + sites = enumerate_call_sites(path, scopes) + try: + script = jedi.Script(path=str(path), project=project) + except Exception: + continue + for cs in sites: + try: + targets = script.goto(cs.line, cs.col, follow_imports=True, + follow_builtin_imports=False) + except Exception: + continue + for t in targets: + callee = _def_node_for(t, worktree, scope_cache) + if callee is None: + continue + cg.add_edge((rel, cs.caller_qual), callee) + if progress and (i + 1) % 20 == 0: + print(f" [oracle] {i + 1}/{len(files)} files " + f"edges={sum(len(v) for v in cg.fwd.values())} " + f"{time.time() - t0:.0f}s", flush=True) + if progress: + print(f" [oracle] DONE {len(files)} files nodes={len(cg.nodes)} " + f"edges={sum(len(v) for v in cg.fwd.values())} {time.time() - t0:.0f}s", + flush=True) + return cg + + +# --------------------------------------------------------------------------- +# Cache +# --------------------------------------------------------------------------- + +def _digest(worktree: Path) -> str: + h = hashlib.sha256() + for p in sorted(_iter_py(worktree)): + try: + st = p.stat() + h.update(str(p).encode()) + h.update(str(int(st.st_mtime)).encode()) + h.update(str(st.st_size).encode()) + except OSError: + continue + return h.hexdigest()[:16] + + +def _cache_path(worktree: Path) -> Path: + return worktree / ".nav_oracle_cache.json" + + +def load_or_build(worktree: Path, *, rebuild: bool = False) -> CallGraph: + cp = _cache_path(worktree) + dig = _digest(worktree) + if cp.exists() and not rebuild: + try: + raw = json.loads(cp.read_text()) + if raw.get("digest") == dig: + cg = CallGraph(worktree=str(worktree)) + for u, v in raw["edges"]: + cg.add_edge(tuple(u), tuple(v)) + for n in raw.get("nodes", []): + cg.nodes.add(tuple(n)) + leaf = tuple(n)[1].rsplit(".", 1)[-1] + cg.by_name.setdefault(leaf, set()).add(tuple(n)) + print(f" [oracle] loaded cache {cp.name} " + f"(nodes={len(cg.nodes)} edges={sum(len(v) for v in cg.fwd.values())})") + return cg + except (json.JSONDecodeError, KeyError, ValueError): + pass + cg = build_call_graph(worktree) + edges = [[list(u), list(v)] for u, vs in cg.fwd.items() for v in vs] + cp.write_text(json.dumps({ + "digest": dig, + "nodes": [list(n) for n in sorted(cg.nodes)], + "edges": edges, + })) + print(f" [oracle] wrote cache {cp.name} ({len(edges)} edges)") + return cg + + +def _distinctive(qual: str, generic: set[str]) -> bool: + leaf = qual.rsplit(".", 1)[-1] + return (len(leaf) >= 5 and not leaf.startswith("_") + and leaf not in generic and leaf.lower() not in generic) + + +def generate_questions( + cg: CallGraph, worktree: Path, *, seed: int, per_type: int, depth: int = 3, +) -> list[dict]: + """Graph-blind question universe sampled from the INDEPENDENT oracle (never + from FalkorDB). One question per symbol/type with cardinality bands so we + measure navigation, not list-enumeration (prereg + rubber-duck §3). + + callers 1-hop : gold-set size 3..30 + callees 1-hop : gold-set size 3..30 + blast_radius >=2hop: reverse closure(<=depth) size 5..50 AND > indeg + (genuine multi-hop, not just the 1-hop caller set) + path >=2hop: B reachable from A with path length >=3 nodes; + plus a few unreachable negatives. + + Single-definition gate (find_definitions) keeps the question referent + unambiguous and the leaf name usable as a graph/agent lookup key. + """ + import random + from bench.runners.struct_query_bench import _GENERIC + + rng = random.Random(seed) + # cache single-def leaf names to avoid repeated ast scans + def_cache: dict[str, int] = {} + + def single_def(leaf: str) -> bool: + if leaf not in def_cache: + def_cache[leaf] = len(find_definitions(worktree, leaf)) + return def_cache[leaf] == 1 + + nodes = sorted(cg.nodes) + rng.shuffle(nodes) + + def _is_test(node: Node) -> bool: + rel, qual = node + leaf = qual.rsplit(".", 1)[-1] + return ("/test" in rel or rel.startswith("test") + or "/tests/" in rel or leaf.startswith("test_") + or "conftest" in rel) + + # subject pool excludes test/fixture nodes — graph value lives in library code + nodes = [n for n in nodes if not _is_test(n)] + + def phrase_set(node: Node) -> dict: + return {"path": node[0], "qualname": node[1]} + + out: list[dict] = [] + + # ---- callers (1-hop reverse) ---- + picked = 0 + for n in nodes: + if picked >= per_type: + break + rel, qual = n + leaf = qual.rsplit(".", 1)[-1] + if not _distinctive(qual, _GENERIC): + continue + callers = cg.predecessors(n) + if not (3 <= len(callers) <= 30): + continue + if not single_def(leaf): + continue + out.append({ + "id": f"callers::{rel}::{qual}", + "type": "callers", "hop": "1hop", + "symbol": {"path": rel, "qualname": qual, "leaf": leaf}, + "question": (f"List every function that directly CALLS the function " + f"`{qual}` (defined in `{rel}`). Return the caller functions."), + "gold": [phrase_set(c) for c in sorted(callers)], + }) + picked += 1 + + # ---- callees (1-hop forward) ---- + picked = 0 + for n in nodes: + if picked >= per_type: + break + rel, qual = n + leaf = qual.rsplit(".", 1)[-1] + if not _distinctive(qual, _GENERIC): + continue + callees = cg.successors(n) + if not (3 <= len(callees) <= 30): + continue + if not single_def(leaf): + continue + out.append({ + "id": f"callees::{rel}::{qual}", + "type": "callees", "hop": "1hop", + "symbol": {"path": rel, "qualname": qual, "leaf": leaf}, + "question": (f"List every function that the function `{qual}` " + f"(defined in `{rel}`) directly CALLS. Return the callee functions."), + "gold": [phrase_set(c) for c in sorted(callees)], + }) + picked += 1 + + # ---- blast_radius (>=2-hop reverse closure) ---- + picked = 0 + for n in nodes: + if picked >= per_type: + break + rel, qual = n + leaf = qual.rsplit(".", 1)[-1] + if not _distinctive(qual, _GENERIC): + continue + indeg = len(cg.predecessors(n)) + closure = cg.reverse_closure(n, depth) + if not (5 <= len(closure) <= 50): + continue + if len(closure) <= indeg: # must extend beyond the 1-hop caller set + continue + if not single_def(leaf): + continue + out.append({ + "id": f"blast::{rel}::{qual}", + "type": "blast_radius", "hop": "multihop", + "symbol": {"path": rel, "qualname": qual, "leaf": leaf}, + "question": (f"If the signature/behaviour of `{qual}` (defined in `{rel}`) " + f"changes, which functions are potentially AFFECTED? Include all " + f"functions that reach `{qual}` through up to {depth} levels of " + f"calls (transitive callers)."), + "gold": [phrase_set(c) for c in sorted(closure)], + "depth": depth, + }) + picked += 1 + + # ---- path (>=2-hop forward reachability) — ~half positive, ~half negative ---- + n_pos = (per_type + 1) // 2 + n_neg = per_type - n_pos + picked = 0 + src_pool = [n for n in nodes if cg.successors(n) and _distinctive(n[1], _GENERIC)] + attempts = 0 + seen_pairs: set[tuple[Node, Node]] = set() + while picked < n_pos and attempts < len(src_pool) * 4 and src_pool: + attempts += 1 + a = rng.choice(src_pool) + # forward reachable nodes within a few hops + reach: list[Node] = [] + frontier = deque([(a, 0)]) + seen = {a} + while frontier: + cur, d = frontier.popleft() + if d >= 5: + continue + for v in cg.fwd.get(cur, set()): + if v not in seen: + seen.add(v) + if d + 1 >= 2: + reach.append(v) + frontier.append((v, d + 1)) + reach = [b for b in reach if _distinctive(b[1], _GENERIC) and not _is_test(b)] + if not reach: + continue + b = rng.choice(reach) + if (a, b) in seen_pairs: + continue + path = cg.forward_path(a, b) + if path is None or len(path) < 3: + continue + if not (single_def(a[1].rsplit(".", 1)[-1]) and single_def(b[1].rsplit(".", 1)[-1])): + continue + seen_pairs.add((a, b)) + out.append({ + "id": f"path::{a[0]}::{a[1]}->{b[0]}::{b[1]}", + "type": "path", "hop": "multihop", + "symbol": {"source": {"path": a[0], "qualname": a[1], "leaf": a[1].rsplit('.', 1)[-1]}, + "target": {"path": b[0], "qualname": b[1], "leaf": b[1].rsplit('.', 1)[-1]}}, + "question": (f"Is there a chain of function calls starting from `{a[1]}` " + f"(in `{a[0]}`) that eventually reaches `{b[1]}` (in `{b[0]}`)? " + f"If yes, give one such call path."), + "gold": {"reachable": True, "path": [phrase_set(p) for p in path]}, + }) + picked += 1 + + # negatives: A,B both real symbols with NO forward path A->B + picked_neg = 0 + cand = [n for n in nodes if _distinctive(n[1], _GENERIC) and not _is_test(n)] + attempts = 0 + while picked_neg < n_neg and attempts < len(cand) * 8 and len(cand) > 2: + attempts += 1 + a = rng.choice(cand) + b = rng.choice(cand) + if a == b or (a, b) in seen_pairs: + continue + # require a has outgoing edges (otherwise trivially unreachable) + if not cg.successors(a): + continue + if cg.forward_path(a, b) is not None: + continue + if not (single_def(a[1].rsplit(".", 1)[-1]) and single_def(b[1].rsplit(".", 1)[-1])): + continue + seen_pairs.add((a, b)) + out.append({ + "id": f"path::{a[0]}::{a[1]}->{b[0]}::{b[1]}", + "type": "path", "hop": "multihop", + "symbol": {"source": {"path": a[0], "qualname": a[1], "leaf": a[1].rsplit('.', 1)[-1]}, + "target": {"path": b[0], "qualname": b[1], "leaf": b[1].rsplit('.', 1)[-1]}}, + "question": (f"Is there a chain of function calls starting from `{a[1]}` " + f"(in `{a[0]}`) that eventually reaches `{b[1]}` (in `{b[0]}`)? " + f"If yes, give one such call path."), + "gold": {"reachable": False, "path": []}, + }) + picked_neg += 1 + + return out + + +def _stats(cg: CallGraph, depth: int) -> dict: + indeg = [len(cg.rev.get(n, set())) for n in cg.nodes] + outdeg = [len(cg.fwd.get(n, set())) for n in cg.nodes] + import statistics as st + nz_in = [d for d in indeg if d] + nz_out = [d for d in outdeg if d] + return { + "nodes": len(cg.nodes), + "edges": sum(outdeg), + "nodes_with_callers": len(nz_in), + "nodes_with_callees": len(nz_out), + "median_indeg_nz": st.median(nz_in) if nz_in else 0, + "median_outdeg_nz": st.median(nz_out) if nz_out else 0, + "max_indeg": max(indeg) if indeg else 0, + "max_outdeg": max(outdeg) if outdeg else 0, + } + + +def main(argv: Optional[list[str]] = None) -> int: + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("--worktree", required=True, type=Path) + ap.add_argument("--rebuild", action="store_true") + ap.add_argument("--depth", type=int, default=3) + ap.add_argument("--questions", type=int, default=0, + help="if >0, generate this many questions PER TYPE and write --out") + ap.add_argument("--seed", type=int, default=1234) + ap.add_argument("--out") + args = ap.parse_args(argv) + wt = args.worktree.resolve() + cg = load_or_build(wt, rebuild=args.rebuild) + print(json.dumps(_stats(cg, args.depth), indent=2, default=str)) + if args.questions: + qs = generate_questions(cg, wt, seed=args.seed, per_type=args.questions, depth=args.depth) + from collections import Counter + byt = Counter(q["type"] for q in qs) + byh = Counter(q["hop"] for q in qs) + print(f"\ngenerated {len(qs)} questions by_type={dict(byt)} by_hop={dict(byh)}") + for q in qs: + if q["type"] == "path": + gold = "reachable" if q["gold"]["reachable"] else "no-path" + print(f" [{q['type']:12} {q['hop']:8}] {q['id'][:70]} gold={gold}") + else: + print(f" [{q['type']:12} {q['hop']:8}] {q['symbol']['qualname']:32} gold_n={len(q['gold'])}") + if args.out: + Path(args.out).write_text(json.dumps({"worktree": str(wt), "questions": qs}, indent=2)) + print(f"wrote {args.out}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/bench/runners/nav_oracle_bench.py b/bench/runners/nav_oracle_bench.py new file mode 100644 index 00000000..7e2cbe37 --- /dev/null +++ b/bench/runners/nav_oracle_bench.py @@ -0,0 +1,340 @@ +"""Deterministic graph-vs-oracle navigation accuracy bench (Lane 2, the FREE half). + +Fixes the circular-validation flaw of ``struct_query_bench`` (which grades the graph +against its own CALLS edges). Here the GROUND TRUTH for "who calls S" comes from an +INDEPENDENT oracle -- jedi find-references -- run on the same worktree source, never +from the graph. We then score the graph's CALLS answer against that oracle. + +Independence discipline (prereg-multihop-nav §2, §6): + * Symbol definitions are located via ``ast`` over the source, NOT via the graph. + * Symbols with 0 or >1 definitions in the worktree are DROPPED (oracle-uncertain); + we log how many, keeping the gold clean at the stated cost of generalization. + * jedi references are filtered to CALL sites and mapped to their enclosing function + via ``ast`` (source-only), so the comparison unit -- (relpath, caller_qualname) -- + is computed without consulting the graph. + +Comparison unit: the SET of caller functions, identified by (relpath, qualname). +Module-level call sites map to qualname ``""``. We report per-symbol set +precision / recall / F1 of GRAPH vs ORACLE, macro-averaged, plus the raw disagreement +lists (graph-only and oracle-only callers) for hand audit. + +Usage: + .venv/bin/python -m bench.runners.nav_oracle_bench \ + --graph code:loc-:_default --worktree [--n 30] [--port 6380] [--json out.json] +""" +from __future__ import annotations + +import argparse +import ast +import json +import statistics as st +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Optional + +from bench.runners.struct_query_bench import _GENERIC, _graph_query + + +# --------------------------------------------------------------------------- +# Source-side (graph-independent) helpers: definitions + enclosing scopes via ast +# --------------------------------------------------------------------------- + +@dataclass +class Scope: + start: int # 1-based first line (the def/class line) + end: int # 1-based last line (inclusive) + qualname: str + + +def _iter_py(worktree: Path): + for p in worktree.rglob("*.py"): + # Skip typical vendored / test-noise dirs that pollute the oracle. + parts = set(p.parts) + if parts & {".git", "node_modules", ".tox", "build", "dist", ".venv"}: + continue + yield p + + +def _parse(path: Path) -> Optional[ast.AST]: + try: + return ast.parse(path.read_text(encoding="utf-8", errors="replace")) + except (SyntaxError, ValueError): + return None + + +def find_definitions(worktree: Path, name: str) -> list[tuple[Path, int, int]]: + """All (path, lineno, name_col) where a function/class `name` is DEFINED. + + Source-only (ast), independent of the graph. name_col is the 0-based column of + the identifier itself (not the `def`/`class` keyword), suitable for seeding jedi. + """ + out: list[tuple[Path, int, int]] = [] + for p in _iter_py(worktree): + tree = _parse(p) + if tree is None: + continue + for node in ast.walk(tree): + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): + if node.name != name: + continue + kw = "class " if isinstance(node, ast.ClassDef) else "def " + out.append((p, node.lineno, node.col_offset + len(kw))) + return out + + +def build_scopes(path: Path) -> list[Scope]: + """Flat list of every function/class scope in a file with line ranges + qualname.""" + tree = _parse(path) + if tree is None: + return [] + scopes: list[Scope] = [] + + def walk(node: ast.AST, prefix: str) -> None: + for child in ast.iter_child_nodes(node): + if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): + qual = f"{prefix}{child.name}" + end = getattr(child, "end_lineno", None) or child.lineno + scopes.append(Scope(child.lineno, end, qual)) + walk(child, qual + ".") + else: + walk(child, prefix) + + walk(tree, "") + return scopes + + +def enclosing_qualname(scopes: list[Scope], line: int) -> str: + """Innermost scope containing `line`, or '' if none.""" + best: Optional[Scope] = None + for s in scopes: + if s.start <= line <= s.end: + if best is None or s.start > best.start: # innermost = latest start + best = s + return best.qualname if best else "" + + +# --------------------------------------------------------------------------- +# Oracle (jedi) caller set +# --------------------------------------------------------------------------- + +def _is_call_site(line_text: str, col: int, name: str) -> bool: + """Heuristic: the reference at `col` is a CALL if `name` is immediately + followed (modulo whitespace) by '('. Excludes imports, attribute reads, + type hints, decorators-without-call.""" + after = line_text[col + len(name):] + stripped = after.lstrip() + return stripped.startswith("(") + + +def oracle_callers( + worktree: Path, defs: list[tuple[Path, int, int]], name: str, + scope_cache: dict[Path, list[Scope]], +) -> Optional[set[tuple[str, str]]]: + """jedi find-references -> set of (relpath, caller_qualname) call sites. + + Returns None if jedi can't resolve (oracle failure -> caller drops the symbol). + Unions references across all definition sites (already gated to a single def by + the caller, but kept general).""" + import jedi + + callers: set[tuple[str, str]] = set() + project = jedi.Project(str(worktree)) + for dpath, dline, dcol in defs: + try: + script = jedi.Script(path=str(dpath), project=project) + refs = script.get_references(line=dline, column=dcol, include_builtins=False) + except Exception: + return None + for r in refs: + if r.is_definition(): + continue + mod = r.module_path + if mod is None: + continue + mp = Path(mod) + try: + rel = str(mp.relative_to(worktree)) + except ValueError: + continue # reference outside the worktree (stdlib/site-packages) + try: + line_text = mp.read_text(encoding="utf-8", errors="replace").splitlines()[r.line - 1] + except (OSError, IndexError): + continue + if not _is_call_site(line_text, r.column, name): + continue + if mp not in scope_cache: + scope_cache[mp] = build_scopes(mp) + qual = enclosing_qualname(scope_cache[mp], r.line) + callers.add((rel, qual)) + return callers + + +# --------------------------------------------------------------------------- +# Graph caller set +# --------------------------------------------------------------------------- + +def graph_callers(graph: str, port: int, name: str, worktree: Path) -> set[tuple[str, str]]: + """The graph's CALLS answer as a set of (relpath, caller_qualname). + + The graph stores caller functions directly (c)-[:CALLS]->(s{name}). We map each + caller's (path, name) to the SAME (relpath, qualname) identity the oracle uses by + re-deriving qualname from src_start via the source ast, so the two sets are + comparable. Falls back to the bare caller name if scope lookup misses.""" + cypher = ( + f"MATCH (c)-[:CALLS]->(s {{name:'{name}'}}) " + "RETURN DISTINCT c.name, c.path, c.src_start" + ) + rows = _graph_query(graph, cypher, port) + out: set[tuple[str, str]] = set() + scope_cache: dict[Path, list[Scope]] = {} + for row in rows: + if len(row) < 2 or row[0] is None or row[1] is None: + continue + cname, cpath = str(row[0]), str(row[1]) + start = int(row[2]) if len(row) > 2 and row[2] is not None else None + mp = Path(cpath) + try: + rel = str(mp.relative_to(worktree)) + except ValueError: + rel = mp.name + qual = cname + if start is not None and mp.exists(): + if mp not in scope_cache: + scope_cache[mp] = build_scopes(mp) + q = enclosing_qualname(scope_cache[mp], start) + if q != "": + qual = q + out.add((rel, qual)) + return out + + +# --------------------------------------------------------------------------- +# Sampling + scoring +# --------------------------------------------------------------------------- + +def sample_caller_symbols( + graph: str, port: int, *, n: int, seed: int, + fanin_lo: int = 3, fanin_hi: int = 80, +) -> list[dict[str, Any]]: + """Distinctively-named callees with banded fan-in (reuses struct_query_bench + rationale: 3..80 avoids precise-grep-already and generic-megahub extremes).""" + import random + cypher = ( + "MATCH (c)-[:CALLS]->(s) WHERE s:Searchable " + "WITH s.name AS name, count(c) AS fanin " + f"WHERE fanin >= {fanin_lo} AND fanin <= {fanin_hi} " + "RETURN name, fanin ORDER BY name" + ) + rows = _graph_query(graph, cypher, port) + pairs = [] + for row in rows: + if len(row) < 2: + continue + name, fan = row[0], row[1] + if not name or name in _GENERIC or str(name).startswith("__") or len(str(name)) < 4: + continue + pairs.append((str(name), int(fan))) + rng = random.Random(seed) + rng.shuffle(pairs) + return [{"name": nm, "fanin": fn} for nm, fn in pairs[:n]] + + +def _prf(graph_set: set, oracle_set: set) -> dict[str, float]: + tp = len(graph_set & oracle_set) + fp = len(graph_set - oracle_set) + fn = len(oracle_set - graph_set) + prec = tp / (tp + fp) if (tp + fp) else (1.0 if not fn else 0.0) + rec = tp / (tp + fn) if (tp + fn) else (1.0 if not fp else 0.0) + f1 = 2 * prec * rec / (prec + rec) if (prec + rec) else 0.0 + return {"tp": tp, "fp": fp, "fn": fn, "precision": round(prec, 4), + "recall": round(rec, 4), "f1": round(f1, 4)} + + +@dataclass +class Result: + rows: list[dict] = field(default_factory=list) + dropped: list[dict] = field(default_factory=list) + + +def run(graph: str, worktree: Path, *, n: int, seed: int, port: int) -> Result: + syms = sample_caller_symbols(graph, port, n=n, seed=seed) + res = Result() + scope_cache: dict[Path, list[Scope]] = {} + for s in syms: + name = s["name"] + defs = find_definitions(worktree, name) + if len(defs) != 1: # oracle-reliability gate + res.dropped.append({"symbol": name, "reason": f"{len(defs)} defs", "fanin": s["fanin"]}) + continue + oset = oracle_callers(worktree, defs, name, scope_cache) + if oset is None: + res.dropped.append({"symbol": name, "reason": "jedi failed", "fanin": s["fanin"]}) + continue + gset = graph_callers(graph, port, name, worktree) + prf = _prf(gset, oset) + res.rows.append({ + "symbol": name, + "fanin": s["fanin"], + "n_graph": len(gset), + "n_oracle": len(oset), + **prf, + "graph_only": sorted(f"{q} @ {p}" for p, q in (gset - oset))[:10], + "oracle_only": sorted(f"{q} @ {p}" for p, q in (oset - gset))[:10], + }) + return res + + +def summarize(rows: list[dict]) -> dict[str, Any]: + if not rows: + return {"n": 0} + def macro(k: str) -> float: + return round(st.mean(r[k] for r in rows), 4) + return { + "n_scored": len(rows), + "macro_precision": macro("precision"), + "macro_recall": macro("recall"), + "macro_f1": macro("f1"), + "median_f1": round(st.median(r["f1"] for r in rows), 4), + "exact_match_rate": round(sum(1 for r in rows if r["fp"] == 0 and r["fn"] == 0) / len(rows), 4), + } + + +def main(argv: Optional[list[str]] = None) -> int: + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("--graph", required=True, help="FalkorDB graph key, e.g. code:loc-:_default") + ap.add_argument("--worktree", required=True, type=Path) + ap.add_argument("--n", type=int, default=30) + ap.add_argument("--seed", type=int, default=1234) + ap.add_argument("--port", type=int, default=6380) + ap.add_argument("--json") + args = ap.parse_args(argv) + + wt = args.worktree.resolve() + res = run(args.graph, wt, n=args.n, seed=args.seed, port=args.port) + summary = summarize(res.rows) + + print(f"graph={args.graph} worktree={wt.name}") + print(f"sampled n={args.n} scored={summary.get('n_scored', 0)} dropped={len(res.dropped)}") + print(f" GRAPH-vs-ORACLE (callers, 1-hop reverse CALLS):") + for k in ("macro_precision", "macro_recall", "macro_f1", "median_f1", "exact_match_rate"): + print(f" {k:18} = {summary.get(k)}") + print(f"\n per-symbol (sorted by f1):") + for r in sorted(res.rows, key=lambda x: x["f1"]): + print(f" {r['symbol']:28} fanin={r['fanin']:>3} " + f"P={r['precision']:.2f} R={r['recall']:.2f} F1={r['f1']:.2f} " + f"(g={r['n_graph']} o={r['n_oracle']} tp={r['tp']} fp={r['fp']} fn={r['fn']})") + if res.dropped: + from collections import Counter + dc = Counter(d["reason"] for d in res.dropped) + print(f"\n dropped: {dict(dc)}") + + if args.json: + Path(args.json).write_text(json.dumps( + {"graph": args.graph, "worktree": str(wt), "summary": summary, + "rows": res.rows, "dropped": res.dropped}, indent=2)) + print(f"\nwrote {args.json}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/bench/runners/reader_experiment.py b/bench/runners/reader_experiment.py new file mode 100644 index 00000000..60440eb8 --- /dev/null +++ b/bench/runners/reader_experiment.py @@ -0,0 +1,507 @@ +"""Stage A offline "reader experiment" for the relationship_explanation lever. + +Background +---------- +The corrected stratified measurement (files/benchmark-report-corrected-stratified.md) +found that on the cg-n10-hardened batch the code-graph search tool *surfaces* gold +files well (exposure 0.824) but the agent *adopts* them poorly (adoption 0.536): +~70% of the recall loss is surfaced-but-dropped gold, not retrieval misses. + +This harness tests ONE cheap, non-overfitting lever before committing to an +expensive end-to-end factorial: does annotating each search_code result with a +FACTUAL ``relationship_explanation`` (WHY two files relate, e.g. "both override +``value_to_string``") increase how often a single-turn reader keeps a surfaced +gold file? + +It is fully OFFLINE w.r.t. the graph: it re-uses the search_code outputs already +captured in the cg-n10-hardened run dirs, re-annotates them per arm, and presents +them to an isolated single-turn Copilot agent that has NO tools and NO repository +access — so the only thing that varies between arms is the annotation schema. + +Arms (rubber-duck-mandated) +--------------------------- +* ``off`` — captured results as-is (no relationship_explanation). +* ``placebo`` — length-matched neutral filler in the same field (controls for + "more text in the result" rather than "the explanation content"). +* ``explain`` — the factual relationship_explanation. + +The surfaced set is IDENTICAL across arms ("fixed-opportunity adoption"): every +arm sees the same files; only the annotation differs. + +Primary metric +-------------- +Paired per-instance file_recall (NOT raw adoption). Secondary: fixed-opportunity +adoption (of gold files surfaced anywhere in the captures, how many the reader +predicts), acc@k, MRR, and tokens. Paired bootstrap CIs across the shared task +set. + +Usage +----- + python -m bench.runners.reader_experiment \ + --run-dir bench/cache/cg-n10-hardened \ + --model claude-opus-4.8 \ + --out bench/cache/reader-stageA + +Add ``--dry-run`` to render prompts + arms without spending any LLM tokens. +""" + +from __future__ import annotations + +import argparse +import copy +import json +import os +import random +import subprocess +import sys +import time +import uuid +from pathlib import Path +from typing import Any + +# --- repo-local imports ----------------------------------------------------- +_THIS = Path(__file__).resolve() +_BENCH = _THIS.parents[1] # .../bench +_REPO = _BENCH.parent # .../mcp-t17 +if str(_BENCH) not in sys.path: + sys.path.insert(0, str(_BENCH)) + +from runners import copilot_runner as cr # noqa: E402 +from analysis.reader_capture import capture_search_calls # noqa: E402 + +# rel_explain lives in the mcp-smoke worktree (the live server the bench uses). +_REL_EXPLAIN_DIR = ( + _REPO.parent.parent / ".worktrees" / "mcp-smoke" / "api" / "mcp" / "tools" +) +if _REL_EXPLAIN_DIR.is_dir() and str(_REL_EXPLAIN_DIR) not in sys.path: + sys.path.insert(0, str(_REL_EXPLAIN_DIR)) +import rel_explain # noqa: E402 + +ARMS = ("off", "placebo", "explain") + +# Boundary markers in the captured prompt.txt (see plan + sample inspection). +_PROBLEM_END = "\nInvestigate the repository to determine" + + +# --------------------------------------------------------------------------- +# Loading captured tasks +# --------------------------------------------------------------------------- +def _load_gold(run_dir: Path, model: str) -> dict[str, list[str]]: + """Map task_id -> gold_files for the code_graph config rows.""" + results = run_dir / model / "results.jsonl" + gold: dict[str, list[str]] = {} + for line in results.read_text().splitlines(): + line = line.strip() + if not line: + continue + row = json.loads(line) + if row.get("config") != "code_graph": + continue + tid = row.get("task_id") + if tid and tid not in gold: + gold[tid] = list(row.get("gold_files") or []) + return gold + + +def _extract_problem(prompt_text: str) -> str: + """Pull the bare problem statement out of the captured localize prompt.""" + # Everything after the first line ("You are localizing ... checked out at X.") + nl = prompt_text.find("\n") + body = prompt_text[nl + 1:] if nl != -1 else prompt_text + end = body.find(_PROBLEM_END) + if end != -1: + body = body[:end] + return body.strip() + + +def load_tasks(run_dir: Path, model: str) -> list[dict[str, Any]]: + """Return ordered task records with problem statement, gold, and captures.""" + gold_map = _load_gold(run_dir, model) + runs_root = run_dir / "runs" / model / "localize" / "nudged" / "code_graph" + tasks: list[dict[str, Any]] = [] + for task_dir in sorted(p for p in runs_root.iterdir() if p.is_dir()): + tid = task_dir.name + prompt_f = task_dir / "prompt.txt" + stdout_f = task_dir / "logs" / "stdout.jsonl" + if not prompt_f.exists() or not stdout_f.exists(): + continue + calls = capture_search_calls(stdout_f) + if not calls: + continue + tasks.append( + { + "task_id": tid, + "problem": _extract_problem(prompt_f.read_text()), + "gold_files": [cr._norm_path(g) for g in gold_map.get(tid, [])], + "calls": calls, + } + ) + return tasks + + +# --------------------------------------------------------------------------- +# Arm construction + prompt rendering +# --------------------------------------------------------------------------- +def annotate_calls(calls: list[dict[str, Any]], arm: str) -> list[dict[str, Any]]: + """Deep-copy + re-annotate every call's results for the given arm.""" + out: list[dict[str, Any]] = [] + for call in calls: + results = copy.deepcopy(call["results"]) + annotated = rel_explain.annotate_results(results, call["query"], arm) + out.append({"query": call["query"], "results": annotated}) + return out + + +def surfaced_files(calls: list[dict[str, Any]]) -> set[str]: + """Every file the captures put in front of the agent (primary + related).""" + files: set[str] = set() + for call in calls: + for r in call["results"]: + f = r.get("file") + if f: + files.add(cr._norm_path(f)) + for rel in r.get("likely_related_files") or []: + rf = rel.get("file") + if rf: + files.add(cr._norm_path(rf)) + return files + + +_READER_INSTRUCTIONS = ( + "You are localizing (not fixing) a bug in a Python repository. You do NOT " + "have access to the repository or any tools. Below is the bug report followed " + "by the complete output of a code-navigation search tool that was run against " + "the repository during an earlier investigation. Each result lists a file, a " + "matched symbol, and (where available) related files with an explanation of " + "how they relate.\n\n" + "Determine which SOURCE files must be edited to fix the issue, reasoning ONLY " + "from the bug report and the search results shown. Prefer files that the " + "evidence most directly implicates. List Python source files only (no tests, " + "no docs).\n" +) + +_READER_SENTINEL_INSTR = ( + "\nFinish your final message with a single line in EXACTLY this format " + "(most-likely file first, repo-root-relative paths):\n\n" + f"{cr.LOCALIZE_SENTINEL} [\"pkg/module_a.py\", \"pkg/module_b.py\"]\n\n" + "Write that line as plain text in your own message." +) + + +def render_prompt(problem: str, calls: list[dict[str, Any]]) -> str: + parts = [_READER_INSTRUCTIONS, "\n=== BUG REPORT ===\n", problem, "\n"] + parts.append("\n=== CODE-NAVIGATION SEARCH RESULTS ===\n") + for i, call in enumerate(calls, start=1): + parts.append(f"\n--- search_code call {i}: query={call['query']!r} ---\n") + parts.append(json.dumps(call["results"], indent=1, ensure_ascii=False)) + parts.append("\n") + parts.append(_READER_SENTINEL_INSTR) + return "".join(parts) + + +# --------------------------------------------------------------------------- +# Isolated single-turn invocation (NO tools, NO repo) +# --------------------------------------------------------------------------- +def run_isolated_reader( + *, prompt: str, model: str, log_dir: Path, wall_time: float +) -> dict[str, Any]: + """Invoke Copilot single-turn with ZERO tools; answer purely from the prompt. + + ``--available-tools=`` (empty) is the key: the model gets no tools and must + answer from the prompt-embedded evidence only. ``--log-level debug`` is + required for ``process-*.log`` files (token usage) to be written. + """ + log_dir = log_dir.resolve() + log_dir.mkdir(parents=True, exist_ok=True) + env = dict(os.environ) + stdout, stderr, returncode = "", "", None + timed_out = False + for attempt in range(1, cr.COPILOT_MAX_ATTEMPTS + 1): + session_id = str(uuid.uuid4()) + cmd = [ + "copilot", "-p", prompt, + "--model", model, + "--output-format", "json", + "--no-remote", + "--disable-builtin-mcps", + "--available-tools=", + "--log-level", "debug", + "--log-dir", str(log_dir), + "--session-id", session_id, + ] + _effort = cr._resolve_reasoning_effort() + if _effort: + cmd += ["--effort", _effort] + timed_out = False + proc = subprocess.Popen( + cmd, + cwd=str(log_dir), + stdin=subprocess.DEVNULL, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + env=env, + start_new_session=True, + ) + try: + stdout, stderr = proc.communicate(timeout=wall_time) + except subprocess.TimeoutExpired: + timed_out = True + cr._kill_group(proc.pid) + try: + stdout, stderr = proc.communicate(timeout=30) + except subprocess.TimeoutExpired: + stdout, stderr = "", "" + returncode = proc.returncode + if timed_out or not cr._is_transient_startup_failure(returncode, stdout, stderr): + break + if attempt < cr.COPILOT_MAX_ATTEMPTS: + time.sleep(cr.COPILOT_RETRY_BACKOFF_SEC) + (log_dir / "stdout.jsonl").write_text(stdout or "") + (log_dir / "stderr.txt").write_text(stderr or "") + return {"stdout": stdout or "", "returncode": returncode, "timed_out": timed_out} + + +def _final_text(stdout: str) -> str: + """Extract assistant-visible text from the Copilot CLI JSONL event stream. + + The CLI emits one JSON object per line, each shaped + ``{type, data, id, timestamp, ...}``. The assistant's final visible text + (carrying the ``FINAL_LOCALIZATION_JSON:`` sentinel) lives in + ``assistant.message`` events under ``data.content`` (a string). Streaming + deltas arrive as ``assistant.message_delta`` events; we fall back to + assembling those, and finally to a terminal ``result`` event, if no + consolidated message is present. + """ + messages: list[str] = [] + deltas: list[str] = [] + result_text = "" + for line in stdout.splitlines(): + line = line.strip() + if not line: + continue + try: + ev = json.loads(line) + except json.JSONDecodeError: + continue + if not isinstance(ev, dict): + continue + t = ev.get("type") + data = ev.get("data") + if t == "assistant.message" and isinstance(data, dict): + content = data.get("content") + if isinstance(content, str) and content: + messages.append(content) + elif isinstance(content, list): + for block in content: + if isinstance(block, dict) and isinstance(block.get("text"), str): + messages.append(block["text"]) + elif t == "assistant.message_delta" and isinstance(data, dict): + for key in ("content", "delta", "text"): + val = data.get(key) + if isinstance(val, str) and val: + deltas.append(val) + break + elif t == "result": + if isinstance(data, str): + result_text = data + elif isinstance(data, dict): + for key in ("content", "text", "result"): + val = data.get(key) + if isinstance(val, str) and val: + result_text = val + break + # Legacy single-object shape (older CLI): {type:"assistant", message:{content:[...]}} + elif t == "assistant" and isinstance(ev.get("message"), dict): + for block in ev["message"].get("content") or []: + if isinstance(block, dict) and block.get("type") == "text": + messages.append(block.get("text") or "") + + if messages: + return "\n".join(messages) + if deltas: + return "".join(deltas) + return result_text + + +# --------------------------------------------------------------------------- +# Scoring + paired bootstrap +# --------------------------------------------------------------------------- +def adoption(pred: list[str], surfaced_gold: set[str]) -> float | None: + if not surfaced_gold: + return None + keep = surfaced_gold & {cr._norm_path(p) for p in pred} + return len(keep) / len(surfaced_gold) + + +def paired_bootstrap( + deltas: list[float], n: int = 10000, seed: int = 20260604 +) -> dict[str, float]: + """Mean delta + 95% bootstrap CI over paired per-instance deltas.""" + if not deltas: + return {"mean": 0.0, "ci_low": 0.0, "ci_high": 0.0, "n": 0} + rng = random.Random(seed) + means = [] + m = len(deltas) + for _ in range(n): + sample = [deltas[rng.randrange(m)] for _ in range(m)] + means.append(sum(sample) / m) + means.sort() + return { + "mean": sum(deltas) / m, + "ci_low": means[int(0.025 * n)], + "ci_high": means[int(0.975 * n)], + "n": m, + } + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- +def main() -> int: + ap = argparse.ArgumentParser(description="Stage A reader experiment") + ap.add_argument("--run-dir", required=True, type=Path, + help="cg-n10-hardened cache dir (contains /results.jsonl + runs/)") + ap.add_argument("--model", default="claude-opus-4.8") + ap.add_argument("--out", required=True, type=Path, help="output cache dir") + ap.add_argument("--arms", default=",".join(ARMS), + help="comma-separated subset of off,placebo,explain") + ap.add_argument("--wall-time", type=float, default=420.0) + ap.add_argument("--limit", type=int, default=0, help="limit #tasks (0=all)") + ap.add_argument("--dry-run", action="store_true", + help="render arms+prompts, write them, no LLM calls") + args = ap.parse_args() + + arms = [a.strip() for a in args.arms.split(",") if a.strip()] + for a in arms: + if rel_explain.normalize_mode(a) != a: + print(f"WARNING: arm {a!r} normalizes to {rel_explain.normalize_mode(a)!r}") + + tasks = load_tasks(args.run_dir, args.model) + if args.limit: + tasks = tasks[: args.limit] + if not tasks: + print("No tasks loaded — check --run-dir/--model.", file=sys.stderr) + return 2 + print(f"Loaded {len(tasks)} tasks; arms={arms}; model={args.model}") + + args.out.mkdir(parents=True, exist_ok=True) + results_path = args.out / "reader_results.jsonl" + rows: list[dict[str, Any]] = [] + + with results_path.open("w") as out_f: + for ti, task in enumerate(tasks, start=1): + tid = task["task_id"] + gold = set(task["gold_files"]) + surf = surfaced_files(task["calls"]) + surf_gold = surf & gold + for arm in arms: + acalls = annotate_calls(task["calls"], arm) + prompt = render_prompt(task["problem"], acalls) + log_dir = args.out / "runs" / tid / arm + log_dir.mkdir(parents=True, exist_ok=True) + (log_dir / "prompt.txt").write_text(prompt) + tag = f"[{ti}/{len(tasks)}] {tid} :: {arm}" + if args.dry_run: + print(f"{tag} (dry-run) prompt={len(prompt)}B " + f"surfaced_gold={len(surf_gold)}/{len(gold)}") + continue + t0 = time.time() + rr = run_isolated_reader( + prompt=prompt, model=args.model, + log_dir=log_dir, wall_time=args.wall_time, + ) + final = _final_text(rr["stdout"]) + pred, perr, fallback = cr.parse_localization(final) + score = cr.score_localization(pred, sorted(gold)) + toks = cr.parse_tokens_from_logs(log_dir) + row = { + "task_id": tid, + "arm": arm, + "model": args.model, + "gold_files": sorted(gold), + "surfaced_gold": sorted(surf_gold), + "pred_files": pred, + "file_recall": score["file_recall"], + "file_precision": score["file_precision"], + "file_all_found": score["file_all_found"], + "acc_at_1": score["acc_at_1"], + "acc_at_3": score["acc_at_3"], + "acc_at_5": score["acc_at_5"], + "file_mrr": score["file_mrr"], + "adoption": adoption(pred, surf_gold), + "parse_error": perr, + "parse_fallback": fallback, + "input_tokens": toks["input_tokens"], + "output_tokens": toks["output_tokens"], + "reasoning_tokens": toks["reasoning_tokens"], + "usage_blocks": toks["usage_blocks"], + "timed_out": rr["timed_out"], + "returncode": rr["returncode"], + "wall_sec": round(time.time() - t0, 1), + "prompt_bytes": len(prompt), + } + rows.append(row) + out_f.write(json.dumps(row) + "\n") + out_f.flush() + print(f"{tag} recall={score['file_recall']} " + f"adopt={row['adoption']} in={toks['input_tokens']} " + f"err={perr} wall={row['wall_sec']}s") + + if args.dry_run or not rows: + print(f"\nWrote prompts under {args.out/'runs'}. Done (dry-run={args.dry_run}).") + return 0 + + _summarize(rows, arms, args.out) + return 0 + + +def _summarize(rows: list[dict[str, Any]], arms: list[str], out: Path) -> None: + by: dict[str, dict[str, dict[str, Any]]] = {a: {} for a in arms} + for r in rows: + by[r["arm"]][r["task_id"]] = r + + def col(arm: str, key: str) -> list[float]: + return [v[key] for v in by[arm].values() if v.get(key) is not None] + + print("\n================ STAGE A SUMMARY ================") + hdr = f"{'arm':10s} {'recall':>8s} {'adopt':>8s} {'acc@1':>7s} {'mrr':>7s} {'in_med':>9s}" + print(hdr) + import statistics as st + for a in arms: + rec = col(a, "file_recall") + ado = col(a, "adoption") + a1 = col(a, "acc_at_1") + mrr = col(a, "file_mrr") + intok = col(a, "input_tokens") + print(f"{a:10s} {st.mean(rec):8.3f} " + f"{(st.mean(ado) if ado else 0):8.3f} " + f"{st.mean(a1):7.3f} {st.mean(mrr):7.3f} " + f"{int(st.median(intok)) if intok else 0:9d}") + + # Paired deltas vs 'off' baseline. + base = "off" if "off" in arms else arms[0] + print(f"\nPaired deltas vs {base!r} (95% bootstrap CI):") + for a in arms: + if a == base: + continue + common = sorted(set(by[a]) & set(by[base])) + for metric in ("file_recall", "adoption"): + deltas = [] + for t in common: + va, vb = by[a][t].get(metric), by[base][t].get(metric) + if va is None or vb is None: + continue + deltas.append(va - vb) + bs = paired_bootstrap(deltas) + print(f" {a:8s} Δ{metric:13s} " + f"mean={bs['mean']:+.3f} CI[{bs['ci_low']:+.3f},{bs['ci_high']:+.3f}] n={bs['n']}") + + summary_path = out / "summary.json" + summary_path.write_text(json.dumps( + {"arms": arms, "n_tasks": len(by[arms[0]]), "rows": rows}, indent=1)) + print(f"\nWrote {summary_path}") + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/bench/runners/struct_query_bench.py b/bench/runners/struct_query_bench.py new file mode 100644 index 00000000..8a7d3efd --- /dev/null +++ b/bench/runners/struct_query_bench.py @@ -0,0 +1,232 @@ +"""Structural-query token-compression benchmark. + +Answers the stakeholder question directly, without agent-in-the-loop variance: + + For graph-shaped questions ("who calls symbol S?", "what does S call?", + "where is S defined?"), how many *context tokens* does the code-graph's + structured answer consume, versus the raw file/grep evidence an agent + would otherwise have to read into its context to derive the same answer? + +This isolates the one thing a code graph can mechanically do — return compact, +precise structural facts — from the confounds that dominate the agent +benchmarks (the model ignoring the tool, non-convergence on large repos, issue +text leaking the answer). If the graph cannot beat grep on *evidence +compression* here, it cannot save tokens anywhere. + +Method (deterministic, no LLM): + * Connect straight to FalkorDB (the API's graph_entities endpoint caps the + returned subgraph; the raw store is complete). + * Sample distinctively-named Function/Class symbols with a meaningful but + non-pathological caller fan-in. + * For each symbol and each query type, compute: + graph_tokens = tiktoken size of the graph's structured answer + (caller/callee/definition list: "name @ relpath:line") + raw_tokens = tiktoken size of the evidence an agent reads WITHOUT the + graph: `rg -nw ` over the repo's .py files (the + lines it must scan and disambiguate) + ratio = raw_tokens / graph_tokens (>1 => graph saves tokens) + * Aggregate with paired statistics (median ratio, geometric mean, win-rate), + never raw sums (one megahub symbol would dominate). + +Token counts use tiktoken cl100k as a standard, reproducible proxy for context +size; absolute Claude token counts differ slightly but ratios are stable. + +Usage: + python -m bench.runners.struct_query_bench \ + --repo-graph code:conan-io__conan-16987__loc:_default \ + --worktree bench/cache/worktrees-localize/conan-io__conan-16987__loc \ + --sample 40 --out bench/cache/struct-query/conan.jsonl +""" + +from __future__ import annotations + +import argparse +import json +import random +import statistics as st +import subprocess +from pathlib import Path +from typing import Any + +DEFAULT_SEED = 20260526 + +# Generic method names whose name-based CALLS edges are too noisy to be a +# meaningful "callers of X" answer (every .run()/.get() in the repo links here). +_GENERIC = { + "run", "get", "set", "__init__", "__call__", "__enter__", "__exit__", + "setUp", "tearDown", "update", "add", "remove", "append", "load", "save", + "main", "test", "name", "value", "data", "result", "items", "keys", + "values", "format", "parse", "build", "make", "create", "close", "open", + "read", "write", "start", "stop", "send", "recv", "copy", "clear", +} + + +def _tok(text: str) -> int: + """tiktoken cl100k token count (reproducible context-size proxy).""" + import tiktoken + + enc = tiktoken.get_encoding("cl100k_base") + return len(enc.encode(text)) + + +def _graph_query(graph: str, cypher: str, port: int) -> list[list[Any]]: + """Run a Cypher query against FalkorDB, returning structured data rows. + + Uses the redis Python client and FalkorDB's RESP result framing: the reply + is [header, data_rows, stats]; data_rows is a list of rows, each a list of + column values. Far more robust than parsing redis-cli text output. + """ + import redis + + r = redis.Redis(host="127.0.0.1", port=port, decode_responses=True) + reply = r.execute_command("GRAPH.QUERY", graph, cypher) + if not isinstance(reply, list) or len(reply) < 2: + return [] + data = reply[1] + rows: list[list[Any]] = [] + for row in data: + rows.append(list(row) if isinstance(row, (list, tuple)) else [row]) + return rows + + +def _relpath(path: str, worktree: str) -> str: + try: + return str(Path(path).relative_to(worktree)) + except ValueError: + return Path(path).name + + +def sample_symbols( + graph: str, port: int, *, n: int, seed: int = DEFAULT_SEED, + fanin_lo: int = 3, fanin_hi: int = 80, +) -> list[dict[str, Any]]: + """Distinctively-named Function/Class symbols with meaningful fan-in. + + We band the fan-in: too low (1-2) means grep is already precise (no graph + headroom); too high (generic megahubs) means the name-based caller list is + noise. The 3..80 band targets symbols where 'who calls X' is both a real + question and answerable precisely. + """ + cypher = ( + "MATCH (c)-[:CALLS]->(s) WHERE s:Searchable " + f"WITH s.name AS name, count(c) AS fanin " + f"WHERE fanin >= {fanin_lo} AND fanin <= {fanin_hi} " + "RETURN name, fanin ORDER BY name" + ) + rows = _graph_query(graph, cypher, port) + pairs: list[tuple[str, int]] = [] + for row in rows: + if len(row) < 2: + continue + name, fan = row[0], row[1] + if not name or name in _GENERIC or str(name).startswith("__"): + continue + if len(str(name)) < 4: + continue + try: + pairs.append((str(name), int(fan))) + except (ValueError, TypeError): + continue + rng = random.Random(seed) + rng.shuffle(pairs) + return [{"name": nm, "fanin": fn} for nm, fn in pairs[:n]] + + +def graph_callers_answer(graph: str, port: int, symbol: str, worktree: str) -> str: + """The structured caller list the graph returns for `who calls `.""" + cypher = ( + f"MATCH (c)-[:CALLS]->(s {{name:'{symbol}'}}) " + "RETURN DISTINCT c.name, c.path ORDER BY c.name LIMIT 200" + ) + rows = _graph_query(graph, cypher, port) + lines = [] + for row in rows: + if len(row) < 2: + continue + cname, cpath = row[0], row[1] + lines.append(f"{cname} @ {_relpath(str(cpath), worktree)}") + return f"callers of {symbol} ({len(lines)}):\n" + "\n".join(lines) + + +def grep_evidence(symbol: str, worktree: str) -> str: + """The raw evidence an agent reads WITHOUT the graph: word-boundary grep + of the symbol across the repo's Python files (the lines it must scan to + find and disambiguate real call sites).""" + out = subprocess.run( + ["rg", "-nw", "--no-heading", "-g", "*.py", symbol, worktree], + capture_output=True, + text=True, + timeout=120, + ) + # Strip the absolute worktree prefix to mimic what the agent actually sees + # (relative paths), so we don't inflate raw tokens with long abs paths. + return out.stdout.replace(worktree.rstrip("/") + "/", "") + + +def run( + graph: str, worktree: str, *, port: int, n: int, seed: int, +) -> list[dict[str, Any]]: + syms = sample_symbols(graph, port, n=n, seed=seed) + results = [] + for s in syms: + name = s["name"] + g = graph_callers_answer(graph, port, name, worktree) + r = grep_evidence(name, worktree) + gt, rt = _tok(g), _tok(r) + results.append({ + "symbol": name, + "fanin": s["fanin"], + "graph_tokens": gt, + "raw_tokens": rt, + "ratio": round(rt / gt, 3) if gt else None, + "grep_hits": r.count("\n"), + }) + return results + + +def summarize(rows: list[dict[str, Any]]) -> dict[str, Any]: + ratios = [r["ratio"] for r in rows if r["ratio"]] + wins = sum(1 for x in ratios if x > 1.0) + geomean = ( + st.geometric_mean(ratios) if ratios else None + ) + return { + "n": len(rows), + "median_ratio": round(st.median(ratios), 3) if ratios else None, + "geomean_ratio": round(geomean, 3) if geomean else None, + "win_rate": f"{wins}/{len(ratios)}", + "median_graph_tokens": int(st.median([r["graph_tokens"] for r in rows])) if rows else 0, + "median_raw_tokens": int(st.median([r["raw_tokens"] for r in rows])) if rows else 0, + } + + +def main(argv: list[str] | None = None) -> int: + p = argparse.ArgumentParser(description=__doc__) + p.add_argument("--repo-graph", required=True, help="FalkorDB graph key") + p.add_argument("--worktree", required=True, help="repo worktree path (for grep + relpaths)") + p.add_argument("--port", type=int, default=6380) + p.add_argument("--sample", type=int, default=40) + p.add_argument("--seed", type=int, default=DEFAULT_SEED) + p.add_argument("--out", type=Path, default=None) + args = p.parse_args(argv) + + wt = str(Path(args.worktree).resolve()) + rows = run(args.repo_graph, wt, port=args.port, n=args.sample, seed=args.seed) + summ = summarize(rows) + print(json.dumps({"summary": summ}, indent=2)) + for r in sorted(rows, key=lambda x: -(x["ratio"] or 0)): + print(f" {r['symbol']:32} fanin={r['fanin']:>4} " + f"graph={r['graph_tokens']:>5} raw={r['raw_tokens']:>7} " + f"ratio={r['ratio']}") + if args.out: + args.out.parent.mkdir(parents=True, exist_ok=True) + with args.out.open("w") as f: + f.write(json.dumps({"summary": summ}) + "\n") + for r in rows: + f.write(json.dumps(r) + "\n") + print(f"\nwrote {args.out}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/bench/scripts/start-api.sh b/bench/scripts/start-api.sh new file mode 100755 index 00000000..4e55f673 --- /dev/null +++ b/bench/scripts/start-api.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# Launch the code-graph API server with the fast tree-sitter Python +# resolver enabled (PR #691 + #692). This is what the bench harness +# expects to talk to at 127.0.0.1:5000. +# +# Usage: +# bench/scripts/start-api.sh # default port 5000 +# bench/scripts/start-api.sh --port 5001 +# +# Prereqs: +# - FalkorDB running. For native falkordb on 6380 set +# FALKORDB_HOST=127.0.0.1 FALKORDB_PORT=6380 before invoking. +# - uv on PATH. +# - cwd must be a code-graph worktree containing api/ with PR #691 +# and PR #692 applied (i.e. the dvirdukhan/query-cache branch tip +# or staging once those are merged). + +set -euo pipefail + +PORT=5000 +while [[ $# -gt 0 ]]; do + case "$1" in + --port) PORT="$2"; shift 2 ;; + *) echo "Unknown arg: $1" >&2; exit 1 ;; + esac +done + +# Tree-sitter static resolver — turns Python indexing from minutes to +# seconds. Default is still jedi, so callers must opt in explicitly. +export CODE_GRAPH_PY_RESOLVER="${CODE_GRAPH_PY_RESOLVER:-tree_sitter}" + +# Allow the bench harness to analyze any folder; the bench worktrees +# live under bench/cache/worktrees. +export ALLOWED_ANALYSIS_DIR="${ALLOWED_ANALYSIS_DIR:-/}" + +# Public mode: bench harness does not bother with bearer tokens. +export CODE_GRAPH_PUBLIC="${CODE_GRAPH_PUBLIC:-1}" + +echo "[start-api] CODE_GRAPH_PY_RESOLVER=$CODE_GRAPH_PY_RESOLVER" +echo "[start-api] CODE_GRAPH_PUBLIC=$CODE_GRAPH_PUBLIC" +echo "[start-api] FALKORDB_HOST=${FALKORDB_HOST:-127.0.0.1} FALKORDB_PORT=${FALKORDB_PORT:-6379}" +echo "[start-api] Listening on 127.0.0.1:$PORT" + +exec uv run uvicorn api.index:app --host 127.0.0.1 --port "$PORT" diff --git a/bench/tools/code_graph_mcp/system_preamble.md b/bench/tools/code_graph_mcp/system_preamble.md new file mode 100644 index 00000000..bf5af4a1 --- /dev/null +++ b/bench/tools/code_graph_mcp/system_preamble.md @@ -0,0 +1,72 @@ +# code-graph (MCP) preamble + +You are an autonomous coding agent solving a software-engineering task. +Your sole tool is bash: every action you take is a shell command that +is executed in the repository's working directory. + +## Code-navigation workflow — use this BEFORE grep/find + +A code-graph **MCP server** (`cgraph-mcp`) is available for this repo. +**Before reading or editing code, locate the relevant symbols through +`cg-mcp` rather than grepping the file tree** — it's faster, returns +precise `{id, file, line}` records, and reveals caller / callee / +impact relationships you would otherwise reconstruct by hand. Fall +back to bash only when `cg-mcp` cannot answer the question. + +`$PROJECT_NAME` and `$BRANCH` are exported for you (do not guess). +The graph is already indexed against the current commit. + +Typical loop: + +1. `cg-mcp search_code --project "$PROJECT_NAME" --prefix ` — + locate a function/class by name. Pick the `id` of the best hit. +2. `cg-mcp get_callers --project "$PROJECT_NAME" --symbol-id ` — + "who calls this?" before refactoring. +3. `cg-mcp impact_analysis --project "$PROJECT_NAME" --symbol-id + --depth 3` — full transitive blast radius. Use this BEFORE + non-trivial edits. +4. Read the implicated file(s) with `sed -n` / `cat`, then edit. + +## Available `cg-mcp` sub-commands + +- `cg-mcp search_code --project P --prefix STR [--limit N]` — + prefix search; returns `[{id, name, label, file, line}, ...]`. +- `cg-mcp get_callers --project P --symbol-id ID [--limit N]` — + incoming CALLS edges (who calls X). +- `cg-mcp get_callees --project P --symbol-id ID [--limit N]` — + outgoing CALLS edges (what X calls). +- `cg-mcp get_dependencies --project P --symbol-id ID [--limit N]` — + all outgoing edges (CALLS + IMPORTS + DEFINES). +- `cg-mcp impact_analysis --project P --symbol-id ID + [--direction IN|OUT] [--depth N]` — + transitive blast radius (default IN, depth 3). +- `cg-mcp find_path --project P --source-id ID --dest-id ID` — + the call chain(s) between two symbols. +- `cg-mcp index_repo --path-or-url PATH [--branch B]` — + (re)index a folder or git URL. Only needed for repos that aren't + pre-indexed. + +You also have the usual Unix tools (`cat`, `grep`/`rg`, `find`, `sed`) +for cases the graph can't answer. + +## Rules of thumb + +1. **Always run `search_code` first** to turn a name into an `id`. +2. **`impact_analysis` before any non-trivial edit.** Even when you + think you know the answer — the transitive closure often surprises + you. +3. **Don't `grep` for callers.** `get_callers` is one cheap Cypher + hop; grep over a large repo costs tens of thousands of tokens. + +## Submission + +When you believe the task is complete, run a bash command whose first +line of stdout is exactly: + +``` +COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT +``` + +followed by your final answer or summary on subsequent lines. The +runner reads the working-tree `git diff` automatically; you do not +need to commit. diff --git a/bench/tools/code_graph_mcp/tools.yaml b/bench/tools/code_graph_mcp/tools.yaml new file mode 100644 index 00000000..3b676977 --- /dev/null +++ b/bench/tools/code_graph_mcp/tools.yaml @@ -0,0 +1,39 @@ +# SWE-agent tool bundle: code-graph MCP-transport config. +# +# This is the MCP-transport sibling of bench/tools/code_graph/tools.yaml. +# Same backend graph; different transport. Where `code_graph` calls the +# host FastAPI service over HTTP, `code_graph_mcp` spawns the +# `cgraph-mcp` stdio server for each tool call — the exact transport +# Claude Code / Cursor / Cline use in production. +# +# Tool names mirror the 8 MCP tools registered in api/mcp/tools/ +# (search_code, get_callers, get_callees, get_dependencies, +# impact_analysis, find_path, index_repo, ask). The bash agent calls +# them through the `cg-mcp ...` shim (see bench/cli/cg-mcp). +# +# IMPORTANT: `ask` (GraphRAG) is intentionally NOT in the tool list. +# Including it would double-count tokens (nested LLM agent). Same Q2 +# decision as the HTTP code_graph config — we benchmark the *graph*, +# not GraphRAG. + +extends: ../baseline/tools.yaml + +tools: + - index_repo # (path_or_url, branch?) -> indexing stats + - search_code # (project, prefix) -> [symbol] + - get_callers # (project, symbol_id) -> [caller] + - get_callees # (project, symbol_id) -> [callee] + - get_dependencies # (project, symbol_id) -> [dep] + - impact_analysis # (project, symbol_id, direction, depth) -> [impacted] + - find_path # (project, source_id, dest_id) -> [path] + +backend: + transport: mcp_stdio + command: cgraph-mcp + # Container has cgraph-mcp on PATH via `pip install -e .` against this + # repo. FALKORDB_HOST/PORT are passed through to the spawned MCP + # server, pointing at the same host FalkorDB the HTTP config uses. + env_passthrough: + - FALKORDB_HOST + - FALKORDB_PORT + - MODEL_NAME diff --git a/tests/bench/__init__.py b/tests/bench/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/bench/test_adopt_controls.py b/tests/bench/test_adopt_controls.py new file mode 100644 index 00000000..9b84c307 --- /dev/null +++ b/tests/bench/test_adopt_controls.py @@ -0,0 +1,169 @@ +"""Unit tests for the adoption-calibration controls + candidate metric. + +Covers the FREE/offline pieces (no API, no live graph): + * candidate-level confusion matrix scores an injected NOISY distractor as FP + when the agent keeps it and TN when it drops it (the run-time NOISY arm's + correctness hinge); + * edit-critical relabel heuristic + manual override precedence; + * the degenerate-task convention for macro precision/recall (undefined tasks + are dropped from the macro average, not imputed). +""" + +from __future__ import annotations + +import json + +from bench.analysis import adopt_controls as ac +from bench.analysis.exposure_adoption import candidate_calibration, classify_run + + +def _write_stdout(path, primaries): + """Write a minimal Copilot-CLI stdout.jsonl with one search_code call that + surfaces ``primaries`` (list of file paths) as ranked primary hits.""" + contents = [{"type": "text", "text": json.dumps({"file": f})} for f in primaries] + events = [ + {"type": "tool.execution_start", + "data": {"toolCallId": "c1", "mcpToolName": "search_code"}}, + {"type": "tool.execution_complete", + "data": {"toolCallId": "c1", "result": {"contents": contents}}}, + ] + path.write_text("\n".join(json.dumps(e) for e in events)) + return path + + +def test_kept_distractor_scores_fp(tmp_path): + sp = _write_stdout(tmp_path / "s.jsonl", ["pkg/gold.py", "pkg/distractor.py"]) + cls = classify_run(sp, gold_files=["pkg/gold.py"], + pred_files=["pkg/gold.py", "pkg/distractor.py"]) + c = cls["cand"] + assert c["tp"] == 1 # surfaced gold kept + assert c["fp"] == 1 # surfaced non-gold (distractor) kept + assert c["fn"] == 0 and c["tn"] == 0 + + +def test_dropped_distractor_scores_tn(tmp_path): + sp = _write_stdout(tmp_path / "s.jsonl", ["pkg/gold.py", "pkg/distractor.py"]) + cls = classify_run(sp, gold_files=["pkg/gold.py"], pred_files=["pkg/gold.py"]) + c = cls["cand"] + assert c["tp"] == 1 and c["tn"] == 1 # gold kept, distractor correctly dropped + assert c["fp"] == 0 and c["fn"] == 0 + + +def test_dropped_gold_scores_fn(tmp_path): + sp = _write_stdout(tmp_path / "s.jsonl", ["pkg/gold.py"]) + cls = classify_run(sp, gold_files=["pkg/gold.py"], pred_files=[]) + assert cls["cand"]["fn"] == 1 and cls["cand"]["tp"] == 0 + + +def test_incidental_gold_excluded_from_matrix(tmp_path): + # gold surfaced but marked NOT edit-critical -> neither TP nor FN when dropped + sp = _write_stdout(tmp_path / "s.jsonl", ["pkg/gold.py"]) + cls = classify_run(sp, gold_files=["pkg/gold.py"], pred_files=[], + edit_critical=[]) + c = cls["cand"] + assert c["tp"] == 0 and c["fn"] == 0 + assert cls["cand"]["detail"]["pkg/gold.py"] == "incidental_gold" + + +def test_edit_critical_heuristic_and_override(): + gold = ["pkg/core.py", "tests/test_core.py", "pkg/migrations/0001_init.py"] + crit, inc = ac.edit_critical_split(gold) + assert crit == ["pkg/core.py"] + assert set(inc) == {"tests/test_core.py", "pkg/migrations/0001_init.py"} + + # override flips core.py to incidental and the test file to critical + ov = {"t1": {"pkg/core.py": "incidental", "tests/test_core.py": "critical"}} + crit2, inc2 = ac.edit_critical_split(gold, task="t1", overrides=ov) + assert "pkg/core.py" in inc2 + assert "tests/test_core.py" in crit2 + + +def test_macro_drops_undefined_tasks(): + # task A: one TP, one TN -> P=1.0 R=1.0; task B: all TN (no kept, no surfaced + # gold) -> precision & recall undefined -> must be DROPPED from macro, not 0. + runs = [ + {"task": "A", "cand": {"tp": 1, "fp": 0, "fn": 0, "tn": 1}}, + {"task": "B", "cand": {"tp": 0, "fp": 0, "fn": 0, "tn": 3}}, + ] + cal = candidate_calibration(runs) + # only task A contributes to the macro average (B is undefined on both axes) + assert cal["macro"]["precision"] == 1.0 + assert cal["macro"]["recall"] == 1.0 + + +def test_gold_symbols_skip_dunders(tmp_path): + f = tmp_path / "m.py" + f.write_text("class C:\n def __init__(self): pass\n def real_method(self): pass\n" + "def top_fn(): pass\n") + syms = ac.gold_symbols_offline(tmp_path, "m.py") + assert "__init__" not in syms + assert {"C", "real_method", "top_fn"} <= set(syms) + + +# --- identity-aware log lookup (no cross-wiring across prompt_modes) ---------- +def _write_batch_run(batch_root, model, mode, prompt_mode, task, primaries, + run_idx=0): + """Materialize a runs////code_graph/[/run] + /logs/stdout.jsonl with the given surfaced primaries.""" + base = batch_root / "runs" / model / mode / prompt_mode / "code_graph" / task + if run_idx: + base = base / f"run{run_idx}" + logs = base / "logs" + logs.mkdir(parents=True, exist_ok=True) + _write_stdout(logs / "stdout.jsonl", primaries) + + +def test_row_stdout_path_does_not_cross_wire_prompt_modes(tmp_path): + from bench.analysis.exposure_adoption import row_stdout_path + model = "m" + batch = tmp_path / "batch" + # Same task under two arms, each surfacing a DISTINCT file. + _write_batch_run(batch, model, "localize", "adopt-ctrl", "task-1", ["pkg/ctrl.py"]) + _write_batch_run(batch, model, "localize", "adopt-sem", "task-1", ["pkg/sem.py"]) + ctrl_row = {"config": "code_graph", "mode": "localize", + "prompt_mode": "adopt-ctrl", "task_id": "task-1", "run_idx": 0} + sem_row = {**ctrl_row, "prompt_mode": "adopt-sem"} + ctrl_log = row_stdout_path(batch, model, ctrl_row).read_text() + sem_log = row_stdout_path(batch, model, sem_row).read_text() + assert "pkg/ctrl.py" in ctrl_log and "pkg/sem.py" not in ctrl_log + assert "pkg/sem.py" in sem_log and "pkg/ctrl.py" not in sem_log + + +def test_row_stdout_path_separates_run_idx(tmp_path): + from bench.analysis.exposure_adoption import row_stdout_path + model = "m" + batch = tmp_path / "batch" + _write_batch_run(batch, model, "localize", "adopt-ctrl", "task-1", ["pkg/r0.py"], run_idx=0) + _write_batch_run(batch, model, "localize", "adopt-ctrl", "task-1", ["pkg/r1.py"], run_idx=1) + r0 = {"config": "code_graph", "mode": "localize", "prompt_mode": "adopt-ctrl", + "task_id": "task-1", "run_idx": 0} + r1 = {**r0, "run_idx": 1} + assert "pkg/r0.py" in row_stdout_path(batch, model, r0).read_text() + assert "pkg/r1.py" in row_stdout_path(batch, model, r1).read_text() + + +def test_macro_strict_scores_drop_everything_as_zero(): + # task A: clean keep -> F1=1. task B: gold SURFACED but ALL dropped + # (tp=0, fn>0) -> a real adoption failure -> macro_strict must include it as + # F1=0 (not silently drop it), while the lenient macro drops it. + runs = [ + {"task": "A", "cand": {"tp": 1, "fp": 0, "fn": 0, "tn": 1}}, + {"task": "B", "cand": {"tp": 0, "fp": 0, "fn": 2, "tn": 0}}, + ] + cal = candidate_calibration(runs) + assert cal["macro"]["f1"] == 1.0 # B dropped from lenient macro + assert cal["macro_strict"]["f1"] == 0.5 # B counted as F1=0 -> (1+0)/2 + assert cal["n_tasks_gold_dropped_failures"] == 1 + + +def test_macro_strict_keeps_dropping_no_surfaced_gold_tasks(): + # task with no surfaced gold (tp=0, fn=0) is genuinely undefined and stays + # dropped from BOTH macros. + runs = [ + {"task": "A", "cand": {"tp": 1, "fp": 0, "fn": 0, "tn": 1}}, + {"task": "B", "cand": {"tp": 0, "fp": 0, "fn": 0, "tn": 3}}, + ] + cal = candidate_calibration(runs) + assert cal["macro_strict"]["f1"] == 1.0 + assert cal["macro_strict"]["n"] == 1 + assert cal["n_tasks_dropped_undefined"] == 1 diff --git a/tests/bench/test_adopt_diag.py b/tests/bench/test_adopt_diag.py new file mode 100644 index 00000000..97e62925 --- /dev/null +++ b/tests/bench/test_adopt_diag.py @@ -0,0 +1,176 @@ +"""Unit tests for the Lane 1 per-arm diagnostics (``adopt_diag``). + +Covers the FREE/offline pieces (no API, no live graph): + * RAT keep/drop parsing + consistency audit (compliant, consistent, + dropped-but-kept conflict, kept-then-omitted erosion); + * end-to-end ``diagnose`` over a tiny CTRL/RAT batch: arm detection, exposure + recall, GRAPH-WRONG subset freezing, and token summary. +""" + +from __future__ import annotations + +import json +from pathlib import Path + +from bench.analysis import adopt_diag as ad + + +def test_parse_rat_decisions_keep_drop_and_backticks(): + text = ( + "Here are my decisions:\n" + "KEEP pkg/a.py — directly implements the save path\n" + "- `DROP` `pkg/b.py` — only a caller, not the edit site\n" + "DROP pkg/c.py: unrelated sibling\n" + "I also note keep is a verb used in prose but not line-initial here.\n" + ) + d = ad.parse_rat_decisions(text) + assert d == {"pkg/a.py": "keep", "pkg/b.py": "drop", "pkg/c.py": "drop"} + + +def test_rat_audit_consistent_when_no_dropped_file_kept(): + text = "KEEP pkg/a.py — yes\nDROP pkg/b.py — no\n" + a = ad.rat_audit(text, pred_files=["pkg/a.py"]) + assert a["compliant"] and a["consistent"] + assert a["n_keep"] == 1 and a["n_drop"] == 1 + assert a["dropped_but_kept"] == [] + assert a["kept_omitted"] == [] + + +def test_rat_audit_flags_dropped_but_kept_conflict(): + text = "KEEP pkg/a.py — yes\nDROP pkg/b.py — no\n" + a = ad.rat_audit(text, pred_files=["pkg/a.py", "pkg/b.py"]) + assert not a["consistent"] + assert a["dropped_but_kept"] == ["pkg/b.py"] + + +def test_rat_audit_flags_kept_then_omitted(): + text = "KEEP pkg/a.py — yes\nKEEP pkg/b.py — yes\n" + a = ad.rat_audit(text, pred_files=["pkg/a.py"]) + assert a["consistent"] # nothing dropped-then-kept + assert a["kept_omitted"] == ["pkg/b.py"] + + +def test_rat_audit_non_compliant_when_no_decisions(): + a = ad.rat_audit("I think the answer is pkg/a.py.", pred_files=["pkg/a.py"]) + assert not a["compliant"] + assert a["n_keep"] == 0 and a["n_drop"] == 0 + + +# --------------------------------------------------------------------------- +# End-to-end batch fixture +# --------------------------------------------------------------------------- + +MODEL = "test-model" + + +def _write_run(batch_root: Path, arm: str, task: str, primaries: list[str], + agent_text: str) -> None: + run_dir = batch_root / "runs" / MODEL / "localize" / arm / "code_graph" / task + (run_dir / "logs").mkdir(parents=True, exist_ok=True) + contents = [{"type": "text", "text": json.dumps({"file": f})} for f in primaries] + events = [ + {"type": "tool.execution_start", + "data": {"toolCallId": "c1", "mcpToolName": "search_code"}}, + {"type": "tool.execution_complete", + "data": {"toolCallId": "c1", "result": {"contents": contents}}}, + ] + (run_dir / "logs" / "stdout.jsonl").write_text( + "\n".join(json.dumps(e) for e in events) + ) + (run_dir / "agent_text.txt").write_text(agent_text) + + +def _row(arm: str, task: str, gold: list[str], pred: list[str], **extra) -> dict: + base = { + "benchmark": "swe_bench_verified", + "task_id": task, + "config": "code_graph", + "model": MODEL, + "mode": "localize", + "prompt_mode": arm, + "run_idx": 0, + "runner": "copilot-runner/2", + "completed": True, + "gold_files": gold, + "pred_files": pred, + "total_tokens": 1000, + "output_tokens": 400, + "reasoning_tokens": 150, + "input_tokens": 600, + "premium_requests": 1, + "num_turns": 5, + } + base.update(extra) + return base + + +def _build_batch(tmp_path: Path) -> Path: + batch_root = tmp_path / "cache" + results_path = batch_root / MODEL / "results.jsonl" + results_path.parent.mkdir(parents=True, exist_ok=True) + + rows = [] + # task t1: graph surfaces gold at rank-1 (CLEAN, not graph-wrong). + # CTRL drops the gold (adoption failure); RAT keeps it. + _write_run(batch_root, "adopt-ctrl", "t1", ["pkg/gold.py"], "") + _write_run(batch_root, "adopt-rat", "t1", ["pkg/gold.py"], + "KEEP pkg/gold.py — implements it\n") + rows.append(_row("adopt-ctrl", "t1", ["pkg/gold.py"], [])) # FN + rows.append(_row("adopt-rat", "t1", ["pkg/gold.py"], ["pkg/gold.py"])) # TP + + # task t2: rank-1 hit is non-gold (GRAPH-WRONG). CTRL keeps the wrong rank-1 + # (FP); RAT drops it and keeps the real gold surfaced at rank-2. + _write_run(batch_root, "adopt-ctrl", "t2", ["pkg/wrong.py", "pkg/real.py"], "") + _write_run(batch_root, "adopt-rat", "t2", ["pkg/wrong.py", "pkg/real.py"], + "DROP pkg/wrong.py — only related\nKEEP pkg/real.py — the edit site\n") + rows.append(_row("adopt-ctrl", "t2", ["pkg/real.py"], ["pkg/wrong.py"])) + rows.append(_row("adopt-rat", "t2", ["pkg/real.py"], ["pkg/real.py"])) + + results_path.write_text("\n".join(json.dumps(r) for r in rows)) + return results_path + + +def test_diagnose_detects_arms_and_exposure(tmp_path): + results_path = _build_batch(tmp_path) + rep = ad.diagnose(results_path, ref_arm="adopt-ctrl") + assert rep["arms_present"] == ["adopt-ctrl", "adopt-rat"] + # Every gold file was surfaced in both arms -> exposure_recall == 1.0. + assert rep["arms"]["adopt-ctrl"]["exposure"]["exposure_recall"] == 1.0 + assert rep["arms"]["adopt-rat"]["exposure"]["exposure_recall"] == 1.0 + # CTRL dropped both surfaced golds; RAT kept both. + assert rep["arms"]["adopt-ctrl"]["exposure"]["adoption_rate"] == 0.0 + assert rep["arms"]["adopt-rat"]["exposure"]["adoption_rate"] == 1.0 + + +def test_diagnose_graph_wrong_subset_frozen_from_ref(tmp_path): + results_path = _build_batch(tmp_path) + rep = ad.diagnose(results_path, ref_arm="adopt-ctrl") + # t2's rank-1 (pkg/wrong.py) is non-gold -> GRAPH-WRONG; t1 is not. + assert rep["graph_wrong"]["tasks"] == ["t2"] + assert rep["graph_wrong"]["ref_arm"] == "adopt-ctrl" + + +def test_diagnose_rat_audit_and_consistency(tmp_path): + results_path = _build_batch(tmp_path) + rep = ad.diagnose(results_path, ref_arm="adopt-ctrl") + ra = rep["arms"]["adopt-rat"]["rat_audit"] + assert ra["n"] == 2 + assert ra["compliance_rate"] == 1.0 # both RAT runs emitted decisions + assert ra["consistency_rate"] == 1.0 # no dropped file was kept + assert ra["n_dropped_but_kept"] == 0 + + +def test_diagnose_rat_calibration_beats_ctrl(tmp_path): + results_path = _build_batch(tmp_path) + rep = ad.diagnose(results_path, ref_arm="adopt-ctrl") + ctrl_f1 = rep["arms"]["adopt-ctrl"]["calibration_clean"]["macro_strict"]["f1"] + rat_f1 = rep["arms"]["adopt-rat"]["calibration_clean"]["macro_strict"]["f1"] + assert rat_f1 > ctrl_f1 + + +def test_token_summary_visible_output_excludes_reasoning(tmp_path): + rows = [_row("adopt-rat", "t1", ["g.py"], ["g.py"])] + ts = ad.token_summary(rows) + # output 400 - reasoning 150 = 250 visible + assert ts["median_visible_output_tokens"] == 250 + assert ts["median_total_tokens"] == 1000 diff --git a/tests/bench/test_cg_mcp_adapter.py b/tests/bench/test_cg_mcp_adapter.py new file mode 100644 index 00000000..7d6f9274 --- /dev/null +++ b/tests/bench/test_cg_mcp_adapter.py @@ -0,0 +1,178 @@ +"""Tests for the MCP-transport bench adapter (`cg-mcp`). + +Heavy end-to-end test (talks to real cgraph-mcp + FalkorDB) is gated +behind the same `_falkordb_reachable` check as the existing MCP tests. +Light tests run unconditionally and validate the argparse surface and +`_extract` shape handling. +""" + +from __future__ import annotations + +import json +import os +import socket +import subprocess +import sys +from pathlib import Path + +import pytest + +from bench.agents import code_graph_mcp_adapter as cgm +from bench.cli import cg_mcp + + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def _mcp_server_available() -> bool: + """The benchmark MCP adapter requires the in-repo `cgraph-mcp` server. + + On branches that pre-date the MCP stack (e.g. this branch's base, + `fix-find-symbol-nested-name`), `api.mcp.server` is absent. The + end-to-end test must skip there; it will run on staging once the + MCP stack lands. + """ + try: + import api.mcp.server # noqa: F401 + return True + except ImportError: + return False + + +def _falkordb_reachable() -> bool: + host = os.environ.get("FALKORDB_HOST", "127.0.0.1") + port = int(os.environ.get("FALKORDB_PORT", "6390")) + try: + with socket.create_connection((host, port), timeout=1): + return True + except OSError: + return False + + +# ── light unit tests ────────────────────────────────────────────────── + + +class _FakeChunk: + def __init__(self, text: str) -> None: + self.text = text + + +class _FakeResult: + def __init__(self, content, structured=None, is_error=False): + self.content = content + self.structuredContent = structured + self.isError = is_error + + +def test_extract_prefers_text_chunk_json(): + r = _FakeResult([_FakeChunk('{"id": 7, "name": "foo"}')]) + assert cgm._extract(r) == {"id": 7, "name": "foo"} + + +def test_extract_falls_back_to_structured_result_wrapper(): + r = _FakeResult(content=[], structured={"result": [1, 2, 3]}) + assert cgm._extract(r) == [1, 2, 3] + + +def test_extract_returns_raw_text_when_not_json(): + r = _FakeResult([_FakeChunk("not json at all")]) + assert cgm._extract(r) == "not json at all" + + +def test_cli_rejects_unknown_subcommand(capsys): + with pytest.raises(SystemExit): + cg_mcp.main(["totally_bogus"]) + + +def test_cli_index_repo_parses_ignore_list(monkeypatch): + captured: dict = {} + + def fake_index_repo(path_or_url, branch=None, ignore=None): + captured.update(path_or_url=path_or_url, branch=branch, ignore=ignore) + return {"ok": True, **captured} + + monkeypatch.setattr(cgm, "index_repo", fake_index_repo) + rc = cg_mcp.main( + [ + "index_repo", + "--path-or-url", + "/tmp/x", + "--branch", + "main", + "--ignore", + ".venv", + "node_modules", + ] + ) + assert rc == 0 + assert captured["path_or_url"] == "/tmp/x" + assert captured["branch"] == "main" + assert captured["ignore"] == [".venv", "node_modules"] + + +# ── heavy end-to-end test ───────────────────────────────────────────── + + +@pytest.mark.skipif( + not _mcp_server_available(), + reason="api.mcp.server not present — requires MCP stack to be merged", +) +@pytest.mark.skipif(not _falkordb_reachable(), reason="FalkorDB unreachable") +def test_cg_mcp_search_code_end_to_end(tmp_path): + """Spawn the actual cg-mcp shim against a freshly-indexed fixture.""" + fixture = REPO_ROOT / "tests" / "mcp" / "fixtures" / "sample_project" + if not fixture.exists(): + pytest.skip("MCP sample fixture not present") + + env = os.environ.copy() + env["FALKORDB_HOST"] = os.environ.get("FALKORDB_HOST", "127.0.0.1") + env["FALKORDB_PORT"] = os.environ.get("FALKORDB_PORT", "6390") + env["BENCH_PYTHON"] = sys.executable + # Ensure cgraph-mcp is on PATH for the spawned subprocess. + venv_bin = str(Path(sys.executable).parent) + env["PATH"] = f"{venv_bin}:{env.get('PATH', '')}" + + # Index the fixture under a deterministic project/branch. + project = "sample_project" + branch = f"benchmcp-{os.getpid()}" + idx = subprocess.run( + [ + str(REPO_ROOT / "bench" / "cli" / "cg-mcp"), + "index_repo", + "--path-or-url", + str(fixture), + "--branch", + branch, + ], + env=env, + capture_output=True, + text=True, + timeout=120, + ) + assert idx.returncode == 0, idx.stderr + idx_payload = json.loads(idx.stdout) + assert "graph_name" in idx_payload + assert idx_payload["num_nodes"] > 0 + + # Then search for any known symbol from the fixture. + sr = subprocess.run( + [ + str(REPO_ROOT / "bench" / "cli" / "cg-mcp"), + "search_code", + "--project", + project, + "--branch", + branch, + "--prefix", + "a", # broad prefix to match something in the fixture + "--limit", + "3", + ], + env=env, + capture_output=True, + text=True, + timeout=60, + ) + assert sr.returncode == 0, sr.stderr + out = json.loads(sr.stdout) + assert out is not None diff --git a/tests/bench/test_copilot_runner.py b/tests/bench/test_copilot_runner.py new file mode 100644 index 00000000..73f1f729 --- /dev/null +++ b/tests/bench/test_copilot_runner.py @@ -0,0 +1,660 @@ +"""Unit tests for the Copilot benchmark runner parsers + TCO accounting. + +These run unconditionally (no FalkorDB / Copilot needed): they exercise the +log/JSONL parsing and cost math against synthetic inputs plus the captured +spike fixtures when present. +""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from bench.runners import copilot_runner as cr +from bench.runners import copilot_tco as tco + + +# --------------------------------------------------------------------------- +# Token-block parsing +# --------------------------------------------------------------------------- + + +def _write_log(tmp_path: Path, name: str, text: str) -> Path: + d = tmp_path / "logs" + d.mkdir(exist_ok=True) + (d / name).write_text(text) + return d + + +_USAGE_BLOCK = """\ +some preamble line + "usage": { + "prompt_tokens": 1000, + "completion_tokens": 50, + "total_tokens": 1050, + "prompt_tokens_details": { + "cached_tokens": 200, + "cache_creation_tokens": 800 + }, + "completion_tokens_details": { "reasoning_tokens": 0 } + } +trailing +""" + + +def test_parse_tokens_sums_multiple_blocks(tmp_path): + text = _USAGE_BLOCK + "\n" + _USAGE_BLOCK + d = _write_log(tmp_path, "process-1.log", text) + out = cr.parse_tokens_from_logs(d) + assert out["input_tokens"] == 2000 + assert out["output_tokens"] == 100 + assert out["total_tokens"] == 2100 + assert out["cached_input_tokens"] == 400 + assert out["cache_creation_tokens"] == 1600 + assert out["usage_blocks"] == 2 + + +def test_parse_tokens_ignores_non_model_usage(tmp_path): + # An MCP tool result or stray JSON with a "usage" key but missing the + # required model-response fields must NOT be counted. + stray = '{ "usage": { "premiumRequests": 15, "totalApiDurationMs": 100 } }' + text = _USAGE_BLOCK + "\n" + stray + d = _write_log(tmp_path, "process-1.log", text) + out = cr.parse_tokens_from_logs(d) + assert out["usage_blocks"] == 1 + assert out["input_tokens"] == 1000 + + +def test_parse_tokens_multiple_log_files(tmp_path): + d = _write_log(tmp_path, "process-1.log", _USAGE_BLOCK) + (d / "process-2.log").write_text(_USAGE_BLOCK) + out = cr.parse_tokens_from_logs(d) + assert out["usage_blocks"] == 2 + assert out["input_tokens"] == 2000 + + +# --------------------------------------------------------------------------- +# Result-event + tool-call parsing +# --------------------------------------------------------------------------- + + +def test_parse_result_event(): + stdout = "\n".join([ + json.dumps({"type": "assistant", "data": {}}), + json.dumps({ + "type": "result", + "data": { + "usage": { + "premiumRequests": 12, + "codeChanges": {"filesModified": ["a.py", "b.py"]}, + }, + "isError": False, + "numTurns": 7, + }, + }), + ]) + out = cr.parse_result_event(stdout) + assert out["premium_requests"] == 12 + assert out["files_modified"] == ["a.py", "b.py"] + assert out["is_error"] is False + assert out["num_turns"] == 7 + + +def test_parse_tool_calls_counts_mcp_and_shell(): + stdout = "\n".join([ + json.dumps({"type": "tool.execution_start", "data": {"name": "bash"}}), + json.dumps({"type": "tool.execution_start", "data": {"name": "bash"}}), + json.dumps({"type": "tool.execution_start", "data": {"name": "code-graph-search_code"}}), + json.dumps({"type": "tool.execution_complete", "data": {}}), + ]) + total, by_name = cr.parse_tool_calls(stdout) + assert total == 3 + assert by_name == {"bash": 2, "code-graph-search_code": 1} + + +def test_parsers_tolerate_garbage_lines(): + stdout = "not json\n\n" + json.dumps({"type": "result", "data": {"usage": {"premiumRequests": 1}}}) + assert cr.parse_result_event(stdout)["premium_requests"] == 1 + assert cr.parse_tool_calls("garbage\n{bad") == (0, {}) + + +# --------------------------------------------------------------------------- +# Patch helpers +# --------------------------------------------------------------------------- + + +def test_patched_files_extraction(): + patch = ( + "diff --git a/x/y.py b/x/y.py\n" + "--- a/x/y.py\n" + "+++ b/x/y.py\n" + "@@ -1 +1 @@\n-a\n+b\n" + "diff --git a/tests/test_z.py b/tests/test_z.py\n" + "--- a/tests/test_z.py\n" + "+++ b/tests/test_z.py\n" + "@@ -1 +1 @@\n-c\n+d\n" + ) + assert cr._patched_files(patch) == ["x/y.py", "tests/test_z.py"] + + +# --------------------------------------------------------------------------- +# Prompt assembly +# --------------------------------------------------------------------------- + + +def test_prompt_excludes_ask_for_code_graph(tmp_path): + p = cr.build_prompt(cr.CODE_GRAPH, tmp_path, "Fix the bug.", "django__django-10973") + assert "django__django-10973" in p + assert "Do not use the `ask` tool" in p + assert "search_code" in p + + +def test_prompt_no_mcp_has_no_tool_sales(): + p = cr.build_prompt(cr.NO_MCP, Path("/tmp/x"), "Fix it.", "proj") + assert "MCP" in p # capability note present + assert "search_code" not in p + + +# --------------------------------------------------------------------------- +# TCO accounting +# --------------------------------------------------------------------------- + + +def test_tco_no_ask_is_agent_only(): + row = { + "task_id": "t1", "config": "code_graph", "model": "claude-opus-4.8", + "input_tokens": 1_000_000, "output_tokens": 100_000, + "premium_requests": 20, "index_sec": 60.0, "completed": True, + } + out = tco.row_tco(row) + # opus: 1M in * $15 + 0.1M out * $75 = 15 + 7.5 = 22.5 + assert out["agent_usd"] == pytest.approx(22.5, abs=0.01) + assert out["graphrag_usd"] == 0.0 + assert out["per_task_tco_usd"] == pytest.approx(22.5, abs=0.01) + assert out["index_usd_amortized_once"] > 0 + + +def test_tco_meters_ask_when_present(): + row = { + "task_id": "t1", "config": "code_graph_ask", "model": "claude-sonnet-4.6", + "input_tokens": 0, "output_tokens": 0, + "graphrag_ask_calls": 3, "graphrag_input_tokens": 1_000_000, + "graphrag_output_tokens": 100_000, "completed": True, + } + out = tco.row_tco(row) + # gemini-flash-lite: 1M * 0.075 + 0.1M * 0.30 = 0.075 + 0.03 = 0.105 + assert out["graphrag_usd"] == pytest.approx(0.105, abs=1e-4) + assert out["graphrag_ask_calls"] == 3 + + +def test_tco_aggregate_groups_by_config(): + rows = [ + {"config": "copilot_no_mcp", "model": "claude-opus-4.8", "input_tokens": 100, "output_tokens": 10, "outcome": "resolved", "completed": True}, + {"config": "code_graph", "model": "claude-opus-4.8", "input_tokens": 100, "output_tokens": 10, "outcome": "failed", "completed": True}, + {"config": "code_graph", "model": "claude-opus-4.8", "input_tokens": 0, "output_tokens": 0, "outcome": "x", "completed": False}, # skipped + ] + agg = tco.aggregate(rows) + assert agg["copilot_no_mcp"]["n"] == 1 + assert agg["copilot_no_mcp"]["resolved"] == 1 + assert agg["code_graph"]["n"] == 1 # incomplete row excluded + + +def test_agent_key_mapping(): + assert tco.agent_key("claude-opus-4.8") == "opus" + assert tco.agent_key("claude-sonnet-4.6") == "sonnet" + assert tco.agent_key("claude-haiku-4.5") == "haiku" + + +# --------------------------------------------------------------------------- +# Prompt assembly: nudge + localize modes +# --------------------------------------------------------------------------- + + +def test_prompt_nudge_code_graph_mandates_search(tmp_path): + p = cr.build_prompt(cr.CODE_GRAPH, tmp_path, "Fix it.", "proj", nudge=True) + assert "MUST begin by calling search_code(project=\"proj\")" in p + assert "Do not use the `ask` tool" in p + + +def test_prompt_nudge_no_mcp_is_matched_control(tmp_path): + p = cr.build_prompt(cr.NO_MCP, tmp_path, "Fix it.", "proj", nudge=True) + # Matched "search-first" control with no tool sales / no graph verbs. + assert "Before resorting to plain text search" in p + assert "search_code" not in p + + +def test_prompt_localize_emits_sentinel_contract(tmp_path): + p = cr.build_prompt(cr.CODE_GRAPH, tmp_path, "Bug.", "proj", mode=cr.LOCALIZE) + assert cr.LOCALIZE_SENTINEL in p + assert "Do NOT modify any files" in p + assert "Do NOT emit it through a" in p + + +# --------------------------------------------------------------------------- +# Lane 1 adoption-calibration arms (CTRL / SEM / RAT) +# --------------------------------------------------------------------------- + + +def test_adopt_ctrl_capability_equals_canonical_nudge(tmp_path): + # CTRL must be byte-identical to the canonical nudge capability (prereg §2 + # amended: CTRL == _CAP_CODE_GRAPH_NUDGE), independent of env-gated variants. + cap = cr._capability(cr.CODE_GRAPH, "proj", nudge=False, adopt_arm="ctrl") + assert cap == cr._CAP_CODE_GRAPH_NUDGE.format(project="proj") + + +def test_adopt_arms_bypass_env_gates(tmp_path, monkeypatch): + # The SUBST/SPIKE/TRAVERSE env gates must NOT affect arm capabilities. + monkeypatch.setenv("CGRAPH_SUBST_NUDGE", "1") + monkeypatch.setenv("CGRAPH_SPIKE_NUDGE", "1") + monkeypatch.setenv("CGRAPH_TRAVERSE_NUDGE", "1") + cap = cr._capability(cr.CODE_GRAPH, "proj", nudge=True, adopt_arm="ctrl") + assert cap == cr._CAP_CODE_GRAPH_NUDGE.format(project="proj") + assert "TRUST the ranked results" not in cap + assert "get_importers" not in cap + + +def test_adopt_sem_appends_frozen_clause_only(monkeypatch, tmp_path): + monkeypatch.delenv("BENCH_BLOCK_NETWORK", raising=False) + p = cr.build_prompt( + cr.CODE_GRAPH, tmp_path, "Bug.", "proj", mode=cr.LOCALIZE, adopt_arm="sem" + ) + # SEM == CTRL (canonical nudge) + frozen edge-semantics clause. + assert "MUST begin by calling search_code(project=\"proj\")" in p + assert "Relatedness alone is not a reason to keep or to drop." in p + assert "evidence that code is RELATED" in p + # Rejected wording (benchmark prior) must never appear. + assert "often a caller or a sibling" not in p + # SEM has no extra keep/drop step (that is RAT's lever). + assert "KEEP " not in p + + +def test_adopt_rat_injects_keep_drop_step_before_sentinel(tmp_path): + p = cr.build_prompt( + cr.CODE_GRAPH, tmp_path, "Bug.", "proj", mode=cr.LOCALIZE, adopt_arm="rat" + ) + assert "KEEP " in p and "DROP " in p + assert "list every file the graph surfaced" in p + # The keep/drop step must precede the FINAL sentinel instruction. + assert p.index("list every file the graph surfaced") < p.index(cr.LOCALIZE_SENTINEL) + # RAT shares the CTRL base, not the SEM clause. + assert "MUST begin by calling search_code(project=\"proj\")" in p + assert "Relatedness alone is not a reason" not in p + + +def test_adopt_arm_guard_rejects_non_code_graph(tmp_path): + with pytest.raises(ValueError): + cr.build_prompt( + cr.NO_MCP, tmp_path, "Bug.", "proj", mode=cr.LOCALIZE, adopt_arm="ctrl" + ) + + +def test_adopt_arm_guard_rejects_non_localize(tmp_path): + with pytest.raises(ValueError): + cr.build_prompt( + cr.CODE_GRAPH, tmp_path, "Bug.", "proj", mode=cr.FIX, adopt_arm="ctrl" + ) + + +def test_adopt_arm_guard_rejects_unknown_value(tmp_path): + # Programmatic callers bypass argparse choices; an unknown arm must not + # silently fall through to CTRL behavior while logging prompt_mode=adopt-bad. + with pytest.raises(ValueError): + cr.build_prompt( + cr.CODE_GRAPH, tmp_path, "Bug.", "proj", mode=cr.LOCALIZE, adopt_arm="bad" + ) + + +# --------------------------------------------------------------------------- +# NOISY/GRAPH-WRONG distractor injection wiring +# --------------------------------------------------------------------------- + + +class _Inst: + def __init__(self, instance_id): + self.instance_id = instance_id + + +def test_compute_prompt_mode_suffixes_only_when_injecting(): + assert cr._compute_prompt_mode(adopt_arm="sem", nudge=True, inject_label=None) == "adopt-sem" + assert ( + cr._compute_prompt_mode(adopt_arm="sem", nudge=True, inject_label="noisy") + == "adopt-sem-noisy" + ) + assert cr._compute_prompt_mode(adopt_arm=None, nudge=True, inject_label=None) == "nudged" + assert ( + cr._compute_prompt_mode(adopt_arm=None, nudge=False, inject_label="gwrong") + == "neutral-gwrong" + ) + + +def test_inject_env_pins_task_and_manifest(): + inst = _Inst("django__django-1") + env = cr._inject_env(inst, inject_manifest=Path("/tmp/m.json"), inject_k=3) + assert env == { + "BENCH_NOISY_MANIFEST": "/tmp/m.json", + "BENCH_NOISY_TASK": "django__django-1", + "BENCH_NOISY_K": "3", + } + + +def test_inject_env_omits_k_when_unset(): + env = cr._inject_env(_Inst("t1"), inject_manifest=Path("/tmp/m.json"), inject_k=None) + assert "BENCH_NOISY_K" not in env + assert env["BENCH_NOISY_TASK"] == "t1" + + +def test_inject_env_none_when_disabled(): + assert cr._inject_env(_Inst("t1"), inject_manifest=None, inject_k=None) is None + + +def test_write_mcp_config_clean_is_falkor_only(tmp_path): + w = tmp_path / "wrap.sh" + w.write_text("#!/bin/bash\n") + cfg = cr._write_mcp_config(tmp_path, w, "h", 6379) + env = json.loads(cfg.read_text())["mcpServers"]["code-graph"]["env"] + assert env == {"FALKORDB_HOST": "h", "FALKORDB_PORT": "6379"} + + +def test_write_mcp_config_threads_extra_env(tmp_path): + w = tmp_path / "wrap.sh" + w.write_text("#!/bin/bash\n") + cfg = cr._write_mcp_config( + tmp_path, w, "h", 6379, extra_env={"BENCH_NOISY_TASK": "t1"} + ) + env = json.loads(cfg.read_text())["mcpServers"]["code-graph"]["env"] + assert env["FALKORDB_HOST"] == "h" + assert env["BENCH_NOISY_TASK"] == "t1" + + +def test_run_one_inject_guard_rejects_non_localize(tmp_path): + with pytest.raises(ValueError): + cr.run_one( + _Inst("x"), track=cr.CODE_GRAPH, model="m", cache_dir=tmp_path, + wall_time=1.0, server_root=tmp_path, mode=cr.FIX, + inject_manifest=Path("/tmp/m.json"), inject_label="noisy", + ) + + +def test_run_one_inject_guard_requires_label(tmp_path): + with pytest.raises(ValueError): + cr.run_one( + _Inst("x"), track=cr.CODE_GRAPH, model="m", cache_dir=tmp_path, + wall_time=1.0, server_root=tmp_path, mode=cr.LOCALIZE, + inject_manifest=Path("/tmp/m.json"), inject_label=None, + ) + + +# --------------------------------------------------------------------------- +# Localization extraction + scoring +# --------------------------------------------------------------------------- + + +def _msg(content): + return json.dumps({"type": "assistant.message", "data": {"content": content}}) + + +def test_extract_agent_text_concats_messages(): + stdout = "\n".join([ + _msg("first thought"), + json.dumps({"type": "tool.execution_start", "data": {"name": "bash"}}), + json.dumps({"type": "user.message", "data": {"content": "IGNORE ME"}}), + _msg(""), # empty tool-only turn + _msg("final answer"), + ]) + text = cr.extract_agent_text(stdout) + assert "first thought" in text + assert "final answer" in text + assert "IGNORE ME" not in text + + +def test_parse_localization_strict_sentinel(): + text = 'Reasoning...\nFINAL_LOCALIZATION_JSON: ["a/pkg/x.py", "./pkg/y.py"]' + pred, err, fallback = cr.parse_localization(text) + assert pred == ["pkg/x.py", "pkg/y.py"] + assert err is None + assert fallback is False + + +def test_parse_localization_uses_last_sentinel(): + text = ( + 'FINAL_LOCALIZATION_JSON: ["wrong.py"]\n' + 'on reflection...\n' + 'FINAL_LOCALIZATION_JSON: ["right.py"]' + ) + pred, err, fallback = cr.parse_localization(text) + assert pred == ["right.py"] + assert err is None + + +def test_parse_localization_missing_sentinel_flags_fallback(): + pred, err, fallback = cr.parse_localization("no marker here") + assert pred == [] + assert err == "sentinel_missing" + assert fallback is True + + +def test_parse_localization_malformed_array(): + pred, err, fallback = cr.parse_localization("FINAL_LOCALIZATION_JSON: [oops") + assert pred == [] + assert fallback is True + assert err == "unbalanced_array" + + +def test_score_localization_recall_and_mrr(): + scores = cr.score_localization( + pred=["pkg/b.py", "pkg/a.py"], gold=["pkg/a.py", "pkg/c.py"] + ) + assert scores["file_recall"] == 0.5 + assert scores["file_precision"] == 0.5 + assert scores["file_all_found"] is False + assert scores["acc_at_1"] == 0.0 # first pred (b.py) not gold + assert scores["acc_at_3"] == 1.0 + assert scores["file_mrr"] == 0.5 # a.py at rank 2 + + +def test_score_localization_all_found(): + scores = cr.score_localization(pred=["a.py", "b.py"], gold=["a.py", "b.py"]) + assert scores["file_all_found"] is True + assert scores["file_recall"] == 1.0 + assert scores["acc_at_1"] == 1.0 + assert scores["file_mrr"] == 1.0 + + +# --------------------------------------------------------------------------- +# Nudge compliance metrics +# --------------------------------------------------------------------------- + + +def test_nudge_compliance_first_is_graph(): + stdout = "\n".join([ + json.dumps({"type": "tool.execution_start", "data": {"name": "code-graph-search_code"}}), + json.dumps({"type": "tool.execution_start", "data": {"name": "bash"}}), + json.dumps({"type": "tool.execution_start", "data": {"name": "code-graph-get_callers"}}), + ]) + c = cr.nudge_compliance(stdout) + assert c["first_tool"] == "code-graph-search_code" + assert c["first_is_graph"] is True + assert c["graph_calls"] == 2 + + +def test_nudge_compliance_no_graph(): + stdout = json.dumps({"type": "tool.execution_start", "data": {"name": "bash"}}) + c = cr.nudge_compliance(stdout) + assert c["first_tool"] == "bash" + assert c["first_is_graph"] is False + assert c["graph_calls"] == 0 + + +# --------------------------------------------------------------------------- +# Resume identity includes mode / prompt_mode / model +# --------------------------------------------------------------------------- + + +def test_load_done_keys_include_mode_and_prompt(tmp_path): + results = tmp_path / "results.jsonl" + rows = [ + {"task_id": "t1", "config": "code_graph", "model": "claude-opus-4.8", + "mode": "fix", "prompt_mode": "neutral", "run_idx": 0, + "runner": cr.RUNNER_VERSION, "completed": True}, + {"task_id": "t1", "config": "code_graph", "model": "claude-opus-4.8", + "mode": "localize", "prompt_mode": "nudged", "run_idx": 0, + "runner": cr.RUNNER_VERSION, "completed": True}, + ] + results.write_text("\n".join(json.dumps(r) for r in rows)) + done = cr._load_done(results) + assert ("t1", "code_graph", "claude-opus-4.8", "fix", "neutral", 0) in done + assert ("t1", "code_graph", "claude-opus-4.8", "localize", "nudged", 0) in done + # A different prompt_mode is NOT considered done. + assert ("t1", "code_graph", "claude-opus-4.8", "fix", "nudged", 0) not in done + + +# --------------------------------------------------------------------------- +# Real captured fixture (when present) +# --------------------------------------------------------------------------- + +_FIXTURE = Path(__file__).resolve().parents[1].parent / "bench" / "cache" / "copilot-spike" / "logs" / "mcp-probe3" + + +@pytest.mark.skipif(not (_FIXTURE / "..").exists() or not _FIXTURE.exists(), reason="spike fixture absent") +def test_real_fixture_tokens_parse(): + out = cr.parse_tokens_from_logs(_FIXTURE) + assert out["usage_blocks"] >= 1 + assert out["input_tokens"] > 0 + + +# --------------------------------------------------------------------------- +# Leak hardening: git-walk-up fence + index branch pin (harden/2) +# --------------------------------------------------------------------------- + +import os # noqa: E402 + +from bench.datasets import swe_bench # noqa: E402 + + +def test_strip_git_oracle_removes_nested_git(tmp_path): + root = tmp_path / "wt" + (root / "pkg").mkdir(parents=True) + # top-level repo .git directory + (root / ".git").mkdir() + (root / ".git" / "config").write_text("[remote]\n") + # submodule .git is a FILE holding a gitdir pointer, not a directory + (root / "pkg" / ".git").write_text("gitdir: ../../.git/modules/pkg\n") + # real source must survive + (root / "pkg" / "mod.py").write_text("x = 1\n") + + swe_bench.strip_git_oracle(root) + + assert not (root / ".git").exists() + assert not (root / "pkg" / ".git").exists() + assert (root / "pkg" / "mod.py").read_text() == "x = 1\n" + + +def test_harden_env_scrubs_git_and_creds(monkeypatch): + monkeypatch.setenv("GITHUB_TOKEN", "secret") + monkeypatch.setenv("GIT_DIR", "/somewhere/.git") + monkeypatch.setenv("GIT_WORK_TREE", "/somewhere") + monkeypatch.setenv("GIT_COMMON_DIR", "/somewhere/.git") + out = cr._harden_env(dict(os.environ)) + assert "GITHUB_TOKEN" not in out + assert "GIT_DIR" not in out + assert "GIT_WORK_TREE" not in out + assert "GIT_COMMON_DIR" not in out + assert out["GIT_CONFIG_NOSYSTEM"] == "1" + + +def test_git_ceiling_dirs_points_at_worktree_parent(tmp_path): + wt = tmp_path / "worktrees" / "code_graph" / "loc-abc123" + wt.mkdir(parents=True) + ceiling = cr._git_ceiling_dirs(wt) + parent = str((tmp_path / "worktrees" / "code_graph").resolve()) + assert parent in ceiling.split(os.pathsep) + + +def test_hardening_meta_harden2_flags(monkeypatch, tmp_path): + monkeypatch.setenv("BENCH_BLOCK_NETWORK", "1") + meta = cr.hardening_meta(tmp_path, "", 0) # tmp_path has no .git + assert meta["harness_hardening_version"] == "harden/2" + assert meta["git_walk_up_blocked"] is True + assert meta["git_sanitized"] is True + assert meta["network_block_mode"] is True + assert meta["opaque_path_mode"] is True + + +def test_hardening_meta_on_by_default(monkeypatch, tmp_path): + # Hardening is default-ON: tracing repeatedly caught the agent fetching the + # gold file list from GitHub, turning localization misses into fake recall=1.0. + monkeypatch.delenv("BENCH_BLOCK_NETWORK", raising=False) + meta = cr.hardening_meta(tmp_path, "", 0) + assert meta["git_walk_up_blocked"] is True + assert meta["network_block_mode"] is True + + +def test_hardening_meta_explicit_opt_out(monkeypatch, tmp_path): + # Hardening only disengages when explicitly set to a falsy value. + monkeypatch.setenv("BENCH_BLOCK_NETWORK", "0") + meta = cr.hardening_meta(tmp_path, "", 0) + assert meta["git_walk_up_blocked"] is False + assert meta["network_block_mode"] is False + + +def test_leak_scan_flags_git_escape_attempts(): + # `git -C ..` re-points discovery above the ceiling fence. + sig = cr._scan_leak_arguments("bash", {"command": "git -C .. log --oneline -1"}) + assert sig, "git -C escape should be flagged" + # Explicit --git-dir bypass. + sig2 = cr._scan_leak_arguments("bash", {"command": "git --git-dir=/x/.git log"}) + assert sig2, "git --git-dir escape should be flagged" + # Unsetting the ceiling env before running git. + sig3 = cr._scan_leak_arguments("bash", {"command": "env -u GIT_CEILING_DIRECTORIES git log"}) + assert sig3, "env -u GIT_CEILING escape should be flagged" + # A benign in-worktree command is NOT flagged. + assert cr._scan_leak_arguments("bash", {"command": "ls -la && cat README.md"}) == [] + + +def test_resolve_run_dir_layout_matches_row_stdout_path(tmp_path): + # The producer (_resolve_run_dir) and the consumer (exposure_adoption. + # row_stdout_path) must agree on the on-disk layout, including the run + # nesting introduced for multi-run pilots. This pins that contract so a + # change to one side can't silently desync log lookup. + from bench.analysis import exposure_adoption as ea + + cache_dir = tmp_path / "batch" + model = "claude-opus-4.8" + common = dict( + model=model, + mode="localize", + prompt_mode="adopt-sem", + track="code_graph", + instance_id="django__django-12345", + ) + for run_idx in (0, 1, 3): + run_dir = cr._resolve_run_dir(cache_dir, run_idx=run_idx, **common) + log_dir = run_dir / "logs" + log_dir.mkdir(parents=True, exist_ok=True) + (log_dir / "stdout.jsonl").write_text("{}\n") + + row = { + "mode": common["mode"], + "prompt_mode": common["prompt_mode"], + "config": common["track"], + "task_id": common["instance_id"], + "run_idx": run_idx, + } + resolved = ea.row_stdout_path(cache_dir, model, row) + assert resolved == log_dir / "stdout.jsonl" + + +def test_resolve_run_dir_nests_only_for_nonzero_idx(tmp_path): + base = dict( + model="m", + mode="fix", + prompt_mode="neutral", + track="lsp", + instance_id="t-1", + ) + bare = cr._resolve_run_dir(tmp_path, run_idx=0, **base) + nested = cr._resolve_run_dir(tmp_path, run_idx=2, **base) + assert bare.name == "t-1" + assert nested.name == "run2" and nested.parent.name == "t-1" diff --git a/tests/bench/test_localize_runner.py b/tests/bench/test_localize_runner.py new file mode 100644 index 00000000..db5870f9 --- /dev/null +++ b/tests/bench/test_localize_runner.py @@ -0,0 +1,218 @@ +"""Offline unit tests for the LocAgent-style localization runner. + +These guard the two bugs found during the live smoke run: + 1. The parser must read ONLY the agent's own output (assistant/exit/ + submission), never the example sentinel embedded in the user prompt. + 2. Scoring (recall / Acc@k / MRR) must follow the LocAgent definition + where Acc@k means "all gold files recovered within top-k". +""" +from __future__ import annotations + +from bench.runners.localize_runner import ( + SENTINEL, + parse_prediction, + score_localization, +) + + +def _traj(messages, submission=None): + info = {} + if submission is not None: + info["submission"] = submission + return {"messages": messages, "info": info} + + +def test_parser_ignores_example_in_user_prompt(): + """The instance prompt contains an EXAMPLE sentinel; it must be skipped.""" + user_prompt = ( + "Name the files that must change. End with a line like:\n" + f'{SENTINEL} ["pkg/module/foo.py","pkg/other.py"]\n' + ) + assistant = ( + "I investigated the code base.\n" + f'{SENTINEL} ["app/real_target.py","app/helper.py"]' + ) + traj = _traj( + [ + {"role": "system", "content": "be helpful"}, + {"role": "user", "content": user_prompt}, + {"role": "assistant", "content": assistant}, + ] + ) + pred, err, _ = parse_prediction(traj) + assert err is False + assert pred == ["app/real_target.py", "app/helper.py"] + # the example files must never leak in + assert "pkg/module/foo.py" not in pred + + +def test_parser_reads_submission_field(): + sub = f'{SENTINEL} ["pkg/b.py"]' + traj = _traj( + [ + {"role": "user", "content": f'{SENTINEL} ["example/x.py"]'}, + ], + submission=sub, + ) + pred, err, _ = parse_prediction(traj) + assert err is False + assert pred == ["pkg/b.py"] + + +def test_parser_uses_last_sentinel_in_agent_text(): + assistant = ( + f'{SENTINEL} ["first/guess.py"]\n' + "...reconsidered...\n" + f'{SENTINEL} ["final/answer.py"]' + ) + traj = _traj([{"role": "assistant", "content": assistant}]) + pred, err, _ = parse_prediction(traj) + assert err is False + assert pred == ["final/answer.py"] + + +def test_parser_missing_sentinel_is_error(): + traj = _traj([{"role": "assistant", "content": "no answer here"}]) + pred, err, _ = parse_prediction(traj) + assert err is True + assert pred == [] + + +def test_parser_normalizes_diff_prefixes_and_dotslash(): + # git-diff style `a/`,`b/` prefixes are stripped; `./` is stripped. + assistant = f'{SENTINEL} ["a/django/x.py", "./sympy/y.py"]' + traj = _traj([{"role": "assistant", "content": assistant}]) + pred, err, _ = parse_prediction(traj) + assert err is False + assert pred == ["django/x.py", "sympy/y.py"] + + +def test_score_all_found_and_acc_at_k(): + gold = ["pkg/a.py", "pkg/b.py"] + # both gold present but need top-2 -> acc@1 False, acc@3 True + pred = ["pkg/a.py", "pkg/b.py"] + s = score_localization(pred, gold) + assert s["file_recall"] == 1.0 + assert s["file_all_found"] is True + assert s["acc_at_1"] is False + assert s["acc_at_3"] is True + assert s["acc_at_5"] is True + assert s["file_mrr"] == 1.0 + + +def test_score_partial_recall(): + gold = ["pkg/a.py", "pkg/b.py"] + pred = ["pkg/a.py", "pkg/wrong.py"] + s = score_localization(pred, gold) + assert s["file_recall"] == 0.5 + assert s["file_all_found"] is False + assert s["acc_at_5"] is False + assert s["file_mrr"] == 1.0 # first prediction is gold + + +def test_score_mrr_second_position(): + gold = ["pkg/b.py"] + pred = ["pkg/wrong.py", "pkg/b.py"] + s = score_localization(pred, gold) + assert s["file_mrr"] == 0.5 + assert s["acc_at_1"] is False + assert s["acc_at_3"] is True + + +def test_score_empty_prediction(): + s = score_localization([], ["pkg/a.py"]) + assert s["file_recall"] == 0.0 + assert s["file_all_found"] is False + assert s["file_mrr"] == 0.0 + + +def test_safe_env_kills_pipe_holding_grandchild_promptly(): + """The exact deadlock: a command backgrounds a child that keeps the stdout + pipe open and sleeps far longer than the timeout. The stock + subprocess.run(timeout=) would block in communicate(); SafeLocalEnvironment + must return promptly with a timeout marker. + """ + import time as _t + + from bench.runners.localize_runner import SafeLocalEnvironment + + env = SafeLocalEnvironment(cwd="/tmp", env={}, timeout=2) + # `sleep 60 &` inherits stdout; parent echoes then exits but the child + # holds the pipe open for 60s. + started = _t.time() + out = env.execute({"command": "sleep 60 & echo started; sleep 60"}) + elapsed = _t.time() - started + assert elapsed < 15, f"did not reap promptly (took {elapsed:.1f}s)" + assert out["returncode"] == -1 + assert "timed out" in out["output"] + +def test_timeout_retry_model_interrupts_stall_then_succeeds(): + """A model whose query blocks past the per-call timeout must be interrupted + by SIGALRM and retried; once a call returns quickly the wrapper yields it.""" + import time as _t + + from bench.runners.localize_runner import TimeoutRetryModel + + class FlakyModel: + def __init__(self): + self.calls = 0 + self.cost = 1.23 # attribute that must be delegated + + def query(self, messages, **kwargs): + self.calls += 1 + if self.calls == 1: + _t.sleep(30) # stall: SIGALRM must interrupt this + return {"role": "assistant", "content": "ok"} + + inner = FlakyModel() + wrapped = TimeoutRetryModel(inner, per_call_timeout=1, retries=2) + started = _t.time() + out = wrapped.query([{"role": "user", "content": "hi"}]) + elapsed = _t.time() - started + assert out["content"] == "ok" + assert inner.calls == 2 # first stalled+interrupted, second succeeded + assert elapsed < 10, f"did not interrupt the stall promptly ({elapsed:.1f}s)" + assert wrapped.cost == 1.23 # delegation works + + +def test_timeout_retry_model_raises_after_exhausting_retries(): + import time as _t + + from bench.runners.localize_runner import TimeoutRetryModel + + class DeadModel: + def query(self, messages, **kwargs): + _t.sleep(30) + + wrapped = TimeoutRetryModel(DeadModel(), per_call_timeout=1, retries=1) + started = _t.time() + try: + wrapped.query([{"role": "user", "content": "hi"}]) + raised = False + except TimeoutError: + raised = True + elapsed = _t.time() - started + assert raised + assert elapsed < 10 + +def test_build_instance_template_forced_vs_freeform(): + from bench.runners.localize_runner import ( + LOCALIZE_INSTANCE_TEMPLATE, + build_instance_template, + ) + + # Free-form: identical to the shared template for every config. + for cfg in ("baseline", "lsp", "code_graph", "code_graph_mcp"): + assert build_instance_template(cfg, force_tool=False) == LOCALIZE_INSTANCE_TEMPLATE + + # Forced: tool configs get a mandate prefix naming their tool. + lsp_t = build_instance_template("lsp", force_tool=True) + assert lsp_t != LOCALIZE_INSTANCE_TEMPLATE + assert lsp_t.endswith(LOCALIZE_INSTANCE_TEMPLATE) + assert "MANDATORY" in lsp_t and "lsp" in lsp_t + + cg_t = build_instance_template("code_graph", force_tool=True) + assert "MANDATORY" in cg_t and "cg search_code" in cg_t + + # baseline has no tool -> forced is a no-op (still the shared template). + assert build_instance_template("baseline", force_tool=True) == LOCALIZE_INSTANCE_TEMPLATE diff --git a/tests/bench/test_struct_query_bench.py b/tests/bench/test_struct_query_bench.py new file mode 100644 index 00000000..914c4b57 --- /dev/null +++ b/tests/bench/test_struct_query_bench.py @@ -0,0 +1,55 @@ +"""Unit tests for the deterministic structural-query compression benchmark. + +Only the pure (FalkorDB-free) helpers are covered here; the graph/grep paths +are integration-tested by running the module against a live indexed repo. +""" + +from __future__ import annotations + +import pytest + +from bench.runners import struct_query_bench as sqb + + +def test_relpath_strips_worktree(): + assert sqb._relpath("/wt/pkg/mod.py", "/wt") == "pkg/mod.py" + + +def test_relpath_falls_back_to_basename_outside_worktree(): + assert sqb._relpath("/other/pkg/mod.py", "/wt") == "mod.py" + + +def test_tok_counts_tokens(): + pytest.importorskip("tiktoken") + assert sqb._tok("hello world") > 0 + # longer text => more tokens + assert sqb._tok("a b c d e f g h") > sqb._tok("a b") + + +def test_summarize_paired_ratio_stats(): + rows = [ + {"ratio": 4.0, "graph_tokens": 100, "raw_tokens": 400}, + {"ratio": 2.0, "graph_tokens": 50, "raw_tokens": 100}, + {"ratio": 0.5, "graph_tokens": 200, "raw_tokens": 100}, + {"ratio": 1.0, "graph_tokens": 80, "raw_tokens": 80}, + ] + s = sqb.summarize(rows) + assert s["n"] == 4 + # median of [0.5, 1.0, 2.0, 4.0] = 1.5 + assert s["median_ratio"] == 1.5 + # 2 of 4 strictly above 1.0 + assert s["win_rate"] == "2/4" + assert s["geomean_ratio"] is not None + + +def test_summarize_handles_empty(): + s = sqb.summarize([]) + assert s["n"] == 0 + assert s["median_ratio"] is None + assert s["win_rate"] == "0/0" + + +def test_generic_names_excluded_constant(): + # sanity: the common-name filter contains the obvious megahub offenders + for n in ("run", "get", "__init__", "update"): + assert n in sqb._GENERIC diff --git a/tests/bench/test_trace.py b/tests/bench/test_trace.py new file mode 100644 index 00000000..60253ed7 --- /dev/null +++ b/tests/bench/test_trace.py @@ -0,0 +1,154 @@ +"""Unit tests for the trace extractor's before-call thoughts + final blocks. + +These cover the post-benchmark behaviour-analysis requirements: surface the +agent's thinking/narration BEFORE each tool call, capture the trailing +reasoning/answer after the last call, and thread reasoning_tokens into meta. +""" +from __future__ import annotations + +from bench.analysis.trace import build_steps, final_blocks, render_md + + +def _ev(etype: str, **data): + return {"type": etype, "data": data} + + +def _make_events(): + # Turn 0: reason + narrate, then fire TWO sibling tool calls. + # Turn 1: narrate, then one tool call. + # Trailing: closing reasoning + final answer after the last tool. + return [ + _ev("assistant.reasoning", content="I should find the symbol first."), + _ev("assistant.message", content="Searching the graph."), + _ev("tool.execution_start", toolName="code-graph-search_code", + toolCallId="c1", turnId=0, arguments={"query": "foo"}), + _ev("tool.execution_start", toolName="grep", + toolCallId="c2", turnId=0, arguments={"pattern": "foo"}), + _ev("tool.execution_complete", toolCallId="c1", success=True, + result={"content": [{"type": "text", "text": "hit"}]}), + _ev("tool.execution_complete", toolCallId="c2", success=True, + result={"content": [{"type": "text", "text": "match"}]}), + _ev("assistant.reasoning", content="Now I narrow down the file."), + _ev("assistant.message", content="Reading the file."), + _ev("tool.execution_start", toolName="view", + toolCallId="c3", turnId=1, arguments={"path": "a.py"}), + _ev("tool.execution_complete", toolCallId="c3", success=True, + result={"content": [{"type": "text", "text": "src"}]}), + _ev("assistant.reasoning", content="The fix lives in a.py."), + _ev("assistant.message", content="FINAL_LOCALIZATION_JSON: [\"a.py\"]"), + ] + + +def test_thinking_before_attaches_to_first_tool_of_turn(): + steps = build_steps(_make_events()) + assert len(steps) == 3 + # Step 0 (first tool of turn 0) carries the turn's thinking + narration. + assert steps[0]["thinking_before"] == "I should find the symbol first." + assert steps[0]["narration_before"] == "Searching the graph." + # Step 1 is a sibling in the same turn -> before-window is empty. + assert steps[1]["thinking_before"] == "" + assert steps[1]["narration_before"] == "" + assert steps[0]["turn"] == steps[1]["turn"] == 0 + # Step 2 (turn 1) carries turn 1's thinking + narration. + assert steps[2]["thinking_before"] == "Now I narrow down the file." + assert steps[2]["narration_before"] == "Reading the file." + + +def test_final_blocks_capture_trailing_answer(): + final = final_blocks(_make_events()) + assert final["thinking"] == "The fix lives in a.py." + assert "FINAL_LOCALIZATION_JSON" in final["narration"] + + +def test_render_md_shows_thoughts_before_call_and_reasoning_tokens(): + events = _make_events() + steps = build_steps(events) + final = final_blocks(events) + meta = {"input_tokens": 100, "output_tokens": 10, + "reasoning_tokens": 42, "total_tokens": 110} + md = render_md(meta, steps, [], { + "tool_calls_total": 3, "tool_calls_by_kind": {}, "structural_adopted": True, + "structural_calls": 1, "first_tool": "code-graph-search_code", + "empty_result_count": 0, "tool_error_count": 0, "redundant_call_count": 0, + "gold_hit_source_counts": {}, + }, final) + assert "of which reasoning: 42" in md + assert "thinking (before call):" in md + assert "narration (before call):" in md + # The thought renders before the call line for step 0. + assert md.index("I should find the symbol first.") < md.index('"query": "foo"') + assert "## Final (after last tool call)" in md + + +def test_final_blocks_empty_when_no_trailing_content(): + events = [ + _ev("tool.execution_start", toolName="grep", toolCallId="c1", + turnId=0, arguments={}), + _ev("tool.execution_complete", toolCallId="c1", success=True, + result={"content": [{"type": "text", "text": "x"}]}), + ] + final = final_blocks(events) + assert final["thinking"] == "" + assert final["narration"] == "" + + +def test_cost_without_benefit_charges_unused_structural_tokens(): + """A structural tool that surfaces a gold file is beneficial; an empty or + unused structural call is charged as cost-without-benefit.""" + from bench.analysis.trace import attribute_files, summarize + + steps = [ + # Step 0: graph call that surfaces the gold file -> beneficial. + {"step": 0, "turn": 0, "tool": "code-graph-search_code", "kind": "graph", + "arguments": {"query": "centroid"}, "success": True, "empty": False, + "result_text": "uxarray/grid/coordinates.py prepare_points", + "result_tokens_est": 50}, + # Step 1: graph call returning empty -> wasted (0 tokens but a wasted call). + {"step": 1, "turn": 0, "tool": "code-graph-search_code", "kind": "graph", + "arguments": {"query": "nope"}, "success": True, "empty": True, + "result_text": "", "result_tokens_est": 0}, + # Step 2: verbose lsp dump never tied to a gold prediction -> wasted. + {"step": 2, "turn": 1, "tool": "lsp-document_symbols", "kind": "lsp", + "arguments": {"file": "x.py"}, "success": True, "empty": False, + "result_text": "lots of symbols", "result_tokens_est": 2714}, + # Step 3: builtin grep -> not under test, not charged. + {"step": 3, "turn": 2, "tool": "grep", "kind": "builtin_reader", + "arguments": {"pattern": "y"}, "success": True, "empty": False, + "result_text": "noise", "result_tokens_est": 100}, + ] + pred = ["uxarray/grid/coordinates.py"] + gold = ["uxarray/grid/coordinates.py"] + attribution = attribute_files(pred, gold, steps, prompt_text="") + cwb = summarize(steps, attribution)["cost_without_benefit"] + + assert cwb["benefited"] is True + # graph step 0 (50 tok) benefited; step 1 empty wasted; lsp 2714 wasted. + assert cwb["beneficial_tokens"] == 50 + assert cwb["wasted_tokens"] == 2714 + assert cwb["wasted_calls"] == 2 # empty graph + unused lsp + assert cwb["structural_result_tokens"] == 50 + 0 + 2714 + assert cwb["by_kind"]["lsp"]["wasted_tokens"] == 2714 + assert cwb["by_kind"]["graph"]["wasted_calls"] == 1 + # builtin grep tokens are NOT counted as structural cost. + assert "builtin_reader" not in cwb["by_kind"] + + +def test_cost_without_benefit_flags_zero_contribution(): + """Structural tool used but it never surfaced the gold file -> benefited False + and all its tokens are wasted.""" + from bench.analysis.trace import attribute_files, summarize + + steps = [ + {"step": 0, "turn": 0, "tool": "code-graph-search_code", "kind": "graph", + "arguments": {"query": "q"}, "success": True, "empty": False, + "result_text": "some/other/file.py", "result_tokens_est": 300}, + ] + # Agent predicted the gold from its own prior / builtin, not the graph. + pred = ["the/gold.py"] + gold = ["the/gold.py"] + attribution = attribute_files(pred, gold, steps, prompt_text="") + cwb = summarize(steps, attribution)["cost_without_benefit"] + + assert cwb["benefited"] is False + assert cwb["wasted_tokens"] == 300 + assert cwb["wasted_fraction"] == 1.0