diff --git a/bench/agents/code_graph_adapter.py b/bench/agents/code_graph_adapter.py index 96f5cfdf..a4728835 100644 --- a/bench/agents/code_graph_adapter.py +++ b/bench/agents/code_graph_adapter.py @@ -140,6 +140,85 @@ def note_edit(self, repo: str, path: str) -> dict[str, Any]: except httpx.HTTPError as exc: return {"ok": False, "error": str(exc), "path": path} + # ------------------------------------------------------------------ + # v2 agent verbs — parity with the MCP transport. + # ------------------------------------------------------------------ + # Each hits a /api/v2/* endpoint that wraps the same async function + # the FastMCP server exposes. Output shape is identical between + # transports, so cg / cg-mcp benchmarks measure transport overhead + # rather than API-surface differences. + + def search_code(self, project: str, prefix: str, branch: str | None = None, + limit: int = 10) -> list[dict[str, Any]]: + body: dict[str, Any] = {"project": project, "prefix": prefix, "limit": limit} + if branch: + body["branch"] = branch + r = self._client.post("/api/v2/search_code", json=body) + r.raise_for_status() + return r.json() + + def get_callers(self, project: str, symbol_id: int, branch: str | None = None, + limit: int = 50) -> list[dict[str, Any]]: + body: dict[str, Any] = {"project": project, "symbol_id": symbol_id, "limit": limit} + if branch: + body["branch"] = branch + r = self._client.post("/api/v2/get_callers", json=body) + r.raise_for_status() + return r.json() + + def get_callees(self, project: str, symbol_id: int, branch: str | None = None, + limit: int = 50) -> list[dict[str, Any]]: + body: dict[str, Any] = {"project": project, "symbol_id": symbol_id, "limit": limit} + if branch: + body["branch"] = branch + r = self._client.post("/api/v2/get_callees", json=body) + r.raise_for_status() + return r.json() + + def get_dependencies(self, project: str, symbol_id: int, branch: str | None = None, + limit: int = 50) -> list[dict[str, Any]]: + body: dict[str, Any] = {"project": project, "symbol_id": symbol_id, "limit": limit} + if branch: + body["branch"] = branch + r = self._client.post("/api/v2/get_dependencies", json=body) + r.raise_for_status() + return r.json() + + def impact_analysis(self, project: str, symbol_id: int, + branch: str | None = None, + direction: str = "IN", + depth: int = 3) -> list[dict[str, Any]]: + body: dict[str, Any] = { + "project": project, "symbol_id": symbol_id, + "direction": direction, "depth": depth, + } + if branch: + body["branch"] = branch + r = self._client.post("/api/v2/impact_analysis", json=body) + r.raise_for_status() + return r.json() + + def find_path_v2(self, project: str, source_id: int, dest_id: int, + branch: str | None = None, + max_paths: int = 10) -> list[dict[str, Any]]: + body: dict[str, Any] = { + "project": project, "source_id": source_id, "dest_id": dest_id, + "max_paths": max_paths, + } + if branch: + body["branch"] = branch + r = self._client.post("/api/v2/find_path", json=body) + r.raise_for_status() + return r.json() + + def ask_v2(self, project: str, question: str, branch: str | None = None) -> Any: + body: dict[str, Any] = {"project": project, "question": question} + if branch: + body["branch"] = branch + r = self._client.post("/api/v2/ask", json=body) + r.raise_for_status() + return r.json() + # Convenience function aliases — the SWE-agent tool registry expects # top-level callables. Each spins up a short-lived client; for hot loops diff --git a/bench/cli/cg.py b/bench/cli/cg.py index e31e623e..d684f5b8 100644 --- a/bench/cli/cg.py +++ b/bench/cli/cg.py @@ -1,18 +1,26 @@ -"""`cg` — bash-callable CLI exposing code-graph primitives. - -mini-swe-agent only uses bash, so each "tool" we want the agent to have is -just a CLI it can invoke. This script wraps bench/agents/code_graph_adapter -behind a small argparse interface and prints JSON results to stdout, one -JSON document per call. - -Usage examples (run inside the agent's bash environment): - - cg graph-entities --repo django - cg get-neighbors --repo django --ids 12 14 17 - cg find-paths --repo django --src 12 --dst 88 - cg auto-complete --repo django --prefix get_user - cg find-symbol --repo django --name get_user_model - cg note-edit --repo django --path src/django/contrib/auth/models.py +"""`cg` — bash-callable CLI exposing code-graph's 8 agent primitives over HTTP. + +This is the HTTP-transport sibling of `cg-mcp`. Both CLIs expose the **same +verb surface** (search_code, get_callers, get_callees, get_dependencies, +impact_analysis, find_path, ask, index_repo) over **the same underlying +async tool functions** (api.mcp.tools.structural and api.mcp.tools.ask), +so a benchmark comparison between them measures transport overhead, not +API differences. + +The agent calls these via bash: + + cg index_repo --path-or-url . [--branch B] [--ignore PAT ...] + cg search_code --project P --prefix STR [--branch B] [--limit N] + cg get_callers --project P --symbol-id ID [--branch B] [--limit N] + cg get_callees --project P --symbol-id ID [--branch B] [--limit N] + cg get_dependencies --project P --symbol-id ID [--branch B] [--limit N] + cg impact_analysis --project P --symbol-id ID [--direction IN|OUT] [--depth N] [--limit N] + cg find_path --project P --source-id ID --dest-id ID [--branch B] + cg ask --project P --question "..." [--branch B] + +Legacy verbs (graph-entities, get-neighbors, find-paths, auto-complete, +find-symbol, note-edit) remain for the React UI's backing tests but are +not exposed to the agent preamble. Required env vars (set by the runner): CODEGRAPH_URL base URL of the code-graph service @@ -29,12 +37,50 @@ from bench.agents.code_graph_adapter import CodeGraphClient -# ---------- Output compaction -------------------------------------------------- -# Every byte returned here is re-fed to the LLM on every subsequent turn (the -# context window grows monotonically until the trajectory ends). A neighbors -# call that returns 20 KB of raw JSON costs ~5K tokens, and at 50+ turns that -# compounds badly. The full FastAPI shape is needed by the React frontend, not -# by an agent — strip the noise here so the LLM sees only what it can act on. +# --------------------------------------------------------------------------- +# Output compaction — must match bench/cli/cg_mcp.py exactly for parity. +# --------------------------------------------------------------------------- +# +# Iter2 finding: every node returned by the v2 endpoints has an absolute +# worktree path under `file` (~130 chars). Stripping the project-name +# prefix saves ~100 chars × N entries, which compounds badly when the +# agent re-feeds tool output across 30-50 turns. + +def _strip_worktree_prefix(path: Any, project: str | None) -> Any: + if not isinstance(path, str) or not project: + return path + needle = f"/{project}/" + idx = path.find(needle) + if idx < 0: + return path + return path[idx + len(needle):] + + +def _compact_entry(entry: Any, project: str | None) -> Any: + if not isinstance(entry, dict): + return entry + out: dict[str, Any] = {} + for k, v in entry.items(): + if v in (None, "", [], {}): + continue + if k == "file": + v = _strip_worktree_prefix(v, project) + out[k] = v + return out + + +def _compact_list(items: Any, project: str | None, limit: int | None) -> Any: + if not isinstance(items, list): + return items + compacted = [_compact_entry(x, project) for x in items] + if limit is not None and limit > 0: + compacted = compacted[:limit] + return compacted + + +# --------------------------------------------------------------------------- +# Legacy UI-verb compaction (kept so existing tests keep passing). +# --------------------------------------------------------------------------- _NODE_KEEP = ("id", "label", "labels", "name", "file", "src", "line", "start_line", "end_line") _EDGE_KEEP = ("id", "src_node", "dest_node", "relation") @@ -65,7 +111,6 @@ def _compact_edge(e: Any) -> Any: def _compact_neighbors(payload: dict[str, Any], limit: int | None) -> dict[str, Any]: - """Strip empty properties + alias and apply optional limit.""" if not isinstance(payload, dict): return payload n = payload.get("neighbors") or payload @@ -81,13 +126,6 @@ def _compact_neighbors(payload: dict[str, Any], limit: int | None) -> dict[str, def _compact_symbols(payload: Any) -> Any: - """Trim find-symbol / auto-complete records to the fields the agent needs. - - The HTTP responses vary in shape: - - find_symbol: ``[node, ...]`` - - auto_complete: ``{"branch": ..., "completions": [node, ...]}`` - Compact both consistently. - """ if isinstance(payload, list): return [_compact_node(x) for x in payload] if isinstance(payload, dict): @@ -100,49 +138,132 @@ def _compact_symbols(payload: Any) -> Any: def _print(obj: object) -> None: - # Compact separators shave ~30 % off vs the default indented form, which the - # LLM doesn't need (it ignores whitespace). json.dump(obj, sys.stdout, separators=(",", ":"), sort_keys=True, default=str) sys.stdout.write("\n") +# --------------------------------------------------------------------------- +# index_repo over HTTP. Hits /api/analyze_folder; for parity we keep the +# same kwargs as cg-mcp. +# --------------------------------------------------------------------------- + +def _index_repo(c: CodeGraphClient, path_or_url: str, + branch: str | None, ignore: list[str] | None) -> dict[str, Any]: + """Mirror cg-mcp index_repo but go through HTTP /api/analyze_folder.""" + body: dict[str, Any] = {"path": path_or_url, "ignore": ignore or []} + if branch: + body["branch"] = branch + r = c._client.post("/api/analyze_folder", json=body) + r.raise_for_status() + return r.json() + + def main(argv: list[str] | None = None) -> int: parser = argparse.ArgumentParser(prog="cg", description=__doc__) sub = parser.add_subparsers(dest="cmd", required=True) + def _add_project(p: argparse.ArgumentParser) -> None: + p.add_argument("--project", required=True) + p.add_argument("--branch", default=None) + + def _add_symbol(p: argparse.ArgumentParser) -> None: + p.add_argument("--symbol-id", type=int, required=True, dest="symbol_id") + p.add_argument("--limit", type=int, default=50) + + # ---- v2 (MCP-parity) verbs ---- + ir = sub.add_parser("index_repo") + ir.add_argument("--path-or-url", required=True, dest="path_or_url") + ir.add_argument("--branch", default=None) + ir.add_argument("--ignore", nargs="*", default=None) + + sc = sub.add_parser("search_code") + _add_project(sc) + sc.add_argument("--prefix", required=True) + sc.add_argument("--limit", type=int, default=10) + + for name in ("get_callers", "get_callees", "get_dependencies"): + p = sub.add_parser(name) + _add_project(p) + _add_symbol(p) + + ia = sub.add_parser("impact_analysis") + _add_project(ia) + ia.add_argument("--symbol-id", type=int, required=True, dest="symbol_id") + ia.add_argument("--direction", choices=["IN", "OUT"], default="IN") + ia.add_argument("--depth", type=int, default=3) + ia.add_argument("--limit", type=int, default=50) + + fp2 = sub.add_parser("find_path") + _add_project(fp2) + fp2.add_argument("--source-id", type=int, required=True, dest="source_id") + fp2.add_argument("--dest-id", type=int, required=True, dest="dest_id") + + aq = sub.add_parser("ask") + _add_project(aq) + aq.add_argument("--question", required=True) + + # ---- legacy UI verbs (kept for existing tests) ---- def add_repo(p: argparse.ArgumentParser) -> None: - p.add_argument("--repo", required=True, help="repository name in the graph") - - ge = sub.add_parser("graph-entities") - add_repo(ge) + p.add_argument("--repo", required=True) - gn = sub.add_parser("get-neighbors") - add_repo(gn) + ge = sub.add_parser("graph-entities"); add_repo(ge) + gn = sub.add_parser("get-neighbors"); add_repo(gn) gn.add_argument("--ids", type=int, nargs="+", required=True) - gn.add_argument("--limit", type=int, default=50, - help="cap nodes/edges in response (default 50, 0 = unlimited)") - - fp = sub.add_parser("find-paths") - add_repo(fp) + gn.add_argument("--limit", type=int, default=50) + fp = sub.add_parser("find-paths"); add_repo(fp) fp.add_argument("--src", type=int, required=True) fp.add_argument("--dst", type=int, required=True) - - ac = sub.add_parser("auto-complete") - add_repo(ac) + ac = sub.add_parser("auto-complete"); add_repo(ac) ac.add_argument("--prefix", required=True) - - fs = sub.add_parser("find-symbol") - add_repo(fs) + fs = sub.add_parser("find-symbol"); add_repo(fs) fs.add_argument("--name", required=True) - - ne = sub.add_parser("note-edit") - add_repo(ne) + ne = sub.add_parser("note-edit"); add_repo(ne) ne.add_argument("--path", required=True) args = parser.parse_args(argv) with CodeGraphClient() as c: - if args.cmd == "graph-entities": + proj = getattr(args, "project", None) + # ---- v2 verbs ---- + if args.cmd == "index_repo": + _print(_index_repo(c, args.path_or_url, args.branch, args.ignore)) + elif args.cmd == "search_code": + _print(_compact_list( + c.search_code(args.project, args.prefix, branch=args.branch, limit=args.limit), + proj, args.limit, + )) + elif args.cmd == "get_callers": + _print(_compact_list( + c.get_callers(args.project, args.symbol_id, branch=args.branch, limit=args.limit), + proj, args.limit, + )) + elif args.cmd == "get_callees": + _print(_compact_list( + c.get_callees(args.project, args.symbol_id, branch=args.branch, limit=args.limit), + proj, args.limit, + )) + elif args.cmd == "get_dependencies": + _print(_compact_list( + c.get_dependencies(args.project, args.symbol_id, branch=args.branch, limit=args.limit), + proj, args.limit, + )) + elif args.cmd == "impact_analysis": + _print(_compact_list( + c.impact_analysis( + args.project, args.symbol_id, branch=args.branch, + direction=args.direction, depth=args.depth, + ), + proj, args.limit, + )) + elif args.cmd == "find_path": + _print(_compact_entry( + c.find_path_v2(args.project, args.source_id, args.dest_id, branch=args.branch), + proj, + )) + elif args.cmd == "ask": + _print(c.ask_v2(args.project, args.question, branch=args.branch)) + # ---- legacy verbs ---- + elif args.cmd == "graph-entities": _print(c.graph_entities(args.repo)) elif args.cmd == "get-neighbors": limit = args.limit if args.limit > 0 else None diff --git a/bench/runners/mini_runner.py b/bench/runners/mini_runner.py index 2e3d441f..02af96f4 100644 --- a/bench/runners/mini_runner.py +++ b/bench/runners/mini_runner.py @@ -132,7 +132,8 @@ class Task: INSTANCE_TEMPLATE_CODE_GRAPH = """\ You are working in the repository at {{cwd}}. The code-graph service has already indexed this repository under the -name `$REPO_NAME` (use the env var literally). +project name `$PROJECT_NAME` on branch `$BRANCH` (use the env vars +literally). The task to solve: @@ -141,13 +142,12 @@ class Task: **Required workflow.** Before reading or editing any file, your first bash command MUST be: - `cg find-symbol --repo "$REPO_NAME" --name ` + `cg search_code --project "$PROJECT_NAME" --branch "$BRANCH" --prefix ` -then use `cg get-neighbors --repo "$REPO_NAME" --ids ` to expand -relationships before doing any textual search. After every file edit, -run `cg note-edit --repo "$REPO_NAME" --path ` so subsequent -graph queries reflect your change. Reach for grep/sed/cat only for -content reading after `cg` has located the right place. +Then use `cg get_callers --project "$PROJECT_NAME" --branch "$BRANCH" --symbol-id ` +to expand relationships before doing any textual search. Use +`cg impact_analysis ... --symbol-id --depth 3` before +non-trivial edits. When you believe the task is complete, finish your turn with a final message that contains a unified diff of your changes inside a fenced @@ -237,8 +237,12 @@ def config_env(config: str, repo_path: Path) -> dict[str, str]: elif config == "code_graph": # The runner is responsible for ensuring the service is up. env.setdefault("CODEGRAPH_URL", "http://127.0.0.1:5000") - # The agent's preamble references $REPO_NAME — set it to the - # worktree dirname, which is what analyze_folder used as the id. + # Parity with MCP track: both tracks now use the same verbs and the + # same env-var contract. PROJECT_NAME / BRANCH match what the + # indexing pre-step registers. + env["PROJECT_NAME"] = repo_path.name + env["BRANCH"] = os.environ.get("CGRAPH_HTTP_BRANCH", "_default") + # Keep REPO_NAME for any legacy preambles / tests. env["REPO_NAME"] = repo_path.name elif config == "code_graph_mcp": # MCP transport: agent calls `cg-mcp …` which spawns the @@ -277,6 +281,7 @@ def _ensure_indexed(repo_path: Path) -> float: base = os.environ.get("CODEGRAPH_URL", "http://127.0.0.1:5000").rstrip("/") repo_name = repo_path.name + branch = os.environ.get("CGRAPH_HTTP_BRANCH", "_default") token = os.environ.get("SECRET_TOKEN") or os.environ.get("CODEGRAPH_TOKEN") headers = {"Authorization": f"Bearer {token}"} if token else {} @@ -310,15 +315,12 @@ def _ensure_indexed(repo_path: Path) -> float: # that schema churn. host = os.environ.get("FALKORDB_HOST", "127.0.0.1") port = int(os.environ.get("FALKORDB_PORT", "6379")) - expected_graph = repo_name # the HTTP path uses bare folder name as graph key + expected_graph = f"code:{repo_name}:{branch}" try: r = redis.Redis(host=host, port=port, decode_responses=True, socket_timeout=2) graphs = r.execute_command("GRAPH.LIST") or [] - # Match either bare name (legacy) or "code::" pattern. - if expected_graph in graphs or any( - g == repo_name or g.startswith(f"code:{repo_name}:") for g in graphs - ): - print(f"[index] {repo_name} already in FalkorDB; skip") + if expected_graph in graphs: + print(f"[index] {expected_graph} already in FalkorDB; skip") return 0.0 except Exception as exc: # noqa: BLE001 print(f"[index] WARN GRAPH.LIST precheck failed ({exc!r}); attempting index anyway") @@ -337,7 +339,7 @@ def _ensure_indexed(repo_path: Path) -> float: headers=headers) as c: r = c.post( f"{base}/api/analyze_folder", - json={"path": str(repo_path), "ignore": default_ignore}, + json={"path": str(repo_path), "ignore": default_ignore, "branch": branch}, ) if r.status_code != 200: raise RuntimeError( @@ -867,8 +869,9 @@ def main(argv: list[str] | None = None) -> int: default="smoke", help="SWE-bench stage (sample size). Only used with --swe-bench.") p.add_argument("--limit", type=int, default=None, - help="Cap number of instances sampled. Overrides --stage size " - "for quick checks (e.g. --limit 1).") + help="Exact number of instances to sample. Overrides the " + "--stage size (e.g. --limit 1 for a quick check, " + "--limit 40 for a larger run).") p.add_argument("--results", type=Path, default=DEFAULT_RESULTS) p.add_argument("--trajectories", type=Path, default=DEFAULT_CACHE_DIR / "trajectories") p.add_argument("--model", default="anthropic/claude-sonnet-4-5", @@ -903,9 +906,7 @@ def main(argv: list[str] | None = None) -> int: ) from bench.metrics import append_jsonl - insts = sample_instances(load_instances(), stage=args.stage) - if args.limit is not None: - insts = insts[: args.limit] + insts = sample_instances(load_instances(), stage=args.stage, n=args.limit) print(f"[swe-bench] stage={args.stage} running {len(insts)} instances " f"x {len(configs)} configs = {len(insts) * len(configs)} trajectories") for inst in insts: diff --git a/bench/tools/code_graph/system_preamble.md b/bench/tools/code_graph/system_preamble.md index 99ad597b..066030f9 100644 --- a/bench/tools/code_graph/system_preamble.md +++ b/bench/tools/code_graph/system_preamble.md @@ -1,30 +1,38 @@ -# code-graph preamble +# code-graph (HTTP) preamble You are an autonomous coding agent solving a software-engineering task. Your sole tool is bash: every action you take is a shell command that is executed in the repository's working directory. -A pre-indexed code-graph for this repo is available via `cg`. +A pre-indexed code-graph for this repo is available via the `cg` CLI +(talks to the code-graph HTTP service at `$CODEGRAPH_URL`). **Use `cg` to locate symbols before reading files or grepping.** -`$REPO_NAME` is exported. +`$PROJECT_NAME` and `$BRANCH` are exported. + +This CLI exposes the **same 8 verbs** as the MCP-track `cg-mcp` CLI; +they wrap the same underlying tool implementations. The only +difference is transport (HTTP vs stdio MCP). ## Workflow -1. `cg find-symbol --repo "$REPO_NAME" --name ` → `{id, file, line}`. -2. `cg get-neighbors --repo "$REPO_NAME" --ids [--limit 50]` → - callers / callees / definitions. Default limit 50 keeps output small; - pass `--limit 0` only if you truly need everything. -3. Read the file with `sed -n` / `cat`, then edit. -4. After every edit run `cg note-edit --repo "$REPO_NAME" --path `. +1. `cg search_code --project "$PROJECT_NAME" --prefix ` → + list of `{id, name, file, line}`. Pick the best `id`. +2. `cg get_callers --project "$PROJECT_NAME" --symbol-id ` — + who calls X. (Default `--limit 50`.) +3. `cg impact_analysis --project "$PROJECT_NAME" --symbol-id --depth 3` — + transitive blast radius before any non-trivial edit. +4. Read ONLY the relevant span with `sed -n 'START,ENDp' `, + anchored on the line number the graph already gave you (e.g. + `sed -n '430,470p'`). Then edit. ## Sub-commands -- `cg find-symbol --repo R --name NAME` -- `cg get-neighbors --repo R --ids N [N ...] [--limit N]` -- `cg find-paths --repo R --src N --dst N` -- `cg auto-complete --repo R --prefix STRING` -- `cg note-edit --repo R --path PATH` (call after every edit) -- `cg graph-entities --repo R` (large; rarely needed) +- `cg search_code --project P --prefix STR [--limit N]` +- `cg get_callers --project P --symbol-id ID [--limit N]` +- `cg get_callees --project P --symbol-id ID [--limit N]` +- `cg get_dependencies --project P --symbol-id ID [--limit N]` +- `cg impact_analysis --project P --symbol-id ID [--direction IN|OUT] [--depth N] [--limit N]` +- `cg find_path --project P --source-id ID --dest-id ID` ## Rules @@ -33,10 +41,12 @@ A pre-indexed code-graph for this repo is available via `cg`. earlier tool output in this conversation. - **Do not fall back to `grep`/`rg`/`find` silently.** If `cg` returns empty, say so in your next message before grepping. -- Standard Unix tools (`cat`, `grep`, `find`, `sed`) remain available - for cases the graph can't answer. - -## Submission +- **Never `cat` a whole source file.** The graph already gave you the + line number — read a bounded window with `sed -n 'START,ENDp'` + (widen by ~30 lines if you need more context). Full-file reads are + the single biggest source of wasted tokens. +- Standard Unix tools remain available for cases the graph can't + answer. When you believe the task is complete, run a bash command whose first line of stdout is exactly: diff --git a/bench/tools/code_graph_mcp/system_preamble.md b/bench/tools/code_graph_mcp/system_preamble.md index b20ae927..c9780520 100644 --- a/bench/tools/code_graph_mcp/system_preamble.md +++ b/bench/tools/code_graph_mcp/system_preamble.md @@ -17,7 +17,9 @@ A pre-indexed code-graph for this repo is available via the who calls X. (Default `--limit 50`.) 3. `cg-mcp impact_analysis --project "$PROJECT_NAME" --symbol-id --depth 3` — transitive blast radius before any non-trivial edit. -4. Read the file with `sed -n` / `cat`, then edit. +4. Read ONLY the relevant span with `sed -n 'START,ENDp' `, + anchored on the line number the graph already gave you (e.g. + `sed -n '430,470p'`). Then edit. ## Sub-commands @@ -35,11 +37,13 @@ A pre-indexed code-graph for this repo is available via the earlier tool output in this conversation. - **Do not fall back to `grep`/`rg`/`find` silently.** If `cg-mcp` returns empty, say so in your next message before grepping. +- **Never `cat` a whole source file.** The graph already gave you the + line number — read a bounded window with `sed -n 'START,ENDp'` + (widen by ~30 lines if you need more context). Full-file reads are + the single biggest source of wasted tokens. - Standard Unix tools remain available for cases the graph can't answer. -## Submission - When you believe the task is complete, run a bash command whose first line of stdout is exactly: