From 1075704473c1052251e503fc2d65211e019bb003 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 8 Apr 2026 22:26:09 +0000 Subject: [PATCH 1/4] feat(cli): add HuggingFace dataset import command Add `agentv import huggingface` to import datasets from HuggingFace Hub into AgentV EVAL.yaml format. Supports SWE-bench-style datasets with automatic field mapping (instance_id -> test id, problem_statement -> input, FAIL_TO_PASS -> code-grader assertions, repo -> docker workspace). The command shells out to a Python script via `uv run` (per repo convention for Python scripts). The script uses inline PEP 723 metadata so `uv` auto-installs `datasets` and `pyyaml` dependencies. Closes #978 Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/import/huggingface.ts | 121 ++++++++ apps/cli/src/commands/import/index.ts | 4 +- scripts/import-huggingface.py | 300 ++++++++++++++++++++ 3 files changed, 424 insertions(+), 1 deletion(-) create mode 100644 apps/cli/src/commands/import/huggingface.ts create mode 100644 scripts/import-huggingface.py diff --git a/apps/cli/src/commands/import/huggingface.ts b/apps/cli/src/commands/import/huggingface.ts new file mode 100644 index 000000000..93f9d8d50 --- /dev/null +++ b/apps/cli/src/commands/import/huggingface.ts @@ -0,0 +1,121 @@ +/** + * `agentv import huggingface` — Import a HuggingFace dataset into AgentV EVAL.yaml format. + * + * Wraps the Python script `scripts/import-huggingface.py` which uses the + * `datasets` library to load from HuggingFace Hub and converts instances + * (e.g. SWE-bench) into individual .EVAL.yaml files. + * + * The Python script is executed via `uv run` (per repo convention for Python + * scripts). The `uv` tool auto-installs script dependencies from the inline + * metadata block. + * + * Usage: + * agentv import huggingface --repo SWE-bench/SWE-bench_Verified --split test --limit 10 --output evals/swebench/ + */ + +import { execFile } from 'node:child_process'; +import { existsSync } from 'node:fs'; +import path from 'node:path'; +import { command, number, option, optional, string } from 'cmd-ts'; + +/** + * Resolve the path to the import-huggingface.py script. + * + * Searches upward from the CLI package directory to find the repo root + * (where scripts/ lives). Falls back to cwd-relative path. + */ +function findScript(): string { + // Try relative to this file's compiled location (apps/cli/dist/ or apps/cli/src/) + const candidates = [ + path.resolve(__dirname, '..', '..', '..', '..', '..', 'scripts', 'import-huggingface.py'), + path.resolve(__dirname, '..', '..', '..', '..', 'scripts', 'import-huggingface.py'), + path.resolve(process.cwd(), 'scripts', 'import-huggingface.py'), + ]; + for (const candidate of candidates) { + if (existsSync(candidate)) return candidate; + } + return candidates[candidates.length - 1]; // fallback to cwd-relative +} + +export const importHuggingFaceCommand = command({ + name: 'huggingface', + description: 'Import a HuggingFace dataset into AgentV EVAL.yaml format', + args: { + repo: option({ + type: string, + long: 'repo', + description: 'HuggingFace dataset repository (e.g. SWE-bench/SWE-bench_Verified)', + }), + split: option({ + type: optional(string), + long: 'split', + description: 'Dataset split to load (default: test)', + }), + limit: option({ + type: optional(number), + long: 'limit', + description: 'Maximum number of instances to import', + }), + output: option({ + type: optional(string), + long: 'output', + short: 'o', + description: 'Output directory for EVAL.yaml files (default: evals/)', + }), + }, + handler: async ({ repo, split, limit, output }) => { + const scriptPath = findScript(); + + if (!existsSync(scriptPath)) { + console.error(`Error: Python script not found at ${scriptPath}`); + console.error( + 'Make sure you are running from the agentv repository root, or install agentv from source.', + ); + process.exit(1); + } + + // Build arguments for the Python script + const args = [scriptPath, '--repo', repo]; + if (split) args.push('--split', split); + if (limit !== undefined) args.push('--limit', String(limit)); + if (output) args.push('--output', output); + + console.log(`Importing from HuggingFace: ${repo} (split=${split ?? 'test'})...`); + + // Execute via uv run + await new Promise((resolve, reject) => { + const child = execFile('uv', ['run', ...args], { maxBuffer: 50 * 1024 * 1024 }, (error) => { + if (error) { + reject(error); + } else { + resolve(); + } + }); + + // Stream stderr (progress messages) to console + child.stderr?.on('data', (data: Buffer) => { + process.stderr.write(data); + }); + + // Capture stdout (JSON summary) + let stdout = ''; + child.stdout?.on('data', (data: Buffer) => { + stdout += data.toString(); + }); + + child.on('close', (code) => { + if (code === 0 && stdout.trim()) { + try { + const summary = JSON.parse(stdout.trim()); + console.log( + `\nImported ${summary.files_created} eval(s) from ${summary.dataset} → ${summary.output_dir}/`, + ); + } catch { + // If JSON parsing fails, just print raw output + if (stdout.trim()) console.log(stdout.trim()); + } + } + }); + }); + }, +}); diff --git a/apps/cli/src/commands/import/index.ts b/apps/cli/src/commands/import/index.ts index 84435d4c0..b680865c8 100644 --- a/apps/cli/src/commands/import/index.ts +++ b/apps/cli/src/commands/import/index.ts @@ -3,13 +3,15 @@ import { subcommands } from 'cmd-ts'; import { importClaudeCommand } from './claude.js'; import { importCodexCommand } from './codex.js'; import { importCopilotCommand } from './copilot.js'; +import { importHuggingFaceCommand } from './huggingface.js'; export const importCommand = subcommands({ name: 'import', - description: 'Import agent session transcripts for offline grading', + description: 'Import agent session transcripts and datasets for offline grading', cmds: { claude: importClaudeCommand, codex: importCodexCommand, copilot: importCopilotCommand, + huggingface: importHuggingFaceCommand, }, }); diff --git a/scripts/import-huggingface.py b/scripts/import-huggingface.py new file mode 100644 index 000000000..5ef6591c4 --- /dev/null +++ b/scripts/import-huggingface.py @@ -0,0 +1,300 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "datasets>=2.14.0", +# "pyyaml>=6.0", +# ] +# /// +""" +HuggingFace dataset importer for AgentV. + +Downloads a dataset from HuggingFace Hub and converts each instance to an +AgentV EVAL.yaml file. Currently supports SWE-bench-style datasets. + +Usage (via uv): + uv run scripts/import-huggingface.py \ + --repo SWE-bench/SWE-bench_Verified \ + --split test \ + --limit 10 \ + --output evals/swebench/ + +SWE-bench field mapping: + instance_id -> test id + problem_statement -> input (user message) + repo + base_commit -> workspace.docker metadata + FAIL_TO_PASS -> assertions (code-grader commands) + difficulty -> metadata.difficulty + +To support a new dataset schema: + 1. Add a detect function (like _is_swebench) + 2. Add a converter function (like _convert_swebench_instance) + 3. Register it in SCHEMA_CONVERTERS +""" + +from __future__ import annotations + +import argparse +import json +import re +import sys +from pathlib import Path +from typing import Any + +import yaml + + +# --------------------------------------------------------------------------- +# SWE-bench schema detection & conversion +# --------------------------------------------------------------------------- + +def _is_swebench(columns: list[str]) -> bool: + """Return True if the dataset columns match SWE-bench schema.""" + required = {"instance_id", "problem_statement", "repo", "base_commit"} + return required.issubset(set(columns)) + + +def _sanitize_id(instance_id: str) -> str: + """Convert a SWE-bench instance_id to a safe filename component. + + Example: 'django__django-16527' -> 'django__django-16527' + Strips characters that are unsafe in filenames. + """ + return re.sub(r"[^\w\-.]", "_", instance_id) + + +def _to_eval_name(instance_id: str) -> str: + """Convert an instance_id to an AgentV eval name (lowercase, alphanumeric + hyphens). + + AgentV name field must match /^[a-z0-9-]+$/. + Example: 'astropy__astropy-12907' -> 'astropy-astropy-12907' + """ + name = instance_id.lower() + # Replace underscores, dots, and other non-alphanumeric chars with hyphens + name = re.sub(r"[^a-z0-9-]", "-", name) + # Collapse consecutive hyphens + name = re.sub(r"-{2,}", "-", name) + # Strip leading/trailing hyphens + return name.strip("-") + + +def _docker_image_for_repo(repo: str) -> str: + """Derive a Docker image name for a SWE-bench repo. + + Uses the swebench Docker image naming convention: + swebench/sweb.eval.__:latest + """ + # repo format: "owner/name" e.g. "django/django" + safe = repo.replace("/", "__") + return f"swebench/sweb.eval.{safe}:latest" + + +def _parse_test_list(value: Any) -> list[str]: + """Parse FAIL_TO_PASS or PASS_TO_PASS from a SWE-bench row. + + The field may be a JSON-encoded list string, a Python list, or absent. + """ + if value is None: + return [] + if isinstance(value, list): + return [str(t) for t in value] + if isinstance(value, str): + try: + parsed = json.loads(value) + if isinstance(parsed, list): + return [str(t) for t in parsed] + except (json.JSONDecodeError, TypeError): + pass + # Fallback: comma-separated + return [t.strip() for t in value.split(",") if t.strip()] + return [] + + +def _convert_swebench_instance(row: dict[str, Any]) -> dict[str, Any]: + """Convert a single SWE-bench row to an AgentV EVAL.yaml dict.""" + instance_id = str(row.get("instance_id", "unknown")) + problem_statement = str(row.get("problem_statement", "")) + repo = str(row.get("repo", "")) + base_commit = str(row.get("base_commit", "")) + fail_to_pass = _parse_test_list(row.get("FAIL_TO_PASS")) + difficulty = row.get("difficulty") + + # Build assertions from FAIL_TO_PASS test names + assertions: list[dict[str, Any]] = [] + if fail_to_pass: + # Single code-grader that runs the failing tests + assertions.append({ + "type": "code-grader", + "command": [ + "python", "-c", + ( + "import subprocess, sys, json; " + f"result = subprocess.run({json.dumps(['python', '-m', 'pytest'] + fail_to_pass)}, " + "capture_output=True, text=True); " + "passed = result.returncode == 0; " + "print(json.dumps({'score': 1.0 if passed else 0.0, " + "'assertions': [{'text': 'FAIL_TO_PASS tests pass after patch', 'passed': passed, " + "'evidence': result.stdout[-500:] if result.stdout else result.stderr[-500:]}]}))" + ), + ], + }) + + # Build the test case + test_case: dict[str, Any] = { + "id": instance_id, + "input": problem_statement, + } + + if assertions: + test_case["assertions"] = assertions + + # Add metadata + metadata: dict[str, Any] = {} + if repo: + metadata["repo"] = repo + if base_commit: + metadata["base_commit"] = base_commit + if difficulty is not None: + metadata["difficulty"] = str(difficulty) + if metadata: + test_case["metadata"] = metadata + + # Build the eval document + eval_doc: dict[str, Any] = { + "name": _to_eval_name(instance_id), + "description": f"SWE-bench eval for {instance_id}", + } + + # Docker workspace config + if repo: + eval_doc["workspace"] = { + "docker": { + "image": _docker_image_for_repo(repo), + "timeout": 600, + "memory": "4g", + }, + } + + eval_doc["tests"] = [test_case] + + return eval_doc + + +# --------------------------------------------------------------------------- +# Schema converter registry +# --------------------------------------------------------------------------- + +# Each entry: (detect_fn, convert_fn) +# detect_fn receives column names, convert_fn receives a single row dict. +SCHEMA_CONVERTERS = [ + (_is_swebench, _convert_swebench_instance), +] + + +def _detect_converter(columns: list[str]): + """Find the first matching schema converter for the given columns.""" + for detect_fn, convert_fn in SCHEMA_CONVERTERS: + if detect_fn(columns): + return convert_fn + return None + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main() -> None: + parser = argparse.ArgumentParser( + description="Import a HuggingFace dataset into AgentV EVAL.yaml format", + ) + parser.add_argument( + "--repo", + required=True, + help="HuggingFace dataset repository (e.g. SWE-bench/SWE-bench_Verified)", + ) + parser.add_argument( + "--split", + default="test", + help="Dataset split to load (default: test)", + ) + parser.add_argument( + "--limit", + type=int, + default=None, + help="Maximum number of instances to import", + ) + parser.add_argument( + "--output", + default="evals/", + help="Output directory for EVAL.yaml files (default: evals/)", + ) + + args = parser.parse_args() + + # Import datasets here so uv can auto-install the dependency + from datasets import load_dataset + + print(f"Loading dataset {args.repo} (split={args.split})...", file=sys.stderr) + + try: + dataset = load_dataset(args.repo, split=args.split) + except Exception as e: + print(f"Error loading dataset: {e}", file=sys.stderr) + sys.exit(1) + + columns = dataset.column_names + converter = _detect_converter(columns) + + if converter is None: + print( + f"Error: Unsupported dataset schema. Columns: {columns}\n" + "Currently supported: SWE-bench (requires instance_id, problem_statement, repo, base_commit)", + file=sys.stderr, + ) + sys.exit(1) + + # Apply limit + total = len(dataset) + if args.limit is not None and args.limit < total: + dataset = dataset.select(range(args.limit)) + total = args.limit + + print(f"Converting {total} instances...", file=sys.stderr) + + output_dir = Path(args.output) + output_dir.mkdir(parents=True, exist_ok=True) + + created = 0 + for row in dataset: + eval_doc = converter(dict(row)) + # Use the test id (original instance_id) for the filename, not the + # sanitized eval name, so filenames remain recognizable. + test_id = eval_doc["tests"][0]["id"] if eval_doc.get("tests") else eval_doc.get("name", f"instance-{created}") + safe_id = _sanitize_id(test_id) + file_path = output_dir / f"{safe_id}.EVAL.yaml" + + with open(file_path, "w") as f: + yaml.dump( + eval_doc, + f, + default_flow_style=False, + sort_keys=False, + allow_unicode=True, + width=120, + ) + created += 1 + + print(f"Created {created} EVAL.yaml files in {output_dir}/", file=sys.stderr) + + # Print summary to stdout as JSON for programmatic consumption + summary = { + "dataset": args.repo, + "split": args.split, + "total_instances": total, + "files_created": created, + "output_dir": str(output_dir), + } + print(json.dumps(summary)) + + +if __name__ == "__main__": + main() From 954239abec0209ba152ec1ba292101da7c0487cd Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 8 Apr 2026 22:40:54 +0000 Subject: [PATCH 2/4] fix(cli): address code review findings for HuggingFace import - Handle missing uv with clear error message - Surface child process stderr on failure - Add PASS_TO_PASS regression test assertion - Wrap datasets import in try/except ImportError - Validate --limit is positive Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/import/huggingface.ts | 86 ++++++++++++++------- scripts/import-huggingface.py | 36 ++++++++- 2 files changed, 90 insertions(+), 32 deletions(-) diff --git a/apps/cli/src/commands/import/huggingface.ts b/apps/cli/src/commands/import/huggingface.ts index 93f9d8d50..edd9ebe33 100644 --- a/apps/cli/src/commands/import/huggingface.ts +++ b/apps/cli/src/commands/import/huggingface.ts @@ -83,39 +83,67 @@ export const importHuggingFaceCommand = command({ console.log(`Importing from HuggingFace: ${repo} (split=${split ?? 'test'})...`); // Execute via uv run - await new Promise((resolve, reject) => { - const child = execFile('uv', ['run', ...args], { maxBuffer: 50 * 1024 * 1024 }, (error) => { - if (error) { - reject(error); - } else { - resolve(); - } - }); + try { + await new Promise((resolve, reject) => { + const child = execFile( + 'uv', + ['run', ...args], + { maxBuffer: 50 * 1024 * 1024 }, + (error) => { + if (error) { + reject(error); + } else { + resolve(); + } + }, + ); - // Stream stderr (progress messages) to console - child.stderr?.on('data', (data: Buffer) => { - process.stderr.write(data); - }); + // Collect stderr for error reporting + let stderrBuf = ''; + child.stderr?.on('data', (data: Buffer) => { + const chunk = data.toString(); + stderrBuf += chunk; + process.stderr.write(data); + }); - // Capture stdout (JSON summary) - let stdout = ''; - child.stdout?.on('data', (data: Buffer) => { - stdout += data.toString(); - }); + // Capture stdout (JSON summary) + let stdout = ''; + child.stdout?.on('data', (data: Buffer) => { + stdout += data.toString(); + }); - child.on('close', (code) => { - if (code === 0 && stdout.trim()) { - try { - const summary = JSON.parse(stdout.trim()); - console.log( - `\nImported ${summary.files_created} eval(s) from ${summary.dataset} → ${summary.output_dir}/`, - ); - } catch { - // If JSON parsing fails, just print raw output - if (stdout.trim()) console.log(stdout.trim()); + child.on('close', (code) => { + if (code === 0 && stdout.trim()) { + try { + const summary = JSON.parse(stdout.trim()); + console.log( + `\nImported ${summary.files_created} eval(s) from ${summary.dataset} → ${summary.output_dir}/`, + ); + } catch { + // If JSON parsing fails, just print raw output + if (stdout.trim()) console.log(stdout.trim()); + } + } else if (code !== 0) { + // Surface a bounded stderr summary so the user sees what went wrong + const tail = stderrBuf.trim().slice(-2000); + if (tail) { + console.error(`\n--- import-huggingface.py stderr (last 2 000 chars) ---`); + console.error(tail); + } } - } + }); }); - }); + } catch (err: unknown) { + // Handle missing `uv` binary (ENOENT) with a clear message + if (err instanceof Error && (err as NodeJS.ErrnoException).code === 'ENOENT') { + console.error( + 'Error: `uv` is not installed or not found on PATH.\n' + + 'Install it with: curl -LsSf https://astral.sh/uv/install.sh | sh\n' + + 'See https://docs.astral.sh/uv/ for details.', + ); + process.exit(1); + } + throw err; + } }, }); diff --git a/scripts/import-huggingface.py b/scripts/import-huggingface.py index 5ef6591c4..477692f18 100644 --- a/scripts/import-huggingface.py +++ b/scripts/import-huggingface.py @@ -116,12 +116,13 @@ def _convert_swebench_instance(row: dict[str, Any]) -> dict[str, Any]: repo = str(row.get("repo", "")) base_commit = str(row.get("base_commit", "")) fail_to_pass = _parse_test_list(row.get("FAIL_TO_PASS")) + pass_to_pass = _parse_test_list(row.get("PASS_TO_PASS")) difficulty = row.get("difficulty") - # Build assertions from FAIL_TO_PASS test names + # Build assertions from FAIL_TO_PASS and PASS_TO_PASS test names assertions: list[dict[str, Any]] = [] if fail_to_pass: - # Single code-grader that runs the failing tests + # Code-grader that runs the previously-failing tests (should pass after patch) assertions.append({ "type": "code-grader", "command": [ @@ -137,6 +138,23 @@ def _convert_swebench_instance(row: dict[str, Any]) -> dict[str, Any]: ), ], }) + if pass_to_pass: + # Code-grader that verifies existing passing tests still pass (no regression) + assertions.append({ + "type": "code-grader", + "command": [ + "python", "-c", + ( + "import subprocess, sys, json; " + f"result = subprocess.run({json.dumps(['python', '-m', 'pytest'] + pass_to_pass)}, " + "capture_output=True, text=True); " + "passed = result.returncode == 0; " + "print(json.dumps({'score': 1.0 if passed else 0.0, " + "'assertions': [{'text': 'PASS_TO_PASS tests still pass (no regression)', 'passed': passed, " + "'evidence': result.stdout[-500:] if result.stdout else result.stderr[-500:]}]}))" + ), + ], + }) # Build the test case test_case: dict[str, Any] = { @@ -230,8 +248,20 @@ def main() -> None: args = parser.parse_args() + if args.limit is not None and args.limit <= 0: + parser.error("--limit must be a positive integer") + # Import datasets here so uv can auto-install the dependency - from datasets import load_dataset + try: + from datasets import load_dataset + except ImportError: + print( + "Error: the 'datasets' package is not installed.\n" + "Run this script via `uv run` (which auto-installs dependencies) or:\n" + " pip install datasets>=2.14.0", + file=sys.stderr, + ) + sys.exit(1) print(f"Loading dataset {args.repo} (split={args.split})...", file=sys.stderr) From 17d688d2bae6a2b9278af449aa5a633c7d94cd7e Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 8 Apr 2026 22:42:17 +0000 Subject: [PATCH 3/4] fix(cli): resolve biome lint and format errors - Replace template literal with string literal (no interpolation needed) - Fix execFile formatting to match biome style Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/import/huggingface.ts | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/apps/cli/src/commands/import/huggingface.ts b/apps/cli/src/commands/import/huggingface.ts index edd9ebe33..04898c995 100644 --- a/apps/cli/src/commands/import/huggingface.ts +++ b/apps/cli/src/commands/import/huggingface.ts @@ -85,18 +85,13 @@ export const importHuggingFaceCommand = command({ // Execute via uv run try { await new Promise((resolve, reject) => { - const child = execFile( - 'uv', - ['run', ...args], - { maxBuffer: 50 * 1024 * 1024 }, - (error) => { - if (error) { - reject(error); - } else { - resolve(); - } - }, - ); + const child = execFile('uv', ['run', ...args], { maxBuffer: 50 * 1024 * 1024 }, (error) => { + if (error) { + reject(error); + } else { + resolve(); + } + }); // Collect stderr for error reporting let stderrBuf = ''; @@ -127,7 +122,7 @@ export const importHuggingFaceCommand = command({ // Surface a bounded stderr summary so the user sees what went wrong const tail = stderrBuf.trim().slice(-2000); if (tail) { - console.error(`\n--- import-huggingface.py stderr (last 2 000 chars) ---`); + console.error('\n--- import-huggingface.py stderr (last 2 000 chars) ---'); console.error(tail); } } From aa7072a1bb85c93c2d26c1c0f516bb7c7b8a47ba Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 8 Apr 2026 22:45:23 +0000 Subject: [PATCH 4/4] refactor(import): move base_commit from metadata to workspace.docker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit base_commit is not informational metadata — it's required to reproduce the evaluation environment. SWE-bench builds Docker images with `git reset --hard {base_commit}` and resets test files to this commit before running tests. Place it in workspace.docker where it belongs. Co-Authored-By: Claude Opus 4.6 (1M context) --- scripts/import-huggingface.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/scripts/import-huggingface.py b/scripts/import-huggingface.py index 477692f18..fd5208230 100644 --- a/scripts/import-huggingface.py +++ b/scripts/import-huggingface.py @@ -165,12 +165,10 @@ def _convert_swebench_instance(row: dict[str, Any]) -> dict[str, Any]: if assertions: test_case["assertions"] = assertions - # Add metadata + # Add metadata (informational only — repo/base_commit are in workspace.docker) metadata: dict[str, Any] = {} if repo: metadata["repo"] = repo - if base_commit: - metadata["base_commit"] = base_commit if difficulty is not None: metadata["difficulty"] = str(difficulty) if metadata: @@ -183,14 +181,17 @@ def _convert_swebench_instance(row: dict[str, Any]) -> dict[str, Any]: } # Docker workspace config + # base_commit is part of the workspace, not metadata — the container must be + # checked out at this commit for the patch to apply and tests to match. if repo: - eval_doc["workspace"] = { - "docker": { - "image": _docker_image_for_repo(repo), - "timeout": 600, - "memory": "4g", - }, + docker_config: dict[str, Any] = { + "image": _docker_image_for_repo(repo), + "timeout": 600, + "memory": "4g", } + if base_commit: + docker_config["base_commit"] = base_commit + eval_doc["workspace"] = {"docker": docker_config} eval_doc["tests"] = [test_case]