From 1075704473c1052251e503fc2d65211e019bb003 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Wed, 8 Apr 2026 22:26:09 +0000
Subject: [PATCH 1/4] feat(cli): add HuggingFace dataset import command

Add `agentv import huggingface` to import datasets from HuggingFace Hub
into AgentV EVAL.yaml format. Supports SWE-bench-style datasets with
automatic field mapping (instance_id -> test id, problem_statement ->
input, FAIL_TO_PASS -> code-grader assertions, repo -> docker workspace).

The command shells out to a Python script via `uv run` (per repo
convention for Python scripts). The script uses inline PEP 723 metadata
so `uv` auto-installs `datasets` and `pyyaml` dependencies.

Closes #978

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 apps/cli/src/commands/import/huggingface.ts | 121 ++++++++
 apps/cli/src/commands/import/index.ts       |   4 +-
 scripts/import-huggingface.py               | 300 ++++++++++++++++++++
 3 files changed, 424 insertions(+), 1 deletion(-)
 create mode 100644 apps/cli/src/commands/import/huggingface.ts
 create mode 100644 scripts/import-huggingface.py

diff --git a/apps/cli/src/commands/import/huggingface.ts b/apps/cli/src/commands/import/huggingface.ts
new file mode 100644
index 000000000..93f9d8d50
--- /dev/null
+++ b/apps/cli/src/commands/import/huggingface.ts
@@ -0,0 +1,121 @@
+/**
+ * `agentv import huggingface` — Import a HuggingFace dataset into AgentV EVAL.yaml format.
+ *
+ * Wraps the Python script `scripts/import-huggingface.py` which uses the
+ * `datasets` library to load from HuggingFace Hub and converts instances
+ * (e.g. SWE-bench) into individual .EVAL.yaml files.
+ *
+ * The Python script is executed via `uv run` (per repo convention for Python
+ * scripts). The `uv` tool auto-installs script dependencies from the inline
+ * metadata block.
+ *
+ * Usage:
+ *   agentv import huggingface --repo SWE-bench/SWE-bench_Verified --split test --limit 10 --output evals/swebench/
+ */
+
+import { execFile } from 'node:child_process';
+import { existsSync } from 'node:fs';
+import path from 'node:path';
+import { command, number, option, optional, string } from 'cmd-ts';
+
+/**
+ * Resolve the path to the import-huggingface.py script.
+ *
+ * Searches upward from the CLI package directory to find the repo root
+ * (where scripts/ lives). Falls back to cwd-relative path.
+ */
+function findScript(): string {
+  // Try relative to this file's compiled location (apps/cli/dist/ or apps/cli/src/)
+  const candidates = [
+    path.resolve(__dirname, '..', '..', '..', '..', '..', 'scripts', 'import-huggingface.py'),
+    path.resolve(__dirname, '..', '..', '..', '..', 'scripts', 'import-huggingface.py'),
+    path.resolve(process.cwd(), 'scripts', 'import-huggingface.py'),
+  ];
+  for (const candidate of candidates) {
+    if (existsSync(candidate)) return candidate;
+  }
+  return candidates[candidates.length - 1]; // fallback to cwd-relative
+}
+
+export const importHuggingFaceCommand = command({
+  name: 'huggingface',
+  description: 'Import a HuggingFace dataset into AgentV EVAL.yaml format',
+  args: {
+    repo: option({
+      type: string,
+      long: 'repo',
+      description: 'HuggingFace dataset repository (e.g. SWE-bench/SWE-bench_Verified)',
+    }),
+    split: option({
+      type: optional(string),
+      long: 'split',
+      description: 'Dataset split to load (default: test)',
+    }),
+    limit: option({
+      type: optional(number),
+      long: 'limit',
+      description: 'Maximum number of instances to import',
+    }),
+    output: option({
+      type: optional(string),
+      long: 'output',
+      short: 'o',
+      description: 'Output directory for EVAL.yaml files (default: evals/)',
+    }),
+  },
+  handler: async ({ repo, split, limit, output }) => {
+    const scriptPath = findScript();
+
+    if (!existsSync(scriptPath)) {
+      console.error(`Error: Python script not found at ${scriptPath}`);
+      console.error(
+        'Make sure you are running from the agentv repository root, or install agentv from source.',
+      );
+      process.exit(1);
+    }
+
+    // Build arguments for the Python script
+    const args = [scriptPath, '--repo', repo];
+    if (split) args.push('--split', split);
+    if (limit !== undefined) args.push('--limit', String(limit));
+    if (output) args.push('--output', output);
+
+    console.log(`Importing from HuggingFace: ${repo} (split=${split ?? 'test'})...`);
+
+    // Execute via uv run
+    await new Promise<void>((resolve, reject) => {
+      const child = execFile('uv', ['run', ...args], { maxBuffer: 50 * 1024 * 1024 }, (error) => {
+        if (error) {
+          reject(error);
+        } else {
+          resolve();
+        }
+      });
+
+      // Stream stderr (progress messages) to console
+      child.stderr?.on('data', (data: Buffer) => {
+        process.stderr.write(data);
+      });
+
+      // Capture stdout (JSON summary)
+      let stdout = '';
+      child.stdout?.on('data', (data: Buffer) => {
+        stdout += data.toString();
+      });
+
+      child.on('close', (code) => {
+        if (code === 0 && stdout.trim()) {
+          try {
+            const summary = JSON.parse(stdout.trim());
+            console.log(
+              `\nImported ${summary.files_created} eval(s) from ${summary.dataset} → ${summary.output_dir}/`,
+            );
+          } catch {
+            // If JSON parsing fails, just print raw output
+            if (stdout.trim()) console.log(stdout.trim());
+          }
+        }
+      });
+    });
+  },
+});
diff --git a/apps/cli/src/commands/import/index.ts b/apps/cli/src/commands/import/index.ts
index 84435d4c0..b680865c8 100644
--- a/apps/cli/src/commands/import/index.ts
+++ b/apps/cli/src/commands/import/index.ts
@@ -3,13 +3,15 @@ import { subcommands } from 'cmd-ts';
 import { importClaudeCommand } from './claude.js';
 import { importCodexCommand } from './codex.js';
 import { importCopilotCommand } from './copilot.js';
+import { importHuggingFaceCommand } from './huggingface.js';
 
 export const importCommand = subcommands({
   name: 'import',
-  description: 'Import agent session transcripts for offline grading',
+  description: 'Import agent session transcripts and datasets for offline grading',
   cmds: {
     claude: importClaudeCommand,
     codex: importCodexCommand,
     copilot: importCopilotCommand,
+    huggingface: importHuggingFaceCommand,
   },
 });
diff --git a/scripts/import-huggingface.py b/scripts/import-huggingface.py
new file mode 100644
index 000000000..5ef6591c4
--- /dev/null
+++ b/scripts/import-huggingface.py
@@ -0,0 +1,300 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "datasets>=2.14.0",
+#     "pyyaml>=6.0",
+# ]
+# ///
+"""
+HuggingFace dataset importer for AgentV.
+
+Downloads a dataset from HuggingFace Hub and converts each instance to an
+AgentV EVAL.yaml file. Currently supports SWE-bench-style datasets.
+
+Usage (via uv):
+    uv run scripts/import-huggingface.py \
+        --repo SWE-bench/SWE-bench_Verified \
+        --split test \
+        --limit 10 \
+        --output evals/swebench/
+
+SWE-bench field mapping:
+    instance_id        -> test id
+    problem_statement  -> input (user message)
+    repo + base_commit -> workspace.docker metadata
+    FAIL_TO_PASS       -> assertions (code-grader commands)
+    difficulty         -> metadata.difficulty
+
+To support a new dataset schema:
+    1. Add a detect function (like _is_swebench)
+    2. Add a converter function (like _convert_swebench_instance)
+    3. Register it in SCHEMA_CONVERTERS
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+from typing import Any
+
+import yaml
+
+
+# ---------------------------------------------------------------------------
+# SWE-bench schema detection & conversion
+# ---------------------------------------------------------------------------
+
+def _is_swebench(columns: list[str]) -> bool:
+    """Return True if the dataset columns match SWE-bench schema."""
+    required = {"instance_id", "problem_statement", "repo", "base_commit"}
+    return required.issubset(set(columns))
+
+
+def _sanitize_id(instance_id: str) -> str:
+    """Convert a SWE-bench instance_id to a safe filename component.
+
+    Example: 'django__django-16527' -> 'django__django-16527'
+    Strips characters that are unsafe in filenames.
+    """
+    return re.sub(r"[^\w\-.]", "_", instance_id)
+
+
+def _to_eval_name(instance_id: str) -> str:
+    """Convert an instance_id to an AgentV eval name (lowercase, alphanumeric + hyphens).
+
+    AgentV name field must match /^[a-z0-9-]+$/.
+    Example: 'astropy__astropy-12907' -> 'astropy-astropy-12907'
+    """
+    name = instance_id.lower()
+    # Replace underscores, dots, and other non-alphanumeric chars with hyphens
+    name = re.sub(r"[^a-z0-9-]", "-", name)
+    # Collapse consecutive hyphens
+    name = re.sub(r"-{2,}", "-", name)
+    # Strip leading/trailing hyphens
+    return name.strip("-")
+
+
+def _docker_image_for_repo(repo: str) -> str:
+    """Derive a Docker image name for a SWE-bench repo.
+
+    Uses the swebench Docker image naming convention:
+    swebench/sweb.eval.<owner>__<repo>:latest
+    """
+    # repo format: "owner/name" e.g. "django/django"
+    safe = repo.replace("/", "__")
+    return f"swebench/sweb.eval.{safe}:latest"
+
+
+def _parse_test_list(value: Any) -> list[str]:
+    """Parse FAIL_TO_PASS or PASS_TO_PASS from a SWE-bench row.
+
+    The field may be a JSON-encoded list string, a Python list, or absent.
+    """
+    if value is None:
+        return []
+    if isinstance(value, list):
+        return [str(t) for t in value]
+    if isinstance(value, str):
+        try:
+            parsed = json.loads(value)
+            if isinstance(parsed, list):
+                return [str(t) for t in parsed]
+        except (json.JSONDecodeError, TypeError):
+            pass
+        # Fallback: comma-separated
+        return [t.strip() for t in value.split(",") if t.strip()]
+    return []
+
+
+def _convert_swebench_instance(row: dict[str, Any]) -> dict[str, Any]:
+    """Convert a single SWE-bench row to an AgentV EVAL.yaml dict."""
+    instance_id = str(row.get("instance_id", "unknown"))
+    problem_statement = str(row.get("problem_statement", ""))
+    repo = str(row.get("repo", ""))
+    base_commit = str(row.get("base_commit", ""))
+    fail_to_pass = _parse_test_list(row.get("FAIL_TO_PASS"))
+    difficulty = row.get("difficulty")
+
+    # Build assertions from FAIL_TO_PASS test names
+    assertions: list[dict[str, Any]] = []
+    if fail_to_pass:
+        # Single code-grader that runs the failing tests
+        assertions.append({
+            "type": "code-grader",
+            "command": [
+                "python", "-c",
+                (
+                    "import subprocess, sys, json; "
+                    f"result = subprocess.run({json.dumps(['python', '-m', 'pytest'] + fail_to_pass)}, "
+                    "capture_output=True, text=True); "
+                    "passed = result.returncode == 0; "
+                    "print(json.dumps({'score': 1.0 if passed else 0.0, "
+                    "'assertions': [{'text': 'FAIL_TO_PASS tests pass after patch', 'passed': passed, "
+                    "'evidence': result.stdout[-500:] if result.stdout else result.stderr[-500:]}]}))"
+                ),
+            ],
+        })
+
+    # Build the test case
+    test_case: dict[str, Any] = {
+        "id": instance_id,
+        "input": problem_statement,
+    }
+
+    if assertions:
+        test_case["assertions"] = assertions
+
+    # Add metadata
+    metadata: dict[str, Any] = {}
+    if repo:
+        metadata["repo"] = repo
+    if base_commit:
+        metadata["base_commit"] = base_commit
+    if difficulty is not None:
+        metadata["difficulty"] = str(difficulty)
+    if metadata:
+        test_case["metadata"] = metadata
+
+    # Build the eval document
+    eval_doc: dict[str, Any] = {
+        "name": _to_eval_name(instance_id),
+        "description": f"SWE-bench eval for {instance_id}",
+    }
+
+    # Docker workspace config
+    if repo:
+        eval_doc["workspace"] = {
+            "docker": {
+                "image": _docker_image_for_repo(repo),
+                "timeout": 600,
+                "memory": "4g",
+            },
+        }
+
+    eval_doc["tests"] = [test_case]
+
+    return eval_doc
+
+
+# ---------------------------------------------------------------------------
+# Schema converter registry
+# ---------------------------------------------------------------------------
+
+# Each entry: (detect_fn, convert_fn)
+# detect_fn receives column names, convert_fn receives a single row dict.
+SCHEMA_CONVERTERS = [
+    (_is_swebench, _convert_swebench_instance),
+]
+
+
+def _detect_converter(columns: list[str]):
+    """Find the first matching schema converter for the given columns."""
+    for detect_fn, convert_fn in SCHEMA_CONVERTERS:
+        if detect_fn(columns):
+            return convert_fn
+    return None
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Import a HuggingFace dataset into AgentV EVAL.yaml format",
+    )
+    parser.add_argument(
+        "--repo",
+        required=True,
+        help="HuggingFace dataset repository (e.g. SWE-bench/SWE-bench_Verified)",
+    )
+    parser.add_argument(
+        "--split",
+        default="test",
+        help="Dataset split to load (default: test)",
+    )
+    parser.add_argument(
+        "--limit",
+        type=int,
+        default=None,
+        help="Maximum number of instances to import",
+    )
+    parser.add_argument(
+        "--output",
+        default="evals/",
+        help="Output directory for EVAL.yaml files (default: evals/)",
+    )
+
+    args = parser.parse_args()
+
+    # Import datasets here so uv can auto-install the dependency
+    from datasets import load_dataset
+
+    print(f"Loading dataset {args.repo} (split={args.split})...", file=sys.stderr)
+
+    try:
+        dataset = load_dataset(args.repo, split=args.split)
+    except Exception as e:
+        print(f"Error loading dataset: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    columns = dataset.column_names
+    converter = _detect_converter(columns)
+
+    if converter is None:
+        print(
+            f"Error: Unsupported dataset schema. Columns: {columns}\n"
+            "Currently supported: SWE-bench (requires instance_id, problem_statement, repo, base_commit)",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    # Apply limit
+    total = len(dataset)
+    if args.limit is not None and args.limit < total:
+        dataset = dataset.select(range(args.limit))
+        total = args.limit
+
+    print(f"Converting {total} instances...", file=sys.stderr)
+
+    output_dir = Path(args.output)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    created = 0
+    for row in dataset:
+        eval_doc = converter(dict(row))
+        # Use the test id (original instance_id) for the filename, not the
+        # sanitized eval name, so filenames remain recognizable.
+        test_id = eval_doc["tests"][0]["id"] if eval_doc.get("tests") else eval_doc.get("name", f"instance-{created}")
+        safe_id = _sanitize_id(test_id)
+        file_path = output_dir / f"{safe_id}.EVAL.yaml"
+
+        with open(file_path, "w") as f:
+            yaml.dump(
+                eval_doc,
+                f,
+                default_flow_style=False,
+                sort_keys=False,
+                allow_unicode=True,
+                width=120,
+            )
+        created += 1
+
+    print(f"Created {created} EVAL.yaml files in {output_dir}/", file=sys.stderr)
+
+    # Print summary to stdout as JSON for programmatic consumption
+    summary = {
+        "dataset": args.repo,
+        "split": args.split,
+        "total_instances": total,
+        "files_created": created,
+        "output_dir": str(output_dir),
+    }
+    print(json.dumps(summary))
+
+
+if __name__ == "__main__":
+    main()

From 954239abec0209ba152ec1ba292101da7c0487cd Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Wed, 8 Apr 2026 22:40:54 +0000
Subject: [PATCH 2/4] fix(cli): address code review findings for HuggingFace
 import

- Handle missing uv with clear error message
- Surface child process stderr on failure
- Add PASS_TO_PASS regression test assertion
- Wrap datasets import in try/except ImportError
- Validate --limit is positive

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 apps/cli/src/commands/import/huggingface.ts | 86 ++++++++++++++-------
 scripts/import-huggingface.py               | 36 ++++++++-
 2 files changed, 90 insertions(+), 32 deletions(-)

diff --git a/apps/cli/src/commands/import/huggingface.ts b/apps/cli/src/commands/import/huggingface.ts
index 93f9d8d50..edd9ebe33 100644
--- a/apps/cli/src/commands/import/huggingface.ts
+++ b/apps/cli/src/commands/import/huggingface.ts
@@ -83,39 +83,67 @@ export const importHuggingFaceCommand = command({
     console.log(`Importing from HuggingFace: ${repo} (split=${split ?? 'test'})...`);
 
     // Execute via uv run
-    await new Promise<void>((resolve, reject) => {
-      const child = execFile('uv', ['run', ...args], { maxBuffer: 50 * 1024 * 1024 }, (error) => {
-        if (error) {
-          reject(error);
-        } else {
-          resolve();
-        }
-      });
+    try {
+      await new Promise<void>((resolve, reject) => {
+        const child = execFile(
+          'uv',
+          ['run', ...args],
+          { maxBuffer: 50 * 1024 * 1024 },
+          (error) => {
+            if (error) {
+              reject(error);
+            } else {
+              resolve();
+            }
+          },
+        );
 
-      // Stream stderr (progress messages) to console
-      child.stderr?.on('data', (data: Buffer) => {
-        process.stderr.write(data);
-      });
+        // Collect stderr for error reporting
+        let stderrBuf = '';
+        child.stderr?.on('data', (data: Buffer) => {
+          const chunk = data.toString();
+          stderrBuf += chunk;
+          process.stderr.write(data);
+        });
 
-      // Capture stdout (JSON summary)
-      let stdout = '';
-      child.stdout?.on('data', (data: Buffer) => {
-        stdout += data.toString();
-      });
+        // Capture stdout (JSON summary)
+        let stdout = '';
+        child.stdout?.on('data', (data: Buffer) => {
+          stdout += data.toString();
+        });
 
-      child.on('close', (code) => {
-        if (code === 0 && stdout.trim()) {
-          try {
-            const summary = JSON.parse(stdout.trim());
-            console.log(
-              `\nImported ${summary.files_created} eval(s) from ${summary.dataset} → ${summary.output_dir}/`,
-            );
-          } catch {
-            // If JSON parsing fails, just print raw output
-            if (stdout.trim()) console.log(stdout.trim());
+        child.on('close', (code) => {
+          if (code === 0 && stdout.trim()) {
+            try {
+              const summary = JSON.parse(stdout.trim());
+              console.log(
+                `\nImported ${summary.files_created} eval(s) from ${summary.dataset} → ${summary.output_dir}/`,
+              );
+            } catch {
+              // If JSON parsing fails, just print raw output
+              if (stdout.trim()) console.log(stdout.trim());
+            }
+          } else if (code !== 0) {
+            // Surface a bounded stderr summary so the user sees what went wrong
+            const tail = stderrBuf.trim().slice(-2000);
+            if (tail) {
+              console.error(`\n--- import-huggingface.py stderr (last 2 000 chars) ---`);
+              console.error(tail);
+            }
           }
-        }
+        });
       });
-    });
+    } catch (err: unknown) {
+      // Handle missing `uv` binary (ENOENT) with a clear message
+      if (err instanceof Error && (err as NodeJS.ErrnoException).code === 'ENOENT') {
+        console.error(
+          'Error: `uv` is not installed or not found on PATH.\n' +
+            'Install it with: curl -LsSf https://astral.sh/uv/install.sh | sh\n' +
+            'See https://docs.astral.sh/uv/ for details.',
+        );
+        process.exit(1);
+      }
+      throw err;
+    }
   },
 });
diff --git a/scripts/import-huggingface.py b/scripts/import-huggingface.py
index 5ef6591c4..477692f18 100644
--- a/scripts/import-huggingface.py
+++ b/scripts/import-huggingface.py
@@ -116,12 +116,13 @@ def _convert_swebench_instance(row: dict[str, Any]) -> dict[str, Any]:
     repo = str(row.get("repo", ""))
     base_commit = str(row.get("base_commit", ""))
     fail_to_pass = _parse_test_list(row.get("FAIL_TO_PASS"))
+    pass_to_pass = _parse_test_list(row.get("PASS_TO_PASS"))
     difficulty = row.get("difficulty")
 
-    # Build assertions from FAIL_TO_PASS test names
+    # Build assertions from FAIL_TO_PASS and PASS_TO_PASS test names
     assertions: list[dict[str, Any]] = []
     if fail_to_pass:
-        # Single code-grader that runs the failing tests
+        # Code-grader that runs the previously-failing tests (should pass after patch)
         assertions.append({
             "type": "code-grader",
             "command": [
@@ -137,6 +138,23 @@ def _convert_swebench_instance(row: dict[str, Any]) -> dict[str, Any]:
                 ),
             ],
         })
+    if pass_to_pass:
+        # Code-grader that verifies existing passing tests still pass (no regression)
+        assertions.append({
+            "type": "code-grader",
+            "command": [
+                "python", "-c",
+                (
+                    "import subprocess, sys, json; "
+                    f"result = subprocess.run({json.dumps(['python', '-m', 'pytest'] + pass_to_pass)}, "
+                    "capture_output=True, text=True); "
+                    "passed = result.returncode == 0; "
+                    "print(json.dumps({'score': 1.0 if passed else 0.0, "
+                    "'assertions': [{'text': 'PASS_TO_PASS tests still pass (no regression)', 'passed': passed, "
+                    "'evidence': result.stdout[-500:] if result.stdout else result.stderr[-500:]}]}))"
+                ),
+            ],
+        })
 
     # Build the test case
     test_case: dict[str, Any] = {
@@ -230,8 +248,20 @@ def main() -> None:
 
     args = parser.parse_args()
 
+    if args.limit is not None and args.limit <= 0:
+        parser.error("--limit must be a positive integer")
+
     # Import datasets here so uv can auto-install the dependency
-    from datasets import load_dataset
+    try:
+        from datasets import load_dataset
+    except ImportError:
+        print(
+            "Error: the 'datasets' package is not installed.\n"
+            "Run this script via `uv run` (which auto-installs dependencies) or:\n"
+            "  pip install datasets>=2.14.0",
+            file=sys.stderr,
+        )
+        sys.exit(1)
 
     print(f"Loading dataset {args.repo} (split={args.split})...", file=sys.stderr)
 

From 17d688d2bae6a2b9278af449aa5a633c7d94cd7e Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Wed, 8 Apr 2026 22:42:17 +0000
Subject: [PATCH 3/4] fix(cli): resolve biome lint and format errors

- Replace template literal with string literal (no interpolation needed)
- Fix execFile formatting to match biome style

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 apps/cli/src/commands/import/huggingface.ts | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/apps/cli/src/commands/import/huggingface.ts b/apps/cli/src/commands/import/huggingface.ts
index edd9ebe33..04898c995 100644
--- a/apps/cli/src/commands/import/huggingface.ts
+++ b/apps/cli/src/commands/import/huggingface.ts
@@ -85,18 +85,13 @@ export const importHuggingFaceCommand = command({
     // Execute via uv run
     try {
       await new Promise<void>((resolve, reject) => {
-        const child = execFile(
-          'uv',
-          ['run', ...args],
-          { maxBuffer: 50 * 1024 * 1024 },
-          (error) => {
-            if (error) {
-              reject(error);
-            } else {
-              resolve();
-            }
-          },
-        );
+        const child = execFile('uv', ['run', ...args], { maxBuffer: 50 * 1024 * 1024 }, (error) => {
+          if (error) {
+            reject(error);
+          } else {
+            resolve();
+          }
+        });
 
         // Collect stderr for error reporting
         let stderrBuf = '';
@@ -127,7 +122,7 @@ export const importHuggingFaceCommand = command({
             // Surface a bounded stderr summary so the user sees what went wrong
             const tail = stderrBuf.trim().slice(-2000);
             if (tail) {
-              console.error(`\n--- import-huggingface.py stderr (last 2 000 chars) ---`);
+              console.error('\n--- import-huggingface.py stderr (last 2 000 chars) ---');
               console.error(tail);
             }
           }

From aa7072a1bb85c93c2d26c1c0f516bb7c7b8a47ba Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Wed, 8 Apr 2026 22:45:23 +0000
Subject: [PATCH 4/4] refactor(import): move base_commit from metadata to
 workspace.docker
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

base_commit is not informational metadata — it's required to reproduce
the evaluation environment. SWE-bench builds Docker images with
`git reset --hard {base_commit}` and resets test files to this commit
before running tests. Place it in workspace.docker where it belongs.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 scripts/import-huggingface.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/scripts/import-huggingface.py b/scripts/import-huggingface.py
index 477692f18..fd5208230 100644
--- a/scripts/import-huggingface.py
+++ b/scripts/import-huggingface.py
@@ -165,12 +165,10 @@ def _convert_swebench_instance(row: dict[str, Any]) -> dict[str, Any]:
     if assertions:
         test_case["assertions"] = assertions
 
-    # Add metadata
+    # Add metadata (informational only — repo/base_commit are in workspace.docker)
     metadata: dict[str, Any] = {}
     if repo:
         metadata["repo"] = repo
-    if base_commit:
-        metadata["base_commit"] = base_commit
     if difficulty is not None:
         metadata["difficulty"] = str(difficulty)
     if metadata:
@@ -183,14 +181,17 @@ def _convert_swebench_instance(row: dict[str, Any]) -> dict[str, Any]:
     }
 
     # Docker workspace config
+    # base_commit is part of the workspace, not metadata — the container must be
+    # checked out at this commit for the patch to apply and tests to match.
     if repo:
-        eval_doc["workspace"] = {
-            "docker": {
-                "image": _docker_image_for_repo(repo),
-                "timeout": 600,
-                "memory": "4g",
-            },
+        docker_config: dict[str, Any] = {
+            "image": _docker_image_for_repo(repo),
+            "timeout": 600,
+            "memory": "4g",
         }
+        if base_commit:
+            docker_config["base_commit"] = base_commit
+        eval_doc["workspace"] = {"docker": docker_config}
 
     eval_doc["tests"] = [test_case]