diff --git a/examples/10_Agentic_Inference/README.md b/examples/10_Agentic_Inference/README.md
index ab3673b51..932e42db5 100644
--- a/examples/10_Agentic_Inference/README.md
+++ b/examples/10_Agentic_Inference/README.md
@@ -194,3 +194,29 @@ Update the first `datasets` entry (`name` and `path`), `model_params.name`, and
uv run inference-endpoint benchmark from-config \
--config examples/10_Agentic_Inference/kimi_agentic_benchmark.yaml
```
+
+## SWE-bench Accuracy
+
+`swe_bench_accuracy.yaml` runs the SWE-bench accuracy evaluation alongside a
+minimal performance dataset. The benchmark framework skips its built-in
+accuracy phase for this dataset; instead, `SWEBenchScorer` shells out to
+`mini-swe-agent` and the `swebench` evaluation harness, and that external flow
+drives requests to the configured endpoint.
+
+The isolated `uv` environment for those tools lives in `accuracy/`. Sync it
+once before running:
+
+```bash
+cd examples/10_Agentic_Inference/accuracy
+uv sync
+```
+
+Then run the benchmark from the repo root:
+
+```bash
+uv run inference-endpoint benchmark from-config \
+ --config examples/10_Agentic_Inference/swe_bench_accuracy.yaml
+```
+
+See `accuracy/RUNBOOK.md` for preconditions, sanity checks, and common failure
+modes.
diff --git a/examples/10_Agentic_Inference/accuracy/RUNBOOK.md b/examples/10_Agentic_Inference/accuracy/RUNBOOK.md
new file mode 100644
index 000000000..7ad03c122
--- /dev/null
+++ b/examples/10_Agentic_Inference/accuracy/RUNBOOK.md
@@ -0,0 +1,54 @@
+# SWE-bench Accuracy Smoke-Test Runbook
+
+End-to-end validation for the SWE-bench accuracy pipeline. Unit tests mock all
+subprocesses, so running the real pipeline is the only way to catch Docker,
+HuggingFace access, or mini-swe-agent wiring issues.
+
+## 0. Preconditions
+
+- Docker daemon running (swebench harness spawns one container per instance).
+- Docker Hub auth or a pre-seeded image cache for uncached SWE-bench images.
+- Network egress to PyPI and HuggingFace Hub.
+- `uv` binary on PATH (`curl -LsSf https://astral.sh/uv/install.sh | sh`).
+- Parent endpoints env already synced (`uv sync --extra dev` from repo root).
+
+## 1. Sync the accuracy subproject
+
+From the repo root:
+
+```bash
+cd examples/10_Agentic_Inference/accuracy
+uv sync
+```
+
+Sanity check:
+
+```bash
+uv run mini-extra --help
+uv run python -m swebench.harness.run_evaluation --help
+```
+
+Override the default subproject path via env var if needed:
+
+```bash
+export SWE_BENCH_PROJECT_PATH=/path/to/examples/10_Agentic_Inference/accuracy
+```
+
+## 2. End-to-end test (requires live endpoint)
+
+```bash
+uv run inference-endpoint benchmark from-config \
+ --config examples/10_Agentic_Inference/swe_bench_accuracy.yaml
+```
+
+Scorer preflight now resolves the requested SWE-bench instances and pre-pulls any
+missing Docker images before `mini-extra swebench` starts. Cached images are
+skipped.
+
+## Common failure modes
+
+| Symptom | Likely cause | Fix |
+| ---------------------------------------------------- | ------------------------------------- | --------------------------------------------------------- |
+| `FileNotFoundError: SWE-bench subproject not found` | subproject not synced | Run `uv sync` in `examples/10_Agentic_Inference/accuracy` |
+| Docker error during `run_evaluation` | Docker daemon not running | Start Docker and retry |
+| `Failed to pre-pull required SWE-bench Docker image` | Docker Hub rate limit or missing auth | Run `docker login` or use a local image cache/mirror |
diff --git a/examples/10_Agentic_Inference/accuracy/pyproject.toml b/examples/10_Agentic_Inference/accuracy/pyproject.toml
new file mode 100644
index 000000000..14482a29f
--- /dev/null
+++ b/examples/10_Agentic_Inference/accuracy/pyproject.toml
@@ -0,0 +1,29 @@
+# Isolated uv project for the SWE-bench accuracy evaluator.
+#
+# mini-swe-agent and swebench pin specific versions of litellm, docker,
+# and other packages that are not part of the parent endpoints env. Keeping
+# the swebench env separate means the parent lockfile stays solvable and
+# the evaluation env stays reproducible.
+#
+# `inference_endpoint.evaluation.scoring.SWEBenchScorer` invokes
+# mini-extra and swebench.harness.run_evaluation via `uv run --project`,
+# so the main benchmark process never needs to import these packages.
+#
+# Usage on the accuracy host:
+# cd examples/10_Agentic_Inference/accuracy
+# uv sync
+# # SWEBenchScorer in the parent will shell out automatically.
+
+[project]
+name = "swe-bench-accuracy"
+version = "0.1.0"
+description = "Isolated SWE-bench accuracy environment for the multi-turn agentic benchmark."
+requires-python = ">=3.12"
+dependencies = [
+ "mini-swe-agent==2.3.0",
+ "swebench==4.1.0",
+]
+
+[tool.uv]
+# Script-runner env: no build, no install of this project itself.
+package = false
diff --git a/examples/10_Agentic_Inference/kimi_agentic_benchmark.yaml b/examples/10_Agentic_Inference/kimi_agentic_benchmark.yaml
index 9740aa4c1..b513d0849 100644
--- a/examples/10_Agentic_Inference/kimi_agentic_benchmark.yaml
+++ b/examples/10_Agentic_Inference/kimi_agentic_benchmark.yaml
@@ -23,6 +23,13 @@ datasets:
num_trajectories_to_issue: 990 # Should be integer multiple of 990.
# Required benchmark default; set to true only for faster optimization/debug runs.
stop_issuing_on_first_user_complete: false
+ - name: swe_bench
+ type: "accuracy"
+ accuracy_config:
+ eval_method: "swe_bench_scorer"
+ num_repeats: 1
+ extras:
+ num_instances: 200
settings:
runtime:
diff --git a/examples/10_Agentic_Inference/qwen_agentic_benchmark.yaml b/examples/10_Agentic_Inference/qwen_agentic_benchmark.yaml
new file mode 100644
index 000000000..415b3e68c
--- /dev/null
+++ b/examples/10_Agentic_Inference/qwen_agentic_benchmark.yaml
@@ -0,0 +1,48 @@
+name: "qwen-agentic-benchmark"
+version: "1.0"
+type: "online"
+
+model_params:
+ name: "Qwen/Qwen3.6-35B-A3B"
+ temperature: 1.0
+ top_k: 20
+ top_p: 0.95
+ repetition_penalty: 1.0
+ presence_penalty: 1.5
+ max_new_tokens: 8192
+ chat_template_kwargs:
+ preserve_thinking: true
+
+datasets:
+ - name: agentic_coding
+ type: performance
+ path: /path/to/agentic_combined.jsonl
+ accuracy_config:
+ eval_method: agentic_inference_inline # required benchmark default.
+ agentic_inference:
+ turn_timeout_s: 14400.0
+ enable_salt: true # do not change.
+ inject_tool_delay: true # do not change.
+ - name: swe_bench
+ type: "accuracy"
+ accuracy_config:
+ eval_method: "swe_bench_scorer"
+ num_repeats: 1
+ extras:
+ num_instances: 200
+
+settings:
+ runtime:
+ min_duration_ms: 0
+ max_duration_ms: 36000000
+
+ load_pattern:
+ type: agentic_inference
+ target_concurrency: 8 # Submission-specific concurrency.
+
+endpoint_config:
+ endpoints:
+ - "http://localhost:30000"
+ api_type: openai
+
+report_dir: logs/qwen_agentic
diff --git a/examples/10_Agentic_Inference/swe_bench_accuracy.yaml b/examples/10_Agentic_Inference/swe_bench_accuracy.yaml
new file mode 100644
index 000000000..8508b0129
--- /dev/null
+++ b/examples/10_Agentic_Inference/swe_bench_accuracy.yaml
@@ -0,0 +1,42 @@
+type: "online"
+
+model_params:
+ name: "Qwen/Qwen3.6-35B-A3B"
+ temperature: 1.0
+ top_p: 0.95
+ top_k: 20
+ repetition_penalty: 1.0
+ presence_penalty: 1.5
+ max_new_tokens: 8192
+ chat_template_kwargs:
+ preserve_thinking: true
+
+datasets:
+ # Minimal performance dataset required by the framework.
+ - name: swe_bench_perf
+ type: "performance"
+ path: "tests/assets/datasets/dummy_1k.jsonl"
+ parser:
+ prompt: text_input
+
+ # Accuracy dataset — instance_id rows tell mini-swe-agent which instances to run.
+ # First run downloads ~10 MB from HuggingFace and caches to datasets_dir.
+ - name: swe_bench
+ type: "accuracy"
+ accuracy_config:
+ eval_method: "swe_bench_scorer"
+ num_repeats: 1
+ extras:
+ num_instances: 200
+
+settings:
+ load_pattern:
+ type: "concurrency"
+ target_concurrency: 10 # mini-extra inherits target_concurrency from performance dataset
+ runtime:
+ n_samples_to_issue: 10
+
+endpoint_config:
+ endpoints:
+ - "http://localhost:30000"
+ api_type: "openai"
diff --git a/examples/10_Agentic_Inference/swebench_template.yaml b/examples/10_Agentic_Inference/swebench_template.yaml
new file mode 100644
index 000000000..9b37ec5ba
--- /dev/null
+++ b/examples/10_Agentic_Inference/swebench_template.yaml
@@ -0,0 +1,186 @@
+agent:
+ system_template: |
+ You are a helpful assistant that can interact with a computer shell to solve programming tasks.
+ instance_template: |
+
+ Consider the following PR description:
+ {{task}}
+
+
+
+ # Task Instructions
+
+ ## Overview
+
+ You're a software engineer interacting continuously with a computer by submitting commands.
+ You'll be helping implement necessary changes to meet requirements in the PR description.
+ Your task is specifically to make changes to non-test files in the current directory in order to fix the issue described in the PR description in a way that is general and consistent with the codebase.
+ This is an interactive process where you will think and issue AT LEAST ONE command, see the result, then think and issue your next command(s).
+
+ For each response:
+
+ 1. Include a THOUGHT section explaining your reasoning and what you're trying to accomplish
+ 2. Provide one or more bash tool calls to execute
+
+ ## Important Boundaries
+
+ - MODIFY: Regular source code files in /testbed (this is the working directory for all your subsequent commands)
+ - DO NOT MODIFY: Tests, configuration files (pyproject.toml, setup.cfg, etc.)
+
+ ## Recommended Workflow
+
+ 1. Analyze the codebase by finding and reading relevant files
+ 2. Create a script to reproduce the issue
+ 3. Edit the source code to resolve the issue
+ 4. Verify your fix works by running your script again
+ 5. Test edge cases to ensure your fix is robust
+
+ ## Command Execution Rules
+
+ You are operating in an environment where
+
+ 1. You issue at least one command
+ 2. The system executes the command(s) in a subshell
+ 3. You see the result(s)
+ 4. You write your next command(s)
+
+ Each response should include:
+
+ 1. **Reasoning text** where you explain your analysis and plan
+ 2. At least one tool call with your command
+
+ **CRITICAL REQUIREMENTS:**
+
+ - Your response SHOULD include reasoning text explaining what you're doing
+ - Your response MUST include AT LEAST ONE bash tool call. You can make MULTIPLE tool calls in a single response when the commands are independent (e.g., searching multiple files, reading different parts of the codebase).
+ - Directory or environment variable changes are not persistent. Every action is executed in a new subshell.
+ - However, you can prefix any action with `MY_ENV_VAR=MY_VALUE cd /path/to/working/dir && ...` or write/load environment variables from files
+
+ Example of a CORRECT response:
+
+ I need to understand the Builder-related code. Let me find relevant files and check the project structure.
+
+ [Makes multiple bash tool calls: {"command": "ls -la"}, {"command": "find src -name '*.java' | grep -i builder"}, {"command": "cat README.md | head -50"}]
+
+
+ ## Environment Details
+
+ - You have a full Linux shell environment
+ - Always use non-interactive flags (-y, -f) for commands
+ - Avoid interactive tools like vi, nano, or any that require user input
+ - You can use bash commands or invoke any tool that is available in the environment
+ - You can also create new tools or scripts to help you with the task
+ - If a tool isn't available, you can also install it
+
+ ## Submission
+
+ When you've completed your work, you MUST submit your changes as a git patch.
+ Follow these steps IN ORDER, with SEPARATE commands:
+
+ Step 1: Create the patch file
+ Run `git diff -- path/to/file1 path/to/file2 > patch.txt` listing only the source files you modified.
+ Do NOT commit your changes.
+
+
+ The patch must only contain changes to the specific source files you modified to fix the issue.
+ Do not submit file creations or changes to any of the following files:
+
+ - test and reproduction files
+ - helper scripts, tests, or tools that you created
+ - installation, build, packaging, configuration, or setup scripts unless they are directly part of the issue you were fixing (you can assume that the environment is already set up for your client)
+ - binary or compiled files
+
+
+ Step 2: Verify your patch
+ Inspect patch.txt to confirm it only contains your intended changes and headers show `--- a/` and `+++ b/` paths.
+
+ Step 3: Submit (EXACT command required)
+ You MUST use this EXACT command to submit:
+
+ ```bash
+ echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT && cat patch.txt
+ ```
+
+ If the command fails (nonzero exit status), it will not submit.
+
+
+ - Creating/viewing the patch and submitting it MUST be separate commands (not combined with &&).
+ - If you modify patch.txt after verifying, you SHOULD verify again before submitting.
+ - You CANNOT continue working (reading, editing, testing) in any way on this task after submitting.
+
+
+ step_limit: 250
+ cost_limit: 3.
+
+environment:
+ cwd: "/testbed"
+ timeout: 3600
+ interpreter: ["bash", "-c"]
+ env:
+ PAGER: cat
+ MANPAGER: cat
+ LESS: -R
+ PIP_PROGRESS_BAR: "off"
+ TQDM_DISABLE: "1"
+ environment_class: docker
+ pull_timeout: 3600
+ container_timeout: 10h
+
+model:
+ cost_tracking: "ignore_errors"
+ observation_template: |
+ {% if output.exception_info -%}
+ {{output.exception_info}}
+ {% endif -%}
+ {{output.returncode}}
+ {% if output.output | length < 10000 -%}
+
+ {%- else -%}
+
+ The output of your last command was too long.
+ Please try a different command that produces less output.
+ If you're looking at a file you can try use head, tail or sed to view a smaller number of lines selectively.
+ If you're using grep or find and it produced too much output, you can use a more selective search pattern.
+ If you really need to see something from the full command's output, you can redirect output to a file and then search in that file.
+
+ {%- set elided_chars = output.output | length - 10000 -%}
+
+ {{ output.output[:5000] }}
+
+
+ {{ elided_chars }} characters elided
+
+
+ {{ output.output[-5000:] }}
+
+ {%- endif -%}
+ format_error_template: |
+ Tool call error:
+
+
+ {{error}}
+
+
+ Here is general guidance on how to submit correct toolcalls:
+
+ Every response needs to use the 'bash' tool at least once to execute commands.
+
+ Call the bash tool with your command as the argument:
+ - Tool: bash
+ - Arguments: {"command": "your_command_here"}
+
+ If you have completed your assignment, please consult the first message about how to
+ submit your solution (you will not be able to continue working on this task after that).
+ # Patched at runtime by SWEBenchScorer from model_params and endpoint_config
+ model_name: ""
+ model_kwargs:
+ custom_llm_provider: "openai"
+ api_key: "test"
+ drop_params: true
+ parallel_tool_calls: true
+ api_base: ""
+ # Sampling parameters (temperature, top_p, top_k, etc.) are injected at
+ # runtime from the benchmark config's model_params block — absent here so
+ # the model's own defaults apply when not specified in model_params.
diff --git a/pyproject.toml b/pyproject.toml
index 4a7655021..429cdcf4b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -73,6 +73,7 @@ dependencies = [
# Fix pytz-2024 import warning
"pytz==2026.1.post1",
"urllib3==2.7.0",
+ "pyyaml==6.0.3",
]
[project.optional-dependencies]
diff --git a/src/inference_endpoint/commands/benchmark/execute.py b/src/inference_endpoint/commands/benchmark/execute.py
index a2050bbe3..13ee87632 100644
--- a/src/inference_endpoint/commands/benchmark/execute.py
+++ b/src/inference_endpoint/commands/benchmark/execute.py
@@ -285,6 +285,10 @@ def _load_datasets(
)
assert acc_cfg.accuracy_config is not None
+ extras = acc_cfg.accuracy_config.extras or {}
+
+ scorer_cls.preflight(extras)
+
ds = DataLoaderFactory.create_loader(
acc_cfg, num_repeats=acc_cfg.accuracy_config.num_repeats
)
@@ -299,7 +303,7 @@ def _load_datasets(
report_dir,
acc_cfg.accuracy_config.ground_truth,
acc_cfg.accuracy_config.num_repeats,
- acc_cfg.accuracy_config.extras or {},
+ extras,
)
)
ds.load(
@@ -313,6 +317,14 @@ def _load_datasets(
raise InputValidationError("Multiple performance datasets not supported")
perf_cfg = performance_cfgs[0]
+ perf_cls = Dataset.PREDEFINED.get(perf_cfg.name)
+ if perf_cls is not None and perf_cls.ACCURACY_ONLY:
+ raise InputValidationError(
+ f"Dataset '{perf_cfg.name}' is accuracy-only and cannot be used "
+ "as a performance dataset. Use a different dataset (e.g. 'random') for the "
+ "performance phase."
+ )
+
try:
dataloader = DataLoaderFactory.create_loader(perf_cfg)
dataloader.load(
@@ -320,9 +332,7 @@ def _load_datasets(
)
logger.info(f"Loaded {dataloader.num_samples()} samples")
except FileNotFoundError as e:
- raise InputValidationError(
- f"Dataset file not found: {performance_cfgs[0].path}"
- ) from e
+ raise InputValidationError(f"Dataset file not found: {perf_cfg.path}") from e
except Exception as e:
raise SetupError(f"Failed to load dataset: {e}") from e
@@ -337,6 +347,7 @@ def _load_datasets(
scorer_cls, extractor_cls = _resolve_accuracy_components(
perf_cfg.name, accuracy_config
)
+ scorer_cls.preflight(accuracy_config.extras or {})
eval_configs.append(
AccuracyConfiguration(
@@ -399,8 +410,11 @@ def setup_benchmark(config: BenchmarkConfig, test_mode: TestMode) -> BenchmarkCo
# Calculate and display expected sample count
total_samples = rt_settings.total_samples_to_issue()
- if accuracy_datasets:
- total_samples += sum(ds.num_samples() * ds.repeats for ds in accuracy_datasets)
+ total_samples += sum(
+ ec.dataset.num_samples() * ec.dataset.repeats
+ for ec in eval_configs
+ if not ec.scorer.SKIP_ENDPOINT_PHASE and ec.dataset_name != "performance"
+ )
collect_responses = test_mode in (TestMode.ACC, TestMode.BOTH)
logger.info(
@@ -409,6 +423,16 @@ def setup_benchmark(config: BenchmarkConfig, test_mode: TestMode) -> BenchmarkCo
logger.info(
f"Min Duration: {rt_settings.min_duration_ms / 1000:.1f}s, Expected samples: {total_samples}"
)
+ for ec in eval_configs:
+ if ec.scorer.SKIP_ENDPOINT_PHASE:
+ n = ec.scorer.external_sample_count(ec.extras)
+ if n is not None:
+ logger.info(
+ "Accuracy dataset '%s' (%s): %d instances evaluated externally",
+ ec.dataset_name,
+ ec.scorer.SCORER_ID,
+ n,
+ )
return BenchmarkContext(
config=config,
@@ -477,6 +501,8 @@ def _build_phases(
# Accuracy phases — use eval_cfg.dataset_name as phase name so it matches
# what Scorer._load_sample_index_map() looks up in sample_idx_map.json
for eval_cfg in ctx.eval_configs:
+ if eval_cfg.scorer.SKIP_ENDPOINT_PHASE:
+ continue
if eval_cfg.dataset_name == "performance":
continue
acc_ds = eval_cfg.dataset
@@ -905,8 +931,12 @@ def finalize_benchmark(ctx: BenchmarkContext, bench: BenchmarkResult) -> None:
**eval_cfg.extras,
)
score, n_repeats = scorer_instance.score()
- assert eval_cfg.dataset.data is not None
- num_samples = len(eval_cfg.dataset.data)
+ if eval_cfg.dataset.data is not None:
+ num_samples = len(eval_cfg.dataset.data)
+ elif eval_cfg.dataset.dataframe is not None:
+ num_samples = len(eval_cfg.dataset.dataframe)
+ else:
+ num_samples = 0
if eval_cfg.dataset_name == "performance":
num_samples = sum(phase.issued_count for phase in result.perf_results)
accuracy_scores[eval_cfg.dataset_name] = {
diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py
index 9226d7f85..0b84b2da2 100644
--- a/src/inference_endpoint/config/schema.py
+++ b/src/inference_endpoint/config/schema.py
@@ -101,6 +101,7 @@ class ScorerMethod(str, Enum):
SHOPIFY_CATEGORY_F1 = "shopify_category_f1"
AGENTIC_INFERENCE_INLINE = "agentic_inference_inline"
VBENCH = "vbench"
+ SWE_BENCH = "swe_bench_scorer"
class TestMode(str, Enum):
@@ -860,6 +861,34 @@ def _resolve_and_validate(self) -> Self:
f"got '{lp.type}'"
)
+ # For swe_bench_scorer, forward target_concurrency as workers when the
+ # user has not set it explicitly. mini-swe-agent's parallelism should
+ # match the endpoint's concurrency budget.
+ concurrency = (
+ lp.target_concurrency
+ if lp.type
+ in (LoadPatternType.CONCURRENCY, LoadPatternType.AGENTIC_INFERENCE)
+ and lp.target_concurrency
+ else None
+ )
+ if concurrency is not None and self.datasets:
+ updated_datasets = []
+ changed = False
+ for ds in self.datasets:
+ acc = ds.accuracy_config
+ if (
+ acc is not None
+ and acc.eval_method == ScorerMethod.SWE_BENCH
+ and (acc.extras is None or "workers" not in acc.extras)
+ ):
+ new_extras = {**(acc.extras or {}), "workers": concurrency}
+ new_acc = acc.model_copy(update={"extras": new_extras})
+ ds = ds.model_copy(update={"accuracy_config": new_acc})
+ changed = True
+ updated_datasets.append(ds)
+ if changed:
+ object.__setattr__(self, "datasets", updated_datasets)
+
return self
@model_validator(mode="after")
diff --git a/src/inference_endpoint/config/templates/concurrency_template_full.yaml b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
index 38829f0f5..3239aa08d 100644
--- a/src/inference_endpoint/config/templates/concurrency_template_full.yaml
+++ b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
@@ -37,7 +37,7 @@ datasets: # Dataset configs
prompt: question
system: system_prompt
accuracy_config: # Accuracy evaluation settings
- eval_method: pass_at_1 # Scorer method | options: pass_at_1, string_match, rouge, code_bench_scorer, shopify_category_f1, agentic_inference_inline, vbench
+ eval_method: pass_at_1 # Scorer method | options: pass_at_1, string_match, rouge, code_bench_scorer, shopify_category_f1, agentic_inference_inline, vbench, swe_bench_scorer
ground_truth: ground_truth # Ground truth column name
extractor: boxed_math_extractor # Answer extractor (abcd_extractor, boxed_math_extractor, identity_extractor, python_code_extractor)
num_repeats: 1 # Repeat dataset N times for evaluation
diff --git a/src/inference_endpoint/config/templates/offline_template_full.yaml b/src/inference_endpoint/config/templates/offline_template_full.yaml
index c3454d5da..476a27ef4 100644
--- a/src/inference_endpoint/config/templates/offline_template_full.yaml
+++ b/src/inference_endpoint/config/templates/offline_template_full.yaml
@@ -37,7 +37,7 @@ datasets: # Dataset configs
prompt: question
system: system_prompt
accuracy_config: # Accuracy evaluation settings
- eval_method: pass_at_1 # Scorer method | options: pass_at_1, string_match, rouge, code_bench_scorer, shopify_category_f1, agentic_inference_inline, vbench
+ eval_method: pass_at_1 # Scorer method | options: pass_at_1, string_match, rouge, code_bench_scorer, shopify_category_f1, agentic_inference_inline, vbench, swe_bench_scorer
ground_truth: ground_truth # Ground truth column name
extractor: boxed_math_extractor # Answer extractor (abcd_extractor, boxed_math_extractor, identity_extractor, python_code_extractor)
num_repeats: 1 # Repeat dataset N times for evaluation
diff --git a/src/inference_endpoint/config/templates/online_template_full.yaml b/src/inference_endpoint/config/templates/online_template_full.yaml
index 5bea95329..266426f4d 100644
--- a/src/inference_endpoint/config/templates/online_template_full.yaml
+++ b/src/inference_endpoint/config/templates/online_template_full.yaml
@@ -37,7 +37,7 @@ datasets: # Dataset configs
prompt: question
system: system_prompt
accuracy_config: # Accuracy evaluation settings
- eval_method: pass_at_1 # Scorer method | options: pass_at_1, string_match, rouge, code_bench_scorer, shopify_category_f1, agentic_inference_inline, vbench
+ eval_method: pass_at_1 # Scorer method | options: pass_at_1, string_match, rouge, code_bench_scorer, shopify_category_f1, agentic_inference_inline, vbench, swe_bench_scorer
ground_truth: ground_truth # Ground truth column name
extractor: boxed_math_extractor # Answer extractor (abcd_extractor, boxed_math_extractor, identity_extractor, python_code_extractor)
num_repeats: 1 # Repeat dataset N times for evaluation
diff --git a/src/inference_endpoint/dataset_manager/__init__.py b/src/inference_endpoint/dataset_manager/__init__.py
index 15525fb50..ac314d3f9 100644
--- a/src/inference_endpoint/dataset_manager/__init__.py
+++ b/src/inference_endpoint/dataset_manager/__init__.py
@@ -32,6 +32,7 @@
ShopifyProductCatalogue,
ShopifyProductCatalogue8k,
)
+from .predefined.swe_bench import SWEBench
from .transforms import (
AddStaticColumns,
ColumnFilter,
@@ -63,5 +64,6 @@
"RandomDataset",
"ShopifyProductCatalogue",
"ShopifyProductCatalogue8k",
+ "SWEBench",
"AgenticInferenceDataset",
]
diff --git a/src/inference_endpoint/dataset_manager/dataset.py b/src/inference_endpoint/dataset_manager/dataset.py
index 963ded391..2281f5184 100644
--- a/src/inference_endpoint/dataset_manager/dataset.py
+++ b/src/inference_endpoint/dataset_manager/dataset.py
@@ -276,6 +276,10 @@ class Dataset:
DATASET_ID: ClassVar[str]
"""The unique identifier for the dataset. Automatically set by __init_subclass__."""
+ ACCURACY_ONLY: ClassVar[bool] = False
+ """If True, this dataset may only be used as an accuracy dataset (type: accuracy).
+ Using it as a performance dataset raises InputValidationError at load time."""
+
def __init_subclass__(
cls,
dataset_id: str | None = None,
diff --git a/src/inference_endpoint/dataset_manager/predefined/swe_bench/__init__.py b/src/inference_endpoint/dataset_manager/predefined/swe_bench/__init__.py
new file mode 100644
index 000000000..72b54383c
--- /dev/null
+++ b/src/inference_endpoint/dataset_manager/predefined/swe_bench/__init__.py
@@ -0,0 +1,116 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from logging import getLogger
+from pathlib import Path
+
+import pandas as pd
+
+from ...dataset import Dataset, load_from_huggingface
+
+logger = getLogger(__name__)
+
+_REPO_MAP = {
+ "verified": "princeton-nlp/SWE-bench_Verified",
+ "lite": "princeton-nlp/SWE-bench_Lite",
+}
+
+
+class SWEBench(
+ Dataset,
+ dataset_id="swe_bench",
+):
+ """SWE-bench: Software Engineering Benchmark for LLM agents.
+
+ Loads instance IDs and problem statements from the SWE-bench Verified or
+ Lite subset. Used as the accuracy dataset for the swe_bench_scorer, which
+ runs mini-swe-agent against a live endpoint and grades patches with the
+ SWE-bench evaluation harness.
+
+ The ``instance_id`` column identifies which instances mini-swe-agent will
+ evaluate. The endpoint phase is skipped entirely for this scorer
+ (``SKIP_ENDPOINT_PHASE=True``); ``SWEBenchScorer`` drives the agent
+ subprocess directly against the configured endpoint.
+
+ Using this dataset as a performance dataset (type: performance) is not
+ meaningful — problem statements sent directly to the model without an
+ agent framework don't reflect real SWE-bench usage. Use a different
+ dataset (e.g. ``random``) for the performance phase.
+ """
+
+ ACCURACY_ONLY = True
+ COLUMN_NAMES = ["instance_id", "prompt"]
+
+ @classmethod
+ def hf_dataset_name(cls, subset: str) -> str:
+ hf_path = _REPO_MAP.get(subset)
+ if hf_path is None:
+ raise ValueError(
+ f"Unknown SWE-bench subset {subset!r}; choose from: {list(_REPO_MAP)}"
+ )
+ return hf_path
+
+ @classmethod
+ def generate(
+ cls,
+ datasets_dir: Path,
+ subset: str = "verified",
+ force: bool = False,
+ ) -> pd.DataFrame:
+ """Download and cache the SWE-bench dataset from HuggingFace.
+
+ Args:
+ datasets_dir: Root cache directory. Parquet is written under
+ ``datasets_dir/swe_bench/{subset}/``.
+ subset: ``"verified"`` (500 instances) or ``"lite"`` (300 instances).
+ force: Re-download even if the local parquet cache exists.
+
+ Returns:
+ DataFrame with columns ``instance_id`` and ``prompt``.
+ """
+ hf_path = cls.hf_dataset_name(subset)
+
+ dst_path = datasets_dir / "swe_bench" / subset / f"swe_bench_{subset}.parquet"
+ if dst_path.exists() and not force:
+ logger.info("Loading SWE-bench %s from cache: %s", subset, dst_path)
+ try:
+ return pd.read_parquet(dst_path)
+ except Exception as e:
+ raise RuntimeError(
+ f"Cached SWE-bench parquet at {dst_path} appears corrupt ({e}). "
+ "Delete it or pass force=True to re-download."
+ ) from e
+
+ try:
+ df = load_from_huggingface(
+ hf_path,
+ split="test",
+ cache_dir=datasets_dir / "hf_cache" / f"swe_bench_{subset}",
+ )
+ except Exception as e:
+ logger.error("Error loading SWE-bench %s from HuggingFace: %s", subset, e)
+ raise
+
+ result = (
+ df[["instance_id", "problem_statement"]]
+ .rename(columns={"problem_statement": "prompt"})
+ .reset_index(drop=True)
+ )
+ dst_path.parent.mkdir(parents=True, exist_ok=True)
+ result.to_parquet(dst_path)
+ logger.info(
+ "Saved %d SWE-bench %s instances to %s", len(result), subset, dst_path
+ )
+ return result
diff --git a/src/inference_endpoint/evaluation/scoring.py b/src/inference_endpoint/evaluation/scoring.py
index f9419703a..54fec7619 100644
--- a/src/inference_endpoint/evaluation/scoring.py
+++ b/src/inference_endpoint/evaluation/scoring.py
@@ -10,7 +10,7 @@
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific permissions and
+# See the License for the specific language governing permissions and
# limitations under the License.
@@ -23,6 +23,7 @@
import subprocess
import sys
import tempfile
+import threading
import uuid
from abc import ABC, abstractmethod
from collections import Counter, defaultdict
@@ -33,6 +34,7 @@
import msgspec.json
import numpy as np
import pandas as pd
+import yaml
from pydantic import ValidationError
from tqdm import tqdm
@@ -53,6 +55,8 @@
from ..dataset_manager.agentic_inference_dataset import AgenticInferenceDataset
from ..dataset_manager.dataset import Dataset
from ..dataset_manager.predefined.shopify_product_catalogue import ProductMetadata
+from ..dataset_manager.predefined.swe_bench import SWEBench
+from ..exceptions import SetupError
from .extractor import Extractor, PythonCodeExtractor
logger = logging.getLogger(__name__)
@@ -67,6 +71,7 @@ class Scorer(ABC):
PREDEFINED: ClassVar[dict[str, type["Scorer"]]] = {}
SCORER_ID: ClassVar[str]
REQUIRES_EXTRACTOR: ClassVar[bool] = True
+ SKIP_ENDPOINT_PHASE: ClassVar[bool] = False
def __init_subclass__(
cls,
@@ -106,6 +111,21 @@ def available_scorers(cls) -> list[str]:
"""Return the list of registered scorer names."""
return list(Scorer.PREDEFINED.keys())
+ @classmethod
+ def external_sample_count(cls, extras: dict[str, Any]) -> int | None:
+ """Return the number of samples the scorer will evaluate externally, or None.
+
+ Used to surface sample counts for scorers that skip the endpoint phase and
+ manage their own evaluation (e.g. `SWEBenchScorer`).
+ The default returns None (scorer uses the endpoint accuracy phase normally).
+ """
+ return None
+
+ @classmethod # noqa: B027 — intentional no-op default; subclasses override when needed
+ def preflight(cls, extras: dict[str, Any]) -> None:
+ """Verify external dependencies before the benchmark starts. No-op by default."""
+ pass
+
def __init__(
self,
dataset_name: str,
@@ -122,7 +142,9 @@ def __init__(
self.ground_truth_column = (
ground_truth_column if ground_truth_column is not None else "ground_truth"
)
- self.sample_index_map = self._load_sample_index_map()
+ self.sample_index_map: dict | None = (
+ None if self.SKIP_ENDPOINT_PHASE else self._load_sample_index_map()
+ )
def _load_sample_index_map(self):
sample_index_map_path = self.report_dir / "sample_idx_map.json"
@@ -163,6 +185,7 @@ def get_outputs(self):
def match_sample_index(self, row: pd.Series) -> pd.Series:
# Pandas Apply function to create a new 'sample_index' column
+ assert self.sample_index_map is not None
row["sample_index"] = self.sample_index_map[row["sample_uuid"]]
return row
@@ -177,6 +200,10 @@ def score(self) -> tuple[float | None, int]:
tuple[float | None, int]: The mean score and the number of repeats.
Returns None as the score if evaluation fails.
"""
+ assert self.sample_index_map is not None, (
+ f"{self.__class__.__name__}.SKIP_ENDPOINT_PHASE is True but score() was not "
+ "overridden; override score() to implement external evaluation."
+ )
df = self.get_outputs()
# Outputs are for all samples, not just the target dataset
@@ -273,6 +300,7 @@ def score(self) -> tuple[float, int]:
df = self.get_outputs()
# Outputs are for all samples, not just the target dataset
+ assert self.sample_index_map is not None
valid_uuids = self.sample_index_map.keys()
df = df[df["sample_uuid"].isin(valid_uuids)]
@@ -1099,6 +1127,7 @@ def score(self) -> tuple[float | None, int]:
df = self.get_outputs()
# Outputs are for all samples, not just the target dataset
+ assert self.sample_index_map is not None
valid_uuids = self.sample_index_map.keys()
df = df[df["sample_uuid"].isin(valid_uuids)]
@@ -1319,6 +1348,7 @@ def score_single_sample(self, value: str, ground_truth: str) -> float:
def score(self) -> tuple[float, int]:
df = self.get_outputs()
+ assert self.sample_index_map is not None
valid_uuids = self.sample_index_map.keys()
df = df[df["sample_uuid"].isin(valid_uuids)]
df = df.apply(self.match_sample_index, axis=1)
@@ -1369,6 +1399,20 @@ def score(self) -> tuple[float, int]:
_VBENCH_PROJECT_PATH_ENV = "VBENCH_PROJECT_PATH"
+
+def _resolve_subproject_path(
+ explicit: str | os.PathLike | None,
+ env_var: str,
+ default: Path,
+) -> Path:
+ if explicit is not None:
+ return Path(explicit)
+ from_env = os.environ.get(env_var)
+ if from_env:
+ return Path(from_env)
+ return default
+
+
# Filenames in `vbench_standard` mode key on the prompt verbatim — VBench looks
# the filename's prompt-prefix up in vbench_full_info.json. We can therefore
# only reshape unsafe characters, not replace the prompt with a UUID. Slashes
@@ -1470,18 +1514,10 @@ def __init__(
def _resolve_project_path(
explicit: os.PathLike | None,
) -> Path:
- """Resolve the VBench subproject path.
-
- Lookup order: explicit ctor arg → ``$VBENCH_PROJECT_PATH`` env var →
- editable-checkout fallback. The env var lets wheel-installed users
- point at a synced subproject without patching source.
- """
- if explicit is not None:
- return Path(explicit)
- from_env = os.environ.get(_VBENCH_PROJECT_PATH_ENV)
- if from_env:
- return Path(from_env)
- return Path(_DEFAULT_VBENCH_PROJECT_PATH)
+ """Lookup order: explicit ctor arg → ``$VBENCH_PROJECT_PATH`` env var → editable-checkout fallback."""
+ return _resolve_subproject_path(
+ explicit, _VBENCH_PROJECT_PATH_ENV, Path(_DEFAULT_VBENCH_PROJECT_PATH)
+ )
def score_single_sample(self, value: str, ground_truth: str) -> float:
raise RuntimeError(
@@ -1542,35 +1578,12 @@ def _run_vbench_subprocess(
cmd += ["--full-info-json", self.full_info_json_path]
log_path = self.report_dir / "vbench_subprocess.log"
- try:
- completed = subprocess.run(
- cmd,
- check=False,
- stdin=subprocess.DEVNULL,
- stdout=subprocess.PIPE,
- stderr=subprocess.STDOUT,
- text=True,
- timeout=self.subprocess_timeout_s,
- )
- except subprocess.TimeoutExpired as e:
- partial = (
- e.stdout
- if isinstance(e.stdout, str)
- else (e.stdout or b"").decode("utf-8", errors="replace")
- )
- log_path.write_text(partial)
- raise RuntimeError(
- f"VBench subprocess timed out after {self.subprocess_timeout_s}s; "
- f"see {log_path} for partial output."
- ) from e
-
- log_path.write_text(completed.stdout or "")
- if completed.returncode != 0:
- tail = "\n".join((completed.stdout or "").splitlines()[-50:])
- raise RuntimeError(
- f"VBench subprocess exited with code {completed.returncode}; "
- f"full log at {log_path}. Last 50 lines:\n{tail}"
- )
+ _run_subprocess_with_log(
+ cmd,
+ log_path,
+ timeout_s=self.subprocess_timeout_s,
+ label="VBench",
+ )
def _extract_per_dim_scores(self, results: dict[str, Any]) -> list[float]:
"""Pull each requested dim's aggregate score, with clear errors.
@@ -1600,6 +1613,7 @@ def _extract_per_dim_scores(self, results: dict[str, Any]) -> list[float]:
def score(self) -> tuple[float | None, int]:
df = self.get_outputs()
+ assert self.sample_index_map is not None
valid_uuids = self.sample_index_map.keys()
df = df[df["sample_uuid"].isin(valid_uuids)]
# Drop failed queries: Scorer.get_outputs() emits "" when record.data
@@ -1658,3 +1672,623 @@ def score(self) -> tuple[float | None, int]:
per_dim_scores = self._extract_per_dim_scores(results)
mean_score = float(np.mean(per_dim_scores))
return mean_score, n_repeats
+
+
+def _run_subprocess_with_log(
+ cmd: list[str],
+ log_path: Path,
+ *,
+ timeout_s: int | None,
+ label: str,
+ cwd: Path | None = None,
+) -> None:
+ """Run *cmd*, capture stdout+stderr to *log_path*, raise on timeout or non-zero exit."""
+ try:
+ completed = subprocess.run(
+ cmd,
+ check=False,
+ stdin=subprocess.DEVNULL,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.STDOUT,
+ text=True,
+ timeout=timeout_s,
+ cwd=str(cwd) if cwd is not None else None,
+ )
+ except subprocess.TimeoutExpired as e:
+ partial = (
+ e.stdout
+ if isinstance(e.stdout, str)
+ else (e.stdout or b"").decode("utf-8", errors="replace")
+ )
+ log_path.write_text(partial)
+ raise RuntimeError(
+ f"{label} subprocess timed out after {timeout_s}s; "
+ f"see {log_path} for partial output."
+ ) from e
+ log_path.write_text(completed.stdout or "")
+ if completed.returncode != 0:
+ tail = "\n".join((completed.stdout or "").splitlines()[-50:])
+ raise RuntimeError(
+ f"{label} subprocess exited with code {completed.returncode}; "
+ f"full log at {log_path}. Last 50 lines:\n{tail}"
+ )
+
+
+_DEFAULT_SWE_BENCH_PROJECT_PATH = (
+ Path(__file__).resolve().parents[3]
+ / "examples"
+ / "10_Agentic_Inference"
+ / "accuracy"
+)
+_SWE_BENCH_PROJECT_PATH_ENV = "SWE_BENCH_PROJECT_PATH"
+_DEFAULT_SWE_BENCH_TEMPLATE = (
+ Path(__file__).resolve().parents[3]
+ / "examples"
+ / "10_Agentic_Inference"
+ / "swebench_template.yaml"
+)
+
+
+def _read_swebench_exit_statuses(
+ output_dir: Path, ignore: frozenset[Path]
+) -> dict[str, list[str]]:
+ """Read the newest exit_statuses_*.yaml not in *ignore*; return {} if none present."""
+ files = [
+ f for f in sorted(output_dir.glob("exit_statuses_*.yaml")) if f not in ignore
+ ]
+ if not files:
+ return {}
+ try:
+ data = yaml.safe_load(files[-1].read_text()) or {}
+ return data.get("instances_by_exit_status", {})
+ except Exception:
+ logger.debug(
+ "Could not read %s for progress reporting", files[-1], exc_info=True
+ )
+ return {}
+
+
+def _poll_swebench_progress(
+ output_dir: Path, total: int, stop: threading.Event
+) -> None:
+ """Poll exit_statuses_*.yaml and update a tqdm bar until stop is set."""
+ # Snapshot pre-existing status files so stale data from prior runs is ignored.
+ existing = frozenset(output_dir.glob("exit_statuses_*.yaml"))
+ with tqdm(total=total, desc="SWE-bench instances", unit="instance") as bar:
+ last = 0
+ while not stop.is_set():
+ statuses = _read_swebench_exit_statuses(output_dir, existing)
+ done = sum(len(v) for v in statuses.values())
+ if done > last:
+ bar.update(done - last)
+ last = done
+ if statuses:
+ bar.set_postfix({k: len(v) for k, v in sorted(statuses.items())})
+ if last >= total:
+ break
+ stop.wait(timeout=5.0)
+ statuses = _read_swebench_exit_statuses(output_dir, existing)
+ done = sum(len(v) for v in statuses.values())
+ if done > last:
+ bar.update(done - last)
+ if statuses:
+ bar.set_postfix({k: len(v) for k, v in sorted(statuses.items())})
+
+
+def _decode_subprocess_stderr(stderr: bytes | str | None) -> str:
+ if stderr is None:
+ return ""
+ if isinstance(stderr, bytes):
+ return stderr.decode(errors="replace").strip()
+ return str(stderr).strip()
+
+
+class SWEBenchScorer(Scorer, scorer_id="swe_bench_scorer"):
+ """SWE-bench accuracy scorer using the mini-extra CLI (mini-swe-agent package).
+
+ Invokes ``mini-extra swebench`` and ``swebench.harness.run_evaluation`` via
+ ``uv run --project `` so the parent process never imports
+ them directly. Run ``uv sync`` in the subproject directory once before use.
+ """
+
+ REQUIRES_EXTRACTOR: ClassVar[bool] = False
+ SKIP_ENDPOINT_PHASE: ClassVar[bool] = True
+ DEFAULT_SUBPROCESS_TIMEOUT_S: ClassVar[int] = 24 * 60 * 60
+ DEFAULT_SUBSET: ClassVar[str] = "verified"
+ DEFAULT_SPLIT: ClassVar[str] = "test"
+ DEFAULT_NUM_INSTANCES: ClassVar[int] = 100
+ PREPULL_TIMEOUT_S: ClassVar[int] = 10 * 60
+
+ def __init__(
+ self,
+ dataset_name: str,
+ dataset: Dataset,
+ report_dir: os.PathLike,
+ extractor: type[Extractor] | None = None,
+ ground_truth_column: str | None = "instance_id",
+ swe_bench_project_path: str | os.PathLike | None = None,
+ swebench_config_template: str | os.PathLike | None = None,
+ subset: str = DEFAULT_SUBSET,
+ split: str = DEFAULT_SPLIT,
+ num_instances: int = DEFAULT_NUM_INSTANCES,
+ workers: int = 10,
+ max_eval_workers: int = 10,
+ subprocess_timeout_s: int | None = None,
+ ):
+ super().__init__(
+ dataset_name=dataset_name,
+ dataset=dataset,
+ report_dir=report_dir,
+ extractor=extractor,
+ ground_truth_column=ground_truth_column,
+ )
+ self.report_dir = self.report_dir.resolve()
+ self.swe_bench_project_path = self._resolve_project_path(swe_bench_project_path)
+ self.swebench_config_template = (
+ Path(swebench_config_template)
+ if swebench_config_template is not None
+ else _DEFAULT_SWE_BENCH_TEMPLATE
+ )
+ SWEBench.hf_dataset_name(subset)
+ self.subset = subset
+ self.split = split
+ self.num_instances = num_instances
+ self.workers = workers
+ self.max_eval_workers = max_eval_workers
+ self.subprocess_timeout_s = (
+ subprocess_timeout_s
+ if subprocess_timeout_s is not None
+ else self.DEFAULT_SUBPROCESS_TIMEOUT_S
+ )
+
+ if not self.swebench_config_template.exists():
+ raise FileNotFoundError(
+ f"swebench template not found: {self.swebench_config_template}. "
+ f"Pass swebench_config_template= in accuracy_config.extras."
+ )
+ with self.swebench_config_template.open() as _f:
+ _tmpl = yaml.safe_load(_f) or {}
+ model_cfg = _tmpl.get("model")
+ if not isinstance(model_cfg, dict) or not isinstance(
+ model_cfg.get("model_kwargs"), dict
+ ):
+ raise ValueError(
+ f"swebench template {self.swebench_config_template} must have a "
+ "'model.model_kwargs' dict; check the template structure."
+ )
+ pyproject = self.swe_bench_project_path / "pyproject.toml"
+ if not pyproject.exists():
+ raise FileNotFoundError(
+ f"SWE-bench subproject not found at {self.swe_bench_project_path}. "
+ f"Set ${_SWE_BENCH_PROJECT_PATH_ENV} to the subproject path, "
+ f"then run: cd {self.swe_bench_project_path} && uv sync"
+ )
+
+ @staticmethod
+ def _resolve_project_path(
+ explicit: str | os.PathLike | None,
+ ) -> Path:
+ """Lookup order: explicit ctor arg → ``$SWE_BENCH_PROJECT_PATH`` env var → in-repo default."""
+ return _resolve_subproject_path(
+ explicit, _SWE_BENCH_PROJECT_PATH_ENV, Path(_DEFAULT_SWE_BENCH_PROJECT_PATH)
+ )
+
+ @classmethod
+ def _get_extra_int(
+ cls, extras: dict[str, Any], key: str, *, default: int, min_value: int = 0
+ ) -> int:
+ value = extras.get(key, default)
+ try:
+ parsed = int(value)
+ except (TypeError, ValueError) as exc:
+ raise SetupError(
+ f"accuracy_config.extras.{key} must be an integer; got {value!r}"
+ ) from exc
+ if parsed < min_value:
+ raise SetupError(
+ f"accuracy_config.extras.{key} must be >= {min_value}; got {parsed}"
+ )
+ return parsed
+
+ @classmethod
+ def _derive_required_images(
+ cls,
+ *,
+ swe_bench_project_path: Path,
+ subset: str,
+ split: str,
+ num_instances: int,
+ ) -> list[str]:
+ derive_cmd = [
+ "uv",
+ "run",
+ "--project",
+ str(swe_bench_project_path),
+ "python",
+ "-c",
+ (
+ "import json, sys; "
+ "from datasets import load_dataset; "
+ "from minisweagent.run.benchmarks.swebench import "
+ "DATASET_MAPPING, filter_instances, get_swebench_docker_image_name; "
+ "subset, split, num_instances = sys.argv[1], sys.argv[2], int(sys.argv[3]); "
+ "dataset_path = DATASET_MAPPING.get(subset, subset); "
+ "instances = list(load_dataset(dataset_path, split=split)); "
+ "slice_spec = f'0:{min(num_instances, len(instances))}'; "
+ "instances = filter_instances("
+ "instances, filter_spec='', slice_spec=slice_spec, shuffle=False"
+ "); "
+ "seen = set(); images = []; "
+ "for instance in instances: "
+ " image = get_swebench_docker_image_name(instance); "
+ " (seen.add(image), images.append(image)) if image not in seen else None; "
+ "print(json.dumps(images))"
+ ),
+ subset,
+ split,
+ str(num_instances),
+ ]
+ result = subprocess.run(
+ derive_cmd,
+ check=False,
+ capture_output=True,
+ text=True,
+ timeout=cls.PREPULL_TIMEOUT_S,
+ )
+ if result.returncode != 0:
+ stderr_text = _decode_subprocess_stderr(result.stderr)
+ raise SetupError(
+ "Failed to derive required SWE-bench Docker images from the accuracy "
+ f"subproject at {swe_bench_project_path}"
+ + (f". stderr: {stderr_text}" if stderr_text else "")
+ )
+ try:
+ images = json.loads(result.stdout or "[]")
+ except json.JSONDecodeError as exc:
+ stdout_text = (result.stdout or "").strip()
+ raise SetupError(
+ "Failed to parse the required SWE-bench Docker image list from the "
+ f"accuracy subproject output: {stdout_text!r}"
+ ) from exc
+ if not isinstance(images, list) or not all(
+ isinstance(image, str) for image in images
+ ):
+ raise SetupError(
+ "Accuracy subproject returned an invalid SWE-bench Docker image list."
+ )
+ return images
+
+ @classmethod
+ def _prepull_images(cls, images: list[str]) -> None:
+ for image in images:
+ inspect_result = subprocess.run(
+ ["docker", "image", "inspect", image],
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.PIPE,
+ timeout=30,
+ )
+ if inspect_result.returncode == 0:
+ logger.info("SWE-bench Docker image already cached: %s", image)
+ continue
+
+ logger.info("Pulling SWE-bench Docker image: %s", image)
+ pull_result = subprocess.run(
+ ["docker", "pull", image],
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.PIPE,
+ timeout=cls.PREPULL_TIMEOUT_S,
+ )
+ if pull_result.returncode != 0:
+ stderr_text = _decode_subprocess_stderr(pull_result.stderr)
+ raise SetupError(
+ "Failed to pre-pull required SWE-bench Docker image "
+ f"{image}. Authenticate to Docker Hub with `docker login` "
+ "or use a pre-seeded image cache/mirror before retrying."
+ + (f" stderr: {stderr_text}" if stderr_text else "")
+ )
+
+ @classmethod
+ def external_sample_count(cls, extras: dict[str, Any]) -> int | None:
+ try:
+ return int(extras["num_instances"])
+ except (KeyError, TypeError, ValueError):
+ return None
+
+ @classmethod
+ def preflight(cls, extras: dict[str, Any]) -> None:
+ """Check uv, mini-extra, swebench, and Docker before the benchmark starts."""
+ swe_bench_project_path = cls._resolve_project_path(
+ extras.get("swe_bench_project_path")
+ )
+ subset = str(extras.get("subset", cls.DEFAULT_SUBSET))
+ split = str(extras.get("split", cls.DEFAULT_SPLIT))
+ num_instances = cls._get_extra_int(
+ extras,
+ "num_instances",
+ default=cls.DEFAULT_NUM_INSTANCES,
+ )
+
+ if shutil.which("uv") is None:
+ raise SetupError(
+ "uv is not on PATH; install it with: "
+ "curl -LsSf https://astral.sh/uv/install.sh | sh"
+ )
+
+ result = subprocess.run(
+ [
+ "uv",
+ "run",
+ "--project",
+ str(swe_bench_project_path),
+ "mini-extra",
+ "--help",
+ ],
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.PIPE,
+ timeout=30,
+ )
+ if result.returncode != 0:
+ stderr_text = _decode_subprocess_stderr(result.stderr)
+ raise SetupError(
+ f"mini-extra is not available in the SWE-bench subproject at "
+ f"{swe_bench_project_path}. Run: cd {swe_bench_project_path} && uv sync"
+ + (f". stderr: {stderr_text}" if stderr_text else "")
+ )
+
+ swebench_result = subprocess.run(
+ [
+ "uv",
+ "run",
+ "--project",
+ str(swe_bench_project_path),
+ "python",
+ "-c",
+ "import swebench",
+ ],
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.PIPE,
+ timeout=30,
+ )
+ if swebench_result.returncode != 0:
+ stderr_text = _decode_subprocess_stderr(swebench_result.stderr)
+ raise SetupError(
+ f"swebench is not available in the SWE-bench subproject at "
+ f"{swe_bench_project_path}. Run: cd {swe_bench_project_path} && uv sync"
+ + (f". stderr: {stderr_text}" if stderr_text else "")
+ )
+
+ if shutil.which("docker") is None:
+ raise SetupError("docker is not on PATH. Install Docker and retry.")
+
+ try:
+ docker_result = subprocess.run(
+ ["docker", "version"],
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.PIPE,
+ timeout=10,
+ )
+ except Exception as e:
+ raise SetupError(f"Failed to execute docker command: {e}") from e
+
+ if docker_result.returncode != 0:
+ raise SetupError("Docker daemon is not running. Start Docker and retry.")
+
+ images = cls._derive_required_images(
+ swe_bench_project_path=swe_bench_project_path,
+ subset=subset,
+ split=split,
+ num_instances=num_instances,
+ )
+ cls._prepull_images(images)
+
+ def score_single_sample(self, value: str, ground_truth: str) -> float:
+ raise RuntimeError(
+ "SWEBenchScorer uses subprocess evaluation; call score() instead."
+ )
+
+ def _patch_config(self, output_dir: Path, benchmark_config_dict: dict) -> Path:
+ """Load template YAML, patch model fields from benchmark config, write to output_dir."""
+ with self.swebench_config_template.open() as f:
+ cfg = yaml.safe_load(f)
+
+ model_params = benchmark_config_dict.get("model_params") or {}
+ endpoint_cfg = benchmark_config_dict.get("endpoint_config") or {}
+ endpoints = endpoint_cfg.get("endpoints", [])
+
+ model_name = model_params.get("name")
+ if not model_name:
+ raise ValueError(
+ "model_params.name is required in the benchmark config but is missing or empty"
+ )
+ cfg["model"]["model_name"] = model_name
+ if endpoints:
+ base = endpoints[0].rstrip("/")
+ if base.endswith("/v1"):
+ base = base[:-3]
+ cfg["model"]["model_kwargs"]["api_base"] = base + "/v1"
+ else:
+ cfg["model"]["model_kwargs"]["api_base"] = ""
+
+ api_key = endpoint_cfg.get("api_key")
+ if api_key:
+ cfg["model"]["model_kwargs"]["api_key"] = api_key
+
+ for field in (
+ "temperature",
+ "top_p",
+ "top_k",
+ "repetition_penalty",
+ "presence_penalty",
+ "frequency_penalty",
+ ):
+ val = model_params.get(field)
+ if val is not None:
+ cfg["model"]["model_kwargs"][field] = val
+ else:
+ cfg["model"]["model_kwargs"].pop(field, None)
+
+ max_new_tokens_val = model_params.get("max_new_tokens")
+ if max_new_tokens_val is not None:
+ cfg["model"]["model_kwargs"]["max_tokens"] = max_new_tokens_val
+ else:
+ cfg["model"]["model_kwargs"].pop("max_tokens", None)
+
+ chat_tmpl = model_params.get("chat_template_kwargs")
+ if chat_tmpl is not None:
+ cfg["model"]["model_kwargs"]["chat_template_kwargs"] = chat_tmpl
+ else:
+ cfg["model"]["model_kwargs"].pop("chat_template_kwargs", None)
+
+ patched_path = output_dir / "swebench_patched.yaml"
+ with patched_path.open("w") as f:
+ yaml.safe_dump(cfg, f, default_flow_style=False, sort_keys=False)
+ return patched_path
+
+ def _run_subprocess(self, cmd: list[str], log_path: Path, cwd: Path) -> None:
+ """Run a command inside the accuracy subproject via ``uv run --project``."""
+ full_cmd = [
+ "uv",
+ "run",
+ "--project",
+ str(self.swe_bench_project_path),
+ ] + cmd
+ _run_subprocess_with_log(
+ full_cmd,
+ log_path,
+ timeout_s=self.subprocess_timeout_s,
+ label="SWE-bench",
+ cwd=cwd,
+ )
+
+ def score(self) -> tuple[float | None, int]:
+ """Run mini-swe-agent + swebench evaluation. Returns (resolved_rate, 1)."""
+ config_path = self.report_dir / "config.yaml"
+ if not config_path.exists():
+ raise FileNotFoundError(
+ f"config.yaml not found at {config_path}. "
+ "SWEBenchScorer.score() must be called from within a benchmark run "
+ "that has already written its config, or the path must be pre-populated."
+ )
+ with config_path.open() as f:
+ benchmark_cfg = yaml.safe_load(f)
+
+ model_name: str = benchmark_cfg["model_params"]["name"]
+ if self.dataset.dataframe is None:
+ raise RuntimeError(
+ "SWEBench dataset must be loaded before scoring; call dataset.load() first."
+ )
+
+ n_rows = len(self.dataset.dataframe)
+ if self.num_instances > n_rows:
+ logger.warning(
+ "num_instances=%d exceeds dataset size %d; evaluating %d instances",
+ self.num_instances,
+ n_rows,
+ n_rows,
+ )
+ slice_str = f"0:{min(self.num_instances, n_rows)}"
+
+ output_dir = self.report_dir / "swe_bench_output"
+ if output_dir.exists():
+ shutil.rmtree(output_dir)
+ output_dir.mkdir(parents=True)
+
+ patched_config = self._patch_config(output_dir, benchmark_cfg)
+
+ agent_cmd = [
+ "mini-extra",
+ "swebench",
+ "--model",
+ model_name,
+ "--config",
+ str(patched_config),
+ "--subset",
+ self.subset,
+ "--split",
+ self.split,
+ "--slice",
+ slice_str,
+ "--workers",
+ str(self.workers),
+ "--output",
+ str(output_dir),
+ ]
+ logger.info("Running mini-extra swebench: %s", " ".join(agent_cmd))
+ total_instances = min(self.num_instances, n_rows)
+ stop_event = threading.Event()
+ poll_thread = threading.Thread(
+ target=_poll_swebench_progress,
+ args=(output_dir, total_instances, stop_event),
+ daemon=True,
+ )
+ poll_thread.start()
+ try:
+ self._run_subprocess(
+ agent_cmd,
+ self.report_dir / "swe_bench_agent.log",
+ cwd=output_dir,
+ )
+ finally:
+ stop_event.set()
+ poll_thread.join(timeout=10)
+
+ preds_path = output_dir / "preds.json"
+ if not preds_path.exists():
+ logger.error(
+ "preds.json not found after mini-swe-agent run; returning None score"
+ )
+ return None, 1
+
+ hf_dataset_name = SWEBench.hf_dataset_name(self.subset)
+ run_id = f"endpoints_{uuid.uuid4().hex[:8]}"
+ eval_cmd = [
+ "python",
+ "-m",
+ "swebench.harness.run_evaluation",
+ "--dataset_name",
+ hf_dataset_name,
+ "--split",
+ self.split,
+ "--predictions_path",
+ str(preds_path),
+ "--max_workers",
+ str(self.max_eval_workers),
+ "--run_id",
+ run_id,
+ ]
+ logger.info("Running swebench evaluation: %s", " ".join(eval_cmd))
+ self._run_subprocess(
+ eval_cmd,
+ self.report_dir / "swe_bench_eval.log",
+ cwd=output_dir,
+ )
+
+ safe_model = model_name.replace("/", "__")
+ result_path = output_dir / f"{safe_model}.{run_id}.json"
+ if not result_path.exists():
+ candidates = list(output_dir.glob(f"*{run_id}*.json"))
+ if not candidates:
+ logger.error(
+ "SWE-bench result file not found (run_id=%s); returning None",
+ run_id,
+ )
+ return None, 1
+ result_path = candidates[0]
+
+ shutil.copy2(result_path, self.report_dir / "swe_bench_results.json")
+
+ result = msgspec.json.decode(result_path.read_bytes(), type=dict)
+ submitted = result.get("submitted_instances") or 0
+ resolved = result.get("resolved_instances") or 0
+ if submitted == 0:
+ logger.warning("SWE-bench: submitted_instances=0; returning None score")
+ return None, 1
+
+ resolved_rate = resolved / submitted
+ logger.info(
+ "SWE-bench: resolved %d / %d submitted (%.1f%%)",
+ resolved,
+ submitted,
+ resolved_rate * 100,
+ )
+ return resolved_rate, 1
diff --git a/tests/unit/commands/test_benchmark.py b/tests/unit/commands/test_benchmark.py
index 1c90554fb..ee7afd75e 100644
--- a/tests/unit/commands/test_benchmark.py
+++ b/tests/unit/commands/test_benchmark.py
@@ -22,6 +22,7 @@
from types import SimpleNamespace
from unittest.mock import MagicMock, patch
+import inference_endpoint.commands.benchmark.execute as execute_mod
import pandas as pd
import pytest
from inference_endpoint.commands.benchmark.cli import (
@@ -34,6 +35,7 @@
BenchmarkContext,
ResponseCollector,
_build_phases,
+ _load_datasets,
_run_benchmark_async,
setup_benchmark,
)
@@ -62,6 +64,7 @@
from inference_endpoint.config.utils import cli_error_formatter as _error_formatter
from inference_endpoint.core.types import QueryResult
from inference_endpoint.dataset_manager.dataset import Dataset
+from inference_endpoint.dataset_manager.predefined.swe_bench import SWEBench
from inference_endpoint.endpoint_client.config import HTTPClientConfig
from inference_endpoint.evaluation.scoring import Scorer
from inference_endpoint.exceptions import InputValidationError, SetupError
@@ -78,6 +81,29 @@
/ "templates"
)
+
+# Test-only scorers registered with leading-underscore IDs so TestScorerMethodSync excludes them.
+
+
+class _SelfContainedScorer(Scorer, scorer_id="_test_skip_endpoint_phase"):
+ SKIP_ENDPOINT_PHASE = True
+
+ def score_single_sample(self, value, ground_truth):
+ return 0.0
+
+ def score(self):
+ return 1.0, 1
+
+
+class _FailingPreflightScorer(Scorer, scorer_id="_test_failing_preflight"):
+ @classmethod
+ def preflight(cls, extras):
+ raise SetupError("mock preflight failure")
+
+ def score_single_sample(self, value, ground_truth):
+ return 0.0
+
+
# Reusable minimal config kwargs
_OFFLINE_KWARGS = {
"endpoint_config": {"endpoints": ["http://test:8000"]},
@@ -132,6 +158,55 @@ def test_missing_model_name_raises(self):
datasets=[{"path": "test.jsonl"}],
)
+ @pytest.mark.unit
+ def test_concurrency_injected_into_swe_bench_extras(self):
+ """target_concurrency is forwarded as workers into swe_bench_scorer extras."""
+ config = OnlineConfig(
+ endpoint_config={"endpoints": ["http://test:8000"]},
+ model_params={"name": "test-model"},
+ datasets=[
+ {
+ "name": "swe_bench",
+ "type": "accuracy",
+ "accuracy_config": {"eval_method": "swe_bench_scorer"},
+ },
+ {"type": "performance", "path": "tests/assets/datasets/dummy_1k.jsonl"},
+ ],
+ settings={
+ "load_pattern": {"type": "concurrency", "target_concurrency": 32}
+ },
+ )
+ acc_ds = next(d for d in config.datasets if d.type == DatasetType.ACCURACY)
+ assert acc_ds.accuracy_config is not None
+ assert acc_ds.accuracy_config.extras is not None
+ assert acc_ds.accuracy_config.extras.get("workers") == 32
+
+ @pytest.mark.unit
+ def test_explicit_workers_not_overridden_by_concurrency(self):
+ """An explicit workers= in extras is not overwritten by target_concurrency."""
+ config = OnlineConfig(
+ endpoint_config={"endpoints": ["http://test:8000"]},
+ model_params={"name": "test-model"},
+ datasets=[
+ {
+ "name": "swe_bench",
+ "type": "accuracy",
+ "accuracy_config": {
+ "eval_method": "swe_bench_scorer",
+ "extras": {"workers": 5},
+ },
+ },
+ {"type": "performance", "path": "tests/assets/datasets/dummy_1k.jsonl"},
+ ],
+ settings={
+ "load_pattern": {"type": "concurrency", "target_concurrency": 32}
+ },
+ )
+ acc_ds = next(d for d in config.datasets if d.type == DatasetType.ACCURACY)
+ assert acc_ds.accuracy_config is not None
+ assert acc_ds.accuracy_config.extras is not None
+ assert acc_ds.accuracy_config.extras.get("workers") == 5
+
class TestDurationSuffix:
"""Test duration suffix parsing (600s, 10m, 600000ms, plain int)."""
@@ -381,6 +456,91 @@ def test_validation_errors(self, overrides, match):
)
+class TestAccuracyOnlyDataset:
+ """Test that datasets with ACCURACY_ONLY=True are rejected as perf datasets."""
+
+ @pytest.mark.unit
+ def test_swe_bench_as_perf_raises(self, tmp_path):
+ fake_df = pd.DataFrame(
+ [{"instance_id": "repo__repo-0", "problem_statement": "Fix bug 0"}]
+ )
+ config = OfflineConfig(
+ endpoint_config={"endpoints": ["http://test:8000"]},
+ model_params={"name": "test-model"},
+ datasets=[{"name": "swe_bench"}],
+ )
+ with (
+ patch.object(SWEBench, "generate", return_value=fake_df),
+ pytest.raises(InputValidationError, match="accuracy-only"),
+ ):
+ _load_datasets(config, tmp_path)
+
+ @pytest.mark.unit
+ def test_preflight_error_propagates(self, tmp_path):
+ """A scorer whose preflight() raises SetupError must stop _load_datasets."""
+ dummy_jsonl = tmp_path / "dummy.jsonl"
+ dummy_jsonl.write_text('{"prompt": "hello"}\n')
+ fake_acc_df = pd.DataFrame(
+ [{"instance_id": "repo__repo-0", "prompt": "Fix bug 0"}]
+ )
+ config = OfflineConfig(
+ endpoint_config={"endpoints": ["http://test:8000"]},
+ model_params={"name": "test-model"},
+ datasets=[
+ {"type": "performance", "path": str(dummy_jsonl)},
+ {
+ "name": "swe_bench",
+ "type": "accuracy",
+ "accuracy_config": {"eval_method": "swe_bench_scorer"},
+ },
+ ],
+ )
+ with (
+ patch.object(SWEBench, "generate", return_value=fake_acc_df),
+ patch.object(
+ execute_mod,
+ "_resolve_accuracy_components",
+ return_value=(_FailingPreflightScorer, None),
+ ),
+ pytest.raises(SetupError, match="mock preflight failure"),
+ ):
+ _load_datasets(config, tmp_path)
+
+ @pytest.mark.unit
+ def test_perf_dataset_with_accuracy_config_does_not_crash_load_datasets(
+ self, tmp_path
+ ):
+ """_load_datasets must not crash when perf dataset carries accuracy_config.
+
+ The perf-with-accuracy-config branch appends to eval_configs but not to
+ accuracy_datasets; a zip(strict=True) over both lists would raise ValueError.
+ """
+ dummy_jsonl = tmp_path / "dummy.jsonl"
+ dummy_jsonl.write_text('{"prompt": "hello"}\n')
+ config = OfflineConfig(
+ endpoint_config={"endpoints": ["http://test:8000"]},
+ model_params={"name": "test-model"},
+ datasets=[
+ {
+ "type": "performance",
+ "path": str(dummy_jsonl),
+ "accuracy_config": {"eval_method": "swe_bench_scorer"},
+ },
+ ],
+ )
+ with patch.object(
+ execute_mod,
+ "_resolve_accuracy_components",
+ return_value=(_SelfContainedScorer, None),
+ ):
+ _, accuracy_datasets, eval_configs = _load_datasets(config, tmp_path)
+
+ # The perf dataset appends to eval_configs only, not accuracy_datasets.
+ assert len(accuracy_datasets) == 0
+ assert len(eval_configs) == 1
+ assert eval_configs[0].dataset_name == "performance"
+
+
class TestYAMLTemplateValidation:
"""Validate all bundled YAML templates parse correctly."""
@@ -555,8 +715,6 @@ class TestAggregatorArgs:
"""Tests that metrics aggregator subprocess args are correctly forwarded."""
def _make_ctx(self, config, tmp_path):
- import random
-
rt = RuntimeSettings(
metric_target=Throughput(10.0),
reported_metrics=[Throughput(10.0)],
@@ -967,6 +1125,27 @@ def test_accuracy_drain_timeout_defaults_to_unbounded(
acc = next(p for p in phases if p.phase_type == PhaseType.ACCURACY)
assert acc.drain_timeout is None
+ @pytest.mark.unit
+ def test_skip_endpoint_phase_omits_accuracy_phase(
+ self, base_rt_settings, simple_dataset
+ ):
+ config = OfflineConfig(**_OFFLINE_KWARGS)
+ ctx = self._make_ctx(config, base_rt_settings, simple_dataset)
+ ctx.eval_configs = [
+ AccuracyConfiguration(
+ scorer=_SelfContainedScorer,
+ extractor=None,
+ dataset_name="acc",
+ dataset=simple_dataset,
+ report_dir=Path("/tmp"),
+ ground_truth_column=None,
+ num_repeats=1,
+ )
+ ]
+ phases = _build_phases(ctx)
+
+ assert all(p.phase_type != PhaseType.ACCURACY for p in phases)
+
@pytest.mark.unit
def test_warmup_uses_independent_rng_instances(
self, base_rt_settings, simple_dataset
@@ -1052,7 +1231,8 @@ class TestScorerMethodSync:
@pytest.mark.unit
def test_scorer_enum_matches_registry(self):
enum_values = {m.value for m in ScorerMethod}
- registry_keys = set(Scorer.PREDEFINED.keys())
+ # Exclude test-only scorers (ids starting with "_")
+ registry_keys = {k for k in Scorer.PREDEFINED if not k.startswith("_")}
assert enum_values == registry_keys, (
f"ScorerMethod enum out of sync with Scorer registry.\n"
f" In enum only: {enum_values - registry_keys}\n"
diff --git a/tests/unit/dataset_manager/test_swe_bench_dataset.py b/tests/unit/dataset_manager/test_swe_bench_dataset.py
new file mode 100644
index 000000000..f82aeec7a
--- /dev/null
+++ b/tests/unit/dataset_manager/test_swe_bench_dataset.py
@@ -0,0 +1,108 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for SWEBench predefined dataset."""
+
+from pathlib import Path
+from unittest.mock import patch
+
+import pandas as pd
+import pytest
+from inference_endpoint.dataset_manager.dataset import Dataset
+from inference_endpoint.dataset_manager.predefined.swe_bench import SWEBench
+
+pytestmark = pytest.mark.unit
+
+_FAKE_INSTANCES = [
+ {"instance_id": f"repo__repo-{i}", "problem_statement": f"Fix bug {i}"}
+ for i in range(5)
+]
+
+
+def _make_hf_df() -> pd.DataFrame:
+ return pd.DataFrame(_FAKE_INSTANCES)
+
+
+class TestSWEBenchRegistration:
+ def test_registered(self):
+ assert "swe_bench" in Dataset.PREDEFINED
+ assert Dataset.PREDEFINED["swe_bench"] is SWEBench
+
+ def test_accuracy_only_flag(self):
+ assert SWEBench.ACCURACY_ONLY is True
+
+ @pytest.mark.parametrize(
+ ("subset", "expected"),
+ [
+ ("verified", "princeton-nlp/SWE-bench_Verified"),
+ ("lite", "princeton-nlp/SWE-bench_Lite"),
+ ],
+ )
+ def test_hf_dataset_name(self, subset: str, expected: str):
+ assert SWEBench.hf_dataset_name(subset) == expected
+
+ def test_hf_dataset_name_invalid_subset_raises(self):
+ with pytest.raises(ValueError, match="Unknown SWE-bench subset"):
+ SWEBench.hf_dataset_name("invalid")
+
+
+class TestSWEBenchGenerate:
+ def test_downloads_and_caches(self, tmp_path: Path):
+ with patch(
+ "inference_endpoint.dataset_manager.predefined.swe_bench.load_from_huggingface",
+ return_value=_make_hf_df(),
+ ) as mock_hf:
+ df1 = SWEBench.generate(datasets_dir=tmp_path)
+
+ assert mock_hf.call_count == 1
+ assert list(df1.columns) == ["instance_id", "prompt"]
+ assert len(df1) == 5
+ assert df1["prompt"].iloc[0] == "Fix bug 0"
+
+ # Second call should hit parquet cache, not HF
+ with patch(
+ "inference_endpoint.dataset_manager.predefined.swe_bench.load_from_huggingface",
+ ) as mock_hf2:
+ df2 = SWEBench.generate(datasets_dir=tmp_path)
+
+ mock_hf2.assert_not_called()
+ assert list(df2.columns) == ["instance_id", "prompt"]
+ assert len(df2) == 5
+
+ def test_unknown_subset_raises(self, tmp_path: Path):
+ with pytest.raises(ValueError, match="Unknown SWE-bench subset"):
+ SWEBench.generate(datasets_dir=tmp_path, subset="invalid")
+
+ def test_force_regenerate(self, tmp_path: Path):
+ with patch(
+ "inference_endpoint.dataset_manager.predefined.swe_bench.load_from_huggingface",
+ return_value=_make_hf_df(),
+ ) as mock_hf:
+ SWEBench.generate(datasets_dir=tmp_path)
+ assert mock_hf.call_count == 1
+
+ SWEBench.generate(datasets_dir=tmp_path, force=True)
+ assert mock_hf.call_count == 2
+
+ def test_lite_subset(self, tmp_path: Path):
+ with patch(
+ "inference_endpoint.dataset_manager.predefined.swe_bench.load_from_huggingface",
+ return_value=_make_hf_df(),
+ ) as mock_hf:
+ df = SWEBench.generate(datasets_dir=tmp_path, subset="lite")
+
+ call_kwargs = mock_hf.call_args
+ assert "princeton-nlp/SWE-bench_Lite" in call_kwargs[0]
+ assert len(df) == 5
diff --git a/tests/unit/evaluation/test_swe_bench_scorer.py b/tests/unit/evaluation/test_swe_bench_scorer.py
new file mode 100644
index 000000000..584ec4eed
--- /dev/null
+++ b/tests/unit/evaluation/test_swe_bench_scorer.py
@@ -0,0 +1,699 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for SWEBenchScorer."""
+
+import json
+from pathlib import Path
+from unittest.mock import MagicMock
+
+import msgspec
+import pandas as pd
+import pytest
+import yaml
+from inference_endpoint.evaluation import scoring as scoring_mod
+from inference_endpoint.evaluation.scoring import (
+ Scorer,
+ SWEBenchScorer,
+)
+from inference_endpoint.exceptions import SetupError
+
+pytestmark = pytest.mark.unit
+
+_DATASET_NAME = "swe_bench_acc"
+_MODEL_NAME = "TestOrg/test-model-7b"
+
+
+def _write_benchmark_config(report_dir: Path, model_params: dict | None = None) -> None:
+ mp: dict = {"name": _MODEL_NAME}
+ defaults = {
+ "model_params": mp,
+ "endpoint_config": {"endpoints": ["http://localhost:30000"]},
+ }
+ if model_params is not None:
+ mp.update(model_params)
+ (report_dir / "config.yaml").write_text(yaml.dump(defaults))
+
+
+def _write_sample_idx_map(report_dir: Path, n: int = 3) -> None:
+ idx_map = {_DATASET_NAME: {f"uuid-{i}": i for i in range(n)}}
+ (report_dir / "sample_idx_map.json").write_bytes(msgspec.json.encode(idx_map))
+
+
+def _make_dataset(n: int = 3) -> MagicMock:
+ df = pd.DataFrame(
+ {
+ "instance_id": [f"repo__repo-{i}" for i in range(n)],
+ "prompt": ["placeholder"] * n,
+ }
+ )
+ ds = MagicMock()
+ ds.dataframe = df
+ ds.num_samples.return_value = n
+ return ds
+
+
+@pytest.fixture
+def swe_bench_project(tmp_path: Path) -> Path:
+ """Fake accuracy subproject directory with a minimal pyproject.toml."""
+ d = tmp_path / "accuracy"
+ d.mkdir(parents=True)
+ (d / "pyproject.toml").write_text("[project]\nname = 'swe-bench-accuracy'\n")
+ return d
+
+
+@pytest.fixture
+def template_yaml(tmp_path: Path) -> Path:
+ """Minimal swebench template YAML."""
+ tmpl = {
+ "model": {
+ "model_name": "",
+ "model_kwargs": {
+ "custom_llm_provider": "openai",
+ "api_base": "",
+ },
+ }
+ }
+ p = tmp_path / "swebench_template.yaml"
+ p.write_text(yaml.dump(tmpl))
+ return p
+
+
+@pytest.fixture
+def report_dir(tmp_path: Path) -> Path:
+ d = tmp_path / "report"
+ d.mkdir()
+ _write_benchmark_config(d)
+ _write_sample_idx_map(d)
+ return d
+
+
+def _make_fake_run(cmd, **kwargs):
+ """Return a fake subprocess.run result with returncode=0."""
+ return MagicMock(returncode=0, stdout="")
+
+
+def _make_staged_run(on_eval_cmd):
+ """Return a fake subprocess.run that handles mini-extra successfully, then delegates."""
+
+ def fake_run(cmd, **kwargs):
+ if "mini-extra" in " ".join(cmd):
+ output_dir = Path(cmd[cmd.index("--output") + 1])
+ output_dir.mkdir(parents=True, exist_ok=True)
+ (output_dir / "preds.json").write_text(json.dumps({}))
+ return MagicMock(returncode=0, stdout="")
+ return on_eval_cmd(cmd, **kwargs)
+
+ return fake_run
+
+
+@pytest.fixture
+def patch_subprocess(monkeypatch, report_dir: Path, swe_bench_project: Path):
+ """Patch subprocess.run to write fake preds.json and result JSON."""
+ captured: list[list[str]] = []
+
+ def fake_run(cmd, **kwargs):
+ captured.append(list(cmd))
+ cmd_str = " ".join(cmd)
+ if "mini-extra" in cmd_str:
+ output_dir = Path(cmd[cmd.index("--output") + 1])
+ output_dir.mkdir(parents=True, exist_ok=True)
+ (output_dir / "preds.json").write_text(json.dumps({}))
+ elif "run_evaluation" in cmd_str:
+ cwd = Path(kwargs["cwd"])
+ run_id = cmd[cmd.index("--run_id") + 1]
+ safe_model = _MODEL_NAME.replace("/", "__")
+ (cwd / f"{safe_model}.{run_id}.json").write_text(
+ json.dumps(
+ {
+ "resolved_instances": 3,
+ "submitted_instances": 10,
+ "total_instances": 500,
+ }
+ )
+ )
+ return MagicMock(returncode=0, stdout="")
+
+ monkeypatch.setattr(scoring_mod.subprocess, "run", fake_run)
+ return captured
+
+
+class TestSWEBenchScorerRegistration:
+ def test_registered(self):
+ assert "swe_bench_scorer" in Scorer.PREDEFINED
+ assert Scorer.get("swe_bench_scorer") is SWEBenchScorer
+
+ def test_skip_endpoint_phase(self):
+ assert SWEBenchScorer.SKIP_ENDPOINT_PHASE is True
+
+ def test_external_sample_count(self):
+ assert SWEBenchScorer.external_sample_count({"num_instances": 100}) == 100
+ assert SWEBenchScorer.external_sample_count({}) is None
+ assert SWEBenchScorer.external_sample_count({"num_instances": "bad"}) is None
+
+
+class TestSWEBenchScorer:
+ def test_score_happy_path(
+ self, report_dir, swe_bench_project, template_yaml, patch_subprocess
+ ):
+ scorer = SWEBenchScorer(
+ dataset_name=_DATASET_NAME,
+ dataset=_make_dataset(),
+ report_dir=report_dir,
+ swe_bench_project_path=swe_bench_project,
+ swebench_config_template=template_yaml,
+ )
+ score, n_repeats = scorer.score()
+
+ assert score == pytest.approx(0.3)
+ assert n_repeats == 1
+ assert (report_dir / "swe_bench_results.json").exists()
+
+ def test_missing_subproject_raises_at_init(
+ self, report_dir, tmp_path, template_yaml
+ ):
+ empty_dir = tmp_path / "empty_project"
+ empty_dir.mkdir()
+ with pytest.raises(FileNotFoundError, match="SWE-bench subproject not found"):
+ SWEBenchScorer(
+ dataset_name=_DATASET_NAME,
+ dataset=_make_dataset(),
+ report_dir=report_dir,
+ swe_bench_project_path=empty_dir,
+ swebench_config_template=template_yaml,
+ )
+
+ def test_missing_template_raises_at_init(
+ self, report_dir, swe_bench_project, tmp_path
+ ):
+ nonexistent = tmp_path / "no_such_template.yaml"
+ with pytest.raises(FileNotFoundError, match="swebench template"):
+ SWEBenchScorer(
+ dataset_name=_DATASET_NAME,
+ dataset=_make_dataset(),
+ report_dir=report_dir,
+ swe_bench_project_path=swe_bench_project,
+ swebench_config_template=nonexistent,
+ )
+
+ def test_missing_preds_returns_none(
+ self, report_dir, swe_bench_project, template_yaml, monkeypatch
+ ):
+ monkeypatch.setattr(scoring_mod.subprocess, "run", _make_fake_run)
+ scorer = SWEBenchScorer(
+ dataset_name=_DATASET_NAME,
+ dataset=_make_dataset(),
+ report_dir=report_dir,
+ swe_bench_project_path=swe_bench_project,
+ swebench_config_template=template_yaml,
+ )
+ score, n_repeats = scorer.score()
+ assert score is None
+ assert n_repeats == 1
+
+ def test_config_patching_all_fields(self, report_dir, swe_bench_project, tmp_path):
+ tmpl = {
+ "model": {
+ "model_name": "",
+ "model_kwargs": {
+ "api_base": "",
+ "temperature": None,
+ "top_k": None,
+ },
+ }
+ }
+ template_path = tmp_path / "tmpl.yaml"
+ template_path.write_text(yaml.dump(tmpl))
+
+ _write_benchmark_config(
+ report_dir,
+ model_params={
+ "temperature": 0.8,
+ "top_p": 0.9,
+ "top_k": 15,
+ "chat_template_kwargs": {"preserve_thinking": True},
+ },
+ )
+
+ scorer = SWEBenchScorer(
+ dataset_name=_DATASET_NAME,
+ dataset=_make_dataset(),
+ report_dir=report_dir,
+ swe_bench_project_path=swe_bench_project,
+ swebench_config_template=template_path,
+ )
+ output_dir = tmp_path / "out"
+ output_dir.mkdir()
+ with (report_dir / "config.yaml").open() as f:
+ benchmark_cfg = yaml.safe_load(f)
+ patched_path = scorer._patch_config(output_dir, benchmark_cfg)
+ patched = yaml.safe_load(patched_path.read_text())
+
+ assert patched["model"]["model_name"] == _MODEL_NAME
+ assert (
+ patched["model"]["model_kwargs"]["api_base"] == "http://localhost:30000/v1"
+ )
+ assert patched["model"]["model_kwargs"]["temperature"] == pytest.approx(0.8)
+ assert patched["model"]["model_kwargs"]["top_p"] == pytest.approx(0.9)
+ assert patched["model"]["model_kwargs"]["top_k"] == 15
+ assert patched["model"]["model_kwargs"]["chat_template_kwargs"] == {
+ "preserve_thinking": True
+ }
+
+ def test_config_patching_omits_none_fields(
+ self, report_dir, swe_bench_project, tmp_path
+ ):
+ tmpl = {
+ "model": {
+ "model_name": "",
+ "model_kwargs": {"api_base": "", "top_k": 20},
+ }
+ }
+ template_path = tmp_path / "tmpl.yaml"
+ template_path.write_text(yaml.dump(tmpl))
+
+ # model_params has no top_k — should be removed from patched config
+ _write_benchmark_config(report_dir)
+
+ scorer = SWEBenchScorer(
+ dataset_name=_DATASET_NAME,
+ dataset=_make_dataset(),
+ report_dir=report_dir,
+ swe_bench_project_path=swe_bench_project,
+ swebench_config_template=template_path,
+ )
+ output_dir = tmp_path / "out"
+ output_dir.mkdir()
+ with (report_dir / "config.yaml").open() as f:
+ benchmark_cfg = yaml.safe_load(f)
+ patched_path = scorer._patch_config(output_dir, benchmark_cfg)
+ patched = yaml.safe_load(patched_path.read_text())
+
+ assert "top_k" not in patched["model"]["model_kwargs"]
+
+ def test_config_patching_max_new_tokens(
+ self, report_dir, swe_bench_project, tmp_path
+ ):
+ tmpl = {
+ "model": {
+ "model_name": "",
+ "model_kwargs": {"api_base": ""},
+ }
+ }
+ template_path = tmp_path / "tmpl.yaml"
+ template_path.write_text(yaml.dump(tmpl))
+
+ _write_benchmark_config(report_dir, model_params={"max_new_tokens": 4096})
+
+ scorer = SWEBenchScorer(
+ dataset_name=_DATASET_NAME,
+ dataset=_make_dataset(),
+ report_dir=report_dir,
+ swe_bench_project_path=swe_bench_project,
+ swebench_config_template=template_path,
+ )
+ output_dir = tmp_path / "out"
+ output_dir.mkdir()
+ with (report_dir / "config.yaml").open() as f:
+ benchmark_cfg = yaml.safe_load(f)
+ patched_path = scorer._patch_config(output_dir, benchmark_cfg)
+ patched = yaml.safe_load(patched_path.read_text())
+
+ assert patched["model"]["model_kwargs"]["max_tokens"] == 4096
+
+ def test_config_patching_omits_max_tokens_when_not_set(
+ self, report_dir, swe_bench_project, tmp_path
+ ):
+ tmpl = {
+ "model": {
+ "model_name": "",
+ "model_kwargs": {"api_base": "", "max_tokens": 999},
+ }
+ }
+ template_path = tmp_path / "tmpl.yaml"
+ template_path.write_text(yaml.dump(tmpl))
+
+ # model_params has no max_new_tokens — max_tokens should be removed
+ _write_benchmark_config(report_dir)
+
+ scorer = SWEBenchScorer(
+ dataset_name=_DATASET_NAME,
+ dataset=_make_dataset(),
+ report_dir=report_dir,
+ swe_bench_project_path=swe_bench_project,
+ swebench_config_template=template_path,
+ )
+ output_dir = tmp_path / "out"
+ output_dir.mkdir()
+ with (report_dir / "config.yaml").open() as f:
+ benchmark_cfg = yaml.safe_load(f)
+ patched_path = scorer._patch_config(output_dir, benchmark_cfg)
+ patched = yaml.safe_load(patched_path.read_text())
+
+ assert "max_tokens" not in patched["model"]["model_kwargs"]
+
+ @pytest.mark.parametrize(
+ "num_instances, expected_slice",
+ [
+ (5, "0:5"),
+ (100, "0:100"),
+ ],
+ )
+ def test_num_instances_produces_correct_slice(
+ self,
+ num_instances,
+ expected_slice,
+ report_dir,
+ swe_bench_project,
+ template_yaml,
+ patch_subprocess,
+ ):
+ scorer = SWEBenchScorer(
+ dataset_name=_DATASET_NAME,
+ dataset=_make_dataset(n=num_instances),
+ report_dir=report_dir,
+ swe_bench_project_path=swe_bench_project,
+ swebench_config_template=template_yaml,
+ num_instances=num_instances,
+ )
+ scorer.score()
+ agent_cmd = patch_subprocess[0]
+ assert agent_cmd[agent_cmd.index("--slice") + 1] == expected_slice
+
+ @pytest.mark.parametrize(
+ "subset, expected_hf_name",
+ [
+ ("lite", "princeton-nlp/SWE-bench_Lite"),
+ ("verified", "princeton-nlp/SWE-bench_Verified"),
+ ],
+ )
+ def test_subset_maps_to_correct_hf_dataset_name(
+ self,
+ subset,
+ expected_hf_name,
+ report_dir,
+ swe_bench_project,
+ template_yaml,
+ patch_subprocess,
+ ):
+ scorer = SWEBenchScorer(
+ dataset_name=_DATASET_NAME,
+ dataset=_make_dataset(),
+ report_dir=report_dir,
+ swe_bench_project_path=swe_bench_project,
+ swebench_config_template=template_yaml,
+ subset=subset,
+ )
+ scorer.score()
+ eval_cmd = patch_subprocess[1]
+ assert eval_cmd[eval_cmd.index("--dataset_name") + 1] == expected_hf_name
+
+ def test_unknown_subset_raises_at_init(
+ self, report_dir, swe_bench_project, template_yaml
+ ):
+ with pytest.raises(ValueError, match="Unknown SWE-bench subset"):
+ SWEBenchScorer(
+ dataset_name=_DATASET_NAME,
+ dataset=_make_dataset(),
+ report_dir=report_dir,
+ swe_bench_project_path=swe_bench_project,
+ swebench_config_template=template_yaml,
+ subset="full",
+ )
+
+ def test_missing_model_name_raises_clear_error(self, swe_bench_project, tmp_path):
+ tmpl = {
+ "model": {
+ "model_name": "",
+ "model_kwargs": {"api_base": ""},
+ }
+ }
+ template_path = tmp_path / "tmpl.yaml"
+ template_path.write_text(yaml.dump(tmpl))
+
+ scorer = SWEBenchScorer(
+ dataset_name=_DATASET_NAME,
+ dataset=_make_dataset(),
+ report_dir=tmp_path,
+ swe_bench_project_path=swe_bench_project,
+ swebench_config_template=template_path,
+ )
+ output_dir = tmp_path / "out"
+ output_dir.mkdir()
+
+ with pytest.raises(ValueError, match="model_params.name is required"):
+ scorer._patch_config(output_dir, {"model_params": {}})
+
+ def test_template_missing_model_kwargs_raises(
+ self, report_dir, swe_bench_project, tmp_path
+ ):
+ bad_template = tmp_path / "bad_template.yaml"
+ bad_template.write_text(yaml.dump({"model": {"model_name": ""}}))
+ with pytest.raises(ValueError, match="model.model_kwargs"):
+ SWEBenchScorer(
+ dataset_name=_DATASET_NAME,
+ dataset=_make_dataset(),
+ report_dir=report_dir,
+ swe_bench_project_path=swe_bench_project,
+ swebench_config_template=bad_template,
+ )
+
+ def test_subprocess_failure_raises(
+ self, report_dir, swe_bench_project, template_yaml, monkeypatch
+ ):
+ def _fail_eval(cmd, **kwargs):
+ return MagicMock(returncode=2, stdout="docker error: permission denied")
+
+ monkeypatch.setattr(
+ scoring_mod.subprocess,
+ "run",
+ _make_staged_run(_fail_eval),
+ )
+ scorer = SWEBenchScorer(
+ dataset_name=_DATASET_NAME,
+ dataset=_make_dataset(),
+ report_dir=report_dir,
+ swe_bench_project_path=swe_bench_project,
+ swebench_config_template=template_yaml,
+ )
+ with pytest.raises(RuntimeError, match="exited with code 2"):
+ scorer.score()
+
+ def test_subprocess_timeout_raises(
+ self, report_dir, swe_bench_project, template_yaml, monkeypatch
+ ):
+ def _timeout_eval(cmd, **kwargs):
+ raise scoring_mod.subprocess.TimeoutExpired(cmd=cmd, timeout=300)
+
+ monkeypatch.setattr(
+ scoring_mod.subprocess, "run", _make_staged_run(_timeout_eval)
+ )
+ scorer = SWEBenchScorer(
+ dataset_name=_DATASET_NAME,
+ dataset=_make_dataset(),
+ report_dir=report_dir,
+ swe_bench_project_path=swe_bench_project,
+ swebench_config_template=template_yaml,
+ )
+ with pytest.raises(RuntimeError, match="timed out after"):
+ scorer.score()
+
+ def test_result_glob_fallback(
+ self, report_dir, swe_bench_project, template_yaml, monkeypatch
+ ):
+ def _write_alt_prefix(cmd, **kwargs):
+ if "run_evaluation" in " ".join(cmd):
+ cwd = Path(kwargs["cwd"])
+ run_id = cmd[cmd.index("--run_id") + 1]
+ # Write under a different prefix so exact name won't match; glob will find it
+ (cwd / f"alt_prefix.{run_id}.json").write_text(
+ json.dumps(
+ {
+ "resolved_instances": 1,
+ "submitted_instances": 5,
+ "total_instances": 500,
+ }
+ )
+ )
+ return MagicMock(returncode=0, stdout="")
+
+ monkeypatch.setattr(
+ scoring_mod.subprocess, "run", _make_staged_run(_write_alt_prefix)
+ )
+ scorer = SWEBenchScorer(
+ dataset_name=_DATASET_NAME,
+ dataset=_make_dataset(),
+ report_dir=report_dir,
+ swe_bench_project_path=swe_bench_project,
+ swebench_config_template=template_yaml,
+ )
+ score, n_repeats = scorer.score()
+ assert score == pytest.approx(1 / 5)
+ assert n_repeats == 1
+
+ def test_zero_submitted_instances_returns_none(
+ self, report_dir, swe_bench_project, template_yaml, monkeypatch
+ ):
+ def _write_zero_results(cmd, **kwargs):
+ if "run_evaluation" in " ".join(cmd):
+ cwd = Path(kwargs["cwd"])
+ run_id = cmd[cmd.index("--run_id") + 1]
+ safe_model = _MODEL_NAME.replace("/", "__")
+ (cwd / f"{safe_model}.{run_id}.json").write_text(
+ json.dumps(
+ {
+ "resolved_instances": 0,
+ "submitted_instances": 0,
+ "total_instances": 500,
+ }
+ )
+ )
+ return MagicMock(returncode=0, stdout="")
+
+ monkeypatch.setattr(
+ scoring_mod.subprocess, "run", _make_staged_run(_write_zero_results)
+ )
+ scorer = SWEBenchScorer(
+ dataset_name=_DATASET_NAME,
+ dataset=_make_dataset(),
+ report_dir=report_dir,
+ swe_bench_project_path=swe_bench_project,
+ swebench_config_template=template_yaml,
+ )
+ score, n_repeats = scorer.score()
+ assert score is None
+ assert n_repeats == 1
+
+
+class TestSWEBenchScorerPreflight:
+ def _extras(self, swe_bench_project: Path, **overrides) -> dict:
+ return {"swe_bench_project_path": str(swe_bench_project), **overrides}
+
+ def test_preflight_passes(self, swe_bench_project, monkeypatch):
+ monkeypatch.setattr(
+ scoring_mod.shutil, "which", lambda name: f"/usr/bin/{name}"
+ )
+ captured: list[list[str]] = []
+
+ def fake_run(cmd, **kw):
+ captured.append(list(cmd))
+ cmd_str = " ".join(cmd)
+ if "get_swebench_docker_image_name" in cmd_str:
+ return MagicMock(
+ returncode=0,
+ stdout=json.dumps(["docker.io/swebench/test:latest"]),
+ stderr="",
+ )
+ return MagicMock(returncode=0, stdout="", stderr=b"")
+
+ monkeypatch.setattr(scoring_mod.subprocess, "run", fake_run)
+ SWEBenchScorer.preflight(
+ self._extras(
+ swe_bench_project,
+ subset="lite",
+ split="test",
+ num_instances=2,
+ )
+ )
+
+ derive_cmd = next(
+ cmd for cmd in captured if "get_swebench_docker_image_name" in " ".join(cmd)
+ )
+ assert derive_cmd[-3:] == ["lite", "test", "2"]
+ assert ["docker", "pull", "docker.io/swebench/test:latest"] not in captured
+
+ def test_preflight_fails_uv_missing(self, swe_bench_project, monkeypatch):
+ monkeypatch.setattr(scoring_mod.shutil, "which", lambda name: None)
+ with pytest.raises(SetupError, match="uv is not on PATH"):
+ SWEBenchScorer.preflight(self._extras(swe_bench_project))
+
+ def test_preflight_fails_mini_extra_missing(self, swe_bench_project, monkeypatch):
+ monkeypatch.setattr(
+ scoring_mod.shutil, "which", lambda name: f"/usr/bin/{name}"
+ )
+
+ def fake_run(cmd, **kw):
+ if "mini-extra" in cmd:
+ return MagicMock(returncode=1, stderr=b"not found")
+ return MagicMock(returncode=0)
+
+ monkeypatch.setattr(scoring_mod.subprocess, "run", fake_run)
+ with pytest.raises(
+ SetupError, match=r"mini-extra is not available.*stderr: not found"
+ ):
+ SWEBenchScorer.preflight(self._extras(swe_bench_project))
+
+ def test_preflight_fails_swebench_missing(self, swe_bench_project, monkeypatch):
+ monkeypatch.setattr(
+ scoring_mod.shutil, "which", lambda name: f"/usr/bin/{name}"
+ )
+
+ def fake_run(cmd, **kw):
+ if "import swebench" in " ".join(cmd):
+ return MagicMock(returncode=1, stderr=b"ModuleNotFoundError")
+ return MagicMock(returncode=0)
+
+ monkeypatch.setattr(scoring_mod.subprocess, "run", fake_run)
+ with pytest.raises(
+ SetupError,
+ match=r"swebench is not available.*stderr: ModuleNotFoundError",
+ ):
+ SWEBenchScorer.preflight(self._extras(swe_bench_project))
+
+ def test_preflight_fails_docker_not_running(self, swe_bench_project, monkeypatch):
+ monkeypatch.setattr(
+ scoring_mod.shutil, "which", lambda name: f"/usr/bin/{name}"
+ )
+
+ def fake_run(cmd, **kw):
+ if "docker" in cmd:
+ return MagicMock(
+ returncode=1, stderr=b"Cannot connect to Docker daemon"
+ )
+ return MagicMock(returncode=0)
+
+ monkeypatch.setattr(scoring_mod.subprocess, "run", fake_run)
+ with pytest.raises(SetupError, match="Docker daemon is not running"):
+ SWEBenchScorer.preflight(self._extras(swe_bench_project))
+
+ def test_preflight_fails_when_pull_fails(self, swe_bench_project, monkeypatch):
+ monkeypatch.setattr(
+ scoring_mod.shutil, "which", lambda name: f"/usr/bin/{name}"
+ )
+
+ def fake_run(cmd, **kw):
+ cmd_str = " ".join(cmd)
+ if "get_swebench_docker_image_name" in cmd_str:
+ return MagicMock(
+ returncode=0,
+ stdout=json.dumps(["docker.io/swebench/test:latest"]),
+ stderr="",
+ )
+ if cmd[:3] == ["docker", "image", "inspect"]:
+ return MagicMock(returncode=1, stdout="", stderr=b"missing")
+ if cmd[:2] == ["docker", "pull"]:
+ return MagicMock(
+ returncode=1,
+ stdout="",
+ stderr=b"rate limit exceeded",
+ )
+ return MagicMock(returncode=0, stdout="", stderr=b"")
+
+ monkeypatch.setattr(scoring_mod.subprocess, "run", fake_run)
+ with pytest.raises(
+ SetupError,
+ match=r"docker\.io/swebench/test:latest.*rate limit exceeded",
+ ):
+ SWEBenchScorer.preflight(self._extras(swe_bench_project))
diff --git a/uv.lock b/uv.lock
index 984581b6b..b079ca900 100644
--- a/uv.lock
+++ b/uv.lock
@@ -810,6 +810,7 @@ dependencies = [
{ name = "pydantic", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
{ name = "pydantic-core", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
{ name = "pytz", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
+ { name = "pyyaml", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
{ name = "pyzmq", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
{ name = "rich", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
{ name = "sentencepiece", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
@@ -894,6 +895,7 @@ requires-dist = [
{ name = "pytest-timeout", marker = "extra == 'test'", specifier = "==2.4.0" },
{ name = "pytest-xdist", marker = "extra == 'test'", specifier = "==3.8.0" },
{ name = "pytz", specifier = "==2026.1.post1" },
+ { name = "pyyaml", specifier = "==6.0.3" },
{ name = "pyzmq", specifier = "==27.1.0" },
{ name = "rich", specifier = "==14.3.3" },
{ name = "ruff", marker = "extra == 'dev'", specifier = "==0.15.8" },