diff --git a/examples/10_Agentic_Inference/README.md b/examples/10_Agentic_Inference/README.md
index ab3673b51..932e42db5 100644
--- a/examples/10_Agentic_Inference/README.md
+++ b/examples/10_Agentic_Inference/README.md
@@ -194,3 +194,29 @@ Update the first `datasets` entry (`name` and `path`), `model_params.name`, and
 uv run inference-endpoint benchmark from-config \
   --config examples/10_Agentic_Inference/kimi_agentic_benchmark.yaml
 ```
+
+## SWE-bench Accuracy
+
+`swe_bench_accuracy.yaml` runs the SWE-bench accuracy evaluation alongside a
+minimal performance dataset. The benchmark framework skips its built-in
+accuracy phase for this dataset; instead, `SWEBenchScorer` shells out to
+`mini-swe-agent` and the `swebench` evaluation harness, and that external flow
+drives requests to the configured endpoint.
+
+The isolated `uv` environment for those tools lives in `accuracy/`. Sync it
+once before running:
+
+```bash
+cd examples/10_Agentic_Inference/accuracy
+uv sync
+```
+
+Then run the benchmark from the repo root:
+
+```bash
+uv run inference-endpoint benchmark from-config \
+  --config examples/10_Agentic_Inference/swe_bench_accuracy.yaml
+```
+
+See `accuracy/RUNBOOK.md` for preconditions, sanity checks, and common failure
+modes.
diff --git a/examples/10_Agentic_Inference/accuracy/RUNBOOK.md b/examples/10_Agentic_Inference/accuracy/RUNBOOK.md
new file mode 100644
index 000000000..7ad03c122
--- /dev/null
+++ b/examples/10_Agentic_Inference/accuracy/RUNBOOK.md
@@ -0,0 +1,54 @@
+# SWE-bench Accuracy Smoke-Test Runbook
+
+End-to-end validation for the SWE-bench accuracy pipeline. Unit tests mock all
+subprocesses, so running the real pipeline is the only way to catch Docker,
+HuggingFace access, or mini-swe-agent wiring issues.
+
+## 0. Preconditions
+
+- Docker daemon running (swebench harness spawns one container per instance).
+- Docker Hub auth or a pre-seeded image cache for uncached SWE-bench images.
+- Network egress to PyPI and HuggingFace Hub.
+- `uv` binary on PATH (`curl -LsSf https://astral.sh/uv/install.sh | sh`).
+- Parent endpoints env already synced (`uv sync --extra dev` from repo root).
+
+## 1. Sync the accuracy subproject
+
+From the repo root:
+
+```bash
+cd examples/10_Agentic_Inference/accuracy
+uv sync
+```
+
+Sanity check:
+
+```bash
+uv run mini-extra --help
+uv run python -m swebench.harness.run_evaluation --help
+```
+
+Override the default subproject path via env var if needed:
+
+```bash
+export SWE_BENCH_PROJECT_PATH=/path/to/examples/10_Agentic_Inference/accuracy
+```
+
+## 2. End-to-end test (requires live endpoint)
+
+```bash
+uv run inference-endpoint benchmark from-config \
+  --config examples/10_Agentic_Inference/swe_bench_accuracy.yaml
+```
+
+Scorer preflight now resolves the requested SWE-bench instances and pre-pulls any
+missing Docker images before `mini-extra swebench` starts. Cached images are
+skipped.
+
+## Common failure modes
+
+| Symptom                                              | Likely cause                          | Fix                                                       |
+| ---------------------------------------------------- | ------------------------------------- | --------------------------------------------------------- |
+| `FileNotFoundError: SWE-bench subproject not found`  | subproject not synced                 | Run `uv sync` in `examples/10_Agentic_Inference/accuracy` |
+| Docker error during `run_evaluation`                 | Docker daemon not running             | Start Docker and retry                                    |
+| `Failed to pre-pull required SWE-bench Docker image` | Docker Hub rate limit or missing auth | Run `docker login` or use a local image cache/mirror      |
diff --git a/examples/10_Agentic_Inference/accuracy/pyproject.toml b/examples/10_Agentic_Inference/accuracy/pyproject.toml
new file mode 100644
index 000000000..14482a29f
--- /dev/null
+++ b/examples/10_Agentic_Inference/accuracy/pyproject.toml
@@ -0,0 +1,29 @@
+# Isolated uv project for the SWE-bench accuracy evaluator.
+#
+# mini-swe-agent and swebench pin specific versions of litellm, docker,
+# and other packages that are not part of the parent endpoints env. Keeping
+# the swebench env separate means the parent lockfile stays solvable and
+# the evaluation env stays reproducible.
+#
+# `inference_endpoint.evaluation.scoring.SWEBenchScorer` invokes
+# mini-extra and swebench.harness.run_evaluation via `uv run --project`,
+# so the main benchmark process never needs to import these packages.
+#
+# Usage on the accuracy host:
+#   cd examples/10_Agentic_Inference/accuracy
+#   uv sync
+#   # SWEBenchScorer in the parent will shell out automatically.
+
+[project]
+name = "swe-bench-accuracy"
+version = "0.1.0"
+description = "Isolated SWE-bench accuracy environment for the multi-turn agentic benchmark."
+requires-python = ">=3.12"
+dependencies = [
+    "mini-swe-agent==2.3.0",
+    "swebench==4.1.0",
+]
+
+[tool.uv]
+# Script-runner env: no build, no install of this project itself.
+package = false
diff --git a/examples/10_Agentic_Inference/kimi_agentic_benchmark.yaml b/examples/10_Agentic_Inference/kimi_agentic_benchmark.yaml
index 9740aa4c1..b513d0849 100644
--- a/examples/10_Agentic_Inference/kimi_agentic_benchmark.yaml
+++ b/examples/10_Agentic_Inference/kimi_agentic_benchmark.yaml
@@ -23,6 +23,13 @@ datasets:
       num_trajectories_to_issue: 990 # Should be integer multiple of 990.
       # Required benchmark default; set to true only for faster optimization/debug runs.
       stop_issuing_on_first_user_complete: false
+  - name: swe_bench
+    type: "accuracy"
+    accuracy_config:
+      eval_method: "swe_bench_scorer"
+      num_repeats: 1
+      extras:
+        num_instances: 200
 
 settings:
   runtime:
diff --git a/examples/10_Agentic_Inference/qwen_agentic_benchmark.yaml b/examples/10_Agentic_Inference/qwen_agentic_benchmark.yaml
new file mode 100644
index 000000000..415b3e68c
--- /dev/null
+++ b/examples/10_Agentic_Inference/qwen_agentic_benchmark.yaml
@@ -0,0 +1,48 @@
+name: "qwen-agentic-benchmark"
+version: "1.0"
+type: "online"
+
+model_params:
+  name: "Qwen/Qwen3.6-35B-A3B"
+  temperature: 1.0
+  top_k: 20
+  top_p: 0.95
+  repetition_penalty: 1.0
+  presence_penalty: 1.5
+  max_new_tokens: 8192
+  chat_template_kwargs:
+    preserve_thinking: true
+
+datasets:
+  - name: agentic_coding
+    type: performance
+    path: /path/to/agentic_combined.jsonl
+    accuracy_config:
+      eval_method: agentic_inference_inline # required benchmark default.
+    agentic_inference:
+      turn_timeout_s: 14400.0
+      enable_salt: true # do not change.
+      inject_tool_delay: true # do not change.
+  - name: swe_bench
+    type: "accuracy"
+    accuracy_config:
+      eval_method: "swe_bench_scorer"
+      num_repeats: 1
+      extras:
+        num_instances: 200
+
+settings:
+  runtime:
+    min_duration_ms: 0
+    max_duration_ms: 36000000
+
+  load_pattern:
+    type: agentic_inference
+    target_concurrency: 8 # Submission-specific concurrency.
+
+endpoint_config:
+  endpoints:
+    - "http://localhost:30000"
+  api_type: openai
+
+report_dir: logs/qwen_agentic
diff --git a/examples/10_Agentic_Inference/swe_bench_accuracy.yaml b/examples/10_Agentic_Inference/swe_bench_accuracy.yaml
new file mode 100644
index 000000000..8508b0129
--- /dev/null
+++ b/examples/10_Agentic_Inference/swe_bench_accuracy.yaml
@@ -0,0 +1,42 @@
+type: "online"
+
+model_params:
+  name: "Qwen/Qwen3.6-35B-A3B"
+  temperature: 1.0
+  top_p: 0.95
+  top_k: 20
+  repetition_penalty: 1.0
+  presence_penalty: 1.5
+  max_new_tokens: 8192
+  chat_template_kwargs:
+    preserve_thinking: true
+
+datasets:
+  # Minimal performance dataset required by the framework.
+  - name: swe_bench_perf
+    type: "performance"
+    path: "tests/assets/datasets/dummy_1k.jsonl"
+    parser:
+      prompt: text_input
+
+  # Accuracy dataset — instance_id rows tell mini-swe-agent which instances to run.
+  # First run downloads ~10 MB from HuggingFace and caches to datasets_dir.
+  - name: swe_bench
+    type: "accuracy"
+    accuracy_config:
+      eval_method: "swe_bench_scorer"
+      num_repeats: 1
+      extras:
+        num_instances: 200
+
+settings:
+  load_pattern:
+    type: "concurrency"
+    target_concurrency: 10 # mini-extra inherits target_concurrency from performance dataset
+  runtime:
+    n_samples_to_issue: 10
+
+endpoint_config:
+  endpoints:
+    - "http://localhost:30000"
+  api_type: "openai"
diff --git a/examples/10_Agentic_Inference/swebench_template.yaml b/examples/10_Agentic_Inference/swebench_template.yaml
new file mode 100644
index 000000000..9b37ec5ba
--- /dev/null
+++ b/examples/10_Agentic_Inference/swebench_template.yaml
@@ -0,0 +1,186 @@
+agent:
+  system_template: |
+    You are a helpful assistant that can interact with a computer shell to solve programming tasks.
+  instance_template: |
+    <pr_description>
+    Consider the following PR description:
+    {{task}}
+    </pr_description>
+
+    <instructions>
+    # Task Instructions
+
+    ## Overview
+
+    You're a software engineer interacting continuously with a computer by submitting commands.
+    You'll be helping implement necessary changes to meet requirements in the PR description.
+    Your task is specifically to make changes to non-test files in the current directory in order to fix the issue described in the PR description in a way that is general and consistent with the codebase.
+    <IMPORTANT>This is an interactive process where you will think and issue AT LEAST ONE command, see the result, then think and issue your next command(s).</important>
+
+    For each response:
+
+    1. Include a THOUGHT section explaining your reasoning and what you're trying to accomplish
+    2. Provide one or more bash tool calls to execute
+
+    ## Important Boundaries
+
+    - MODIFY: Regular source code files in /testbed (this is the working directory for all your subsequent commands)
+    - DO NOT MODIFY: Tests, configuration files (pyproject.toml, setup.cfg, etc.)
+
+    ## Recommended Workflow
+
+    1. Analyze the codebase by finding and reading relevant files
+    2. Create a script to reproduce the issue
+    3. Edit the source code to resolve the issue
+    4. Verify your fix works by running your script again
+    5. Test edge cases to ensure your fix is robust
+
+    ## Command Execution Rules
+
+    You are operating in an environment where
+
+    1. You issue at least one command
+    2. The system executes the command(s) in a subshell
+    3. You see the result(s)
+    4. You write your next command(s)
+
+    Each response should include:
+
+    1. **Reasoning text** where you explain your analysis and plan
+    2. At least one tool call with your command
+
+    **CRITICAL REQUIREMENTS:**
+
+    - Your response SHOULD include reasoning text explaining what you're doing
+    - Your response MUST include AT LEAST ONE bash tool call. You can make MULTIPLE tool calls in a single response when the commands are independent (e.g., searching multiple files, reading different parts of the codebase).
+    - Directory or environment variable changes are not persistent. Every action is executed in a new subshell.
+    - However, you can prefix any action with `MY_ENV_VAR=MY_VALUE cd /path/to/working/dir && ...` or write/load environment variables from files
+
+    Example of a CORRECT response:
+    <example_response>
+    I need to understand the Builder-related code. Let me find relevant files and check the project structure.
+
+    [Makes multiple bash tool calls: {"command": "ls -la"}, {"command": "find src -name '*.java' | grep -i builder"}, {"command": "cat README.md | head -50"}]
+    </example_response>
+
+    ## Environment Details
+
+    - You have a full Linux shell environment
+    - Always use non-interactive flags (-y, -f) for commands
+    - Avoid interactive tools like vi, nano, or any that require user input
+    - You can use bash commands or invoke any tool that is available in the environment
+    - You can also create new tools or scripts to help you with the task
+    - If a tool isn't available, you can also install it
+
+    ## Submission
+
+    When you've completed your work, you MUST submit your changes as a git patch.
+    Follow these steps IN ORDER, with SEPARATE commands:
+
+    Step 1: Create the patch file
+    Run `git diff -- path/to/file1 path/to/file2 > patch.txt` listing only the source files you modified.
+    Do NOT commit your changes.
+
+    <IMPORTANT>
+    The patch must only contain changes to the specific source files you modified to fix the issue.
+    Do not submit file creations or changes to any of the following files:
+
+    - test and reproduction files
+    - helper scripts, tests, or tools that you created
+    - installation, build, packaging, configuration, or setup scripts unless they are directly part of the issue you were fixing (you can assume that the environment is already set up for your client)
+    - binary or compiled files
+    </IMPORTANT>
+
+    Step 2: Verify your patch
+    Inspect patch.txt to confirm it only contains your intended changes and headers show `--- a/` and `+++ b/` paths.
+
+    Step 3: Submit (EXACT command required)
+    You MUST use this EXACT command to submit:
+
+    ```bash
+    echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT && cat patch.txt
+    ```
+
+    If the command fails (nonzero exit status), it will not submit.
+
+    <CRITICAL>
+    - Creating/viewing the patch and submitting it MUST be separate commands (not combined with &&).
+    - If you modify patch.txt after verifying, you SHOULD verify again before submitting.
+    - You CANNOT continue working (reading, editing, testing) in any way on this task after submitting.
+    </CRITICAL>
+    </instructions>
+  step_limit: 250
+  cost_limit: 3.
+
+environment:
+  cwd: "/testbed"
+  timeout: 3600
+  interpreter: ["bash", "-c"]
+  env:
+    PAGER: cat
+    MANPAGER: cat
+    LESS: -R
+    PIP_PROGRESS_BAR: "off"
+    TQDM_DISABLE: "1"
+  environment_class: docker
+  pull_timeout: 3600
+  container_timeout: 10h
+
+model:
+  cost_tracking: "ignore_errors"
+  observation_template: |
+    {% if output.exception_info -%}
+    <exception>{{output.exception_info}}</exception>
+    {% endif -%}
+    <returncode>{{output.returncode}}</returncode>
+    {% if output.output | length < 10000 -%}
+    <output>
+    {{ output.output -}}
+    </output>
+    {%- else -%}
+    <warning>
+    The output of your last command was too long.
+    Please try a different command that produces less output.
+    If you're looking at a file you can try use head, tail or sed to view a smaller number of lines selectively.
+    If you're using grep or find and it produced too much output, you can use a more selective search pattern.
+    If you really need to see something from the full command's output, you can redirect output to a file and then search in that file.
+    </warning>
+    {%- set elided_chars = output.output | length - 10000 -%}
+    <output_head>
+    {{ output.output[:5000] }}
+    </output_head>
+    <elided_chars>
+    {{ elided_chars }} characters elided
+    </elided_chars>
+    <output_tail>
+    {{ output.output[-5000:] }}
+    </output_tail>
+    {%- endif -%}
+  format_error_template: |
+    Tool call error:
+
+    <error>
+    {{error}}
+    </error>
+
+    Here is general guidance on how to submit correct toolcalls:
+
+    Every response needs to use the 'bash' tool at least once to execute commands.
+
+    Call the bash tool with your command as the argument:
+    - Tool: bash
+    - Arguments: {"command": "your_command_here"}
+
+    If you have completed your assignment, please consult the first message about how to
+    submit your solution (you will not be able to continue working on this task after that).
+  # Patched at runtime by SWEBenchScorer from model_params and endpoint_config
+  model_name: ""
+  model_kwargs:
+    custom_llm_provider: "openai"
+    api_key: "test"
+    drop_params: true
+    parallel_tool_calls: true
+    api_base: ""
+    # Sampling parameters (temperature, top_p, top_k, etc.) are injected at
+    # runtime from the benchmark config's model_params block — absent here so
+    # the model's own defaults apply when not specified in model_params.
diff --git a/pyproject.toml b/pyproject.toml
index 4a7655021..429cdcf4b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -73,6 +73,7 @@ dependencies = [
     # Fix pytz-2024 import warning
     "pytz==2026.1.post1",
     "urllib3==2.7.0",
+    "pyyaml==6.0.3",
 ]
 
 [project.optional-dependencies]
diff --git a/src/inference_endpoint/commands/benchmark/execute.py b/src/inference_endpoint/commands/benchmark/execute.py
index a2050bbe3..13ee87632 100644
--- a/src/inference_endpoint/commands/benchmark/execute.py
+++ b/src/inference_endpoint/commands/benchmark/execute.py
@@ -285,6 +285,10 @@ def _load_datasets(
         )
         assert acc_cfg.accuracy_config is not None
 
+        extras = acc_cfg.accuracy_config.extras or {}
+
+        scorer_cls.preflight(extras)
+
         ds = DataLoaderFactory.create_loader(
             acc_cfg, num_repeats=acc_cfg.accuracy_config.num_repeats
         )
@@ -299,7 +303,7 @@ def _load_datasets(
                 report_dir,
                 acc_cfg.accuracy_config.ground_truth,
                 acc_cfg.accuracy_config.num_repeats,
-                acc_cfg.accuracy_config.extras or {},
+                extras,
             )
         )
         ds.load(
@@ -313,6 +317,14 @@ def _load_datasets(
         raise InputValidationError("Multiple performance datasets not supported")
 
     perf_cfg = performance_cfgs[0]
+    perf_cls = Dataset.PREDEFINED.get(perf_cfg.name)
+    if perf_cls is not None and perf_cls.ACCURACY_ONLY:
+        raise InputValidationError(
+            f"Dataset '{perf_cfg.name}' is accuracy-only and cannot be used "
+            "as a performance dataset. Use a different dataset (e.g. 'random') for the "
+            "performance phase."
+        )
+
     try:
         dataloader = DataLoaderFactory.create_loader(perf_cfg)
         dataloader.load(
@@ -320,9 +332,7 @@ def _load_datasets(
         )
         logger.info(f"Loaded {dataloader.num_samples()} samples")
     except FileNotFoundError as e:
-        raise InputValidationError(
-            f"Dataset file not found: {performance_cfgs[0].path}"
-        ) from e
+        raise InputValidationError(f"Dataset file not found: {perf_cfg.path}") from e
     except Exception as e:
         raise SetupError(f"Failed to load dataset: {e}") from e
 
@@ -337,6 +347,7 @@ def _load_datasets(
         scorer_cls, extractor_cls = _resolve_accuracy_components(
             perf_cfg.name, accuracy_config
         )
+        scorer_cls.preflight(accuracy_config.extras or {})
 
         eval_configs.append(
             AccuracyConfiguration(
@@ -399,8 +410,11 @@ def setup_benchmark(config: BenchmarkConfig, test_mode: TestMode) -> BenchmarkCo
 
     # Calculate and display expected sample count
     total_samples = rt_settings.total_samples_to_issue()
-    if accuracy_datasets:
-        total_samples += sum(ds.num_samples() * ds.repeats for ds in accuracy_datasets)
+    total_samples += sum(
+        ec.dataset.num_samples() * ec.dataset.repeats
+        for ec in eval_configs
+        if not ec.scorer.SKIP_ENDPOINT_PHASE and ec.dataset_name != "performance"
+    )
 
     collect_responses = test_mode in (TestMode.ACC, TestMode.BOTH)
     logger.info(
@@ -409,6 +423,16 @@ def setup_benchmark(config: BenchmarkConfig, test_mode: TestMode) -> BenchmarkCo
     logger.info(
         f"Min Duration: {rt_settings.min_duration_ms / 1000:.1f}s, Expected samples: {total_samples}"
     )
+    for ec in eval_configs:
+        if ec.scorer.SKIP_ENDPOINT_PHASE:
+            n = ec.scorer.external_sample_count(ec.extras)
+            if n is not None:
+                logger.info(
+                    "Accuracy dataset '%s' (%s): %d instances evaluated externally",
+                    ec.dataset_name,
+                    ec.scorer.SCORER_ID,
+                    n,
+                )
 
     return BenchmarkContext(
         config=config,
@@ -477,6 +501,8 @@ def _build_phases(
     # Accuracy phases — use eval_cfg.dataset_name as phase name so it matches
     # what Scorer._load_sample_index_map() looks up in sample_idx_map.json
     for eval_cfg in ctx.eval_configs:
+        if eval_cfg.scorer.SKIP_ENDPOINT_PHASE:
+            continue
         if eval_cfg.dataset_name == "performance":
             continue
         acc_ds = eval_cfg.dataset
@@ -905,8 +931,12 @@ def finalize_benchmark(ctx: BenchmarkContext, bench: BenchmarkResult) -> None:
             **eval_cfg.extras,
         )
         score, n_repeats = scorer_instance.score()
-        assert eval_cfg.dataset.data is not None
-        num_samples = len(eval_cfg.dataset.data)
+        if eval_cfg.dataset.data is not None:
+            num_samples = len(eval_cfg.dataset.data)
+        elif eval_cfg.dataset.dataframe is not None:
+            num_samples = len(eval_cfg.dataset.dataframe)
+        else:
+            num_samples = 0
         if eval_cfg.dataset_name == "performance":
             num_samples = sum(phase.issued_count for phase in result.perf_results)
         accuracy_scores[eval_cfg.dataset_name] = {
diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py
index 9226d7f85..0b84b2da2 100644
--- a/src/inference_endpoint/config/schema.py
+++ b/src/inference_endpoint/config/schema.py
@@ -101,6 +101,7 @@ class ScorerMethod(str, Enum):
     SHOPIFY_CATEGORY_F1 = "shopify_category_f1"
     AGENTIC_INFERENCE_INLINE = "agentic_inference_inline"
     VBENCH = "vbench"
+    SWE_BENCH = "swe_bench_scorer"
 
 
 class TestMode(str, Enum):
@@ -860,6 +861,34 @@ def _resolve_and_validate(self) -> Self:
                 f"got '{lp.type}'"
             )
 
+        # For swe_bench_scorer, forward target_concurrency as workers when the
+        # user has not set it explicitly. mini-swe-agent's parallelism should
+        # match the endpoint's concurrency budget.
+        concurrency = (
+            lp.target_concurrency
+            if lp.type
+            in (LoadPatternType.CONCURRENCY, LoadPatternType.AGENTIC_INFERENCE)
+            and lp.target_concurrency
+            else None
+        )
+        if concurrency is not None and self.datasets:
+            updated_datasets = []
+            changed = False
+            for ds in self.datasets:
+                acc = ds.accuracy_config
+                if (
+                    acc is not None
+                    and acc.eval_method == ScorerMethod.SWE_BENCH
+                    and (acc.extras is None or "workers" not in acc.extras)
+                ):
+                    new_extras = {**(acc.extras or {}), "workers": concurrency}
+                    new_acc = acc.model_copy(update={"extras": new_extras})
+                    ds = ds.model_copy(update={"accuracy_config": new_acc})
+                    changed = True
+                updated_datasets.append(ds)
+            if changed:
+                object.__setattr__(self, "datasets", updated_datasets)
+
         return self
 
     @model_validator(mode="after")
diff --git a/src/inference_endpoint/config/templates/concurrency_template_full.yaml b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
index 38829f0f5..3239aa08d 100644
--- a/src/inference_endpoint/config/templates/concurrency_template_full.yaml
+++ b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
@@ -37,7 +37,7 @@ datasets:  # Dataset configs
     prompt: question
     system: system_prompt
   accuracy_config:  # Accuracy evaluation settings
-    eval_method: pass_at_1  # Scorer method | options: pass_at_1, string_match, rouge, code_bench_scorer, shopify_category_f1, agentic_inference_inline, vbench
+    eval_method: pass_at_1  # Scorer method | options: pass_at_1, string_match, rouge, code_bench_scorer, shopify_category_f1, agentic_inference_inline, vbench, swe_bench_scorer
     ground_truth: ground_truth  # Ground truth column name
     extractor: boxed_math_extractor  # Answer extractor (abcd_extractor, boxed_math_extractor, identity_extractor, python_code_extractor)
     num_repeats: 1  # Repeat dataset N times for evaluation
diff --git a/src/inference_endpoint/config/templates/offline_template_full.yaml b/src/inference_endpoint/config/templates/offline_template_full.yaml
index c3454d5da..476a27ef4 100644
--- a/src/inference_endpoint/config/templates/offline_template_full.yaml
+++ b/src/inference_endpoint/config/templates/offline_template_full.yaml
@@ -37,7 +37,7 @@ datasets:  # Dataset configs
     prompt: question
     system: system_prompt
   accuracy_config:  # Accuracy evaluation settings
-    eval_method: pass_at_1  # Scorer method | options: pass_at_1, string_match, rouge, code_bench_scorer, shopify_category_f1, agentic_inference_inline, vbench
+    eval_method: pass_at_1  # Scorer method | options: pass_at_1, string_match, rouge, code_bench_scorer, shopify_category_f1, agentic_inference_inline, vbench, swe_bench_scorer
     ground_truth: ground_truth  # Ground truth column name
     extractor: boxed_math_extractor  # Answer extractor (abcd_extractor, boxed_math_extractor, identity_extractor, python_code_extractor)
     num_repeats: 1  # Repeat dataset N times for evaluation
diff --git a/src/inference_endpoint/config/templates/online_template_full.yaml b/src/inference_endpoint/config/templates/online_template_full.yaml
index 5bea95329..266426f4d 100644
--- a/src/inference_endpoint/config/templates/online_template_full.yaml
+++ b/src/inference_endpoint/config/templates/online_template_full.yaml
@@ -37,7 +37,7 @@ datasets:  # Dataset configs
     prompt: question
     system: system_prompt
   accuracy_config:  # Accuracy evaluation settings
-    eval_method: pass_at_1  # Scorer method | options: pass_at_1, string_match, rouge, code_bench_scorer, shopify_category_f1, agentic_inference_inline, vbench
+    eval_method: pass_at_1  # Scorer method | options: pass_at_1, string_match, rouge, code_bench_scorer, shopify_category_f1, agentic_inference_inline, vbench, swe_bench_scorer
     ground_truth: ground_truth  # Ground truth column name
     extractor: boxed_math_extractor  # Answer extractor (abcd_extractor, boxed_math_extractor, identity_extractor, python_code_extractor)
     num_repeats: 1  # Repeat dataset N times for evaluation
diff --git a/src/inference_endpoint/dataset_manager/__init__.py b/src/inference_endpoint/dataset_manager/__init__.py
index 15525fb50..ac314d3f9 100644
--- a/src/inference_endpoint/dataset_manager/__init__.py
+++ b/src/inference_endpoint/dataset_manager/__init__.py
@@ -32,6 +32,7 @@
     ShopifyProductCatalogue,
     ShopifyProductCatalogue8k,
 )
+from .predefined.swe_bench import SWEBench
 from .transforms import (
     AddStaticColumns,
     ColumnFilter,
@@ -63,5 +64,6 @@
     "RandomDataset",
     "ShopifyProductCatalogue",
     "ShopifyProductCatalogue8k",
+    "SWEBench",
     "AgenticInferenceDataset",
 ]
diff --git a/src/inference_endpoint/dataset_manager/dataset.py b/src/inference_endpoint/dataset_manager/dataset.py
index 963ded391..2281f5184 100644
--- a/src/inference_endpoint/dataset_manager/dataset.py
+++ b/src/inference_endpoint/dataset_manager/dataset.py
@@ -276,6 +276,10 @@ class Dataset:
     DATASET_ID: ClassVar[str]
     """The unique identifier for the dataset. Automatically set by __init_subclass__."""
 
+    ACCURACY_ONLY: ClassVar[bool] = False
+    """If True, this dataset may only be used as an accuracy dataset (type: accuracy).
+    Using it as a performance dataset raises InputValidationError at load time."""
+
     def __init_subclass__(
         cls,
         dataset_id: str | None = None,
diff --git a/src/inference_endpoint/dataset_manager/predefined/swe_bench/__init__.py b/src/inference_endpoint/dataset_manager/predefined/swe_bench/__init__.py
new file mode 100644
index 000000000..72b54383c
--- /dev/null
+++ b/src/inference_endpoint/dataset_manager/predefined/swe_bench/__init__.py
@@ -0,0 +1,116 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from logging import getLogger
+from pathlib import Path
+
+import pandas as pd
+
+from ...dataset import Dataset, load_from_huggingface
+
+logger = getLogger(__name__)
+
+_REPO_MAP = {
+    "verified": "princeton-nlp/SWE-bench_Verified",
+    "lite": "princeton-nlp/SWE-bench_Lite",
+}
+
+
+class SWEBench(
+    Dataset,
+    dataset_id="swe_bench",
+):
+    """SWE-bench: Software Engineering Benchmark for LLM agents.
+
+    Loads instance IDs and problem statements from the SWE-bench Verified or
+    Lite subset. Used as the accuracy dataset for the swe_bench_scorer, which
+    runs mini-swe-agent against a live endpoint and grades patches with the
+    SWE-bench evaluation harness.
+
+    The ``instance_id`` column identifies which instances mini-swe-agent will
+    evaluate. The endpoint phase is skipped entirely for this scorer
+    (``SKIP_ENDPOINT_PHASE=True``); ``SWEBenchScorer`` drives the agent
+    subprocess directly against the configured endpoint.
+
+    Using this dataset as a performance dataset (type: performance) is not
+    meaningful — problem statements sent directly to the model without an
+    agent framework don't reflect real SWE-bench usage. Use a different
+    dataset (e.g. ``random``) for the performance phase.
+    """
+
+    ACCURACY_ONLY = True
+    COLUMN_NAMES = ["instance_id", "prompt"]
+
+    @classmethod
+    def hf_dataset_name(cls, subset: str) -> str:
+        hf_path = _REPO_MAP.get(subset)
+        if hf_path is None:
+            raise ValueError(
+                f"Unknown SWE-bench subset {subset!r}; choose from: {list(_REPO_MAP)}"
+            )
+        return hf_path
+
+    @classmethod
+    def generate(
+        cls,
+        datasets_dir: Path,
+        subset: str = "verified",
+        force: bool = False,
+    ) -> pd.DataFrame:
+        """Download and cache the SWE-bench dataset from HuggingFace.
+
+        Args:
+            datasets_dir: Root cache directory. Parquet is written under
+                ``datasets_dir/swe_bench/{subset}/``.
+            subset: ``"verified"`` (500 instances) or ``"lite"`` (300 instances).
+            force: Re-download even if the local parquet cache exists.
+
+        Returns:
+            DataFrame with columns ``instance_id`` and ``prompt``.
+        """
+        hf_path = cls.hf_dataset_name(subset)
+
+        dst_path = datasets_dir / "swe_bench" / subset / f"swe_bench_{subset}.parquet"
+        if dst_path.exists() and not force:
+            logger.info("Loading SWE-bench %s from cache: %s", subset, dst_path)
+            try:
+                return pd.read_parquet(dst_path)
+            except Exception as e:
+                raise RuntimeError(
+                    f"Cached SWE-bench parquet at {dst_path} appears corrupt ({e}). "
+                    "Delete it or pass force=True to re-download."
+                ) from e
+
+        try:
+            df = load_from_huggingface(
+                hf_path,
+                split="test",
+                cache_dir=datasets_dir / "hf_cache" / f"swe_bench_{subset}",
+            )
+        except Exception as e:
+            logger.error("Error loading SWE-bench %s from HuggingFace: %s", subset, e)
+            raise
+
+        result = (
+            df[["instance_id", "problem_statement"]]
+            .rename(columns={"problem_statement": "prompt"})
+            .reset_index(drop=True)
+        )
+        dst_path.parent.mkdir(parents=True, exist_ok=True)
+        result.to_parquet(dst_path)
+        logger.info(
+            "Saved %d SWE-bench %s instances to %s", len(result), subset, dst_path
+        )
+        return result
diff --git a/src/inference_endpoint/evaluation/scoring.py b/src/inference_endpoint/evaluation/scoring.py
index f9419703a..54fec7619 100644
--- a/src/inference_endpoint/evaluation/scoring.py
+++ b/src/inference_endpoint/evaluation/scoring.py
@@ -10,7 +10,7 @@
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific permissions and
+# See the License for the specific language governing permissions and
 # limitations under the License.
 
 
@@ -23,6 +23,7 @@
 import subprocess
 import sys
 import tempfile
+import threading
 import uuid
 from abc import ABC, abstractmethod
 from collections import Counter, defaultdict
@@ -33,6 +34,7 @@
 import msgspec.json
 import numpy as np
 import pandas as pd
+import yaml
 from pydantic import ValidationError
 from tqdm import tqdm
 
@@ -53,6 +55,8 @@
 from ..dataset_manager.agentic_inference_dataset import AgenticInferenceDataset
 from ..dataset_manager.dataset import Dataset
 from ..dataset_manager.predefined.shopify_product_catalogue import ProductMetadata
+from ..dataset_manager.predefined.swe_bench import SWEBench
+from ..exceptions import SetupError
 from .extractor import Extractor, PythonCodeExtractor
 
 logger = logging.getLogger(__name__)
@@ -67,6 +71,7 @@ class Scorer(ABC):
     PREDEFINED: ClassVar[dict[str, type["Scorer"]]] = {}
     SCORER_ID: ClassVar[str]
     REQUIRES_EXTRACTOR: ClassVar[bool] = True
+    SKIP_ENDPOINT_PHASE: ClassVar[bool] = False
 
     def __init_subclass__(
         cls,
@@ -106,6 +111,21 @@ def available_scorers(cls) -> list[str]:
         """Return the list of registered scorer names."""
         return list(Scorer.PREDEFINED.keys())
 
+    @classmethod
+    def external_sample_count(cls, extras: dict[str, Any]) -> int | None:
+        """Return the number of samples the scorer will evaluate externally, or None.
+
+        Used to surface sample counts for scorers that skip the endpoint phase and
+        manage their own evaluation (e.g. `SWEBenchScorer`).
+        The default returns None (scorer uses the endpoint accuracy phase normally).
+        """
+        return None
+
+    @classmethod  # noqa: B027 — intentional no-op default; subclasses override when needed
+    def preflight(cls, extras: dict[str, Any]) -> None:
+        """Verify external dependencies before the benchmark starts. No-op by default."""
+        pass
+
     def __init__(
         self,
         dataset_name: str,
@@ -122,7 +142,9 @@ def __init__(
         self.ground_truth_column = (
             ground_truth_column if ground_truth_column is not None else "ground_truth"
         )
-        self.sample_index_map = self._load_sample_index_map()
+        self.sample_index_map: dict | None = (
+            None if self.SKIP_ENDPOINT_PHASE else self._load_sample_index_map()
+        )
 
     def _load_sample_index_map(self):
         sample_index_map_path = self.report_dir / "sample_idx_map.json"
@@ -163,6 +185,7 @@ def get_outputs(self):
 
     def match_sample_index(self, row: pd.Series) -> pd.Series:
         # Pandas Apply function to create a new 'sample_index' column
+        assert self.sample_index_map is not None
         row["sample_index"] = self.sample_index_map[row["sample_uuid"]]
         return row
 
@@ -177,6 +200,10 @@ def score(self) -> tuple[float | None, int]:
             tuple[float | None, int]: The mean score and the number of repeats.
                 Returns None as the score if evaluation fails.
         """
+        assert self.sample_index_map is not None, (
+            f"{self.__class__.__name__}.SKIP_ENDPOINT_PHASE is True but score() was not "
+            "overridden; override score() to implement external evaluation."
+        )
         df = self.get_outputs()
 
         # Outputs are for all samples, not just the target dataset
@@ -273,6 +300,7 @@ def score(self) -> tuple[float, int]:
         df = self.get_outputs()
 
         # Outputs are for all samples, not just the target dataset
+        assert self.sample_index_map is not None
         valid_uuids = self.sample_index_map.keys()
         df = df[df["sample_uuid"].isin(valid_uuids)]
 
@@ -1099,6 +1127,7 @@ def score(self) -> tuple[float | None, int]:
         df = self.get_outputs()
 
         # Outputs are for all samples, not just the target dataset
+        assert self.sample_index_map is not None
         valid_uuids = self.sample_index_map.keys()
         df = df[df["sample_uuid"].isin(valid_uuids)]
 
@@ -1319,6 +1348,7 @@ def score_single_sample(self, value: str, ground_truth: str) -> float:
     def score(self) -> tuple[float, int]:
         df = self.get_outputs()
 
+        assert self.sample_index_map is not None
         valid_uuids = self.sample_index_map.keys()
         df = df[df["sample_uuid"].isin(valid_uuids)]
         df = df.apply(self.match_sample_index, axis=1)
@@ -1369,6 +1399,20 @@ def score(self) -> tuple[float, int]:
 
 _VBENCH_PROJECT_PATH_ENV = "VBENCH_PROJECT_PATH"
 
+
+def _resolve_subproject_path(
+    explicit: str | os.PathLike | None,
+    env_var: str,
+    default: Path,
+) -> Path:
+    if explicit is not None:
+        return Path(explicit)
+    from_env = os.environ.get(env_var)
+    if from_env:
+        return Path(from_env)
+    return default
+
+
 # Filenames in `vbench_standard` mode key on the prompt verbatim — VBench looks
 # the filename's prompt-prefix up in vbench_full_info.json. We can therefore
 # only reshape unsafe characters, not replace the prompt with a UUID. Slashes
@@ -1470,18 +1514,10 @@ def __init__(
     def _resolve_project_path(
         explicit: os.PathLike | None,
     ) -> Path:
-        """Resolve the VBench subproject path.
-
-        Lookup order: explicit ctor arg → ``$VBENCH_PROJECT_PATH`` env var →
-        editable-checkout fallback. The env var lets wheel-installed users
-        point at a synced subproject without patching source.
-        """
-        if explicit is not None:
-            return Path(explicit)
-        from_env = os.environ.get(_VBENCH_PROJECT_PATH_ENV)
-        if from_env:
-            return Path(from_env)
-        return Path(_DEFAULT_VBENCH_PROJECT_PATH)
+        """Lookup order: explicit ctor arg → ``$VBENCH_PROJECT_PATH`` env var → editable-checkout fallback."""
+        return _resolve_subproject_path(
+            explicit, _VBENCH_PROJECT_PATH_ENV, Path(_DEFAULT_VBENCH_PROJECT_PATH)
+        )
 
     def score_single_sample(self, value: str, ground_truth: str) -> float:
         raise RuntimeError(
@@ -1542,35 +1578,12 @@ def _run_vbench_subprocess(
             cmd += ["--full-info-json", self.full_info_json_path]
 
         log_path = self.report_dir / "vbench_subprocess.log"
-        try:
-            completed = subprocess.run(
-                cmd,
-                check=False,
-                stdin=subprocess.DEVNULL,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.STDOUT,
-                text=True,
-                timeout=self.subprocess_timeout_s,
-            )
-        except subprocess.TimeoutExpired as e:
-            partial = (
-                e.stdout
-                if isinstance(e.stdout, str)
-                else (e.stdout or b"").decode("utf-8", errors="replace")
-            )
-            log_path.write_text(partial)
-            raise RuntimeError(
-                f"VBench subprocess timed out after {self.subprocess_timeout_s}s; "
-                f"see {log_path} for partial output."
-            ) from e
-
-        log_path.write_text(completed.stdout or "")
-        if completed.returncode != 0:
-            tail = "\n".join((completed.stdout or "").splitlines()[-50:])
-            raise RuntimeError(
-                f"VBench subprocess exited with code {completed.returncode}; "
-                f"full log at {log_path}. Last 50 lines:\n{tail}"
-            )
+        _run_subprocess_with_log(
+            cmd,
+            log_path,
+            timeout_s=self.subprocess_timeout_s,
+            label="VBench",
+        )
 
     def _extract_per_dim_scores(self, results: dict[str, Any]) -> list[float]:
         """Pull each requested dim's aggregate score, with clear errors.
@@ -1600,6 +1613,7 @@ def _extract_per_dim_scores(self, results: dict[str, Any]) -> list[float]:
 
     def score(self) -> tuple[float | None, int]:
         df = self.get_outputs()
+        assert self.sample_index_map is not None
         valid_uuids = self.sample_index_map.keys()
         df = df[df["sample_uuid"].isin(valid_uuids)]
         # Drop failed queries: Scorer.get_outputs() emits "" when record.data
@@ -1658,3 +1672,623 @@ def score(self) -> tuple[float | None, int]:
         per_dim_scores = self._extract_per_dim_scores(results)
         mean_score = float(np.mean(per_dim_scores))
         return mean_score, n_repeats
+
+
+def _run_subprocess_with_log(
+    cmd: list[str],
+    log_path: Path,
+    *,
+    timeout_s: int | None,
+    label: str,
+    cwd: Path | None = None,
+) -> None:
+    """Run *cmd*, capture stdout+stderr to *log_path*, raise on timeout or non-zero exit."""
+    try:
+        completed = subprocess.run(
+            cmd,
+            check=False,
+            stdin=subprocess.DEVNULL,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            timeout=timeout_s,
+            cwd=str(cwd) if cwd is not None else None,
+        )
+    except subprocess.TimeoutExpired as e:
+        partial = (
+            e.stdout
+            if isinstance(e.stdout, str)
+            else (e.stdout or b"").decode("utf-8", errors="replace")
+        )
+        log_path.write_text(partial)
+        raise RuntimeError(
+            f"{label} subprocess timed out after {timeout_s}s; "
+            f"see {log_path} for partial output."
+        ) from e
+    log_path.write_text(completed.stdout or "")
+    if completed.returncode != 0:
+        tail = "\n".join((completed.stdout or "").splitlines()[-50:])
+        raise RuntimeError(
+            f"{label} subprocess exited with code {completed.returncode}; "
+            f"full log at {log_path}. Last 50 lines:\n{tail}"
+        )
+
+
+_DEFAULT_SWE_BENCH_PROJECT_PATH = (
+    Path(__file__).resolve().parents[3]
+    / "examples"
+    / "10_Agentic_Inference"
+    / "accuracy"
+)
+_SWE_BENCH_PROJECT_PATH_ENV = "SWE_BENCH_PROJECT_PATH"
+_DEFAULT_SWE_BENCH_TEMPLATE = (
+    Path(__file__).resolve().parents[3]
+    / "examples"
+    / "10_Agentic_Inference"
+    / "swebench_template.yaml"
+)
+
+
+def _read_swebench_exit_statuses(
+    output_dir: Path, ignore: frozenset[Path]
+) -> dict[str, list[str]]:
+    """Read the newest exit_statuses_*.yaml not in *ignore*; return {} if none present."""
+    files = [
+        f for f in sorted(output_dir.glob("exit_statuses_*.yaml")) if f not in ignore
+    ]
+    if not files:
+        return {}
+    try:
+        data = yaml.safe_load(files[-1].read_text()) or {}
+        return data.get("instances_by_exit_status", {})
+    except Exception:
+        logger.debug(
+            "Could not read %s for progress reporting", files[-1], exc_info=True
+        )
+        return {}
+
+
+def _poll_swebench_progress(
+    output_dir: Path, total: int, stop: threading.Event
+) -> None:
+    """Poll exit_statuses_*.yaml and update a tqdm bar until stop is set."""
+    # Snapshot pre-existing status files so stale data from prior runs is ignored.
+    existing = frozenset(output_dir.glob("exit_statuses_*.yaml"))
+    with tqdm(total=total, desc="SWE-bench instances", unit="instance") as bar:
+        last = 0
+        while not stop.is_set():
+            statuses = _read_swebench_exit_statuses(output_dir, existing)
+            done = sum(len(v) for v in statuses.values())
+            if done > last:
+                bar.update(done - last)
+                last = done
+            if statuses:
+                bar.set_postfix({k: len(v) for k, v in sorted(statuses.items())})
+            if last >= total:
+                break
+            stop.wait(timeout=5.0)
+        statuses = _read_swebench_exit_statuses(output_dir, existing)
+        done = sum(len(v) for v in statuses.values())
+        if done > last:
+            bar.update(done - last)
+        if statuses:
+            bar.set_postfix({k: len(v) for k, v in sorted(statuses.items())})
+
+
+def _decode_subprocess_stderr(stderr: bytes | str | None) -> str:
+    if stderr is None:
+        return ""
+    if isinstance(stderr, bytes):
+        return stderr.decode(errors="replace").strip()
+    return str(stderr).strip()
+
+
+class SWEBenchScorer(Scorer, scorer_id="swe_bench_scorer"):
+    """SWE-bench accuracy scorer using the mini-extra CLI (mini-swe-agent package).
+
+    Invokes ``mini-extra swebench`` and ``swebench.harness.run_evaluation`` via
+    ``uv run --project <swe_bench_project_path>`` so the parent process never imports
+    them directly. Run ``uv sync`` in the subproject directory once before use.
+    """
+
+    REQUIRES_EXTRACTOR: ClassVar[bool] = False
+    SKIP_ENDPOINT_PHASE: ClassVar[bool] = True
+    DEFAULT_SUBPROCESS_TIMEOUT_S: ClassVar[int] = 24 * 60 * 60
+    DEFAULT_SUBSET: ClassVar[str] = "verified"
+    DEFAULT_SPLIT: ClassVar[str] = "test"
+    DEFAULT_NUM_INSTANCES: ClassVar[int] = 100
+    PREPULL_TIMEOUT_S: ClassVar[int] = 10 * 60
+
+    def __init__(
+        self,
+        dataset_name: str,
+        dataset: Dataset,
+        report_dir: os.PathLike,
+        extractor: type[Extractor] | None = None,
+        ground_truth_column: str | None = "instance_id",
+        swe_bench_project_path: str | os.PathLike | None = None,
+        swebench_config_template: str | os.PathLike | None = None,
+        subset: str = DEFAULT_SUBSET,
+        split: str = DEFAULT_SPLIT,
+        num_instances: int = DEFAULT_NUM_INSTANCES,
+        workers: int = 10,
+        max_eval_workers: int = 10,
+        subprocess_timeout_s: int | None = None,
+    ):
+        super().__init__(
+            dataset_name=dataset_name,
+            dataset=dataset,
+            report_dir=report_dir,
+            extractor=extractor,
+            ground_truth_column=ground_truth_column,
+        )
+        self.report_dir = self.report_dir.resolve()
+        self.swe_bench_project_path = self._resolve_project_path(swe_bench_project_path)
+        self.swebench_config_template = (
+            Path(swebench_config_template)
+            if swebench_config_template is not None
+            else _DEFAULT_SWE_BENCH_TEMPLATE
+        )
+        SWEBench.hf_dataset_name(subset)
+        self.subset = subset
+        self.split = split
+        self.num_instances = num_instances
+        self.workers = workers
+        self.max_eval_workers = max_eval_workers
+        self.subprocess_timeout_s = (
+            subprocess_timeout_s
+            if subprocess_timeout_s is not None
+            else self.DEFAULT_SUBPROCESS_TIMEOUT_S
+        )
+
+        if not self.swebench_config_template.exists():
+            raise FileNotFoundError(
+                f"swebench template not found: {self.swebench_config_template}. "
+                f"Pass swebench_config_template= in accuracy_config.extras."
+            )
+        with self.swebench_config_template.open() as _f:
+            _tmpl = yaml.safe_load(_f) or {}
+        model_cfg = _tmpl.get("model")
+        if not isinstance(model_cfg, dict) or not isinstance(
+            model_cfg.get("model_kwargs"), dict
+        ):
+            raise ValueError(
+                f"swebench template {self.swebench_config_template} must have a "
+                "'model.model_kwargs' dict; check the template structure."
+            )
+        pyproject = self.swe_bench_project_path / "pyproject.toml"
+        if not pyproject.exists():
+            raise FileNotFoundError(
+                f"SWE-bench subproject not found at {self.swe_bench_project_path}. "
+                f"Set ${_SWE_BENCH_PROJECT_PATH_ENV} to the subproject path, "
+                f"then run: cd {self.swe_bench_project_path} && uv sync"
+            )
+
+    @staticmethod
+    def _resolve_project_path(
+        explicit: str | os.PathLike | None,
+    ) -> Path:
+        """Lookup order: explicit ctor arg → ``$SWE_BENCH_PROJECT_PATH`` env var → in-repo default."""
+        return _resolve_subproject_path(
+            explicit, _SWE_BENCH_PROJECT_PATH_ENV, Path(_DEFAULT_SWE_BENCH_PROJECT_PATH)
+        )
+
+    @classmethod
+    def _get_extra_int(
+        cls, extras: dict[str, Any], key: str, *, default: int, min_value: int = 0
+    ) -> int:
+        value = extras.get(key, default)
+        try:
+            parsed = int(value)
+        except (TypeError, ValueError) as exc:
+            raise SetupError(
+                f"accuracy_config.extras.{key} must be an integer; got {value!r}"
+            ) from exc
+        if parsed < min_value:
+            raise SetupError(
+                f"accuracy_config.extras.{key} must be >= {min_value}; got {parsed}"
+            )
+        return parsed
+
+    @classmethod
+    def _derive_required_images(
+        cls,
+        *,
+        swe_bench_project_path: Path,
+        subset: str,
+        split: str,
+        num_instances: int,
+    ) -> list[str]:
+        derive_cmd = [
+            "uv",
+            "run",
+            "--project",
+            str(swe_bench_project_path),
+            "python",
+            "-c",
+            (
+                "import json, sys; "
+                "from datasets import load_dataset; "
+                "from minisweagent.run.benchmarks.swebench import "
+                "DATASET_MAPPING, filter_instances, get_swebench_docker_image_name; "
+                "subset, split, num_instances = sys.argv[1], sys.argv[2], int(sys.argv[3]); "
+                "dataset_path = DATASET_MAPPING.get(subset, subset); "
+                "instances = list(load_dataset(dataset_path, split=split)); "
+                "slice_spec = f'0:{min(num_instances, len(instances))}'; "
+                "instances = filter_instances("
+                "instances, filter_spec='', slice_spec=slice_spec, shuffle=False"
+                "); "
+                "seen = set(); images = []; "
+                "for instance in instances: "
+                "    image = get_swebench_docker_image_name(instance); "
+                "    (seen.add(image), images.append(image)) if image not in seen else None; "
+                "print(json.dumps(images))"
+            ),
+            subset,
+            split,
+            str(num_instances),
+        ]
+        result = subprocess.run(
+            derive_cmd,
+            check=False,
+            capture_output=True,
+            text=True,
+            timeout=cls.PREPULL_TIMEOUT_S,
+        )
+        if result.returncode != 0:
+            stderr_text = _decode_subprocess_stderr(result.stderr)
+            raise SetupError(
+                "Failed to derive required SWE-bench Docker images from the accuracy "
+                f"subproject at {swe_bench_project_path}"
+                + (f". stderr: {stderr_text}" if stderr_text else "")
+            )
+        try:
+            images = json.loads(result.stdout or "[]")
+        except json.JSONDecodeError as exc:
+            stdout_text = (result.stdout or "").strip()
+            raise SetupError(
+                "Failed to parse the required SWE-bench Docker image list from the "
+                f"accuracy subproject output: {stdout_text!r}"
+            ) from exc
+        if not isinstance(images, list) or not all(
+            isinstance(image, str) for image in images
+        ):
+            raise SetupError(
+                "Accuracy subproject returned an invalid SWE-bench Docker image list."
+            )
+        return images
+
+    @classmethod
+    def _prepull_images(cls, images: list[str]) -> None:
+        for image in images:
+            inspect_result = subprocess.run(
+                ["docker", "image", "inspect", image],
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.PIPE,
+                timeout=30,
+            )
+            if inspect_result.returncode == 0:
+                logger.info("SWE-bench Docker image already cached: %s", image)
+                continue
+
+            logger.info("Pulling SWE-bench Docker image: %s", image)
+            pull_result = subprocess.run(
+                ["docker", "pull", image],
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.PIPE,
+                timeout=cls.PREPULL_TIMEOUT_S,
+            )
+            if pull_result.returncode != 0:
+                stderr_text = _decode_subprocess_stderr(pull_result.stderr)
+                raise SetupError(
+                    "Failed to pre-pull required SWE-bench Docker image "
+                    f"{image}. Authenticate to Docker Hub with `docker login` "
+                    "or use a pre-seeded image cache/mirror before retrying."
+                    + (f" stderr: {stderr_text}" if stderr_text else "")
+                )
+
+    @classmethod
+    def external_sample_count(cls, extras: dict[str, Any]) -> int | None:
+        try:
+            return int(extras["num_instances"])
+        except (KeyError, TypeError, ValueError):
+            return None
+
+    @classmethod
+    def preflight(cls, extras: dict[str, Any]) -> None:
+        """Check uv, mini-extra, swebench, and Docker before the benchmark starts."""
+        swe_bench_project_path = cls._resolve_project_path(
+            extras.get("swe_bench_project_path")
+        )
+        subset = str(extras.get("subset", cls.DEFAULT_SUBSET))
+        split = str(extras.get("split", cls.DEFAULT_SPLIT))
+        num_instances = cls._get_extra_int(
+            extras,
+            "num_instances",
+            default=cls.DEFAULT_NUM_INSTANCES,
+        )
+
+        if shutil.which("uv") is None:
+            raise SetupError(
+                "uv is not on PATH; install it with: "
+                "curl -LsSf https://astral.sh/uv/install.sh | sh"
+            )
+
+        result = subprocess.run(
+            [
+                "uv",
+                "run",
+                "--project",
+                str(swe_bench_project_path),
+                "mini-extra",
+                "--help",
+            ],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.PIPE,
+            timeout=30,
+        )
+        if result.returncode != 0:
+            stderr_text = _decode_subprocess_stderr(result.stderr)
+            raise SetupError(
+                f"mini-extra is not available in the SWE-bench subproject at "
+                f"{swe_bench_project_path}. Run: cd {swe_bench_project_path} && uv sync"
+                + (f". stderr: {stderr_text}" if stderr_text else "")
+            )
+
+        swebench_result = subprocess.run(
+            [
+                "uv",
+                "run",
+                "--project",
+                str(swe_bench_project_path),
+                "python",
+                "-c",
+                "import swebench",
+            ],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.PIPE,
+            timeout=30,
+        )
+        if swebench_result.returncode != 0:
+            stderr_text = _decode_subprocess_stderr(swebench_result.stderr)
+            raise SetupError(
+                f"swebench is not available in the SWE-bench subproject at "
+                f"{swe_bench_project_path}. Run: cd {swe_bench_project_path} && uv sync"
+                + (f". stderr: {stderr_text}" if stderr_text else "")
+            )
+
+        if shutil.which("docker") is None:
+            raise SetupError("docker is not on PATH. Install Docker and retry.")
+
+        try:
+            docker_result = subprocess.run(
+                ["docker", "version"],
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.PIPE,
+                timeout=10,
+            )
+        except Exception as e:
+            raise SetupError(f"Failed to execute docker command: {e}") from e
+
+        if docker_result.returncode != 0:
+            raise SetupError("Docker daemon is not running. Start Docker and retry.")
+
+        images = cls._derive_required_images(
+            swe_bench_project_path=swe_bench_project_path,
+            subset=subset,
+            split=split,
+            num_instances=num_instances,
+        )
+        cls._prepull_images(images)
+
+    def score_single_sample(self, value: str, ground_truth: str) -> float:
+        raise RuntimeError(
+            "SWEBenchScorer uses subprocess evaluation; call score() instead."
+        )
+
+    def _patch_config(self, output_dir: Path, benchmark_config_dict: dict) -> Path:
+        """Load template YAML, patch model fields from benchmark config, write to output_dir."""
+        with self.swebench_config_template.open() as f:
+            cfg = yaml.safe_load(f)
+
+        model_params = benchmark_config_dict.get("model_params") or {}
+        endpoint_cfg = benchmark_config_dict.get("endpoint_config") or {}
+        endpoints = endpoint_cfg.get("endpoints", [])
+
+        model_name = model_params.get("name")
+        if not model_name:
+            raise ValueError(
+                "model_params.name is required in the benchmark config but is missing or empty"
+            )
+        cfg["model"]["model_name"] = model_name
+        if endpoints:
+            base = endpoints[0].rstrip("/")
+            if base.endswith("/v1"):
+                base = base[:-3]
+            cfg["model"]["model_kwargs"]["api_base"] = base + "/v1"
+        else:
+            cfg["model"]["model_kwargs"]["api_base"] = ""
+
+        api_key = endpoint_cfg.get("api_key")
+        if api_key:
+            cfg["model"]["model_kwargs"]["api_key"] = api_key
+
+        for field in (
+            "temperature",
+            "top_p",
+            "top_k",
+            "repetition_penalty",
+            "presence_penalty",
+            "frequency_penalty",
+        ):
+            val = model_params.get(field)
+            if val is not None:
+                cfg["model"]["model_kwargs"][field] = val
+            else:
+                cfg["model"]["model_kwargs"].pop(field, None)
+
+        max_new_tokens_val = model_params.get("max_new_tokens")
+        if max_new_tokens_val is not None:
+            cfg["model"]["model_kwargs"]["max_tokens"] = max_new_tokens_val
+        else:
+            cfg["model"]["model_kwargs"].pop("max_tokens", None)
+
+        chat_tmpl = model_params.get("chat_template_kwargs")
+        if chat_tmpl is not None:
+            cfg["model"]["model_kwargs"]["chat_template_kwargs"] = chat_tmpl
+        else:
+            cfg["model"]["model_kwargs"].pop("chat_template_kwargs", None)
+
+        patched_path = output_dir / "swebench_patched.yaml"
+        with patched_path.open("w") as f:
+            yaml.safe_dump(cfg, f, default_flow_style=False, sort_keys=False)
+        return patched_path
+
+    def _run_subprocess(self, cmd: list[str], log_path: Path, cwd: Path) -> None:
+        """Run a command inside the accuracy subproject via ``uv run --project``."""
+        full_cmd = [
+            "uv",
+            "run",
+            "--project",
+            str(self.swe_bench_project_path),
+        ] + cmd
+        _run_subprocess_with_log(
+            full_cmd,
+            log_path,
+            timeout_s=self.subprocess_timeout_s,
+            label="SWE-bench",
+            cwd=cwd,
+        )
+
+    def score(self) -> tuple[float | None, int]:
+        """Run mini-swe-agent + swebench evaluation. Returns (resolved_rate, 1)."""
+        config_path = self.report_dir / "config.yaml"
+        if not config_path.exists():
+            raise FileNotFoundError(
+                f"config.yaml not found at {config_path}. "
+                "SWEBenchScorer.score() must be called from within a benchmark run "
+                "that has already written its config, or the path must be pre-populated."
+            )
+        with config_path.open() as f:
+            benchmark_cfg = yaml.safe_load(f)
+
+        model_name: str = benchmark_cfg["model_params"]["name"]
+        if self.dataset.dataframe is None:
+            raise RuntimeError(
+                "SWEBench dataset must be loaded before scoring; call dataset.load() first."
+            )
+
+        n_rows = len(self.dataset.dataframe)
+        if self.num_instances > n_rows:
+            logger.warning(
+                "num_instances=%d exceeds dataset size %d; evaluating %d instances",
+                self.num_instances,
+                n_rows,
+                n_rows,
+            )
+        slice_str = f"0:{min(self.num_instances, n_rows)}"
+
+        output_dir = self.report_dir / "swe_bench_output"
+        if output_dir.exists():
+            shutil.rmtree(output_dir)
+        output_dir.mkdir(parents=True)
+
+        patched_config = self._patch_config(output_dir, benchmark_cfg)
+
+        agent_cmd = [
+            "mini-extra",
+            "swebench",
+            "--model",
+            model_name,
+            "--config",
+            str(patched_config),
+            "--subset",
+            self.subset,
+            "--split",
+            self.split,
+            "--slice",
+            slice_str,
+            "--workers",
+            str(self.workers),
+            "--output",
+            str(output_dir),
+        ]
+        logger.info("Running mini-extra swebench: %s", " ".join(agent_cmd))
+        total_instances = min(self.num_instances, n_rows)
+        stop_event = threading.Event()
+        poll_thread = threading.Thread(
+            target=_poll_swebench_progress,
+            args=(output_dir, total_instances, stop_event),
+            daemon=True,
+        )
+        poll_thread.start()
+        try:
+            self._run_subprocess(
+                agent_cmd,
+                self.report_dir / "swe_bench_agent.log",
+                cwd=output_dir,
+            )
+        finally:
+            stop_event.set()
+            poll_thread.join(timeout=10)
+
+        preds_path = output_dir / "preds.json"
+        if not preds_path.exists():
+            logger.error(
+                "preds.json not found after mini-swe-agent run; returning None score"
+            )
+            return None, 1
+
+        hf_dataset_name = SWEBench.hf_dataset_name(self.subset)
+        run_id = f"endpoints_{uuid.uuid4().hex[:8]}"
+        eval_cmd = [
+            "python",
+            "-m",
+            "swebench.harness.run_evaluation",
+            "--dataset_name",
+            hf_dataset_name,
+            "--split",
+            self.split,
+            "--predictions_path",
+            str(preds_path),
+            "--max_workers",
+            str(self.max_eval_workers),
+            "--run_id",
+            run_id,
+        ]
+        logger.info("Running swebench evaluation: %s", " ".join(eval_cmd))
+        self._run_subprocess(
+            eval_cmd,
+            self.report_dir / "swe_bench_eval.log",
+            cwd=output_dir,
+        )
+
+        safe_model = model_name.replace("/", "__")
+        result_path = output_dir / f"{safe_model}.{run_id}.json"
+        if not result_path.exists():
+            candidates = list(output_dir.glob(f"*{run_id}*.json"))
+            if not candidates:
+                logger.error(
+                    "SWE-bench result file not found (run_id=%s); returning None",
+                    run_id,
+                )
+                return None, 1
+            result_path = candidates[0]
+
+        shutil.copy2(result_path, self.report_dir / "swe_bench_results.json")
+
+        result = msgspec.json.decode(result_path.read_bytes(), type=dict)
+        submitted = result.get("submitted_instances") or 0
+        resolved = result.get("resolved_instances") or 0
+        if submitted == 0:
+            logger.warning("SWE-bench: submitted_instances=0; returning None score")
+            return None, 1
+
+        resolved_rate = resolved / submitted
+        logger.info(
+            "SWE-bench: resolved %d / %d submitted (%.1f%%)",
+            resolved,
+            submitted,
+            resolved_rate * 100,
+        )
+        return resolved_rate, 1
diff --git a/tests/unit/commands/test_benchmark.py b/tests/unit/commands/test_benchmark.py
index 1c90554fb..ee7afd75e 100644
--- a/tests/unit/commands/test_benchmark.py
+++ b/tests/unit/commands/test_benchmark.py
@@ -22,6 +22,7 @@
 from types import SimpleNamespace
 from unittest.mock import MagicMock, patch
 
+import inference_endpoint.commands.benchmark.execute as execute_mod
 import pandas as pd
 import pytest
 from inference_endpoint.commands.benchmark.cli import (
@@ -34,6 +35,7 @@
     BenchmarkContext,
     ResponseCollector,
     _build_phases,
+    _load_datasets,
     _run_benchmark_async,
     setup_benchmark,
 )
@@ -62,6 +64,7 @@
 from inference_endpoint.config.utils import cli_error_formatter as _error_formatter
 from inference_endpoint.core.types import QueryResult
 from inference_endpoint.dataset_manager.dataset import Dataset
+from inference_endpoint.dataset_manager.predefined.swe_bench import SWEBench
 from inference_endpoint.endpoint_client.config import HTTPClientConfig
 from inference_endpoint.evaluation.scoring import Scorer
 from inference_endpoint.exceptions import InputValidationError, SetupError
@@ -78,6 +81,29 @@
     / "templates"
 )
 
+
+# Test-only scorers registered with leading-underscore IDs so TestScorerMethodSync excludes them.
+
+
+class _SelfContainedScorer(Scorer, scorer_id="_test_skip_endpoint_phase"):
+    SKIP_ENDPOINT_PHASE = True
+
+    def score_single_sample(self, value, ground_truth):
+        return 0.0
+
+    def score(self):
+        return 1.0, 1
+
+
+class _FailingPreflightScorer(Scorer, scorer_id="_test_failing_preflight"):
+    @classmethod
+    def preflight(cls, extras):
+        raise SetupError("mock preflight failure")
+
+    def score_single_sample(self, value, ground_truth):
+        return 0.0
+
+
 # Reusable minimal config kwargs
 _OFFLINE_KWARGS = {
     "endpoint_config": {"endpoints": ["http://test:8000"]},
@@ -132,6 +158,55 @@ def test_missing_model_name_raises(self):
                 datasets=[{"path": "test.jsonl"}],
             )
 
+    @pytest.mark.unit
+    def test_concurrency_injected_into_swe_bench_extras(self):
+        """target_concurrency is forwarded as workers into swe_bench_scorer extras."""
+        config = OnlineConfig(
+            endpoint_config={"endpoints": ["http://test:8000"]},
+            model_params={"name": "test-model"},
+            datasets=[
+                {
+                    "name": "swe_bench",
+                    "type": "accuracy",
+                    "accuracy_config": {"eval_method": "swe_bench_scorer"},
+                },
+                {"type": "performance", "path": "tests/assets/datasets/dummy_1k.jsonl"},
+            ],
+            settings={
+                "load_pattern": {"type": "concurrency", "target_concurrency": 32}
+            },
+        )
+        acc_ds = next(d for d in config.datasets if d.type == DatasetType.ACCURACY)
+        assert acc_ds.accuracy_config is not None
+        assert acc_ds.accuracy_config.extras is not None
+        assert acc_ds.accuracy_config.extras.get("workers") == 32
+
+    @pytest.mark.unit
+    def test_explicit_workers_not_overridden_by_concurrency(self):
+        """An explicit workers= in extras is not overwritten by target_concurrency."""
+        config = OnlineConfig(
+            endpoint_config={"endpoints": ["http://test:8000"]},
+            model_params={"name": "test-model"},
+            datasets=[
+                {
+                    "name": "swe_bench",
+                    "type": "accuracy",
+                    "accuracy_config": {
+                        "eval_method": "swe_bench_scorer",
+                        "extras": {"workers": 5},
+                    },
+                },
+                {"type": "performance", "path": "tests/assets/datasets/dummy_1k.jsonl"},
+            ],
+            settings={
+                "load_pattern": {"type": "concurrency", "target_concurrency": 32}
+            },
+        )
+        acc_ds = next(d for d in config.datasets if d.type == DatasetType.ACCURACY)
+        assert acc_ds.accuracy_config is not None
+        assert acc_ds.accuracy_config.extras is not None
+        assert acc_ds.accuracy_config.extras.get("workers") == 5
+
 
 class TestDurationSuffix:
     """Test duration suffix parsing (600s, 10m, 600000ms, plain int)."""
@@ -381,6 +456,91 @@ def test_validation_errors(self, overrides, match):
             )
 
 
+class TestAccuracyOnlyDataset:
+    """Test that datasets with ACCURACY_ONLY=True are rejected as perf datasets."""
+
+    @pytest.mark.unit
+    def test_swe_bench_as_perf_raises(self, tmp_path):
+        fake_df = pd.DataFrame(
+            [{"instance_id": "repo__repo-0", "problem_statement": "Fix bug 0"}]
+        )
+        config = OfflineConfig(
+            endpoint_config={"endpoints": ["http://test:8000"]},
+            model_params={"name": "test-model"},
+            datasets=[{"name": "swe_bench"}],
+        )
+        with (
+            patch.object(SWEBench, "generate", return_value=fake_df),
+            pytest.raises(InputValidationError, match="accuracy-only"),
+        ):
+            _load_datasets(config, tmp_path)
+
+    @pytest.mark.unit
+    def test_preflight_error_propagates(self, tmp_path):
+        """A scorer whose preflight() raises SetupError must stop _load_datasets."""
+        dummy_jsonl = tmp_path / "dummy.jsonl"
+        dummy_jsonl.write_text('{"prompt": "hello"}\n')
+        fake_acc_df = pd.DataFrame(
+            [{"instance_id": "repo__repo-0", "prompt": "Fix bug 0"}]
+        )
+        config = OfflineConfig(
+            endpoint_config={"endpoints": ["http://test:8000"]},
+            model_params={"name": "test-model"},
+            datasets=[
+                {"type": "performance", "path": str(dummy_jsonl)},
+                {
+                    "name": "swe_bench",
+                    "type": "accuracy",
+                    "accuracy_config": {"eval_method": "swe_bench_scorer"},
+                },
+            ],
+        )
+        with (
+            patch.object(SWEBench, "generate", return_value=fake_acc_df),
+            patch.object(
+                execute_mod,
+                "_resolve_accuracy_components",
+                return_value=(_FailingPreflightScorer, None),
+            ),
+            pytest.raises(SetupError, match="mock preflight failure"),
+        ):
+            _load_datasets(config, tmp_path)
+
+    @pytest.mark.unit
+    def test_perf_dataset_with_accuracy_config_does_not_crash_load_datasets(
+        self, tmp_path
+    ):
+        """_load_datasets must not crash when perf dataset carries accuracy_config.
+
+        The perf-with-accuracy-config branch appends to eval_configs but not to
+        accuracy_datasets; a zip(strict=True) over both lists would raise ValueError.
+        """
+        dummy_jsonl = tmp_path / "dummy.jsonl"
+        dummy_jsonl.write_text('{"prompt": "hello"}\n')
+        config = OfflineConfig(
+            endpoint_config={"endpoints": ["http://test:8000"]},
+            model_params={"name": "test-model"},
+            datasets=[
+                {
+                    "type": "performance",
+                    "path": str(dummy_jsonl),
+                    "accuracy_config": {"eval_method": "swe_bench_scorer"},
+                },
+            ],
+        )
+        with patch.object(
+            execute_mod,
+            "_resolve_accuracy_components",
+            return_value=(_SelfContainedScorer, None),
+        ):
+            _, accuracy_datasets, eval_configs = _load_datasets(config, tmp_path)
+
+        # The perf dataset appends to eval_configs only, not accuracy_datasets.
+        assert len(accuracy_datasets) == 0
+        assert len(eval_configs) == 1
+        assert eval_configs[0].dataset_name == "performance"
+
+
 class TestYAMLTemplateValidation:
     """Validate all bundled YAML templates parse correctly."""
 
@@ -555,8 +715,6 @@ class TestAggregatorArgs:
     """Tests that metrics aggregator subprocess args are correctly forwarded."""
 
     def _make_ctx(self, config, tmp_path):
-        import random
-
         rt = RuntimeSettings(
             metric_target=Throughput(10.0),
             reported_metrics=[Throughput(10.0)],
@@ -967,6 +1125,27 @@ def test_accuracy_drain_timeout_defaults_to_unbounded(
         acc = next(p for p in phases if p.phase_type == PhaseType.ACCURACY)
         assert acc.drain_timeout is None
 
+    @pytest.mark.unit
+    def test_skip_endpoint_phase_omits_accuracy_phase(
+        self, base_rt_settings, simple_dataset
+    ):
+        config = OfflineConfig(**_OFFLINE_KWARGS)
+        ctx = self._make_ctx(config, base_rt_settings, simple_dataset)
+        ctx.eval_configs = [
+            AccuracyConfiguration(
+                scorer=_SelfContainedScorer,
+                extractor=None,
+                dataset_name="acc",
+                dataset=simple_dataset,
+                report_dir=Path("/tmp"),
+                ground_truth_column=None,
+                num_repeats=1,
+            )
+        ]
+        phases = _build_phases(ctx)
+
+        assert all(p.phase_type != PhaseType.ACCURACY for p in phases)
+
     @pytest.mark.unit
     def test_warmup_uses_independent_rng_instances(
         self, base_rt_settings, simple_dataset
@@ -1052,7 +1231,8 @@ class TestScorerMethodSync:
     @pytest.mark.unit
     def test_scorer_enum_matches_registry(self):
         enum_values = {m.value for m in ScorerMethod}
-        registry_keys = set(Scorer.PREDEFINED.keys())
+        # Exclude test-only scorers (ids starting with "_")
+        registry_keys = {k for k in Scorer.PREDEFINED if not k.startswith("_")}
         assert enum_values == registry_keys, (
             f"ScorerMethod enum out of sync with Scorer registry.\n"
             f"  In enum only: {enum_values - registry_keys}\n"
diff --git a/tests/unit/dataset_manager/test_swe_bench_dataset.py b/tests/unit/dataset_manager/test_swe_bench_dataset.py
new file mode 100644
index 000000000..f82aeec7a
--- /dev/null
+++ b/tests/unit/dataset_manager/test_swe_bench_dataset.py
@@ -0,0 +1,108 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for SWEBench predefined dataset."""
+
+from pathlib import Path
+from unittest.mock import patch
+
+import pandas as pd
+import pytest
+from inference_endpoint.dataset_manager.dataset import Dataset
+from inference_endpoint.dataset_manager.predefined.swe_bench import SWEBench
+
+pytestmark = pytest.mark.unit
+
+_FAKE_INSTANCES = [
+    {"instance_id": f"repo__repo-{i}", "problem_statement": f"Fix bug {i}"}
+    for i in range(5)
+]
+
+
+def _make_hf_df() -> pd.DataFrame:
+    return pd.DataFrame(_FAKE_INSTANCES)
+
+
+class TestSWEBenchRegistration:
+    def test_registered(self):
+        assert "swe_bench" in Dataset.PREDEFINED
+        assert Dataset.PREDEFINED["swe_bench"] is SWEBench
+
+    def test_accuracy_only_flag(self):
+        assert SWEBench.ACCURACY_ONLY is True
+
+    @pytest.mark.parametrize(
+        ("subset", "expected"),
+        [
+            ("verified", "princeton-nlp/SWE-bench_Verified"),
+            ("lite", "princeton-nlp/SWE-bench_Lite"),
+        ],
+    )
+    def test_hf_dataset_name(self, subset: str, expected: str):
+        assert SWEBench.hf_dataset_name(subset) == expected
+
+    def test_hf_dataset_name_invalid_subset_raises(self):
+        with pytest.raises(ValueError, match="Unknown SWE-bench subset"):
+            SWEBench.hf_dataset_name("invalid")
+
+
+class TestSWEBenchGenerate:
+    def test_downloads_and_caches(self, tmp_path: Path):
+        with patch(
+            "inference_endpoint.dataset_manager.predefined.swe_bench.load_from_huggingface",
+            return_value=_make_hf_df(),
+        ) as mock_hf:
+            df1 = SWEBench.generate(datasets_dir=tmp_path)
+
+        assert mock_hf.call_count == 1
+        assert list(df1.columns) == ["instance_id", "prompt"]
+        assert len(df1) == 5
+        assert df1["prompt"].iloc[0] == "Fix bug 0"
+
+        # Second call should hit parquet cache, not HF
+        with patch(
+            "inference_endpoint.dataset_manager.predefined.swe_bench.load_from_huggingface",
+        ) as mock_hf2:
+            df2 = SWEBench.generate(datasets_dir=tmp_path)
+
+        mock_hf2.assert_not_called()
+        assert list(df2.columns) == ["instance_id", "prompt"]
+        assert len(df2) == 5
+
+    def test_unknown_subset_raises(self, tmp_path: Path):
+        with pytest.raises(ValueError, match="Unknown SWE-bench subset"):
+            SWEBench.generate(datasets_dir=tmp_path, subset="invalid")
+
+    def test_force_regenerate(self, tmp_path: Path):
+        with patch(
+            "inference_endpoint.dataset_manager.predefined.swe_bench.load_from_huggingface",
+            return_value=_make_hf_df(),
+        ) as mock_hf:
+            SWEBench.generate(datasets_dir=tmp_path)
+            assert mock_hf.call_count == 1
+
+            SWEBench.generate(datasets_dir=tmp_path, force=True)
+            assert mock_hf.call_count == 2
+
+    def test_lite_subset(self, tmp_path: Path):
+        with patch(
+            "inference_endpoint.dataset_manager.predefined.swe_bench.load_from_huggingface",
+            return_value=_make_hf_df(),
+        ) as mock_hf:
+            df = SWEBench.generate(datasets_dir=tmp_path, subset="lite")
+
+        call_kwargs = mock_hf.call_args
+        assert "princeton-nlp/SWE-bench_Lite" in call_kwargs[0]
+        assert len(df) == 5
diff --git a/tests/unit/evaluation/test_swe_bench_scorer.py b/tests/unit/evaluation/test_swe_bench_scorer.py
new file mode 100644
index 000000000..584ec4eed
--- /dev/null
+++ b/tests/unit/evaluation/test_swe_bench_scorer.py
@@ -0,0 +1,699 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for SWEBenchScorer."""
+
+import json
+from pathlib import Path
+from unittest.mock import MagicMock
+
+import msgspec
+import pandas as pd
+import pytest
+import yaml
+from inference_endpoint.evaluation import scoring as scoring_mod
+from inference_endpoint.evaluation.scoring import (
+    Scorer,
+    SWEBenchScorer,
+)
+from inference_endpoint.exceptions import SetupError
+
+pytestmark = pytest.mark.unit
+
+_DATASET_NAME = "swe_bench_acc"
+_MODEL_NAME = "TestOrg/test-model-7b"
+
+
+def _write_benchmark_config(report_dir: Path, model_params: dict | None = None) -> None:
+    mp: dict = {"name": _MODEL_NAME}
+    defaults = {
+        "model_params": mp,
+        "endpoint_config": {"endpoints": ["http://localhost:30000"]},
+    }
+    if model_params is not None:
+        mp.update(model_params)
+    (report_dir / "config.yaml").write_text(yaml.dump(defaults))
+
+
+def _write_sample_idx_map(report_dir: Path, n: int = 3) -> None:
+    idx_map = {_DATASET_NAME: {f"uuid-{i}": i for i in range(n)}}
+    (report_dir / "sample_idx_map.json").write_bytes(msgspec.json.encode(idx_map))
+
+
+def _make_dataset(n: int = 3) -> MagicMock:
+    df = pd.DataFrame(
+        {
+            "instance_id": [f"repo__repo-{i}" for i in range(n)],
+            "prompt": ["placeholder"] * n,
+        }
+    )
+    ds = MagicMock()
+    ds.dataframe = df
+    ds.num_samples.return_value = n
+    return ds
+
+
+@pytest.fixture
+def swe_bench_project(tmp_path: Path) -> Path:
+    """Fake accuracy subproject directory with a minimal pyproject.toml."""
+    d = tmp_path / "accuracy"
+    d.mkdir(parents=True)
+    (d / "pyproject.toml").write_text("[project]\nname = 'swe-bench-accuracy'\n")
+    return d
+
+
+@pytest.fixture
+def template_yaml(tmp_path: Path) -> Path:
+    """Minimal swebench template YAML."""
+    tmpl = {
+        "model": {
+            "model_name": "",
+            "model_kwargs": {
+                "custom_llm_provider": "openai",
+                "api_base": "",
+            },
+        }
+    }
+    p = tmp_path / "swebench_template.yaml"
+    p.write_text(yaml.dump(tmpl))
+    return p
+
+
+@pytest.fixture
+def report_dir(tmp_path: Path) -> Path:
+    d = tmp_path / "report"
+    d.mkdir()
+    _write_benchmark_config(d)
+    _write_sample_idx_map(d)
+    return d
+
+
+def _make_fake_run(cmd, **kwargs):
+    """Return a fake subprocess.run result with returncode=0."""
+    return MagicMock(returncode=0, stdout="")
+
+
+def _make_staged_run(on_eval_cmd):
+    """Return a fake subprocess.run that handles mini-extra successfully, then delegates."""
+
+    def fake_run(cmd, **kwargs):
+        if "mini-extra" in " ".join(cmd):
+            output_dir = Path(cmd[cmd.index("--output") + 1])
+            output_dir.mkdir(parents=True, exist_ok=True)
+            (output_dir / "preds.json").write_text(json.dumps({}))
+            return MagicMock(returncode=0, stdout="")
+        return on_eval_cmd(cmd, **kwargs)
+
+    return fake_run
+
+
+@pytest.fixture
+def patch_subprocess(monkeypatch, report_dir: Path, swe_bench_project: Path):
+    """Patch subprocess.run to write fake preds.json and result JSON."""
+    captured: list[list[str]] = []
+
+    def fake_run(cmd, **kwargs):
+        captured.append(list(cmd))
+        cmd_str = " ".join(cmd)
+        if "mini-extra" in cmd_str:
+            output_dir = Path(cmd[cmd.index("--output") + 1])
+            output_dir.mkdir(parents=True, exist_ok=True)
+            (output_dir / "preds.json").write_text(json.dumps({}))
+        elif "run_evaluation" in cmd_str:
+            cwd = Path(kwargs["cwd"])
+            run_id = cmd[cmd.index("--run_id") + 1]
+            safe_model = _MODEL_NAME.replace("/", "__")
+            (cwd / f"{safe_model}.{run_id}.json").write_text(
+                json.dumps(
+                    {
+                        "resolved_instances": 3,
+                        "submitted_instances": 10,
+                        "total_instances": 500,
+                    }
+                )
+            )
+        return MagicMock(returncode=0, stdout="")
+
+    monkeypatch.setattr(scoring_mod.subprocess, "run", fake_run)
+    return captured
+
+
+class TestSWEBenchScorerRegistration:
+    def test_registered(self):
+        assert "swe_bench_scorer" in Scorer.PREDEFINED
+        assert Scorer.get("swe_bench_scorer") is SWEBenchScorer
+
+    def test_skip_endpoint_phase(self):
+        assert SWEBenchScorer.SKIP_ENDPOINT_PHASE is True
+
+    def test_external_sample_count(self):
+        assert SWEBenchScorer.external_sample_count({"num_instances": 100}) == 100
+        assert SWEBenchScorer.external_sample_count({}) is None
+        assert SWEBenchScorer.external_sample_count({"num_instances": "bad"}) is None
+
+
+class TestSWEBenchScorer:
+    def test_score_happy_path(
+        self, report_dir, swe_bench_project, template_yaml, patch_subprocess
+    ):
+        scorer = SWEBenchScorer(
+            dataset_name=_DATASET_NAME,
+            dataset=_make_dataset(),
+            report_dir=report_dir,
+            swe_bench_project_path=swe_bench_project,
+            swebench_config_template=template_yaml,
+        )
+        score, n_repeats = scorer.score()
+
+        assert score == pytest.approx(0.3)
+        assert n_repeats == 1
+        assert (report_dir / "swe_bench_results.json").exists()
+
+    def test_missing_subproject_raises_at_init(
+        self, report_dir, tmp_path, template_yaml
+    ):
+        empty_dir = tmp_path / "empty_project"
+        empty_dir.mkdir()
+        with pytest.raises(FileNotFoundError, match="SWE-bench subproject not found"):
+            SWEBenchScorer(
+                dataset_name=_DATASET_NAME,
+                dataset=_make_dataset(),
+                report_dir=report_dir,
+                swe_bench_project_path=empty_dir,
+                swebench_config_template=template_yaml,
+            )
+
+    def test_missing_template_raises_at_init(
+        self, report_dir, swe_bench_project, tmp_path
+    ):
+        nonexistent = tmp_path / "no_such_template.yaml"
+        with pytest.raises(FileNotFoundError, match="swebench template"):
+            SWEBenchScorer(
+                dataset_name=_DATASET_NAME,
+                dataset=_make_dataset(),
+                report_dir=report_dir,
+                swe_bench_project_path=swe_bench_project,
+                swebench_config_template=nonexistent,
+            )
+
+    def test_missing_preds_returns_none(
+        self, report_dir, swe_bench_project, template_yaml, monkeypatch
+    ):
+        monkeypatch.setattr(scoring_mod.subprocess, "run", _make_fake_run)
+        scorer = SWEBenchScorer(
+            dataset_name=_DATASET_NAME,
+            dataset=_make_dataset(),
+            report_dir=report_dir,
+            swe_bench_project_path=swe_bench_project,
+            swebench_config_template=template_yaml,
+        )
+        score, n_repeats = scorer.score()
+        assert score is None
+        assert n_repeats == 1
+
+    def test_config_patching_all_fields(self, report_dir, swe_bench_project, tmp_path):
+        tmpl = {
+            "model": {
+                "model_name": "",
+                "model_kwargs": {
+                    "api_base": "",
+                    "temperature": None,
+                    "top_k": None,
+                },
+            }
+        }
+        template_path = tmp_path / "tmpl.yaml"
+        template_path.write_text(yaml.dump(tmpl))
+
+        _write_benchmark_config(
+            report_dir,
+            model_params={
+                "temperature": 0.8,
+                "top_p": 0.9,
+                "top_k": 15,
+                "chat_template_kwargs": {"preserve_thinking": True},
+            },
+        )
+
+        scorer = SWEBenchScorer(
+            dataset_name=_DATASET_NAME,
+            dataset=_make_dataset(),
+            report_dir=report_dir,
+            swe_bench_project_path=swe_bench_project,
+            swebench_config_template=template_path,
+        )
+        output_dir = tmp_path / "out"
+        output_dir.mkdir()
+        with (report_dir / "config.yaml").open() as f:
+            benchmark_cfg = yaml.safe_load(f)
+        patched_path = scorer._patch_config(output_dir, benchmark_cfg)
+        patched = yaml.safe_load(patched_path.read_text())
+
+        assert patched["model"]["model_name"] == _MODEL_NAME
+        assert (
+            patched["model"]["model_kwargs"]["api_base"] == "http://localhost:30000/v1"
+        )
+        assert patched["model"]["model_kwargs"]["temperature"] == pytest.approx(0.8)
+        assert patched["model"]["model_kwargs"]["top_p"] == pytest.approx(0.9)
+        assert patched["model"]["model_kwargs"]["top_k"] == 15
+        assert patched["model"]["model_kwargs"]["chat_template_kwargs"] == {
+            "preserve_thinking": True
+        }
+
+    def test_config_patching_omits_none_fields(
+        self, report_dir, swe_bench_project, tmp_path
+    ):
+        tmpl = {
+            "model": {
+                "model_name": "",
+                "model_kwargs": {"api_base": "", "top_k": 20},
+            }
+        }
+        template_path = tmp_path / "tmpl.yaml"
+        template_path.write_text(yaml.dump(tmpl))
+
+        # model_params has no top_k — should be removed from patched config
+        _write_benchmark_config(report_dir)
+
+        scorer = SWEBenchScorer(
+            dataset_name=_DATASET_NAME,
+            dataset=_make_dataset(),
+            report_dir=report_dir,
+            swe_bench_project_path=swe_bench_project,
+            swebench_config_template=template_path,
+        )
+        output_dir = tmp_path / "out"
+        output_dir.mkdir()
+        with (report_dir / "config.yaml").open() as f:
+            benchmark_cfg = yaml.safe_load(f)
+        patched_path = scorer._patch_config(output_dir, benchmark_cfg)
+        patched = yaml.safe_load(patched_path.read_text())
+
+        assert "top_k" not in patched["model"]["model_kwargs"]
+
+    def test_config_patching_max_new_tokens(
+        self, report_dir, swe_bench_project, tmp_path
+    ):
+        tmpl = {
+            "model": {
+                "model_name": "",
+                "model_kwargs": {"api_base": ""},
+            }
+        }
+        template_path = tmp_path / "tmpl.yaml"
+        template_path.write_text(yaml.dump(tmpl))
+
+        _write_benchmark_config(report_dir, model_params={"max_new_tokens": 4096})
+
+        scorer = SWEBenchScorer(
+            dataset_name=_DATASET_NAME,
+            dataset=_make_dataset(),
+            report_dir=report_dir,
+            swe_bench_project_path=swe_bench_project,
+            swebench_config_template=template_path,
+        )
+        output_dir = tmp_path / "out"
+        output_dir.mkdir()
+        with (report_dir / "config.yaml").open() as f:
+            benchmark_cfg = yaml.safe_load(f)
+        patched_path = scorer._patch_config(output_dir, benchmark_cfg)
+        patched = yaml.safe_load(patched_path.read_text())
+
+        assert patched["model"]["model_kwargs"]["max_tokens"] == 4096
+
+    def test_config_patching_omits_max_tokens_when_not_set(
+        self, report_dir, swe_bench_project, tmp_path
+    ):
+        tmpl = {
+            "model": {
+                "model_name": "",
+                "model_kwargs": {"api_base": "", "max_tokens": 999},
+            }
+        }
+        template_path = tmp_path / "tmpl.yaml"
+        template_path.write_text(yaml.dump(tmpl))
+
+        # model_params has no max_new_tokens — max_tokens should be removed
+        _write_benchmark_config(report_dir)
+
+        scorer = SWEBenchScorer(
+            dataset_name=_DATASET_NAME,
+            dataset=_make_dataset(),
+            report_dir=report_dir,
+            swe_bench_project_path=swe_bench_project,
+            swebench_config_template=template_path,
+        )
+        output_dir = tmp_path / "out"
+        output_dir.mkdir()
+        with (report_dir / "config.yaml").open() as f:
+            benchmark_cfg = yaml.safe_load(f)
+        patched_path = scorer._patch_config(output_dir, benchmark_cfg)
+        patched = yaml.safe_load(patched_path.read_text())
+
+        assert "max_tokens" not in patched["model"]["model_kwargs"]
+
+    @pytest.mark.parametrize(
+        "num_instances, expected_slice",
+        [
+            (5, "0:5"),
+            (100, "0:100"),
+        ],
+    )
+    def test_num_instances_produces_correct_slice(
+        self,
+        num_instances,
+        expected_slice,
+        report_dir,
+        swe_bench_project,
+        template_yaml,
+        patch_subprocess,
+    ):
+        scorer = SWEBenchScorer(
+            dataset_name=_DATASET_NAME,
+            dataset=_make_dataset(n=num_instances),
+            report_dir=report_dir,
+            swe_bench_project_path=swe_bench_project,
+            swebench_config_template=template_yaml,
+            num_instances=num_instances,
+        )
+        scorer.score()
+        agent_cmd = patch_subprocess[0]
+        assert agent_cmd[agent_cmd.index("--slice") + 1] == expected_slice
+
+    @pytest.mark.parametrize(
+        "subset, expected_hf_name",
+        [
+            ("lite", "princeton-nlp/SWE-bench_Lite"),
+            ("verified", "princeton-nlp/SWE-bench_Verified"),
+        ],
+    )
+    def test_subset_maps_to_correct_hf_dataset_name(
+        self,
+        subset,
+        expected_hf_name,
+        report_dir,
+        swe_bench_project,
+        template_yaml,
+        patch_subprocess,
+    ):
+        scorer = SWEBenchScorer(
+            dataset_name=_DATASET_NAME,
+            dataset=_make_dataset(),
+            report_dir=report_dir,
+            swe_bench_project_path=swe_bench_project,
+            swebench_config_template=template_yaml,
+            subset=subset,
+        )
+        scorer.score()
+        eval_cmd = patch_subprocess[1]
+        assert eval_cmd[eval_cmd.index("--dataset_name") + 1] == expected_hf_name
+
+    def test_unknown_subset_raises_at_init(
+        self, report_dir, swe_bench_project, template_yaml
+    ):
+        with pytest.raises(ValueError, match="Unknown SWE-bench subset"):
+            SWEBenchScorer(
+                dataset_name=_DATASET_NAME,
+                dataset=_make_dataset(),
+                report_dir=report_dir,
+                swe_bench_project_path=swe_bench_project,
+                swebench_config_template=template_yaml,
+                subset="full",
+            )
+
+    def test_missing_model_name_raises_clear_error(self, swe_bench_project, tmp_path):
+        tmpl = {
+            "model": {
+                "model_name": "",
+                "model_kwargs": {"api_base": ""},
+            }
+        }
+        template_path = tmp_path / "tmpl.yaml"
+        template_path.write_text(yaml.dump(tmpl))
+
+        scorer = SWEBenchScorer(
+            dataset_name=_DATASET_NAME,
+            dataset=_make_dataset(),
+            report_dir=tmp_path,
+            swe_bench_project_path=swe_bench_project,
+            swebench_config_template=template_path,
+        )
+        output_dir = tmp_path / "out"
+        output_dir.mkdir()
+
+        with pytest.raises(ValueError, match="model_params.name is required"):
+            scorer._patch_config(output_dir, {"model_params": {}})
+
+    def test_template_missing_model_kwargs_raises(
+        self, report_dir, swe_bench_project, tmp_path
+    ):
+        bad_template = tmp_path / "bad_template.yaml"
+        bad_template.write_text(yaml.dump({"model": {"model_name": ""}}))
+        with pytest.raises(ValueError, match="model.model_kwargs"):
+            SWEBenchScorer(
+                dataset_name=_DATASET_NAME,
+                dataset=_make_dataset(),
+                report_dir=report_dir,
+                swe_bench_project_path=swe_bench_project,
+                swebench_config_template=bad_template,
+            )
+
+    def test_subprocess_failure_raises(
+        self, report_dir, swe_bench_project, template_yaml, monkeypatch
+    ):
+        def _fail_eval(cmd, **kwargs):
+            return MagicMock(returncode=2, stdout="docker error: permission denied")
+
+        monkeypatch.setattr(
+            scoring_mod.subprocess,
+            "run",
+            _make_staged_run(_fail_eval),
+        )
+        scorer = SWEBenchScorer(
+            dataset_name=_DATASET_NAME,
+            dataset=_make_dataset(),
+            report_dir=report_dir,
+            swe_bench_project_path=swe_bench_project,
+            swebench_config_template=template_yaml,
+        )
+        with pytest.raises(RuntimeError, match="exited with code 2"):
+            scorer.score()
+
+    def test_subprocess_timeout_raises(
+        self, report_dir, swe_bench_project, template_yaml, monkeypatch
+    ):
+        def _timeout_eval(cmd, **kwargs):
+            raise scoring_mod.subprocess.TimeoutExpired(cmd=cmd, timeout=300)
+
+        monkeypatch.setattr(
+            scoring_mod.subprocess, "run", _make_staged_run(_timeout_eval)
+        )
+        scorer = SWEBenchScorer(
+            dataset_name=_DATASET_NAME,
+            dataset=_make_dataset(),
+            report_dir=report_dir,
+            swe_bench_project_path=swe_bench_project,
+            swebench_config_template=template_yaml,
+        )
+        with pytest.raises(RuntimeError, match="timed out after"):
+            scorer.score()
+
+    def test_result_glob_fallback(
+        self, report_dir, swe_bench_project, template_yaml, monkeypatch
+    ):
+        def _write_alt_prefix(cmd, **kwargs):
+            if "run_evaluation" in " ".join(cmd):
+                cwd = Path(kwargs["cwd"])
+                run_id = cmd[cmd.index("--run_id") + 1]
+                # Write under a different prefix so exact name won't match; glob will find it
+                (cwd / f"alt_prefix.{run_id}.json").write_text(
+                    json.dumps(
+                        {
+                            "resolved_instances": 1,
+                            "submitted_instances": 5,
+                            "total_instances": 500,
+                        }
+                    )
+                )
+            return MagicMock(returncode=0, stdout="")
+
+        monkeypatch.setattr(
+            scoring_mod.subprocess, "run", _make_staged_run(_write_alt_prefix)
+        )
+        scorer = SWEBenchScorer(
+            dataset_name=_DATASET_NAME,
+            dataset=_make_dataset(),
+            report_dir=report_dir,
+            swe_bench_project_path=swe_bench_project,
+            swebench_config_template=template_yaml,
+        )
+        score, n_repeats = scorer.score()
+        assert score == pytest.approx(1 / 5)
+        assert n_repeats == 1
+
+    def test_zero_submitted_instances_returns_none(
+        self, report_dir, swe_bench_project, template_yaml, monkeypatch
+    ):
+        def _write_zero_results(cmd, **kwargs):
+            if "run_evaluation" in " ".join(cmd):
+                cwd = Path(kwargs["cwd"])
+                run_id = cmd[cmd.index("--run_id") + 1]
+                safe_model = _MODEL_NAME.replace("/", "__")
+                (cwd / f"{safe_model}.{run_id}.json").write_text(
+                    json.dumps(
+                        {
+                            "resolved_instances": 0,
+                            "submitted_instances": 0,
+                            "total_instances": 500,
+                        }
+                    )
+                )
+            return MagicMock(returncode=0, stdout="")
+
+        monkeypatch.setattr(
+            scoring_mod.subprocess, "run", _make_staged_run(_write_zero_results)
+        )
+        scorer = SWEBenchScorer(
+            dataset_name=_DATASET_NAME,
+            dataset=_make_dataset(),
+            report_dir=report_dir,
+            swe_bench_project_path=swe_bench_project,
+            swebench_config_template=template_yaml,
+        )
+        score, n_repeats = scorer.score()
+        assert score is None
+        assert n_repeats == 1
+
+
+class TestSWEBenchScorerPreflight:
+    def _extras(self, swe_bench_project: Path, **overrides) -> dict:
+        return {"swe_bench_project_path": str(swe_bench_project), **overrides}
+
+    def test_preflight_passes(self, swe_bench_project, monkeypatch):
+        monkeypatch.setattr(
+            scoring_mod.shutil, "which", lambda name: f"/usr/bin/{name}"
+        )
+        captured: list[list[str]] = []
+
+        def fake_run(cmd, **kw):
+            captured.append(list(cmd))
+            cmd_str = " ".join(cmd)
+            if "get_swebench_docker_image_name" in cmd_str:
+                return MagicMock(
+                    returncode=0,
+                    stdout=json.dumps(["docker.io/swebench/test:latest"]),
+                    stderr="",
+                )
+            return MagicMock(returncode=0, stdout="", stderr=b"")
+
+        monkeypatch.setattr(scoring_mod.subprocess, "run", fake_run)
+        SWEBenchScorer.preflight(
+            self._extras(
+                swe_bench_project,
+                subset="lite",
+                split="test",
+                num_instances=2,
+            )
+        )
+
+        derive_cmd = next(
+            cmd for cmd in captured if "get_swebench_docker_image_name" in " ".join(cmd)
+        )
+        assert derive_cmd[-3:] == ["lite", "test", "2"]
+        assert ["docker", "pull", "docker.io/swebench/test:latest"] not in captured
+
+    def test_preflight_fails_uv_missing(self, swe_bench_project, monkeypatch):
+        monkeypatch.setattr(scoring_mod.shutil, "which", lambda name: None)
+        with pytest.raises(SetupError, match="uv is not on PATH"):
+            SWEBenchScorer.preflight(self._extras(swe_bench_project))
+
+    def test_preflight_fails_mini_extra_missing(self, swe_bench_project, monkeypatch):
+        monkeypatch.setattr(
+            scoring_mod.shutil, "which", lambda name: f"/usr/bin/{name}"
+        )
+
+        def fake_run(cmd, **kw):
+            if "mini-extra" in cmd:
+                return MagicMock(returncode=1, stderr=b"not found")
+            return MagicMock(returncode=0)
+
+        monkeypatch.setattr(scoring_mod.subprocess, "run", fake_run)
+        with pytest.raises(
+            SetupError, match=r"mini-extra is not available.*stderr: not found"
+        ):
+            SWEBenchScorer.preflight(self._extras(swe_bench_project))
+
+    def test_preflight_fails_swebench_missing(self, swe_bench_project, monkeypatch):
+        monkeypatch.setattr(
+            scoring_mod.shutil, "which", lambda name: f"/usr/bin/{name}"
+        )
+
+        def fake_run(cmd, **kw):
+            if "import swebench" in " ".join(cmd):
+                return MagicMock(returncode=1, stderr=b"ModuleNotFoundError")
+            return MagicMock(returncode=0)
+
+        monkeypatch.setattr(scoring_mod.subprocess, "run", fake_run)
+        with pytest.raises(
+            SetupError,
+            match=r"swebench is not available.*stderr: ModuleNotFoundError",
+        ):
+            SWEBenchScorer.preflight(self._extras(swe_bench_project))
+
+    def test_preflight_fails_docker_not_running(self, swe_bench_project, monkeypatch):
+        monkeypatch.setattr(
+            scoring_mod.shutil, "which", lambda name: f"/usr/bin/{name}"
+        )
+
+        def fake_run(cmd, **kw):
+            if "docker" in cmd:
+                return MagicMock(
+                    returncode=1, stderr=b"Cannot connect to Docker daemon"
+                )
+            return MagicMock(returncode=0)
+
+        monkeypatch.setattr(scoring_mod.subprocess, "run", fake_run)
+        with pytest.raises(SetupError, match="Docker daemon is not running"):
+            SWEBenchScorer.preflight(self._extras(swe_bench_project))
+
+    def test_preflight_fails_when_pull_fails(self, swe_bench_project, monkeypatch):
+        monkeypatch.setattr(
+            scoring_mod.shutil, "which", lambda name: f"/usr/bin/{name}"
+        )
+
+        def fake_run(cmd, **kw):
+            cmd_str = " ".join(cmd)
+            if "get_swebench_docker_image_name" in cmd_str:
+                return MagicMock(
+                    returncode=0,
+                    stdout=json.dumps(["docker.io/swebench/test:latest"]),
+                    stderr="",
+                )
+            if cmd[:3] == ["docker", "image", "inspect"]:
+                return MagicMock(returncode=1, stdout="", stderr=b"missing")
+            if cmd[:2] == ["docker", "pull"]:
+                return MagicMock(
+                    returncode=1,
+                    stdout="",
+                    stderr=b"rate limit exceeded",
+                )
+            return MagicMock(returncode=0, stdout="", stderr=b"")
+
+        monkeypatch.setattr(scoring_mod.subprocess, "run", fake_run)
+        with pytest.raises(
+            SetupError,
+            match=r"docker\.io/swebench/test:latest.*rate limit exceeded",
+        ):
+            SWEBenchScorer.preflight(self._extras(swe_bench_project))
diff --git a/uv.lock b/uv.lock
index 984581b6b..b079ca900 100644
--- a/uv.lock
+++ b/uv.lock
@@ -810,6 +810,7 @@ dependencies = [
     { name = "pydantic", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "pydantic-core", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "pytz", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
+    { name = "pyyaml", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "pyzmq", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "rich", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "sentencepiece", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
@@ -894,6 +895,7 @@ requires-dist = [
     { name = "pytest-timeout", marker = "extra == 'test'", specifier = "==2.4.0" },
     { name = "pytest-xdist", marker = "extra == 'test'", specifier = "==3.8.0" },
     { name = "pytz", specifier = "==2026.1.post1" },
+    { name = "pyyaml", specifier = "==6.0.3" },
     { name = "pyzmq", specifier = "==27.1.0" },
     { name = "rich", specifier = "==14.3.3" },
     { name = "ruff", marker = "extra == 'dev'", specifier = "==0.15.8" },