mlcommons · tianmu-li · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026
@@ -194,3 +194,29 @@ Update the first `datasets` entry (`name` and `path`), `model_params.name`, and
 uv run inference-endpoint benchmark from-config \
   --config examples/10_Agentic_Inference/kimi_agentic_benchmark.yaml
 ```
+
+## SWE-bench Accuracy
+
+`swe_bench_accuracy.yaml` runs the SWE-bench accuracy evaluation alongside a
+minimal performance dataset. The benchmark framework skips its built-in
+accuracy phase for this dataset; instead, `SWEBenchScorer` shells out to
+`mini-swe-agent` and the `swebench` evaluation harness, and that external flow
+drives requests to the configured endpoint.
+
+The isolated `uv` environment for those tools lives in `accuracy/`. Sync it
+once before running:
+
+```bash
+cd examples/10_Agentic_Inference/accuracy
+uv sync
+```
+
+Then run the benchmark from the repo root:
+
+```bash
+uv run inference-endpoint benchmark from-config \
+  --config examples/10_Agentic_Inference/swe_bench_accuracy.yaml
+```
+
+See `accuracy/RUNBOOK.md` for preconditions, sanity checks, and common failure
+modes.
@@ -0,0 +1,54 @@
+# SWE-bench Accuracy Smoke-Test Runbook
+
+End-to-end validation for the SWE-bench accuracy pipeline. Unit tests mock all
+subprocesses, so running the real pipeline is the only way to catch Docker,
+HuggingFace access, or mini-swe-agent wiring issues.
+
+## 0. Preconditions
+
+- Docker daemon running (swebench harness spawns one container per instance).
+- Docker Hub auth or a pre-seeded image cache for uncached SWE-bench images.
+- Network egress to PyPI and HuggingFace Hub.
+- `uv` binary on PATH (`curl -LsSf https://astral.sh/uv/install.sh | sh`).
+- Parent endpoints env already synced (`uv sync --extra dev` from repo root).
+
+## 1. Sync the accuracy subproject
+
+From the repo root:
+
+```bash
+cd examples/10_Agentic_Inference/accuracy
+uv sync
+```
+
+Sanity check:
+
+```bash
+uv run mini-extra --help
+uv run python -m swebench.harness.run_evaluation --help
+```
+
+Override the default subproject path via env var if needed:
+
+```bash
+export SWE_BENCH_PROJECT_PATH=/path/to/examples/10_Agentic_Inference/accuracy
+```
+
+## 2. End-to-end test (requires live endpoint)
+
+```bash
+uv run inference-endpoint benchmark from-config \
+  --config examples/10_Agentic_Inference/swe_bench_accuracy.yaml
+```
+
+Scorer preflight now resolves the requested SWE-bench instances and pre-pulls any
+missing Docker images before `mini-extra swebench` starts. Cached images are
+skipped.
+
+## Common failure modes
+
+| Symptom                                              | Likely cause                          | Fix                                                       |
+| ---------------------------------------------------- | ------------------------------------- | --------------------------------------------------------- |
+| `FileNotFoundError: SWE-bench subproject not found`  | subproject not synced                 | Run `uv sync` in `examples/10_Agentic_Inference/accuracy` |
+| Docker error during `run_evaluation`                 | Docker daemon not running             | Start Docker and retry                                    |
+| `Failed to pre-pull required SWE-bench Docker image` | Docker Hub rate limit or missing auth | Run `docker login` or use a local image cache/mirror      |
@@ -0,0 +1,29 @@
+# Isolated uv project for the SWE-bench accuracy evaluator.
+#
+# mini-swe-agent and swebench pin specific versions of litellm, docker,
+# and other packages that are not part of the parent endpoints env. Keeping
+# the swebench env separate means the parent lockfile stays solvable and
+# the evaluation env stays reproducible.
+#
+# `inference_endpoint.evaluation.scoring.SWEBenchScorer` invokes
+# mini-extra and swebench.harness.run_evaluation via `uv run --project`,
+# so the main benchmark process never needs to import these packages.
+#
+# Usage on the accuracy host:
+#   cd examples/10_Agentic_Inference/accuracy
+#   uv sync
+#   # SWEBenchScorer in the parent will shell out automatically.
+
+[project]
+name = "swe-bench-accuracy"
+version = "0.1.0"
+description = "Isolated SWE-bench accuracy environment for the multi-turn agentic benchmark."
+requires-python = ">=3.12"
+dependencies = [
+    "mini-swe-agent==2.3.0",
+    "swebench==4.1.0",
+]
+
+[tool.uv]
+# Script-runner env: no build, no install of this project itself.
+package = false
@@ -23,6 +23,13 @@ datasets:
       num_trajectories_to_issue: 990 # Should be integer multiple of 990.
       # Required benchmark default; set to true only for faster optimization/debug runs.
       stop_issuing_on_first_user_complete: false
+  - name: swe_bench
+    type: "accuracy"
+    accuracy_config:
+      eval_method: "swe_bench_scorer"
+      num_repeats: 1
+      extras:
+        num_instances: 200
 
 settings:
   runtime:

@@ -0,0 +1,48 @@
+name: "qwen-agentic-benchmark"
+version: "1.0"
+type: "online"
+
+model_params:
+  name: "Qwen/Qwen3.6-35B-A3B"
+  temperature: 1.0
+  top_k: 20
+  top_p: 0.95
+  repetition_penalty: 1.0
+  presence_penalty: 1.5
+  max_new_tokens: 8192
+  chat_template_kwargs:
+    preserve_thinking: true
+
+datasets:
+  - name: agentic_coding
+    type: performance
+    path: /path/to/agentic_combined.jsonl
+    accuracy_config:
+      eval_method: agentic_inference_inline # required benchmark default.
+    agentic_inference:
+      turn_timeout_s: 14400.0
+      enable_salt: true # do not change.
+      inject_tool_delay: true # do not change.
+  - name: swe_bench
+    type: "accuracy"
+    accuracy_config:
+      eval_method: "swe_bench_scorer"
+      num_repeats: 1
+      extras:
+        num_instances: 200
+
+settings:
+  runtime:
+    min_duration_ms: 0
+    max_duration_ms: 36000000
+
+  load_pattern:
+    type: agentic_inference
+    target_concurrency: 8 # Submission-specific concurrency.
+
+endpoint_config:
+  endpoints:
+    - "http://localhost:30000"
+  api_type: openai
+
+report_dir: logs/qwen_agentic
@@ -0,0 +1,42 @@
+type: "online"
+
+model_params:
+  name: "Qwen/Qwen3.6-35B-A3B"
+  temperature: 1.0
+  top_p: 0.95
+  top_k: 20
+  repetition_penalty: 1.0
+  presence_penalty: 1.5
+  max_new_tokens: 8192
+  chat_template_kwargs:
+    preserve_thinking: true
+
+datasets:
+  # Minimal performance dataset required by the framework.
+  - name: swe_bench_perf
+    type: "performance"
+    path: "tests/assets/datasets/dummy_1k.jsonl"
+    parser:
+      prompt: text_input
+
+  # Accuracy dataset — instance_id rows tell mini-swe-agent which instances to run.
+  # First run downloads ~10 MB from HuggingFace and caches to datasets_dir.
+  - name: swe_bench
+    type: "accuracy"
+    accuracy_config:
+      eval_method: "swe_bench_scorer"
+      num_repeats: 1
+      extras:
+        num_instances: 200
+
+settings:
+  load_pattern:
+    type: "concurrency"
+    target_concurrency: 10 # mini-extra inherits target_concurrency from performance dataset
+  runtime:
+    n_samples_to_issue: 10
+
+endpoint_config:
+  endpoints:
+    - "http://localhost:30000"
+  api_type: "openai"