diff --git a/examples/10_Agentic_Inference/README.md b/examples/10_Agentic_Inference/README.md index ab3673b51..932e42db5 100644 --- a/examples/10_Agentic_Inference/README.md +++ b/examples/10_Agentic_Inference/README.md @@ -194,3 +194,29 @@ Update the first `datasets` entry (`name` and `path`), `model_params.name`, and uv run inference-endpoint benchmark from-config \ --config examples/10_Agentic_Inference/kimi_agentic_benchmark.yaml ``` + +## SWE-bench Accuracy + +`swe_bench_accuracy.yaml` runs the SWE-bench accuracy evaluation alongside a +minimal performance dataset. The benchmark framework skips its built-in +accuracy phase for this dataset; instead, `SWEBenchScorer` shells out to +`mini-swe-agent` and the `swebench` evaluation harness, and that external flow +drives requests to the configured endpoint. + +The isolated `uv` environment for those tools lives in `accuracy/`. Sync it +once before running: + +```bash +cd examples/10_Agentic_Inference/accuracy +uv sync +``` + +Then run the benchmark from the repo root: + +```bash +uv run inference-endpoint benchmark from-config \ + --config examples/10_Agentic_Inference/swe_bench_accuracy.yaml +``` + +See `accuracy/RUNBOOK.md` for preconditions, sanity checks, and common failure +modes. diff --git a/examples/10_Agentic_Inference/accuracy/RUNBOOK.md b/examples/10_Agentic_Inference/accuracy/RUNBOOK.md new file mode 100644 index 000000000..7ad03c122 --- /dev/null +++ b/examples/10_Agentic_Inference/accuracy/RUNBOOK.md @@ -0,0 +1,54 @@ +# SWE-bench Accuracy Smoke-Test Runbook + +End-to-end validation for the SWE-bench accuracy pipeline. Unit tests mock all +subprocesses, so running the real pipeline is the only way to catch Docker, +HuggingFace access, or mini-swe-agent wiring issues. + +## 0. Preconditions + +- Docker daemon running (swebench harness spawns one container per instance). +- Docker Hub auth or a pre-seeded image cache for uncached SWE-bench images. +- Network egress to PyPI and HuggingFace Hub. +- `uv` binary on PATH (`curl -LsSf https://astral.sh/uv/install.sh | sh`). +- Parent endpoints env already synced (`uv sync --extra dev` from repo root). + +## 1. Sync the accuracy subproject + +From the repo root: + +```bash +cd examples/10_Agentic_Inference/accuracy +uv sync +``` + +Sanity check: + +```bash +uv run mini-extra --help +uv run python -m swebench.harness.run_evaluation --help +``` + +Override the default subproject path via env var if needed: + +```bash +export SWE_BENCH_PROJECT_PATH=/path/to/examples/10_Agentic_Inference/accuracy +``` + +## 2. End-to-end test (requires live endpoint) + +```bash +uv run inference-endpoint benchmark from-config \ + --config examples/10_Agentic_Inference/swe_bench_accuracy.yaml +``` + +Scorer preflight now resolves the requested SWE-bench instances and pre-pulls any +missing Docker images before `mini-extra swebench` starts. Cached images are +skipped. + +## Common failure modes + +| Symptom | Likely cause | Fix | +| ---------------------------------------------------- | ------------------------------------- | --------------------------------------------------------- | +| `FileNotFoundError: SWE-bench subproject not found` | subproject not synced | Run `uv sync` in `examples/10_Agentic_Inference/accuracy` | +| Docker error during `run_evaluation` | Docker daemon not running | Start Docker and retry | +| `Failed to pre-pull required SWE-bench Docker image` | Docker Hub rate limit or missing auth | Run `docker login` or use a local image cache/mirror | diff --git a/examples/10_Agentic_Inference/accuracy/pyproject.toml b/examples/10_Agentic_Inference/accuracy/pyproject.toml new file mode 100644 index 000000000..14482a29f --- /dev/null +++ b/examples/10_Agentic_Inference/accuracy/pyproject.toml @@ -0,0 +1,29 @@ +# Isolated uv project for the SWE-bench accuracy evaluator. +# +# mini-swe-agent and swebench pin specific versions of litellm, docker, +# and other packages that are not part of the parent endpoints env. Keeping +# the swebench env separate means the parent lockfile stays solvable and +# the evaluation env stays reproducible. +# +# `inference_endpoint.evaluation.scoring.SWEBenchScorer` invokes +# mini-extra and swebench.harness.run_evaluation via `uv run --project`, +# so the main benchmark process never needs to import these packages. +# +# Usage on the accuracy host: +# cd examples/10_Agentic_Inference/accuracy +# uv sync +# # SWEBenchScorer in the parent will shell out automatically. + +[project] +name = "swe-bench-accuracy" +version = "0.1.0" +description = "Isolated SWE-bench accuracy environment for the multi-turn agentic benchmark." +requires-python = ">=3.12" +dependencies = [ + "mini-swe-agent==2.3.0", + "swebench==4.1.0", +] + +[tool.uv] +# Script-runner env: no build, no install of this project itself. +package = false diff --git a/examples/10_Agentic_Inference/kimi_agentic_benchmark.yaml b/examples/10_Agentic_Inference/kimi_agentic_benchmark.yaml index 9740aa4c1..b513d0849 100644 --- a/examples/10_Agentic_Inference/kimi_agentic_benchmark.yaml +++ b/examples/10_Agentic_Inference/kimi_agentic_benchmark.yaml @@ -23,6 +23,13 @@ datasets: num_trajectories_to_issue: 990 # Should be integer multiple of 990. # Required benchmark default; set to true only for faster optimization/debug runs. stop_issuing_on_first_user_complete: false + - name: swe_bench + type: "accuracy" + accuracy_config: + eval_method: "swe_bench_scorer" + num_repeats: 1 + extras: + num_instances: 200 settings: runtime: diff --git a/examples/10_Agentic_Inference/qwen_agentic_benchmark.yaml b/examples/10_Agentic_Inference/qwen_agentic_benchmark.yaml new file mode 100644 index 000000000..415b3e68c --- /dev/null +++ b/examples/10_Agentic_Inference/qwen_agentic_benchmark.yaml @@ -0,0 +1,48 @@ +name: "qwen-agentic-benchmark" +version: "1.0" +type: "online" + +model_params: + name: "Qwen/Qwen3.6-35B-A3B" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + repetition_penalty: 1.0 + presence_penalty: 1.5 + max_new_tokens: 8192 + chat_template_kwargs: + preserve_thinking: true + +datasets: + - name: agentic_coding + type: performance + path: /path/to/agentic_combined.jsonl + accuracy_config: + eval_method: agentic_inference_inline # required benchmark default. + agentic_inference: + turn_timeout_s: 14400.0 + enable_salt: true # do not change. + inject_tool_delay: true # do not change. + - name: swe_bench + type: "accuracy" + accuracy_config: + eval_method: "swe_bench_scorer" + num_repeats: 1 + extras: + num_instances: 200 + +settings: + runtime: + min_duration_ms: 0 + max_duration_ms: 36000000 + + load_pattern: + type: agentic_inference + target_concurrency: 8 # Submission-specific concurrency. + +endpoint_config: + endpoints: + - "http://localhost:30000" + api_type: openai + +report_dir: logs/qwen_agentic diff --git a/examples/10_Agentic_Inference/swe_bench_accuracy.yaml b/examples/10_Agentic_Inference/swe_bench_accuracy.yaml new file mode 100644 index 000000000..8508b0129 --- /dev/null +++ b/examples/10_Agentic_Inference/swe_bench_accuracy.yaml @@ -0,0 +1,42 @@ +type: "online" + +model_params: + name: "Qwen/Qwen3.6-35B-A3B" + temperature: 1.0 + top_p: 0.95 + top_k: 20 + repetition_penalty: 1.0 + presence_penalty: 1.5 + max_new_tokens: 8192 + chat_template_kwargs: + preserve_thinking: true + +datasets: + # Minimal performance dataset required by the framework. + - name: swe_bench_perf + type: "performance" + path: "tests/assets/datasets/dummy_1k.jsonl" + parser: + prompt: text_input + + # Accuracy dataset — instance_id rows tell mini-swe-agent which instances to run. + # First run downloads ~10 MB from HuggingFace and caches to datasets_dir. + - name: swe_bench + type: "accuracy" + accuracy_config: + eval_method: "swe_bench_scorer" + num_repeats: 1 + extras: + num_instances: 200 + +settings: + load_pattern: + type: "concurrency" + target_concurrency: 10 # mini-extra inherits target_concurrency from performance dataset + runtime: + n_samples_to_issue: 10 + +endpoint_config: + endpoints: + - "http://localhost:30000" + api_type: "openai" diff --git a/examples/10_Agentic_Inference/swebench_template.yaml b/examples/10_Agentic_Inference/swebench_template.yaml new file mode 100644 index 000000000..9b37ec5ba --- /dev/null +++ b/examples/10_Agentic_Inference/swebench_template.yaml @@ -0,0 +1,186 @@ +agent: + system_template: | + You are a helpful assistant that can interact with a computer shell to solve programming tasks. + instance_template: | + + Consider the following PR description: + {{task}} + + + + # Task Instructions + + ## Overview + + You're a software engineer interacting continuously with a computer by submitting commands. + You'll be helping implement necessary changes to meet requirements in the PR description. + Your task is specifically to make changes to non-test files in the current directory in order to fix the issue described in the PR description in a way that is general and consistent with the codebase. + This is an interactive process where you will think and issue AT LEAST ONE command, see the result, then think and issue your next command(s). + + For each response: + + 1. Include a THOUGHT section explaining your reasoning and what you're trying to accomplish + 2. Provide one or more bash tool calls to execute + + ## Important Boundaries + + - MODIFY: Regular source code files in /testbed (this is the working directory for all your subsequent commands) + - DO NOT MODIFY: Tests, configuration files (pyproject.toml, setup.cfg, etc.) + + ## Recommended Workflow + + 1. Analyze the codebase by finding and reading relevant files + 2. Create a script to reproduce the issue + 3. Edit the source code to resolve the issue + 4. Verify your fix works by running your script again + 5. Test edge cases to ensure your fix is robust + + ## Command Execution Rules + + You are operating in an environment where + + 1. You issue at least one command + 2. The system executes the command(s) in a subshell + 3. You see the result(s) + 4. You write your next command(s) + + Each response should include: + + 1. **Reasoning text** where you explain your analysis and plan + 2. At least one tool call with your command + + **CRITICAL REQUIREMENTS:** + + - Your response SHOULD include reasoning text explaining what you're doing + - Your response MUST include AT LEAST ONE bash tool call. You can make MULTIPLE tool calls in a single response when the commands are independent (e.g., searching multiple files, reading different parts of the codebase). + - Directory or environment variable changes are not persistent. Every action is executed in a new subshell. + - However, you can prefix any action with `MY_ENV_VAR=MY_VALUE cd /path/to/working/dir && ...` or write/load environment variables from files + + Example of a CORRECT response: + + I need to understand the Builder-related code. Let me find relevant files and check the project structure. + + [Makes multiple bash tool calls: {"command": "ls -la"}, {"command": "find src -name '*.java' | grep -i builder"}, {"command": "cat README.md | head -50"}] + + + ## Environment Details + + - You have a full Linux shell environment + - Always use non-interactive flags (-y, -f) for commands + - Avoid interactive tools like vi, nano, or any that require user input + - You can use bash commands or invoke any tool that is available in the environment + - You can also create new tools or scripts to help you with the task + - If a tool isn't available, you can also install it + + ## Submission + + When you've completed your work, you MUST submit your changes as a git patch. + Follow these steps IN ORDER, with SEPARATE commands: + + Step 1: Create the patch file + Run `git diff -- path/to/file1 path/to/file2 > patch.txt` listing only the source files you modified. + Do NOT commit your changes. + + + The patch must only contain changes to the specific source files you modified to fix the issue. + Do not submit file creations or changes to any of the following files: + + - test and reproduction files + - helper scripts, tests, or tools that you created + - installation, build, packaging, configuration, or setup scripts unless they are directly part of the issue you were fixing (you can assume that the environment is already set up for your client) + - binary or compiled files + + + Step 2: Verify your patch + Inspect patch.txt to confirm it only contains your intended changes and headers show `--- a/` and `+++ b/` paths. + + Step 3: Submit (EXACT command required) + You MUST use this EXACT command to submit: + + ```bash + echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT && cat patch.txt + ``` + + If the command fails (nonzero exit status), it will not submit. + + + - Creating/viewing the patch and submitting it MUST be separate commands (not combined with &&). + - If you modify patch.txt after verifying, you SHOULD verify again before submitting. + - You CANNOT continue working (reading, editing, testing) in any way on this task after submitting. + + + step_limit: 250 + cost_limit: 3. + +environment: + cwd: "/testbed" + timeout: 3600 + interpreter: ["bash", "-c"] + env: + PAGER: cat + MANPAGER: cat + LESS: -R + PIP_PROGRESS_BAR: "off" + TQDM_DISABLE: "1" + environment_class: docker + pull_timeout: 3600 + container_timeout: 10h + +model: + cost_tracking: "ignore_errors" + observation_template: | + {% if output.exception_info -%} + {{output.exception_info}} + {% endif -%} + {{output.returncode}} + {% if output.output | length < 10000 -%} + + {{ output.output -}} + + {%- else -%} + + The output of your last command was too long. + Please try a different command that produces less output. + If you're looking at a file you can try use head, tail or sed to view a smaller number of lines selectively. + If you're using grep or find and it produced too much output, you can use a more selective search pattern. + If you really need to see something from the full command's output, you can redirect output to a file and then search in that file. + + {%- set elided_chars = output.output | length - 10000 -%} + + {{ output.output[:5000] }} + + + {{ elided_chars }} characters elided + + + {{ output.output[-5000:] }} + + {%- endif -%} + format_error_template: | + Tool call error: + + + {{error}} + + + Here is general guidance on how to submit correct toolcalls: + + Every response needs to use the 'bash' tool at least once to execute commands. + + Call the bash tool with your command as the argument: + - Tool: bash + - Arguments: {"command": "your_command_here"} + + If you have completed your assignment, please consult the first message about how to + submit your solution (you will not be able to continue working on this task after that). + # Patched at runtime by SWEBenchScorer from model_params and endpoint_config + model_name: "" + model_kwargs: + custom_llm_provider: "openai" + api_key: "test" + drop_params: true + parallel_tool_calls: true + api_base: "" + # Sampling parameters (temperature, top_p, top_k, etc.) are injected at + # runtime from the benchmark config's model_params block — absent here so + # the model's own defaults apply when not specified in model_params. diff --git a/pyproject.toml b/pyproject.toml index 4a7655021..429cdcf4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,6 +73,7 @@ dependencies = [ # Fix pytz-2024 import warning "pytz==2026.1.post1", "urllib3==2.7.0", + "pyyaml==6.0.3", ] [project.optional-dependencies] diff --git a/src/inference_endpoint/commands/benchmark/execute.py b/src/inference_endpoint/commands/benchmark/execute.py index a2050bbe3..13ee87632 100644 --- a/src/inference_endpoint/commands/benchmark/execute.py +++ b/src/inference_endpoint/commands/benchmark/execute.py @@ -285,6 +285,10 @@ def _load_datasets( ) assert acc_cfg.accuracy_config is not None + extras = acc_cfg.accuracy_config.extras or {} + + scorer_cls.preflight(extras) + ds = DataLoaderFactory.create_loader( acc_cfg, num_repeats=acc_cfg.accuracy_config.num_repeats ) @@ -299,7 +303,7 @@ def _load_datasets( report_dir, acc_cfg.accuracy_config.ground_truth, acc_cfg.accuracy_config.num_repeats, - acc_cfg.accuracy_config.extras or {}, + extras, ) ) ds.load( @@ -313,6 +317,14 @@ def _load_datasets( raise InputValidationError("Multiple performance datasets not supported") perf_cfg = performance_cfgs[0] + perf_cls = Dataset.PREDEFINED.get(perf_cfg.name) + if perf_cls is not None and perf_cls.ACCURACY_ONLY: + raise InputValidationError( + f"Dataset '{perf_cfg.name}' is accuracy-only and cannot be used " + "as a performance dataset. Use a different dataset (e.g. 'random') for the " + "performance phase." + ) + try: dataloader = DataLoaderFactory.create_loader(perf_cfg) dataloader.load( @@ -320,9 +332,7 @@ def _load_datasets( ) logger.info(f"Loaded {dataloader.num_samples()} samples") except FileNotFoundError as e: - raise InputValidationError( - f"Dataset file not found: {performance_cfgs[0].path}" - ) from e + raise InputValidationError(f"Dataset file not found: {perf_cfg.path}") from e except Exception as e: raise SetupError(f"Failed to load dataset: {e}") from e @@ -337,6 +347,7 @@ def _load_datasets( scorer_cls, extractor_cls = _resolve_accuracy_components( perf_cfg.name, accuracy_config ) + scorer_cls.preflight(accuracy_config.extras or {}) eval_configs.append( AccuracyConfiguration( @@ -399,8 +410,11 @@ def setup_benchmark(config: BenchmarkConfig, test_mode: TestMode) -> BenchmarkCo # Calculate and display expected sample count total_samples = rt_settings.total_samples_to_issue() - if accuracy_datasets: - total_samples += sum(ds.num_samples() * ds.repeats for ds in accuracy_datasets) + total_samples += sum( + ec.dataset.num_samples() * ec.dataset.repeats + for ec in eval_configs + if not ec.scorer.SKIP_ENDPOINT_PHASE and ec.dataset_name != "performance" + ) collect_responses = test_mode in (TestMode.ACC, TestMode.BOTH) logger.info( @@ -409,6 +423,16 @@ def setup_benchmark(config: BenchmarkConfig, test_mode: TestMode) -> BenchmarkCo logger.info( f"Min Duration: {rt_settings.min_duration_ms / 1000:.1f}s, Expected samples: {total_samples}" ) + for ec in eval_configs: + if ec.scorer.SKIP_ENDPOINT_PHASE: + n = ec.scorer.external_sample_count(ec.extras) + if n is not None: + logger.info( + "Accuracy dataset '%s' (%s): %d instances evaluated externally", + ec.dataset_name, + ec.scorer.SCORER_ID, + n, + ) return BenchmarkContext( config=config, @@ -477,6 +501,8 @@ def _build_phases( # Accuracy phases — use eval_cfg.dataset_name as phase name so it matches # what Scorer._load_sample_index_map() looks up in sample_idx_map.json for eval_cfg in ctx.eval_configs: + if eval_cfg.scorer.SKIP_ENDPOINT_PHASE: + continue if eval_cfg.dataset_name == "performance": continue acc_ds = eval_cfg.dataset @@ -905,8 +931,12 @@ def finalize_benchmark(ctx: BenchmarkContext, bench: BenchmarkResult) -> None: **eval_cfg.extras, ) score, n_repeats = scorer_instance.score() - assert eval_cfg.dataset.data is not None - num_samples = len(eval_cfg.dataset.data) + if eval_cfg.dataset.data is not None: + num_samples = len(eval_cfg.dataset.data) + elif eval_cfg.dataset.dataframe is not None: + num_samples = len(eval_cfg.dataset.dataframe) + else: + num_samples = 0 if eval_cfg.dataset_name == "performance": num_samples = sum(phase.issued_count for phase in result.perf_results) accuracy_scores[eval_cfg.dataset_name] = { diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py index 9226d7f85..0b84b2da2 100644 --- a/src/inference_endpoint/config/schema.py +++ b/src/inference_endpoint/config/schema.py @@ -101,6 +101,7 @@ class ScorerMethod(str, Enum): SHOPIFY_CATEGORY_F1 = "shopify_category_f1" AGENTIC_INFERENCE_INLINE = "agentic_inference_inline" VBENCH = "vbench" + SWE_BENCH = "swe_bench_scorer" class TestMode(str, Enum): @@ -860,6 +861,34 @@ def _resolve_and_validate(self) -> Self: f"got '{lp.type}'" ) + # For swe_bench_scorer, forward target_concurrency as workers when the + # user has not set it explicitly. mini-swe-agent's parallelism should + # match the endpoint's concurrency budget. + concurrency = ( + lp.target_concurrency + if lp.type + in (LoadPatternType.CONCURRENCY, LoadPatternType.AGENTIC_INFERENCE) + and lp.target_concurrency + else None + ) + if concurrency is not None and self.datasets: + updated_datasets = [] + changed = False + for ds in self.datasets: + acc = ds.accuracy_config + if ( + acc is not None + and acc.eval_method == ScorerMethod.SWE_BENCH + and (acc.extras is None or "workers" not in acc.extras) + ): + new_extras = {**(acc.extras or {}), "workers": concurrency} + new_acc = acc.model_copy(update={"extras": new_extras}) + ds = ds.model_copy(update={"accuracy_config": new_acc}) + changed = True + updated_datasets.append(ds) + if changed: + object.__setattr__(self, "datasets", updated_datasets) + return self @model_validator(mode="after") diff --git a/src/inference_endpoint/config/templates/concurrency_template_full.yaml b/src/inference_endpoint/config/templates/concurrency_template_full.yaml index 38829f0f5..3239aa08d 100644 --- a/src/inference_endpoint/config/templates/concurrency_template_full.yaml +++ b/src/inference_endpoint/config/templates/concurrency_template_full.yaml @@ -37,7 +37,7 @@ datasets: # Dataset configs prompt: question system: system_prompt accuracy_config: # Accuracy evaluation settings - eval_method: pass_at_1 # Scorer method | options: pass_at_1, string_match, rouge, code_bench_scorer, shopify_category_f1, agentic_inference_inline, vbench + eval_method: pass_at_1 # Scorer method | options: pass_at_1, string_match, rouge, code_bench_scorer, shopify_category_f1, agentic_inference_inline, vbench, swe_bench_scorer ground_truth: ground_truth # Ground truth column name extractor: boxed_math_extractor # Answer extractor (abcd_extractor, boxed_math_extractor, identity_extractor, python_code_extractor) num_repeats: 1 # Repeat dataset N times for evaluation diff --git a/src/inference_endpoint/config/templates/offline_template_full.yaml b/src/inference_endpoint/config/templates/offline_template_full.yaml index c3454d5da..476a27ef4 100644 --- a/src/inference_endpoint/config/templates/offline_template_full.yaml +++ b/src/inference_endpoint/config/templates/offline_template_full.yaml @@ -37,7 +37,7 @@ datasets: # Dataset configs prompt: question system: system_prompt accuracy_config: # Accuracy evaluation settings - eval_method: pass_at_1 # Scorer method | options: pass_at_1, string_match, rouge, code_bench_scorer, shopify_category_f1, agentic_inference_inline, vbench + eval_method: pass_at_1 # Scorer method | options: pass_at_1, string_match, rouge, code_bench_scorer, shopify_category_f1, agentic_inference_inline, vbench, swe_bench_scorer ground_truth: ground_truth # Ground truth column name extractor: boxed_math_extractor # Answer extractor (abcd_extractor, boxed_math_extractor, identity_extractor, python_code_extractor) num_repeats: 1 # Repeat dataset N times for evaluation diff --git a/src/inference_endpoint/config/templates/online_template_full.yaml b/src/inference_endpoint/config/templates/online_template_full.yaml index 5bea95329..266426f4d 100644 --- a/src/inference_endpoint/config/templates/online_template_full.yaml +++ b/src/inference_endpoint/config/templates/online_template_full.yaml @@ -37,7 +37,7 @@ datasets: # Dataset configs prompt: question system: system_prompt accuracy_config: # Accuracy evaluation settings - eval_method: pass_at_1 # Scorer method | options: pass_at_1, string_match, rouge, code_bench_scorer, shopify_category_f1, agentic_inference_inline, vbench + eval_method: pass_at_1 # Scorer method | options: pass_at_1, string_match, rouge, code_bench_scorer, shopify_category_f1, agentic_inference_inline, vbench, swe_bench_scorer ground_truth: ground_truth # Ground truth column name extractor: boxed_math_extractor # Answer extractor (abcd_extractor, boxed_math_extractor, identity_extractor, python_code_extractor) num_repeats: 1 # Repeat dataset N times for evaluation diff --git a/src/inference_endpoint/dataset_manager/__init__.py b/src/inference_endpoint/dataset_manager/__init__.py index 15525fb50..ac314d3f9 100644 --- a/src/inference_endpoint/dataset_manager/__init__.py +++ b/src/inference_endpoint/dataset_manager/__init__.py @@ -32,6 +32,7 @@ ShopifyProductCatalogue, ShopifyProductCatalogue8k, ) +from .predefined.swe_bench import SWEBench from .transforms import ( AddStaticColumns, ColumnFilter, @@ -63,5 +64,6 @@ "RandomDataset", "ShopifyProductCatalogue", "ShopifyProductCatalogue8k", + "SWEBench", "AgenticInferenceDataset", ] diff --git a/src/inference_endpoint/dataset_manager/dataset.py b/src/inference_endpoint/dataset_manager/dataset.py index 963ded391..2281f5184 100644 --- a/src/inference_endpoint/dataset_manager/dataset.py +++ b/src/inference_endpoint/dataset_manager/dataset.py @@ -276,6 +276,10 @@ class Dataset: DATASET_ID: ClassVar[str] """The unique identifier for the dataset. Automatically set by __init_subclass__.""" + ACCURACY_ONLY: ClassVar[bool] = False + """If True, this dataset may only be used as an accuracy dataset (type: accuracy). + Using it as a performance dataset raises InputValidationError at load time.""" + def __init_subclass__( cls, dataset_id: str | None = None, diff --git a/src/inference_endpoint/dataset_manager/predefined/swe_bench/__init__.py b/src/inference_endpoint/dataset_manager/predefined/swe_bench/__init__.py new file mode 100644 index 000000000..72b54383c --- /dev/null +++ b/src/inference_endpoint/dataset_manager/predefined/swe_bench/__init__.py @@ -0,0 +1,116 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from logging import getLogger +from pathlib import Path + +import pandas as pd + +from ...dataset import Dataset, load_from_huggingface + +logger = getLogger(__name__) + +_REPO_MAP = { + "verified": "princeton-nlp/SWE-bench_Verified", + "lite": "princeton-nlp/SWE-bench_Lite", +} + + +class SWEBench( + Dataset, + dataset_id="swe_bench", +): + """SWE-bench: Software Engineering Benchmark for LLM agents. + + Loads instance IDs and problem statements from the SWE-bench Verified or + Lite subset. Used as the accuracy dataset for the swe_bench_scorer, which + runs mini-swe-agent against a live endpoint and grades patches with the + SWE-bench evaluation harness. + + The ``instance_id`` column identifies which instances mini-swe-agent will + evaluate. The endpoint phase is skipped entirely for this scorer + (``SKIP_ENDPOINT_PHASE=True``); ``SWEBenchScorer`` drives the agent + subprocess directly against the configured endpoint. + + Using this dataset as a performance dataset (type: performance) is not + meaningful — problem statements sent directly to the model without an + agent framework don't reflect real SWE-bench usage. Use a different + dataset (e.g. ``random``) for the performance phase. + """ + + ACCURACY_ONLY = True + COLUMN_NAMES = ["instance_id", "prompt"] + + @classmethod + def hf_dataset_name(cls, subset: str) -> str: + hf_path = _REPO_MAP.get(subset) + if hf_path is None: + raise ValueError( + f"Unknown SWE-bench subset {subset!r}; choose from: {list(_REPO_MAP)}" + ) + return hf_path + + @classmethod + def generate( + cls, + datasets_dir: Path, + subset: str = "verified", + force: bool = False, + ) -> pd.DataFrame: + """Download and cache the SWE-bench dataset from HuggingFace. + + Args: + datasets_dir: Root cache directory. Parquet is written under + ``datasets_dir/swe_bench/{subset}/``. + subset: ``"verified"`` (500 instances) or ``"lite"`` (300 instances). + force: Re-download even if the local parquet cache exists. + + Returns: + DataFrame with columns ``instance_id`` and ``prompt``. + """ + hf_path = cls.hf_dataset_name(subset) + + dst_path = datasets_dir / "swe_bench" / subset / f"swe_bench_{subset}.parquet" + if dst_path.exists() and not force: + logger.info("Loading SWE-bench %s from cache: %s", subset, dst_path) + try: + return pd.read_parquet(dst_path) + except Exception as e: + raise RuntimeError( + f"Cached SWE-bench parquet at {dst_path} appears corrupt ({e}). " + "Delete it or pass force=True to re-download." + ) from e + + try: + df = load_from_huggingface( + hf_path, + split="test", + cache_dir=datasets_dir / "hf_cache" / f"swe_bench_{subset}", + ) + except Exception as e: + logger.error("Error loading SWE-bench %s from HuggingFace: %s", subset, e) + raise + + result = ( + df[["instance_id", "problem_statement"]] + .rename(columns={"problem_statement": "prompt"}) + .reset_index(drop=True) + ) + dst_path.parent.mkdir(parents=True, exist_ok=True) + result.to_parquet(dst_path) + logger.info( + "Saved %d SWE-bench %s instances to %s", len(result), subset, dst_path + ) + return result diff --git a/src/inference_endpoint/evaluation/scoring.py b/src/inference_endpoint/evaluation/scoring.py index f9419703a..54fec7619 100644 --- a/src/inference_endpoint/evaluation/scoring.py +++ b/src/inference_endpoint/evaluation/scoring.py @@ -10,7 +10,7 @@ # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific permissions and +# See the License for the specific language governing permissions and # limitations under the License. @@ -23,6 +23,7 @@ import subprocess import sys import tempfile +import threading import uuid from abc import ABC, abstractmethod from collections import Counter, defaultdict @@ -33,6 +34,7 @@ import msgspec.json import numpy as np import pandas as pd +import yaml from pydantic import ValidationError from tqdm import tqdm @@ -53,6 +55,8 @@ from ..dataset_manager.agentic_inference_dataset import AgenticInferenceDataset from ..dataset_manager.dataset import Dataset from ..dataset_manager.predefined.shopify_product_catalogue import ProductMetadata +from ..dataset_manager.predefined.swe_bench import SWEBench +from ..exceptions import SetupError from .extractor import Extractor, PythonCodeExtractor logger = logging.getLogger(__name__) @@ -67,6 +71,7 @@ class Scorer(ABC): PREDEFINED: ClassVar[dict[str, type["Scorer"]]] = {} SCORER_ID: ClassVar[str] REQUIRES_EXTRACTOR: ClassVar[bool] = True + SKIP_ENDPOINT_PHASE: ClassVar[bool] = False def __init_subclass__( cls, @@ -106,6 +111,21 @@ def available_scorers(cls) -> list[str]: """Return the list of registered scorer names.""" return list(Scorer.PREDEFINED.keys()) + @classmethod + def external_sample_count(cls, extras: dict[str, Any]) -> int | None: + """Return the number of samples the scorer will evaluate externally, or None. + + Used to surface sample counts for scorers that skip the endpoint phase and + manage their own evaluation (e.g. `SWEBenchScorer`). + The default returns None (scorer uses the endpoint accuracy phase normally). + """ + return None + + @classmethod # noqa: B027 — intentional no-op default; subclasses override when needed + def preflight(cls, extras: dict[str, Any]) -> None: + """Verify external dependencies before the benchmark starts. No-op by default.""" + pass + def __init__( self, dataset_name: str, @@ -122,7 +142,9 @@ def __init__( self.ground_truth_column = ( ground_truth_column if ground_truth_column is not None else "ground_truth" ) - self.sample_index_map = self._load_sample_index_map() + self.sample_index_map: dict | None = ( + None if self.SKIP_ENDPOINT_PHASE else self._load_sample_index_map() + ) def _load_sample_index_map(self): sample_index_map_path = self.report_dir / "sample_idx_map.json" @@ -163,6 +185,7 @@ def get_outputs(self): def match_sample_index(self, row: pd.Series) -> pd.Series: # Pandas Apply function to create a new 'sample_index' column + assert self.sample_index_map is not None row["sample_index"] = self.sample_index_map[row["sample_uuid"]] return row @@ -177,6 +200,10 @@ def score(self) -> tuple[float | None, int]: tuple[float | None, int]: The mean score and the number of repeats. Returns None as the score if evaluation fails. """ + assert self.sample_index_map is not None, ( + f"{self.__class__.__name__}.SKIP_ENDPOINT_PHASE is True but score() was not " + "overridden; override score() to implement external evaluation." + ) df = self.get_outputs() # Outputs are for all samples, not just the target dataset @@ -273,6 +300,7 @@ def score(self) -> tuple[float, int]: df = self.get_outputs() # Outputs are for all samples, not just the target dataset + assert self.sample_index_map is not None valid_uuids = self.sample_index_map.keys() df = df[df["sample_uuid"].isin(valid_uuids)] @@ -1099,6 +1127,7 @@ def score(self) -> tuple[float | None, int]: df = self.get_outputs() # Outputs are for all samples, not just the target dataset + assert self.sample_index_map is not None valid_uuids = self.sample_index_map.keys() df = df[df["sample_uuid"].isin(valid_uuids)] @@ -1319,6 +1348,7 @@ def score_single_sample(self, value: str, ground_truth: str) -> float: def score(self) -> tuple[float, int]: df = self.get_outputs() + assert self.sample_index_map is not None valid_uuids = self.sample_index_map.keys() df = df[df["sample_uuid"].isin(valid_uuids)] df = df.apply(self.match_sample_index, axis=1) @@ -1369,6 +1399,20 @@ def score(self) -> tuple[float, int]: _VBENCH_PROJECT_PATH_ENV = "VBENCH_PROJECT_PATH" + +def _resolve_subproject_path( + explicit: str | os.PathLike | None, + env_var: str, + default: Path, +) -> Path: + if explicit is not None: + return Path(explicit) + from_env = os.environ.get(env_var) + if from_env: + return Path(from_env) + return default + + # Filenames in `vbench_standard` mode key on the prompt verbatim — VBench looks # the filename's prompt-prefix up in vbench_full_info.json. We can therefore # only reshape unsafe characters, not replace the prompt with a UUID. Slashes @@ -1470,18 +1514,10 @@ def __init__( def _resolve_project_path( explicit: os.PathLike | None, ) -> Path: - """Resolve the VBench subproject path. - - Lookup order: explicit ctor arg → ``$VBENCH_PROJECT_PATH`` env var → - editable-checkout fallback. The env var lets wheel-installed users - point at a synced subproject without patching source. - """ - if explicit is not None: - return Path(explicit) - from_env = os.environ.get(_VBENCH_PROJECT_PATH_ENV) - if from_env: - return Path(from_env) - return Path(_DEFAULT_VBENCH_PROJECT_PATH) + """Lookup order: explicit ctor arg → ``$VBENCH_PROJECT_PATH`` env var → editable-checkout fallback.""" + return _resolve_subproject_path( + explicit, _VBENCH_PROJECT_PATH_ENV, Path(_DEFAULT_VBENCH_PROJECT_PATH) + ) def score_single_sample(self, value: str, ground_truth: str) -> float: raise RuntimeError( @@ -1542,35 +1578,12 @@ def _run_vbench_subprocess( cmd += ["--full-info-json", self.full_info_json_path] log_path = self.report_dir / "vbench_subprocess.log" - try: - completed = subprocess.run( - cmd, - check=False, - stdin=subprocess.DEVNULL, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - timeout=self.subprocess_timeout_s, - ) - except subprocess.TimeoutExpired as e: - partial = ( - e.stdout - if isinstance(e.stdout, str) - else (e.stdout or b"").decode("utf-8", errors="replace") - ) - log_path.write_text(partial) - raise RuntimeError( - f"VBench subprocess timed out after {self.subprocess_timeout_s}s; " - f"see {log_path} for partial output." - ) from e - - log_path.write_text(completed.stdout or "") - if completed.returncode != 0: - tail = "\n".join((completed.stdout or "").splitlines()[-50:]) - raise RuntimeError( - f"VBench subprocess exited with code {completed.returncode}; " - f"full log at {log_path}. Last 50 lines:\n{tail}" - ) + _run_subprocess_with_log( + cmd, + log_path, + timeout_s=self.subprocess_timeout_s, + label="VBench", + ) def _extract_per_dim_scores(self, results: dict[str, Any]) -> list[float]: """Pull each requested dim's aggregate score, with clear errors. @@ -1600,6 +1613,7 @@ def _extract_per_dim_scores(self, results: dict[str, Any]) -> list[float]: def score(self) -> tuple[float | None, int]: df = self.get_outputs() + assert self.sample_index_map is not None valid_uuids = self.sample_index_map.keys() df = df[df["sample_uuid"].isin(valid_uuids)] # Drop failed queries: Scorer.get_outputs() emits "" when record.data @@ -1658,3 +1672,623 @@ def score(self) -> tuple[float | None, int]: per_dim_scores = self._extract_per_dim_scores(results) mean_score = float(np.mean(per_dim_scores)) return mean_score, n_repeats + + +def _run_subprocess_with_log( + cmd: list[str], + log_path: Path, + *, + timeout_s: int | None, + label: str, + cwd: Path | None = None, +) -> None: + """Run *cmd*, capture stdout+stderr to *log_path*, raise on timeout or non-zero exit.""" + try: + completed = subprocess.run( + cmd, + check=False, + stdin=subprocess.DEVNULL, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + timeout=timeout_s, + cwd=str(cwd) if cwd is not None else None, + ) + except subprocess.TimeoutExpired as e: + partial = ( + e.stdout + if isinstance(e.stdout, str) + else (e.stdout or b"").decode("utf-8", errors="replace") + ) + log_path.write_text(partial) + raise RuntimeError( + f"{label} subprocess timed out after {timeout_s}s; " + f"see {log_path} for partial output." + ) from e + log_path.write_text(completed.stdout or "") + if completed.returncode != 0: + tail = "\n".join((completed.stdout or "").splitlines()[-50:]) + raise RuntimeError( + f"{label} subprocess exited with code {completed.returncode}; " + f"full log at {log_path}. Last 50 lines:\n{tail}" + ) + + +_DEFAULT_SWE_BENCH_PROJECT_PATH = ( + Path(__file__).resolve().parents[3] + / "examples" + / "10_Agentic_Inference" + / "accuracy" +) +_SWE_BENCH_PROJECT_PATH_ENV = "SWE_BENCH_PROJECT_PATH" +_DEFAULT_SWE_BENCH_TEMPLATE = ( + Path(__file__).resolve().parents[3] + / "examples" + / "10_Agentic_Inference" + / "swebench_template.yaml" +) + + +def _read_swebench_exit_statuses( + output_dir: Path, ignore: frozenset[Path] +) -> dict[str, list[str]]: + """Read the newest exit_statuses_*.yaml not in *ignore*; return {} if none present.""" + files = [ + f for f in sorted(output_dir.glob("exit_statuses_*.yaml")) if f not in ignore + ] + if not files: + return {} + try: + data = yaml.safe_load(files[-1].read_text()) or {} + return data.get("instances_by_exit_status", {}) + except Exception: + logger.debug( + "Could not read %s for progress reporting", files[-1], exc_info=True + ) + return {} + + +def _poll_swebench_progress( + output_dir: Path, total: int, stop: threading.Event +) -> None: + """Poll exit_statuses_*.yaml and update a tqdm bar until stop is set.""" + # Snapshot pre-existing status files so stale data from prior runs is ignored. + existing = frozenset(output_dir.glob("exit_statuses_*.yaml")) + with tqdm(total=total, desc="SWE-bench instances", unit="instance") as bar: + last = 0 + while not stop.is_set(): + statuses = _read_swebench_exit_statuses(output_dir, existing) + done = sum(len(v) for v in statuses.values()) + if done > last: + bar.update(done - last) + last = done + if statuses: + bar.set_postfix({k: len(v) for k, v in sorted(statuses.items())}) + if last >= total: + break + stop.wait(timeout=5.0) + statuses = _read_swebench_exit_statuses(output_dir, existing) + done = sum(len(v) for v in statuses.values()) + if done > last: + bar.update(done - last) + if statuses: + bar.set_postfix({k: len(v) for k, v in sorted(statuses.items())}) + + +def _decode_subprocess_stderr(stderr: bytes | str | None) -> str: + if stderr is None: + return "" + if isinstance(stderr, bytes): + return stderr.decode(errors="replace").strip() + return str(stderr).strip() + + +class SWEBenchScorer(Scorer, scorer_id="swe_bench_scorer"): + """SWE-bench accuracy scorer using the mini-extra CLI (mini-swe-agent package). + + Invokes ``mini-extra swebench`` and ``swebench.harness.run_evaluation`` via + ``uv run --project `` so the parent process never imports + them directly. Run ``uv sync`` in the subproject directory once before use. + """ + + REQUIRES_EXTRACTOR: ClassVar[bool] = False + SKIP_ENDPOINT_PHASE: ClassVar[bool] = True + DEFAULT_SUBPROCESS_TIMEOUT_S: ClassVar[int] = 24 * 60 * 60 + DEFAULT_SUBSET: ClassVar[str] = "verified" + DEFAULT_SPLIT: ClassVar[str] = "test" + DEFAULT_NUM_INSTANCES: ClassVar[int] = 100 + PREPULL_TIMEOUT_S: ClassVar[int] = 10 * 60 + + def __init__( + self, + dataset_name: str, + dataset: Dataset, + report_dir: os.PathLike, + extractor: type[Extractor] | None = None, + ground_truth_column: str | None = "instance_id", + swe_bench_project_path: str | os.PathLike | None = None, + swebench_config_template: str | os.PathLike | None = None, + subset: str = DEFAULT_SUBSET, + split: str = DEFAULT_SPLIT, + num_instances: int = DEFAULT_NUM_INSTANCES, + workers: int = 10, + max_eval_workers: int = 10, + subprocess_timeout_s: int | None = None, + ): + super().__init__( + dataset_name=dataset_name, + dataset=dataset, + report_dir=report_dir, + extractor=extractor, + ground_truth_column=ground_truth_column, + ) + self.report_dir = self.report_dir.resolve() + self.swe_bench_project_path = self._resolve_project_path(swe_bench_project_path) + self.swebench_config_template = ( + Path(swebench_config_template) + if swebench_config_template is not None + else _DEFAULT_SWE_BENCH_TEMPLATE + ) + SWEBench.hf_dataset_name(subset) + self.subset = subset + self.split = split + self.num_instances = num_instances + self.workers = workers + self.max_eval_workers = max_eval_workers + self.subprocess_timeout_s = ( + subprocess_timeout_s + if subprocess_timeout_s is not None + else self.DEFAULT_SUBPROCESS_TIMEOUT_S + ) + + if not self.swebench_config_template.exists(): + raise FileNotFoundError( + f"swebench template not found: {self.swebench_config_template}. " + f"Pass swebench_config_template= in accuracy_config.extras." + ) + with self.swebench_config_template.open() as _f: + _tmpl = yaml.safe_load(_f) or {} + model_cfg = _tmpl.get("model") + if not isinstance(model_cfg, dict) or not isinstance( + model_cfg.get("model_kwargs"), dict + ): + raise ValueError( + f"swebench template {self.swebench_config_template} must have a " + "'model.model_kwargs' dict; check the template structure." + ) + pyproject = self.swe_bench_project_path / "pyproject.toml" + if not pyproject.exists(): + raise FileNotFoundError( + f"SWE-bench subproject not found at {self.swe_bench_project_path}. " + f"Set ${_SWE_BENCH_PROJECT_PATH_ENV} to the subproject path, " + f"then run: cd {self.swe_bench_project_path} && uv sync" + ) + + @staticmethod + def _resolve_project_path( + explicit: str | os.PathLike | None, + ) -> Path: + """Lookup order: explicit ctor arg → ``$SWE_BENCH_PROJECT_PATH`` env var → in-repo default.""" + return _resolve_subproject_path( + explicit, _SWE_BENCH_PROJECT_PATH_ENV, Path(_DEFAULT_SWE_BENCH_PROJECT_PATH) + ) + + @classmethod + def _get_extra_int( + cls, extras: dict[str, Any], key: str, *, default: int, min_value: int = 0 + ) -> int: + value = extras.get(key, default) + try: + parsed = int(value) + except (TypeError, ValueError) as exc: + raise SetupError( + f"accuracy_config.extras.{key} must be an integer; got {value!r}" + ) from exc + if parsed < min_value: + raise SetupError( + f"accuracy_config.extras.{key} must be >= {min_value}; got {parsed}" + ) + return parsed + + @classmethod + def _derive_required_images( + cls, + *, + swe_bench_project_path: Path, + subset: str, + split: str, + num_instances: int, + ) -> list[str]: + derive_cmd = [ + "uv", + "run", + "--project", + str(swe_bench_project_path), + "python", + "-c", + ( + "import json, sys; " + "from datasets import load_dataset; " + "from minisweagent.run.benchmarks.swebench import " + "DATASET_MAPPING, filter_instances, get_swebench_docker_image_name; " + "subset, split, num_instances = sys.argv[1], sys.argv[2], int(sys.argv[3]); " + "dataset_path = DATASET_MAPPING.get(subset, subset); " + "instances = list(load_dataset(dataset_path, split=split)); " + "slice_spec = f'0:{min(num_instances, len(instances))}'; " + "instances = filter_instances(" + "instances, filter_spec='', slice_spec=slice_spec, shuffle=False" + "); " + "seen = set(); images = []; " + "for instance in instances: " + " image = get_swebench_docker_image_name(instance); " + " (seen.add(image), images.append(image)) if image not in seen else None; " + "print(json.dumps(images))" + ), + subset, + split, + str(num_instances), + ] + result = subprocess.run( + derive_cmd, + check=False, + capture_output=True, + text=True, + timeout=cls.PREPULL_TIMEOUT_S, + ) + if result.returncode != 0: + stderr_text = _decode_subprocess_stderr(result.stderr) + raise SetupError( + "Failed to derive required SWE-bench Docker images from the accuracy " + f"subproject at {swe_bench_project_path}" + + (f". stderr: {stderr_text}" if stderr_text else "") + ) + try: + images = json.loads(result.stdout or "[]") + except json.JSONDecodeError as exc: + stdout_text = (result.stdout or "").strip() + raise SetupError( + "Failed to parse the required SWE-bench Docker image list from the " + f"accuracy subproject output: {stdout_text!r}" + ) from exc + if not isinstance(images, list) or not all( + isinstance(image, str) for image in images + ): + raise SetupError( + "Accuracy subproject returned an invalid SWE-bench Docker image list." + ) + return images + + @classmethod + def _prepull_images(cls, images: list[str]) -> None: + for image in images: + inspect_result = subprocess.run( + ["docker", "image", "inspect", image], + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + timeout=30, + ) + if inspect_result.returncode == 0: + logger.info("SWE-bench Docker image already cached: %s", image) + continue + + logger.info("Pulling SWE-bench Docker image: %s", image) + pull_result = subprocess.run( + ["docker", "pull", image], + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + timeout=cls.PREPULL_TIMEOUT_S, + ) + if pull_result.returncode != 0: + stderr_text = _decode_subprocess_stderr(pull_result.stderr) + raise SetupError( + "Failed to pre-pull required SWE-bench Docker image " + f"{image}. Authenticate to Docker Hub with `docker login` " + "or use a pre-seeded image cache/mirror before retrying." + + (f" stderr: {stderr_text}" if stderr_text else "") + ) + + @classmethod + def external_sample_count(cls, extras: dict[str, Any]) -> int | None: + try: + return int(extras["num_instances"]) + except (KeyError, TypeError, ValueError): + return None + + @classmethod + def preflight(cls, extras: dict[str, Any]) -> None: + """Check uv, mini-extra, swebench, and Docker before the benchmark starts.""" + swe_bench_project_path = cls._resolve_project_path( + extras.get("swe_bench_project_path") + ) + subset = str(extras.get("subset", cls.DEFAULT_SUBSET)) + split = str(extras.get("split", cls.DEFAULT_SPLIT)) + num_instances = cls._get_extra_int( + extras, + "num_instances", + default=cls.DEFAULT_NUM_INSTANCES, + ) + + if shutil.which("uv") is None: + raise SetupError( + "uv is not on PATH; install it with: " + "curl -LsSf https://astral.sh/uv/install.sh | sh" + ) + + result = subprocess.run( + [ + "uv", + "run", + "--project", + str(swe_bench_project_path), + "mini-extra", + "--help", + ], + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + timeout=30, + ) + if result.returncode != 0: + stderr_text = _decode_subprocess_stderr(result.stderr) + raise SetupError( + f"mini-extra is not available in the SWE-bench subproject at " + f"{swe_bench_project_path}. Run: cd {swe_bench_project_path} && uv sync" + + (f". stderr: {stderr_text}" if stderr_text else "") + ) + + swebench_result = subprocess.run( + [ + "uv", + "run", + "--project", + str(swe_bench_project_path), + "python", + "-c", + "import swebench", + ], + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + timeout=30, + ) + if swebench_result.returncode != 0: + stderr_text = _decode_subprocess_stderr(swebench_result.stderr) + raise SetupError( + f"swebench is not available in the SWE-bench subproject at " + f"{swe_bench_project_path}. Run: cd {swe_bench_project_path} && uv sync" + + (f". stderr: {stderr_text}" if stderr_text else "") + ) + + if shutil.which("docker") is None: + raise SetupError("docker is not on PATH. Install Docker and retry.") + + try: + docker_result = subprocess.run( + ["docker", "version"], + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + timeout=10, + ) + except Exception as e: + raise SetupError(f"Failed to execute docker command: {e}") from e + + if docker_result.returncode != 0: + raise SetupError("Docker daemon is not running. Start Docker and retry.") + + images = cls._derive_required_images( + swe_bench_project_path=swe_bench_project_path, + subset=subset, + split=split, + num_instances=num_instances, + ) + cls._prepull_images(images) + + def score_single_sample(self, value: str, ground_truth: str) -> float: + raise RuntimeError( + "SWEBenchScorer uses subprocess evaluation; call score() instead." + ) + + def _patch_config(self, output_dir: Path, benchmark_config_dict: dict) -> Path: + """Load template YAML, patch model fields from benchmark config, write to output_dir.""" + with self.swebench_config_template.open() as f: + cfg = yaml.safe_load(f) + + model_params = benchmark_config_dict.get("model_params") or {} + endpoint_cfg = benchmark_config_dict.get("endpoint_config") or {} + endpoints = endpoint_cfg.get("endpoints", []) + + model_name = model_params.get("name") + if not model_name: + raise ValueError( + "model_params.name is required in the benchmark config but is missing or empty" + ) + cfg["model"]["model_name"] = model_name + if endpoints: + base = endpoints[0].rstrip("/") + if base.endswith("/v1"): + base = base[:-3] + cfg["model"]["model_kwargs"]["api_base"] = base + "/v1" + else: + cfg["model"]["model_kwargs"]["api_base"] = "" + + api_key = endpoint_cfg.get("api_key") + if api_key: + cfg["model"]["model_kwargs"]["api_key"] = api_key + + for field in ( + "temperature", + "top_p", + "top_k", + "repetition_penalty", + "presence_penalty", + "frequency_penalty", + ): + val = model_params.get(field) + if val is not None: + cfg["model"]["model_kwargs"][field] = val + else: + cfg["model"]["model_kwargs"].pop(field, None) + + max_new_tokens_val = model_params.get("max_new_tokens") + if max_new_tokens_val is not None: + cfg["model"]["model_kwargs"]["max_tokens"] = max_new_tokens_val + else: + cfg["model"]["model_kwargs"].pop("max_tokens", None) + + chat_tmpl = model_params.get("chat_template_kwargs") + if chat_tmpl is not None: + cfg["model"]["model_kwargs"]["chat_template_kwargs"] = chat_tmpl + else: + cfg["model"]["model_kwargs"].pop("chat_template_kwargs", None) + + patched_path = output_dir / "swebench_patched.yaml" + with patched_path.open("w") as f: + yaml.safe_dump(cfg, f, default_flow_style=False, sort_keys=False) + return patched_path + + def _run_subprocess(self, cmd: list[str], log_path: Path, cwd: Path) -> None: + """Run a command inside the accuracy subproject via ``uv run --project``.""" + full_cmd = [ + "uv", + "run", + "--project", + str(self.swe_bench_project_path), + ] + cmd + _run_subprocess_with_log( + full_cmd, + log_path, + timeout_s=self.subprocess_timeout_s, + label="SWE-bench", + cwd=cwd, + ) + + def score(self) -> tuple[float | None, int]: + """Run mini-swe-agent + swebench evaluation. Returns (resolved_rate, 1).""" + config_path = self.report_dir / "config.yaml" + if not config_path.exists(): + raise FileNotFoundError( + f"config.yaml not found at {config_path}. " + "SWEBenchScorer.score() must be called from within a benchmark run " + "that has already written its config, or the path must be pre-populated." + ) + with config_path.open() as f: + benchmark_cfg = yaml.safe_load(f) + + model_name: str = benchmark_cfg["model_params"]["name"] + if self.dataset.dataframe is None: + raise RuntimeError( + "SWEBench dataset must be loaded before scoring; call dataset.load() first." + ) + + n_rows = len(self.dataset.dataframe) + if self.num_instances > n_rows: + logger.warning( + "num_instances=%d exceeds dataset size %d; evaluating %d instances", + self.num_instances, + n_rows, + n_rows, + ) + slice_str = f"0:{min(self.num_instances, n_rows)}" + + output_dir = self.report_dir / "swe_bench_output" + if output_dir.exists(): + shutil.rmtree(output_dir) + output_dir.mkdir(parents=True) + + patched_config = self._patch_config(output_dir, benchmark_cfg) + + agent_cmd = [ + "mini-extra", + "swebench", + "--model", + model_name, + "--config", + str(patched_config), + "--subset", + self.subset, + "--split", + self.split, + "--slice", + slice_str, + "--workers", + str(self.workers), + "--output", + str(output_dir), + ] + logger.info("Running mini-extra swebench: %s", " ".join(agent_cmd)) + total_instances = min(self.num_instances, n_rows) + stop_event = threading.Event() + poll_thread = threading.Thread( + target=_poll_swebench_progress, + args=(output_dir, total_instances, stop_event), + daemon=True, + ) + poll_thread.start() + try: + self._run_subprocess( + agent_cmd, + self.report_dir / "swe_bench_agent.log", + cwd=output_dir, + ) + finally: + stop_event.set() + poll_thread.join(timeout=10) + + preds_path = output_dir / "preds.json" + if not preds_path.exists(): + logger.error( + "preds.json not found after mini-swe-agent run; returning None score" + ) + return None, 1 + + hf_dataset_name = SWEBench.hf_dataset_name(self.subset) + run_id = f"endpoints_{uuid.uuid4().hex[:8]}" + eval_cmd = [ + "python", + "-m", + "swebench.harness.run_evaluation", + "--dataset_name", + hf_dataset_name, + "--split", + self.split, + "--predictions_path", + str(preds_path), + "--max_workers", + str(self.max_eval_workers), + "--run_id", + run_id, + ] + logger.info("Running swebench evaluation: %s", " ".join(eval_cmd)) + self._run_subprocess( + eval_cmd, + self.report_dir / "swe_bench_eval.log", + cwd=output_dir, + ) + + safe_model = model_name.replace("/", "__") + result_path = output_dir / f"{safe_model}.{run_id}.json" + if not result_path.exists(): + candidates = list(output_dir.glob(f"*{run_id}*.json")) + if not candidates: + logger.error( + "SWE-bench result file not found (run_id=%s); returning None", + run_id, + ) + return None, 1 + result_path = candidates[0] + + shutil.copy2(result_path, self.report_dir / "swe_bench_results.json") + + result = msgspec.json.decode(result_path.read_bytes(), type=dict) + submitted = result.get("submitted_instances") or 0 + resolved = result.get("resolved_instances") or 0 + if submitted == 0: + logger.warning("SWE-bench: submitted_instances=0; returning None score") + return None, 1 + + resolved_rate = resolved / submitted + logger.info( + "SWE-bench: resolved %d / %d submitted (%.1f%%)", + resolved, + submitted, + resolved_rate * 100, + ) + return resolved_rate, 1 diff --git a/tests/unit/commands/test_benchmark.py b/tests/unit/commands/test_benchmark.py index 1c90554fb..ee7afd75e 100644 --- a/tests/unit/commands/test_benchmark.py +++ b/tests/unit/commands/test_benchmark.py @@ -22,6 +22,7 @@ from types import SimpleNamespace from unittest.mock import MagicMock, patch +import inference_endpoint.commands.benchmark.execute as execute_mod import pandas as pd import pytest from inference_endpoint.commands.benchmark.cli import ( @@ -34,6 +35,7 @@ BenchmarkContext, ResponseCollector, _build_phases, + _load_datasets, _run_benchmark_async, setup_benchmark, ) @@ -62,6 +64,7 @@ from inference_endpoint.config.utils import cli_error_formatter as _error_formatter from inference_endpoint.core.types import QueryResult from inference_endpoint.dataset_manager.dataset import Dataset +from inference_endpoint.dataset_manager.predefined.swe_bench import SWEBench from inference_endpoint.endpoint_client.config import HTTPClientConfig from inference_endpoint.evaluation.scoring import Scorer from inference_endpoint.exceptions import InputValidationError, SetupError @@ -78,6 +81,29 @@ / "templates" ) + +# Test-only scorers registered with leading-underscore IDs so TestScorerMethodSync excludes them. + + +class _SelfContainedScorer(Scorer, scorer_id="_test_skip_endpoint_phase"): + SKIP_ENDPOINT_PHASE = True + + def score_single_sample(self, value, ground_truth): + return 0.0 + + def score(self): + return 1.0, 1 + + +class _FailingPreflightScorer(Scorer, scorer_id="_test_failing_preflight"): + @classmethod + def preflight(cls, extras): + raise SetupError("mock preflight failure") + + def score_single_sample(self, value, ground_truth): + return 0.0 + + # Reusable minimal config kwargs _OFFLINE_KWARGS = { "endpoint_config": {"endpoints": ["http://test:8000"]}, @@ -132,6 +158,55 @@ def test_missing_model_name_raises(self): datasets=[{"path": "test.jsonl"}], ) + @pytest.mark.unit + def test_concurrency_injected_into_swe_bench_extras(self): + """target_concurrency is forwarded as workers into swe_bench_scorer extras.""" + config = OnlineConfig( + endpoint_config={"endpoints": ["http://test:8000"]}, + model_params={"name": "test-model"}, + datasets=[ + { + "name": "swe_bench", + "type": "accuracy", + "accuracy_config": {"eval_method": "swe_bench_scorer"}, + }, + {"type": "performance", "path": "tests/assets/datasets/dummy_1k.jsonl"}, + ], + settings={ + "load_pattern": {"type": "concurrency", "target_concurrency": 32} + }, + ) + acc_ds = next(d for d in config.datasets if d.type == DatasetType.ACCURACY) + assert acc_ds.accuracy_config is not None + assert acc_ds.accuracy_config.extras is not None + assert acc_ds.accuracy_config.extras.get("workers") == 32 + + @pytest.mark.unit + def test_explicit_workers_not_overridden_by_concurrency(self): + """An explicit workers= in extras is not overwritten by target_concurrency.""" + config = OnlineConfig( + endpoint_config={"endpoints": ["http://test:8000"]}, + model_params={"name": "test-model"}, + datasets=[ + { + "name": "swe_bench", + "type": "accuracy", + "accuracy_config": { + "eval_method": "swe_bench_scorer", + "extras": {"workers": 5}, + }, + }, + {"type": "performance", "path": "tests/assets/datasets/dummy_1k.jsonl"}, + ], + settings={ + "load_pattern": {"type": "concurrency", "target_concurrency": 32} + }, + ) + acc_ds = next(d for d in config.datasets if d.type == DatasetType.ACCURACY) + assert acc_ds.accuracy_config is not None + assert acc_ds.accuracy_config.extras is not None + assert acc_ds.accuracy_config.extras.get("workers") == 5 + class TestDurationSuffix: """Test duration suffix parsing (600s, 10m, 600000ms, plain int).""" @@ -381,6 +456,91 @@ def test_validation_errors(self, overrides, match): ) +class TestAccuracyOnlyDataset: + """Test that datasets with ACCURACY_ONLY=True are rejected as perf datasets.""" + + @pytest.mark.unit + def test_swe_bench_as_perf_raises(self, tmp_path): + fake_df = pd.DataFrame( + [{"instance_id": "repo__repo-0", "problem_statement": "Fix bug 0"}] + ) + config = OfflineConfig( + endpoint_config={"endpoints": ["http://test:8000"]}, + model_params={"name": "test-model"}, + datasets=[{"name": "swe_bench"}], + ) + with ( + patch.object(SWEBench, "generate", return_value=fake_df), + pytest.raises(InputValidationError, match="accuracy-only"), + ): + _load_datasets(config, tmp_path) + + @pytest.mark.unit + def test_preflight_error_propagates(self, tmp_path): + """A scorer whose preflight() raises SetupError must stop _load_datasets.""" + dummy_jsonl = tmp_path / "dummy.jsonl" + dummy_jsonl.write_text('{"prompt": "hello"}\n') + fake_acc_df = pd.DataFrame( + [{"instance_id": "repo__repo-0", "prompt": "Fix bug 0"}] + ) + config = OfflineConfig( + endpoint_config={"endpoints": ["http://test:8000"]}, + model_params={"name": "test-model"}, + datasets=[ + {"type": "performance", "path": str(dummy_jsonl)}, + { + "name": "swe_bench", + "type": "accuracy", + "accuracy_config": {"eval_method": "swe_bench_scorer"}, + }, + ], + ) + with ( + patch.object(SWEBench, "generate", return_value=fake_acc_df), + patch.object( + execute_mod, + "_resolve_accuracy_components", + return_value=(_FailingPreflightScorer, None), + ), + pytest.raises(SetupError, match="mock preflight failure"), + ): + _load_datasets(config, tmp_path) + + @pytest.mark.unit + def test_perf_dataset_with_accuracy_config_does_not_crash_load_datasets( + self, tmp_path + ): + """_load_datasets must not crash when perf dataset carries accuracy_config. + + The perf-with-accuracy-config branch appends to eval_configs but not to + accuracy_datasets; a zip(strict=True) over both lists would raise ValueError. + """ + dummy_jsonl = tmp_path / "dummy.jsonl" + dummy_jsonl.write_text('{"prompt": "hello"}\n') + config = OfflineConfig( + endpoint_config={"endpoints": ["http://test:8000"]}, + model_params={"name": "test-model"}, + datasets=[ + { + "type": "performance", + "path": str(dummy_jsonl), + "accuracy_config": {"eval_method": "swe_bench_scorer"}, + }, + ], + ) + with patch.object( + execute_mod, + "_resolve_accuracy_components", + return_value=(_SelfContainedScorer, None), + ): + _, accuracy_datasets, eval_configs = _load_datasets(config, tmp_path) + + # The perf dataset appends to eval_configs only, not accuracy_datasets. + assert len(accuracy_datasets) == 0 + assert len(eval_configs) == 1 + assert eval_configs[0].dataset_name == "performance" + + class TestYAMLTemplateValidation: """Validate all bundled YAML templates parse correctly.""" @@ -555,8 +715,6 @@ class TestAggregatorArgs: """Tests that metrics aggregator subprocess args are correctly forwarded.""" def _make_ctx(self, config, tmp_path): - import random - rt = RuntimeSettings( metric_target=Throughput(10.0), reported_metrics=[Throughput(10.0)], @@ -967,6 +1125,27 @@ def test_accuracy_drain_timeout_defaults_to_unbounded( acc = next(p for p in phases if p.phase_type == PhaseType.ACCURACY) assert acc.drain_timeout is None + @pytest.mark.unit + def test_skip_endpoint_phase_omits_accuracy_phase( + self, base_rt_settings, simple_dataset + ): + config = OfflineConfig(**_OFFLINE_KWARGS) + ctx = self._make_ctx(config, base_rt_settings, simple_dataset) + ctx.eval_configs = [ + AccuracyConfiguration( + scorer=_SelfContainedScorer, + extractor=None, + dataset_name="acc", + dataset=simple_dataset, + report_dir=Path("/tmp"), + ground_truth_column=None, + num_repeats=1, + ) + ] + phases = _build_phases(ctx) + + assert all(p.phase_type != PhaseType.ACCURACY for p in phases) + @pytest.mark.unit def test_warmup_uses_independent_rng_instances( self, base_rt_settings, simple_dataset @@ -1052,7 +1231,8 @@ class TestScorerMethodSync: @pytest.mark.unit def test_scorer_enum_matches_registry(self): enum_values = {m.value for m in ScorerMethod} - registry_keys = set(Scorer.PREDEFINED.keys()) + # Exclude test-only scorers (ids starting with "_") + registry_keys = {k for k in Scorer.PREDEFINED if not k.startswith("_")} assert enum_values == registry_keys, ( f"ScorerMethod enum out of sync with Scorer registry.\n" f" In enum only: {enum_values - registry_keys}\n" diff --git a/tests/unit/dataset_manager/test_swe_bench_dataset.py b/tests/unit/dataset_manager/test_swe_bench_dataset.py new file mode 100644 index 000000000..f82aeec7a --- /dev/null +++ b/tests/unit/dataset_manager/test_swe_bench_dataset.py @@ -0,0 +1,108 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for SWEBench predefined dataset.""" + +from pathlib import Path +from unittest.mock import patch + +import pandas as pd +import pytest +from inference_endpoint.dataset_manager.dataset import Dataset +from inference_endpoint.dataset_manager.predefined.swe_bench import SWEBench + +pytestmark = pytest.mark.unit + +_FAKE_INSTANCES = [ + {"instance_id": f"repo__repo-{i}", "problem_statement": f"Fix bug {i}"} + for i in range(5) +] + + +def _make_hf_df() -> pd.DataFrame: + return pd.DataFrame(_FAKE_INSTANCES) + + +class TestSWEBenchRegistration: + def test_registered(self): + assert "swe_bench" in Dataset.PREDEFINED + assert Dataset.PREDEFINED["swe_bench"] is SWEBench + + def test_accuracy_only_flag(self): + assert SWEBench.ACCURACY_ONLY is True + + @pytest.mark.parametrize( + ("subset", "expected"), + [ + ("verified", "princeton-nlp/SWE-bench_Verified"), + ("lite", "princeton-nlp/SWE-bench_Lite"), + ], + ) + def test_hf_dataset_name(self, subset: str, expected: str): + assert SWEBench.hf_dataset_name(subset) == expected + + def test_hf_dataset_name_invalid_subset_raises(self): + with pytest.raises(ValueError, match="Unknown SWE-bench subset"): + SWEBench.hf_dataset_name("invalid") + + +class TestSWEBenchGenerate: + def test_downloads_and_caches(self, tmp_path: Path): + with patch( + "inference_endpoint.dataset_manager.predefined.swe_bench.load_from_huggingface", + return_value=_make_hf_df(), + ) as mock_hf: + df1 = SWEBench.generate(datasets_dir=tmp_path) + + assert mock_hf.call_count == 1 + assert list(df1.columns) == ["instance_id", "prompt"] + assert len(df1) == 5 + assert df1["prompt"].iloc[0] == "Fix bug 0" + + # Second call should hit parquet cache, not HF + with patch( + "inference_endpoint.dataset_manager.predefined.swe_bench.load_from_huggingface", + ) as mock_hf2: + df2 = SWEBench.generate(datasets_dir=tmp_path) + + mock_hf2.assert_not_called() + assert list(df2.columns) == ["instance_id", "prompt"] + assert len(df2) == 5 + + def test_unknown_subset_raises(self, tmp_path: Path): + with pytest.raises(ValueError, match="Unknown SWE-bench subset"): + SWEBench.generate(datasets_dir=tmp_path, subset="invalid") + + def test_force_regenerate(self, tmp_path: Path): + with patch( + "inference_endpoint.dataset_manager.predefined.swe_bench.load_from_huggingface", + return_value=_make_hf_df(), + ) as mock_hf: + SWEBench.generate(datasets_dir=tmp_path) + assert mock_hf.call_count == 1 + + SWEBench.generate(datasets_dir=tmp_path, force=True) + assert mock_hf.call_count == 2 + + def test_lite_subset(self, tmp_path: Path): + with patch( + "inference_endpoint.dataset_manager.predefined.swe_bench.load_from_huggingface", + return_value=_make_hf_df(), + ) as mock_hf: + df = SWEBench.generate(datasets_dir=tmp_path, subset="lite") + + call_kwargs = mock_hf.call_args + assert "princeton-nlp/SWE-bench_Lite" in call_kwargs[0] + assert len(df) == 5 diff --git a/tests/unit/evaluation/test_swe_bench_scorer.py b/tests/unit/evaluation/test_swe_bench_scorer.py new file mode 100644 index 000000000..584ec4eed --- /dev/null +++ b/tests/unit/evaluation/test_swe_bench_scorer.py @@ -0,0 +1,699 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for SWEBenchScorer.""" + +import json +from pathlib import Path +from unittest.mock import MagicMock + +import msgspec +import pandas as pd +import pytest +import yaml +from inference_endpoint.evaluation import scoring as scoring_mod +from inference_endpoint.evaluation.scoring import ( + Scorer, + SWEBenchScorer, +) +from inference_endpoint.exceptions import SetupError + +pytestmark = pytest.mark.unit + +_DATASET_NAME = "swe_bench_acc" +_MODEL_NAME = "TestOrg/test-model-7b" + + +def _write_benchmark_config(report_dir: Path, model_params: dict | None = None) -> None: + mp: dict = {"name": _MODEL_NAME} + defaults = { + "model_params": mp, + "endpoint_config": {"endpoints": ["http://localhost:30000"]}, + } + if model_params is not None: + mp.update(model_params) + (report_dir / "config.yaml").write_text(yaml.dump(defaults)) + + +def _write_sample_idx_map(report_dir: Path, n: int = 3) -> None: + idx_map = {_DATASET_NAME: {f"uuid-{i}": i for i in range(n)}} + (report_dir / "sample_idx_map.json").write_bytes(msgspec.json.encode(idx_map)) + + +def _make_dataset(n: int = 3) -> MagicMock: + df = pd.DataFrame( + { + "instance_id": [f"repo__repo-{i}" for i in range(n)], + "prompt": ["placeholder"] * n, + } + ) + ds = MagicMock() + ds.dataframe = df + ds.num_samples.return_value = n + return ds + + +@pytest.fixture +def swe_bench_project(tmp_path: Path) -> Path: + """Fake accuracy subproject directory with a minimal pyproject.toml.""" + d = tmp_path / "accuracy" + d.mkdir(parents=True) + (d / "pyproject.toml").write_text("[project]\nname = 'swe-bench-accuracy'\n") + return d + + +@pytest.fixture +def template_yaml(tmp_path: Path) -> Path: + """Minimal swebench template YAML.""" + tmpl = { + "model": { + "model_name": "", + "model_kwargs": { + "custom_llm_provider": "openai", + "api_base": "", + }, + } + } + p = tmp_path / "swebench_template.yaml" + p.write_text(yaml.dump(tmpl)) + return p + + +@pytest.fixture +def report_dir(tmp_path: Path) -> Path: + d = tmp_path / "report" + d.mkdir() + _write_benchmark_config(d) + _write_sample_idx_map(d) + return d + + +def _make_fake_run(cmd, **kwargs): + """Return a fake subprocess.run result with returncode=0.""" + return MagicMock(returncode=0, stdout="") + + +def _make_staged_run(on_eval_cmd): + """Return a fake subprocess.run that handles mini-extra successfully, then delegates.""" + + def fake_run(cmd, **kwargs): + if "mini-extra" in " ".join(cmd): + output_dir = Path(cmd[cmd.index("--output") + 1]) + output_dir.mkdir(parents=True, exist_ok=True) + (output_dir / "preds.json").write_text(json.dumps({})) + return MagicMock(returncode=0, stdout="") + return on_eval_cmd(cmd, **kwargs) + + return fake_run + + +@pytest.fixture +def patch_subprocess(monkeypatch, report_dir: Path, swe_bench_project: Path): + """Patch subprocess.run to write fake preds.json and result JSON.""" + captured: list[list[str]] = [] + + def fake_run(cmd, **kwargs): + captured.append(list(cmd)) + cmd_str = " ".join(cmd) + if "mini-extra" in cmd_str: + output_dir = Path(cmd[cmd.index("--output") + 1]) + output_dir.mkdir(parents=True, exist_ok=True) + (output_dir / "preds.json").write_text(json.dumps({})) + elif "run_evaluation" in cmd_str: + cwd = Path(kwargs["cwd"]) + run_id = cmd[cmd.index("--run_id") + 1] + safe_model = _MODEL_NAME.replace("/", "__") + (cwd / f"{safe_model}.{run_id}.json").write_text( + json.dumps( + { + "resolved_instances": 3, + "submitted_instances": 10, + "total_instances": 500, + } + ) + ) + return MagicMock(returncode=0, stdout="") + + monkeypatch.setattr(scoring_mod.subprocess, "run", fake_run) + return captured + + +class TestSWEBenchScorerRegistration: + def test_registered(self): + assert "swe_bench_scorer" in Scorer.PREDEFINED + assert Scorer.get("swe_bench_scorer") is SWEBenchScorer + + def test_skip_endpoint_phase(self): + assert SWEBenchScorer.SKIP_ENDPOINT_PHASE is True + + def test_external_sample_count(self): + assert SWEBenchScorer.external_sample_count({"num_instances": 100}) == 100 + assert SWEBenchScorer.external_sample_count({}) is None + assert SWEBenchScorer.external_sample_count({"num_instances": "bad"}) is None + + +class TestSWEBenchScorer: + def test_score_happy_path( + self, report_dir, swe_bench_project, template_yaml, patch_subprocess + ): + scorer = SWEBenchScorer( + dataset_name=_DATASET_NAME, + dataset=_make_dataset(), + report_dir=report_dir, + swe_bench_project_path=swe_bench_project, + swebench_config_template=template_yaml, + ) + score, n_repeats = scorer.score() + + assert score == pytest.approx(0.3) + assert n_repeats == 1 + assert (report_dir / "swe_bench_results.json").exists() + + def test_missing_subproject_raises_at_init( + self, report_dir, tmp_path, template_yaml + ): + empty_dir = tmp_path / "empty_project" + empty_dir.mkdir() + with pytest.raises(FileNotFoundError, match="SWE-bench subproject not found"): + SWEBenchScorer( + dataset_name=_DATASET_NAME, + dataset=_make_dataset(), + report_dir=report_dir, + swe_bench_project_path=empty_dir, + swebench_config_template=template_yaml, + ) + + def test_missing_template_raises_at_init( + self, report_dir, swe_bench_project, tmp_path + ): + nonexistent = tmp_path / "no_such_template.yaml" + with pytest.raises(FileNotFoundError, match="swebench template"): + SWEBenchScorer( + dataset_name=_DATASET_NAME, + dataset=_make_dataset(), + report_dir=report_dir, + swe_bench_project_path=swe_bench_project, + swebench_config_template=nonexistent, + ) + + def test_missing_preds_returns_none( + self, report_dir, swe_bench_project, template_yaml, monkeypatch + ): + monkeypatch.setattr(scoring_mod.subprocess, "run", _make_fake_run) + scorer = SWEBenchScorer( + dataset_name=_DATASET_NAME, + dataset=_make_dataset(), + report_dir=report_dir, + swe_bench_project_path=swe_bench_project, + swebench_config_template=template_yaml, + ) + score, n_repeats = scorer.score() + assert score is None + assert n_repeats == 1 + + def test_config_patching_all_fields(self, report_dir, swe_bench_project, tmp_path): + tmpl = { + "model": { + "model_name": "", + "model_kwargs": { + "api_base": "", + "temperature": None, + "top_k": None, + }, + } + } + template_path = tmp_path / "tmpl.yaml" + template_path.write_text(yaml.dump(tmpl)) + + _write_benchmark_config( + report_dir, + model_params={ + "temperature": 0.8, + "top_p": 0.9, + "top_k": 15, + "chat_template_kwargs": {"preserve_thinking": True}, + }, + ) + + scorer = SWEBenchScorer( + dataset_name=_DATASET_NAME, + dataset=_make_dataset(), + report_dir=report_dir, + swe_bench_project_path=swe_bench_project, + swebench_config_template=template_path, + ) + output_dir = tmp_path / "out" + output_dir.mkdir() + with (report_dir / "config.yaml").open() as f: + benchmark_cfg = yaml.safe_load(f) + patched_path = scorer._patch_config(output_dir, benchmark_cfg) + patched = yaml.safe_load(patched_path.read_text()) + + assert patched["model"]["model_name"] == _MODEL_NAME + assert ( + patched["model"]["model_kwargs"]["api_base"] == "http://localhost:30000/v1" + ) + assert patched["model"]["model_kwargs"]["temperature"] == pytest.approx(0.8) + assert patched["model"]["model_kwargs"]["top_p"] == pytest.approx(0.9) + assert patched["model"]["model_kwargs"]["top_k"] == 15 + assert patched["model"]["model_kwargs"]["chat_template_kwargs"] == { + "preserve_thinking": True + } + + def test_config_patching_omits_none_fields( + self, report_dir, swe_bench_project, tmp_path + ): + tmpl = { + "model": { + "model_name": "", + "model_kwargs": {"api_base": "", "top_k": 20}, + } + } + template_path = tmp_path / "tmpl.yaml" + template_path.write_text(yaml.dump(tmpl)) + + # model_params has no top_k — should be removed from patched config + _write_benchmark_config(report_dir) + + scorer = SWEBenchScorer( + dataset_name=_DATASET_NAME, + dataset=_make_dataset(), + report_dir=report_dir, + swe_bench_project_path=swe_bench_project, + swebench_config_template=template_path, + ) + output_dir = tmp_path / "out" + output_dir.mkdir() + with (report_dir / "config.yaml").open() as f: + benchmark_cfg = yaml.safe_load(f) + patched_path = scorer._patch_config(output_dir, benchmark_cfg) + patched = yaml.safe_load(patched_path.read_text()) + + assert "top_k" not in patched["model"]["model_kwargs"] + + def test_config_patching_max_new_tokens( + self, report_dir, swe_bench_project, tmp_path + ): + tmpl = { + "model": { + "model_name": "", + "model_kwargs": {"api_base": ""}, + } + } + template_path = tmp_path / "tmpl.yaml" + template_path.write_text(yaml.dump(tmpl)) + + _write_benchmark_config(report_dir, model_params={"max_new_tokens": 4096}) + + scorer = SWEBenchScorer( + dataset_name=_DATASET_NAME, + dataset=_make_dataset(), + report_dir=report_dir, + swe_bench_project_path=swe_bench_project, + swebench_config_template=template_path, + ) + output_dir = tmp_path / "out" + output_dir.mkdir() + with (report_dir / "config.yaml").open() as f: + benchmark_cfg = yaml.safe_load(f) + patched_path = scorer._patch_config(output_dir, benchmark_cfg) + patched = yaml.safe_load(patched_path.read_text()) + + assert patched["model"]["model_kwargs"]["max_tokens"] == 4096 + + def test_config_patching_omits_max_tokens_when_not_set( + self, report_dir, swe_bench_project, tmp_path + ): + tmpl = { + "model": { + "model_name": "", + "model_kwargs": {"api_base": "", "max_tokens": 999}, + } + } + template_path = tmp_path / "tmpl.yaml" + template_path.write_text(yaml.dump(tmpl)) + + # model_params has no max_new_tokens — max_tokens should be removed + _write_benchmark_config(report_dir) + + scorer = SWEBenchScorer( + dataset_name=_DATASET_NAME, + dataset=_make_dataset(), + report_dir=report_dir, + swe_bench_project_path=swe_bench_project, + swebench_config_template=template_path, + ) + output_dir = tmp_path / "out" + output_dir.mkdir() + with (report_dir / "config.yaml").open() as f: + benchmark_cfg = yaml.safe_load(f) + patched_path = scorer._patch_config(output_dir, benchmark_cfg) + patched = yaml.safe_load(patched_path.read_text()) + + assert "max_tokens" not in patched["model"]["model_kwargs"] + + @pytest.mark.parametrize( + "num_instances, expected_slice", + [ + (5, "0:5"), + (100, "0:100"), + ], + ) + def test_num_instances_produces_correct_slice( + self, + num_instances, + expected_slice, + report_dir, + swe_bench_project, + template_yaml, + patch_subprocess, + ): + scorer = SWEBenchScorer( + dataset_name=_DATASET_NAME, + dataset=_make_dataset(n=num_instances), + report_dir=report_dir, + swe_bench_project_path=swe_bench_project, + swebench_config_template=template_yaml, + num_instances=num_instances, + ) + scorer.score() + agent_cmd = patch_subprocess[0] + assert agent_cmd[agent_cmd.index("--slice") + 1] == expected_slice + + @pytest.mark.parametrize( + "subset, expected_hf_name", + [ + ("lite", "princeton-nlp/SWE-bench_Lite"), + ("verified", "princeton-nlp/SWE-bench_Verified"), + ], + ) + def test_subset_maps_to_correct_hf_dataset_name( + self, + subset, + expected_hf_name, + report_dir, + swe_bench_project, + template_yaml, + patch_subprocess, + ): + scorer = SWEBenchScorer( + dataset_name=_DATASET_NAME, + dataset=_make_dataset(), + report_dir=report_dir, + swe_bench_project_path=swe_bench_project, + swebench_config_template=template_yaml, + subset=subset, + ) + scorer.score() + eval_cmd = patch_subprocess[1] + assert eval_cmd[eval_cmd.index("--dataset_name") + 1] == expected_hf_name + + def test_unknown_subset_raises_at_init( + self, report_dir, swe_bench_project, template_yaml + ): + with pytest.raises(ValueError, match="Unknown SWE-bench subset"): + SWEBenchScorer( + dataset_name=_DATASET_NAME, + dataset=_make_dataset(), + report_dir=report_dir, + swe_bench_project_path=swe_bench_project, + swebench_config_template=template_yaml, + subset="full", + ) + + def test_missing_model_name_raises_clear_error(self, swe_bench_project, tmp_path): + tmpl = { + "model": { + "model_name": "", + "model_kwargs": {"api_base": ""}, + } + } + template_path = tmp_path / "tmpl.yaml" + template_path.write_text(yaml.dump(tmpl)) + + scorer = SWEBenchScorer( + dataset_name=_DATASET_NAME, + dataset=_make_dataset(), + report_dir=tmp_path, + swe_bench_project_path=swe_bench_project, + swebench_config_template=template_path, + ) + output_dir = tmp_path / "out" + output_dir.mkdir() + + with pytest.raises(ValueError, match="model_params.name is required"): + scorer._patch_config(output_dir, {"model_params": {}}) + + def test_template_missing_model_kwargs_raises( + self, report_dir, swe_bench_project, tmp_path + ): + bad_template = tmp_path / "bad_template.yaml" + bad_template.write_text(yaml.dump({"model": {"model_name": ""}})) + with pytest.raises(ValueError, match="model.model_kwargs"): + SWEBenchScorer( + dataset_name=_DATASET_NAME, + dataset=_make_dataset(), + report_dir=report_dir, + swe_bench_project_path=swe_bench_project, + swebench_config_template=bad_template, + ) + + def test_subprocess_failure_raises( + self, report_dir, swe_bench_project, template_yaml, monkeypatch + ): + def _fail_eval(cmd, **kwargs): + return MagicMock(returncode=2, stdout="docker error: permission denied") + + monkeypatch.setattr( + scoring_mod.subprocess, + "run", + _make_staged_run(_fail_eval), + ) + scorer = SWEBenchScorer( + dataset_name=_DATASET_NAME, + dataset=_make_dataset(), + report_dir=report_dir, + swe_bench_project_path=swe_bench_project, + swebench_config_template=template_yaml, + ) + with pytest.raises(RuntimeError, match="exited with code 2"): + scorer.score() + + def test_subprocess_timeout_raises( + self, report_dir, swe_bench_project, template_yaml, monkeypatch + ): + def _timeout_eval(cmd, **kwargs): + raise scoring_mod.subprocess.TimeoutExpired(cmd=cmd, timeout=300) + + monkeypatch.setattr( + scoring_mod.subprocess, "run", _make_staged_run(_timeout_eval) + ) + scorer = SWEBenchScorer( + dataset_name=_DATASET_NAME, + dataset=_make_dataset(), + report_dir=report_dir, + swe_bench_project_path=swe_bench_project, + swebench_config_template=template_yaml, + ) + with pytest.raises(RuntimeError, match="timed out after"): + scorer.score() + + def test_result_glob_fallback( + self, report_dir, swe_bench_project, template_yaml, monkeypatch + ): + def _write_alt_prefix(cmd, **kwargs): + if "run_evaluation" in " ".join(cmd): + cwd = Path(kwargs["cwd"]) + run_id = cmd[cmd.index("--run_id") + 1] + # Write under a different prefix so exact name won't match; glob will find it + (cwd / f"alt_prefix.{run_id}.json").write_text( + json.dumps( + { + "resolved_instances": 1, + "submitted_instances": 5, + "total_instances": 500, + } + ) + ) + return MagicMock(returncode=0, stdout="") + + monkeypatch.setattr( + scoring_mod.subprocess, "run", _make_staged_run(_write_alt_prefix) + ) + scorer = SWEBenchScorer( + dataset_name=_DATASET_NAME, + dataset=_make_dataset(), + report_dir=report_dir, + swe_bench_project_path=swe_bench_project, + swebench_config_template=template_yaml, + ) + score, n_repeats = scorer.score() + assert score == pytest.approx(1 / 5) + assert n_repeats == 1 + + def test_zero_submitted_instances_returns_none( + self, report_dir, swe_bench_project, template_yaml, monkeypatch + ): + def _write_zero_results(cmd, **kwargs): + if "run_evaluation" in " ".join(cmd): + cwd = Path(kwargs["cwd"]) + run_id = cmd[cmd.index("--run_id") + 1] + safe_model = _MODEL_NAME.replace("/", "__") + (cwd / f"{safe_model}.{run_id}.json").write_text( + json.dumps( + { + "resolved_instances": 0, + "submitted_instances": 0, + "total_instances": 500, + } + ) + ) + return MagicMock(returncode=0, stdout="") + + monkeypatch.setattr( + scoring_mod.subprocess, "run", _make_staged_run(_write_zero_results) + ) + scorer = SWEBenchScorer( + dataset_name=_DATASET_NAME, + dataset=_make_dataset(), + report_dir=report_dir, + swe_bench_project_path=swe_bench_project, + swebench_config_template=template_yaml, + ) + score, n_repeats = scorer.score() + assert score is None + assert n_repeats == 1 + + +class TestSWEBenchScorerPreflight: + def _extras(self, swe_bench_project: Path, **overrides) -> dict: + return {"swe_bench_project_path": str(swe_bench_project), **overrides} + + def test_preflight_passes(self, swe_bench_project, monkeypatch): + monkeypatch.setattr( + scoring_mod.shutil, "which", lambda name: f"/usr/bin/{name}" + ) + captured: list[list[str]] = [] + + def fake_run(cmd, **kw): + captured.append(list(cmd)) + cmd_str = " ".join(cmd) + if "get_swebench_docker_image_name" in cmd_str: + return MagicMock( + returncode=0, + stdout=json.dumps(["docker.io/swebench/test:latest"]), + stderr="", + ) + return MagicMock(returncode=0, stdout="", stderr=b"") + + monkeypatch.setattr(scoring_mod.subprocess, "run", fake_run) + SWEBenchScorer.preflight( + self._extras( + swe_bench_project, + subset="lite", + split="test", + num_instances=2, + ) + ) + + derive_cmd = next( + cmd for cmd in captured if "get_swebench_docker_image_name" in " ".join(cmd) + ) + assert derive_cmd[-3:] == ["lite", "test", "2"] + assert ["docker", "pull", "docker.io/swebench/test:latest"] not in captured + + def test_preflight_fails_uv_missing(self, swe_bench_project, monkeypatch): + monkeypatch.setattr(scoring_mod.shutil, "which", lambda name: None) + with pytest.raises(SetupError, match="uv is not on PATH"): + SWEBenchScorer.preflight(self._extras(swe_bench_project)) + + def test_preflight_fails_mini_extra_missing(self, swe_bench_project, monkeypatch): + monkeypatch.setattr( + scoring_mod.shutil, "which", lambda name: f"/usr/bin/{name}" + ) + + def fake_run(cmd, **kw): + if "mini-extra" in cmd: + return MagicMock(returncode=1, stderr=b"not found") + return MagicMock(returncode=0) + + monkeypatch.setattr(scoring_mod.subprocess, "run", fake_run) + with pytest.raises( + SetupError, match=r"mini-extra is not available.*stderr: not found" + ): + SWEBenchScorer.preflight(self._extras(swe_bench_project)) + + def test_preflight_fails_swebench_missing(self, swe_bench_project, monkeypatch): + monkeypatch.setattr( + scoring_mod.shutil, "which", lambda name: f"/usr/bin/{name}" + ) + + def fake_run(cmd, **kw): + if "import swebench" in " ".join(cmd): + return MagicMock(returncode=1, stderr=b"ModuleNotFoundError") + return MagicMock(returncode=0) + + monkeypatch.setattr(scoring_mod.subprocess, "run", fake_run) + with pytest.raises( + SetupError, + match=r"swebench is not available.*stderr: ModuleNotFoundError", + ): + SWEBenchScorer.preflight(self._extras(swe_bench_project)) + + def test_preflight_fails_docker_not_running(self, swe_bench_project, monkeypatch): + monkeypatch.setattr( + scoring_mod.shutil, "which", lambda name: f"/usr/bin/{name}" + ) + + def fake_run(cmd, **kw): + if "docker" in cmd: + return MagicMock( + returncode=1, stderr=b"Cannot connect to Docker daemon" + ) + return MagicMock(returncode=0) + + monkeypatch.setattr(scoring_mod.subprocess, "run", fake_run) + with pytest.raises(SetupError, match="Docker daemon is not running"): + SWEBenchScorer.preflight(self._extras(swe_bench_project)) + + def test_preflight_fails_when_pull_fails(self, swe_bench_project, monkeypatch): + monkeypatch.setattr( + scoring_mod.shutil, "which", lambda name: f"/usr/bin/{name}" + ) + + def fake_run(cmd, **kw): + cmd_str = " ".join(cmd) + if "get_swebench_docker_image_name" in cmd_str: + return MagicMock( + returncode=0, + stdout=json.dumps(["docker.io/swebench/test:latest"]), + stderr="", + ) + if cmd[:3] == ["docker", "image", "inspect"]: + return MagicMock(returncode=1, stdout="", stderr=b"missing") + if cmd[:2] == ["docker", "pull"]: + return MagicMock( + returncode=1, + stdout="", + stderr=b"rate limit exceeded", + ) + return MagicMock(returncode=0, stdout="", stderr=b"") + + monkeypatch.setattr(scoring_mod.subprocess, "run", fake_run) + with pytest.raises( + SetupError, + match=r"docker\.io/swebench/test:latest.*rate limit exceeded", + ): + SWEBenchScorer.preflight(self._extras(swe_bench_project)) diff --git a/uv.lock b/uv.lock index 984581b6b..b079ca900 100644 --- a/uv.lock +++ b/uv.lock @@ -810,6 +810,7 @@ dependencies = [ { name = "pydantic", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "pydantic-core", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "pytz", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "pyyaml", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "pyzmq", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "rich", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "sentencepiece", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, @@ -894,6 +895,7 @@ requires-dist = [ { name = "pytest-timeout", marker = "extra == 'test'", specifier = "==2.4.0" }, { name = "pytest-xdist", marker = "extra == 'test'", specifier = "==3.8.0" }, { name = "pytz", specifier = "==2026.1.post1" }, + { name = "pyyaml", specifier = "==6.0.3" }, { name = "pyzmq", specifier = "==27.1.0" }, { name = "rich", specifier = "==14.3.3" }, { name = "ruff", marker = "extra == 'dev'", specifier = "==0.15.8" },