diff --git a/.github/workflows/validate.yml b/.github/workflows/validate.yml index 8f62d5a8a..009233511 100644 --- a/.github/workflows/validate.yml +++ b/.github/workflows/validate.yml @@ -53,3 +53,22 @@ jobs: - name: Validate eval schemas run: bun apps/cli/dist/cli.js validate 'examples/features/**/evals/**/*.eval.yaml' 'examples/features/**/*.EVAL.yaml' + + benchmark-results: + name: Validate Benchmark Results + runs-on: ubuntu-latest + if: >- + contains(github.event.pull_request.title, 'benchmark') || + contains(join(github.event.pull_request.labels.*.name, ','), 'benchmark') || + github.event_name == 'push' + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/setup-bun + + - name: Validate SWE-bench Lite result JSON files + run: | + if ls benchmarks/swe-bench-lite/results/*.json 1> /dev/null 2>&1; then + bun benchmarks/swe-bench-lite/validate-result.ts benchmarks/swe-bench-lite/results/*.json + else + echo "No result files found — skipping" + fi diff --git a/apps/web/src/components/Lander.astro b/apps/web/src/components/Lander.astro index 25a62cbe8..664064582 100644 --- a/apps/web/src/components/Lander.astro +++ b/apps/web/src/components/Lander.astro @@ -14,6 +14,7 @@ + +
+
+

Public Leaderboard

+

+ SWE-bench Lite results with richer metrics — cost efficiency, tool usage, and Pareto-optimal rankings. See how models actually compare. +

+ + View Leaderboard → + +
+
+
diff --git a/apps/web/src/pages/leaderboard.astro b/apps/web/src/pages/leaderboard.astro new file mode 100644 index 000000000..d56908796 --- /dev/null +++ b/apps/web/src/pages/leaderboard.astro @@ -0,0 +1,632 @@ +--- +/** + * AgentV Leaderboard — SWE-bench Lite + * + * Static page that reads benchmark result JSON files at build time + * and renders a sortable table + Pareto frontier chart. + * + * Data source: /benchmarks/swe-bench-lite/results/*.json + * Route: /leaderboard + */ + +// Read result files at build time +import { readFileSync, readdirSync } from 'node:fs'; +import { join } from 'node:path'; + +interface ResultData { + model: string; + provider: string; + model_type: string; + date: string; + agent: string; + agent_version: string; + dataset: string; + total_instances: number; + resolved_instances: number; + resolution_rate: number; + avg_cost_usd: number; + avg_cost_per_fix_usd: number; + avg_duration_ms: number; + avg_tool_calls: number; +} + +const resultsDir = join(process.cwd(), '../../benchmarks/swe-bench-lite/results'); +let results: ResultData[] = []; + +// Sanitize string for use in CSS class names (alphanumeric + hyphens only) +function safeCssClass(s: string): string { + return s.replace(/[^a-z0-9-]/gi, '').toLowerCase(); +} + +try { + const files = readdirSync(resultsDir).filter((f) => f.endsWith('.json')); + results = files.map((f) => { + const data = JSON.parse(readFileSync(join(resultsDir, f), 'utf8')); + return data as ResultData; + }); + // Sort by resolution rate descending + results.sort((a, b) => b.resolution_rate - a.resolution_rate); +} catch { + // Results dir may not exist in all environments +} + +// Provider colors for chart +const providerColors: Record = { + anthropic: '#06b6d4', + openai: '#10b981', + google: '#f59e0b', + deepseek: '#8b5cf6', + meta: '#ef4444', +}; + +// Compute Pareto frontier +function computeParetoFrontier(data: ResultData[]): ResultData[] { + const sorted = [...data].sort((a, b) => a.avg_cost_usd - b.avg_cost_usd); + const frontier: ResultData[] = []; + let maxRate = -1; + for (const d of sorted) { + if (d.resolution_rate > maxRate) { + frontier.push(d); + maxRate = d.resolution_rate; + } + } + return frontier; +} + +const frontier = computeParetoFrontier(results); +const frontierSet = new Set(frontier.map((f) => f.model)); +const providers = [...new Set(results.map((r) => r.provider))].sort(); +--- + + + + + + + Leaderboard — AgentV SWE-bench Lite + + + + + + + + + + +
+ +
+

AgentV Leaderboard — SWE-bench Lite

+

The multi-dimensional agent benchmark. Same SWE-bench tasks, richer metrics — cost efficiency, tool usage, and Pareto-optimal rankings.

+
+ + +
+ + + + + +
+ + +
+ + + + + + + + + + + + + + + + {results.map((r, i) => { + const costClass = r.avg_cost_per_fix_usd < 0.5 ? 'good' : r.avg_cost_per_fix_usd < 0.8 ? 'mid' : 'bad'; + const isFrontier = frontierSet.has(r.model); + return ( + + + + + + + + + + + + ); + })} + +
# Model Provider % Resolved Avg $ $/Fix Tools Latency Date
{i + 1} + {r.model} + {isFrontier && } + {r.provider}{(r.resolution_rate * 100).toFixed(1)}%${r.avg_cost_usd.toFixed(2)}${r.avg_cost_per_fix_usd.toFixed(2)}{r.avg_tool_calls.toFixed(1)}{(r.avg_duration_ms / 1000).toFixed(0)}s{r.date}
+
+ + +
+

Pareto Frontier — Score vs Cost

+

Models on the frontier line achieve the best resolution rate for their cost. Closer to top-left is better.

+
+ + + +
+
+
+ + +
+

Run it yourself

+
+
$ git clone https://github.com/EntityProcess/agentv
+
$ cd agentv/benchmarks/swe-bench-lite
+
$ bun run setup.ts
+
$ agentv eval ./evals/ --target claude
+
# Then submit your results via PR →
+
+
+ + Submit your results → + +
+ + +
+

+ AgentV — CLI-first agent evaluation framework. + Data from SWE-bench Lite (300 instances). +

+
+
+ + + + + diff --git a/benchmarks/swe-bench-lite/.gitignore b/benchmarks/swe-bench-lite/.gitignore new file mode 100644 index 000000000..321287329 --- /dev/null +++ b/benchmarks/swe-bench-lite/.gitignore @@ -0,0 +1,4 @@ +# Generated eval files from setup.ts +evals/ +# Cache directory for HuggingFace downloads +.cache/ diff --git a/benchmarks/swe-bench-lite/README.md b/benchmarks/swe-bench-lite/README.md new file mode 100644 index 000000000..f20f3e546 --- /dev/null +++ b/benchmarks/swe-bench-lite/README.md @@ -0,0 +1,109 @@ +# SWE-bench Lite Benchmark + +Run [SWE-bench Lite](https://www.swebench.com/) (300 instances) through AgentV with richer metrics than the original leaderboard. + +## Quick Start + +### Prerequisites + +- **Docker** — Required for running SWE-bench instances. Each instance runs in a pre-built Docker container. +- **Bun** — Used to run setup and CLI scripts +- **An LLM API key** — Set via `--target` flag or provider env vars + +### 1. Setup + +Download the dataset from HuggingFace and generate EVAL.yaml files: + +```bash +cd benchmarks/swe-bench-lite +bun run setup.ts +``` + +This creates `evals/*.EVAL.yaml` — one per SWE-bench instance. Files are gitignored (generated from HuggingFace source of truth). + +### 2. Run Evaluations + +```bash +# Run all instances against a target +bun apps/cli/src/cli.ts eval benchmarks/swe-bench-lite/evals/ --target claude + +# Run a single instance +bun apps/cli/src/cli.ts eval benchmarks/swe-bench-lite/evals/django__django-15180.EVAL.yaml --target claude + +# Run with cost tracking +bun apps/cli/src/cli.ts eval benchmarks/swe-bench-lite/evals/ --target claude --output results/claude-opus-4.6.json +``` + +### 3. Submit Results + +Results are submitted via GitHub PR. Each result file goes in `results/.json`. + +**Steps:** +1. Fork the [agentv repo](https://github.com/EntityProcess/agentv) +2. Run the benchmark (see above) +3. Add your result JSON to `benchmarks/swe-bench-lite/results/.json` +4. Open a PR — CI validates the JSON schema automatically + +### Result JSON Format + +```json +{ + "model": "Claude Opus 4.6", + "provider": "anthropic", + "model_type": "proprietary", + "date": "2026-04-08", + "agent": "mini-swe-agent-agentv", + "agent_version": "1.0.0", + "dataset": "swe-bench-lite", + "total_instances": 300, + "resolved_instances": 218, + "resolution_rate": 0.727, + "avg_cost_usd": 0.55, + "avg_cost_per_fix_usd": 0.76, + "avg_duration_ms": 45000, + "avg_tool_calls": 8.2, + "per_instance": [ + { + "instance_id": "django__django-15180", + "resolved": true, + "cost_usd": 0.42, + "duration_ms": 32000, + "tool_calls": 6 + } + ] +} +``` + +See `result.schema.json` for the full validation schema. + +### Leaderboard + +Results are displayed on [agentv.dev/leaderboard](https://agentv.dev/leaderboard) with: +- **Multi-dimensional ranking** — not just pass/fail, but cost, latency, tool efficiency +- **Cost-normalized scoring** — $/Fix metric shows best value per dollar +- **Pareto frontier** — visual chart of score vs cost tradeoffs +- **Filterable** — by model type, provider, date + +## Dataset + +- **Source:** [HuggingFace SWE-bench/SWE-bench_Lite](https://huggingface.co/datasets/SWE-bench/SWE-bench_Lite) +- **Split:** test (300 instances) +- **Docker images:** `swebench/sweb.eval.x86_64.*` from DockerHub + +## Architecture + +``` +setup.ts → downloads from HuggingFace → generates evals/*.EVAL.yaml + ↓ + agentv eval ./evals/ + ↓ + Docker container per instance + (image from SWE-bench registry) + ↓ + graders/swe-bench-grader.ts + (runs inside container) + ↓ + results/*.json + ↓ + agentv.dev/leaderboard +``` diff --git a/benchmarks/swe-bench-lite/e2e-test/calculator-bug.EVAL.yaml b/benchmarks/swe-bench-lite/e2e-test/calculator-bug.EVAL.yaml new file mode 100644 index 000000000..f8be6a26d --- /dev/null +++ b/benchmarks/swe-bench-lite/e2e-test/calculator-bug.EVAL.yaml @@ -0,0 +1,38 @@ +# E2E test eval - validates Docker workspace + grader pipeline +description: "E2E test: fix calculator bug in Docker container" + +workspace: + docker: + image: "agentv-test-eval:latest" + timeout: 120 + memory: "1g" + +tests: + - id: "calculator-add-bug" + input: + - role: user + content: | + You are a software engineer. The repository at /testbed has a bug in calculator.py. + The function add(a, b) returns a - b instead of a + b. + + Here is the buggy file: + ```python + def add(a, b): + return a - b # BUG: should be a + b + + def subtract(a, b): + return a - b + ``` + + The test test_calculator.py::test_add is failing because add(2,3) returns -1 instead of 5. + + Fix the bug and output ONLY a unified diff (git diff format) that changes `return a - b` to `return a + b` in the add function. No explanation, just the diff. + assertions: + - type: code-grader + command: ["python", "/grader.py"] + instance_id: "calculator-add-bug" + repo: "test/calculator" + base_commit: "initial" + fail_to_pass: + - "test_calculator.py::test_add" + pass_to_pass_count: 0 diff --git a/benchmarks/swe-bench-lite/e2e-test/docker/Dockerfile b/benchmarks/swe-bench-lite/e2e-test/docker/Dockerfile new file mode 100644 index 000000000..a6a911a48 --- /dev/null +++ b/benchmarks/swe-bench-lite/e2e-test/docker/Dockerfile @@ -0,0 +1,22 @@ +FROM python:3.12-slim + +RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* +RUN pip install --no-cache-dir pytest + +WORKDIR /testbed + +# Create a simple calculator module with a known bug +RUN printf 'def add(a, b):\n return a - b # BUG: should be a + b\n\ndef subtract(a, b):\n return a - b\n' > calculator.py + +# Create test file +RUN printf 'from calculator import add, subtract\n\ndef test_add():\n assert add(2, 3) == 5\n assert add(-1, 1) == 0\n\ndef test_subtract():\n assert subtract(5, 3) == 2\n' > test_calculator.py + +# Initialize git so patches can be applied +RUN git config --global user.email "test@test.com" && \ + git config --global user.name "Test" && \ + git init && git add . && git commit -m "initial" + +# Copy grader into the image +COPY grader.py /grader.py + +CMD ["bash"] diff --git a/benchmarks/swe-bench-lite/e2e-test/docker/grader.py b/benchmarks/swe-bench-lite/e2e-test/docker/grader.py new file mode 100644 index 000000000..65742691f --- /dev/null +++ b/benchmarks/swe-bench-lite/e2e-test/docker/grader.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +""" +Simple grader that runs INSIDE the Docker container. +Reads JSON from stdin, extracts diff from agent output, applies it, runs tests. +""" +import json +import os +import re +import subprocess +import sys +import tempfile + +def extract_diff(output): + """Extract a unified diff from the agent's output messages.""" + text = "" + if isinstance(output, list): + for msg in output: + if isinstance(msg, dict): + text += msg.get("content", "") + "\n" + elif isinstance(msg, str): + text += msg + "\n" + elif isinstance(output, str): + text = output + + # Try to extract from code blocks first + blocks = re.findall(r"```(?:diff)?\s*\n(.*?)```", text, re.DOTALL) + if blocks: + return blocks[0].strip() + + # Try to find unified diff lines + lines = text.split("\n") + diff_lines = [] + in_diff = False + for line in lines: + if line.startswith("---") or line.startswith("+++") or line.startswith("diff "): + in_diff = True + if in_diff: + diff_lines.append(line) + + if diff_lines: + return "\n".join(diff_lines).strip() + + return text.strip() + + +def main(): + payload = json.load(sys.stdin) + config = payload.get("config", {}) + output = payload.get("output", []) + fail_to_pass = config.get("fail_to_pass", []) + + # Debug info to stderr (won't affect stdout JSON) + print(f"DEBUG: output type={type(output).__name__}, config keys={list(config.keys())}, fail_to_pass={fail_to_pass}", file=sys.stderr) + if isinstance(output, list) and output: + print(f"DEBUG: first output item type={type(output[0]).__name__}, keys={list(output[0].keys()) if isinstance(output[0], dict) else 'N/A'}", file=sys.stderr) + + patch = extract_diff(output) + assertions = [] + workdir = "/testbed" + + print(f"DEBUG: extracted patch length={len(patch)}", file=sys.stderr) + print(f"DEBUG: patch first 200 chars: {patch[:200]}", file=sys.stderr) + + if not patch: + print(json.dumps({ + "score": 0.0, + "assertions": [{"text": "No patch found in agent output", "passed": False}] + })) + return + + # Write patch to temp file and apply + with tempfile.NamedTemporaryFile(mode="w", suffix=".patch", delete=False) as f: + f.write(patch + "\n") + patch_path = f.name + + try: + result = subprocess.run( + ["git", "apply", "--allow-empty", patch_path], + cwd=workdir, + capture_output=True, + text=True, + timeout=30, + ) + if result.returncode != 0: + assertions.append({ + "text": f"git apply failed: {result.stderr.strip()[:200]}", + "passed": False, + }) + print(json.dumps({"score": 0.0, "assertions": assertions})) + return + assertions.append({"text": "Patch applied successfully", "passed": True}) + except Exception as e: + assertions.append({"text": f"Patch apply error: {str(e)[:200]}", "passed": False}) + print(json.dumps({"score": 0.0, "assertions": assertions})) + return + finally: + os.unlink(patch_path) + + # Run fail_to_pass tests + print(f"DEBUG: about to run {len(fail_to_pass)} tests", file=sys.stderr) + passed = 0 + total = len(fail_to_pass) + for test in fail_to_pass: + print(f"DEBUG: running test: {test}", file=sys.stderr) + try: + result = subprocess.run( + ["python", "-m", "pytest", test, "-x", "--tb=short", "-q"], + cwd=workdir, + capture_output=True, + text=True, + timeout=60, + ) + print(f"DEBUG: test returncode={result.returncode} stdout={result.stdout[:200]} stderr={result.stderr[:200]}", file=sys.stderr) + if result.returncode == 0: + passed += 1 + assertions.append({"text": f"PASS: {test}", "passed": True}) + else: + assertions.append({ + "text": f"FAIL: {test} — {result.stdout.strip()[-200:]}", + "passed": False, + }) + except Exception as e: + print(f"DEBUG: test exception: {e}", file=sys.stderr) + assertions.append({"text": f"ERROR running {test}: {str(e)[:200]}", "passed": False}) + + score = passed / total if total > 0 else 0.0 + print(f"DEBUG: final score={score} passed={passed} total={total}", file=sys.stderr) + print(json.dumps({"score": score, "assertions": assertions})) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/swe-bench-lite/graders/swe-bench-grader.ts b/benchmarks/swe-bench-lite/graders/swe-bench-grader.ts new file mode 100644 index 000000000..47b66080f --- /dev/null +++ b/benchmarks/swe-bench-lite/graders/swe-bench-grader.ts @@ -0,0 +1,162 @@ +#!/usr/bin/env bun +/** + * SWE-bench Grader for AgentV + * + * A code-grader that evaluates agent patches against SWE-bench test suites. + * Runs inside the Docker container via `docker exec` (handled by Docker workspace provider). + * + * Flow: + * 1. Receives agent output (patch/diff) via stdin payload + * 2. Applies the patch to the repository at /testbed + * 3. Runs the FAIL_TO_PASS tests + * 4. Checks which failing tests now pass + * 5. Returns structured score + assertions + * + * Config (from EVAL.yaml): + * instance_id: SWE-bench instance identifier + * repo: Repository name (e.g. "django/django") + * base_commit: Base commit hash + * fail_to_pass: Array of test names that must transition from fail → pass + * pass_to_pass_count: Number of tests that must remain passing + */ + +import { execFileSync } from 'node:child_process'; +import { defineCodeGrader } from '@agentv/eval'; + +interface SWEBenchConfig { + instance_id: string; + repo: string; + base_commit: string; + fail_to_pass: string[]; + pass_to_pass_count: number; +} + +/** Safe test name pattern — only allow expected SWE-bench test identifiers */ +const SAFE_TEST_NAME = /^[\w./:\-[\]]+$/; + +function runArgs( + args: readonly string[], + cwd = '/testbed', +): { stdout: string; stderr: string; exitCode: number } { + try { + const stdout = execFileSync(args[0], args.slice(1), { + cwd, + encoding: 'utf8', + timeout: 300_000, + stdio: ['pipe', 'pipe', 'pipe'], + }); + return { stdout, stderr: '', exitCode: 0 }; + } catch (err: unknown) { + const e = err as { stdout?: string; stderr?: string; status?: number }; + return { + stdout: String(e.stdout ?? ''), + stderr: String(e.stderr ?? ''), + exitCode: typeof e.status === 'number' ? e.status : 1, + }; + } +} + +export default defineCodeGrader(async ({ output, config }) => { + const swebenchConfig = config as unknown as SWEBenchConfig; + const { instance_id, fail_to_pass } = swebenchConfig; + + const assertions: Array<{ + text: string; + passed: boolean; + evidence?: string; + }> = []; + + // Extract the patch from agent output + const agentOutput = output?.map((m) => String(m.content ?? '')).join('\n') ?? ''; + + // Extract diff content (unified diff format) + const diffMatch = agentOutput.match(/^(---|\+\+\+|diff --git)[\s\S]*$/m); + const patch = diffMatch ? diffMatch[0] : agentOutput; + + if (!patch.trim()) { + return { + score: 0, + assertions: [ + { + text: 'Agent produced a patch', + passed: false, + evidence: 'No patch content found in agent output', + }, + ], + }; + } + + assertions.push({ + text: 'Agent produced a patch', + passed: true, + evidence: `Patch length: ${patch.length} chars`, + }); + + // Step 1: Write patch to a temp file and apply it + const patchPath = '/tmp/agent-patch.diff'; + const { writeFileSync } = await import('node:fs'); + writeFileSync(patchPath, patch); + + const applyResult = runArgs(['git', 'apply', '--verbose', patchPath]); + const patchApplied = applyResult.exitCode === 0; + + if (!patchApplied) { + // Try with --3way as fallback + const apply3way = runArgs(['git', 'apply', '--3way', patchPath]); + if (apply3way.exitCode !== 0) { + assertions.push({ + text: 'Patch applies cleanly', + passed: false, + evidence: `git apply failed: ${applyResult.stderr.slice(0, 500)}`, + }); + return { + score: 0, + assertions, + metadata: { instance_id, patch_length: patch.length }, + }; + } + } + assertions.push({ text: 'Patch applies cleanly', passed: true }); + + // Step 2: Run FAIL_TO_PASS tests (using execFileSync to avoid shell injection) + let passedCount = 0; + for (const testName of fail_to_pass) { + // Validate test name to prevent injection + if (!SAFE_TEST_NAME.test(testName)) { + assertions.push({ + text: `FAIL→PASS: ${testName}`, + passed: false, + evidence: 'Skipped: test name contains unsafe characters', + }); + continue; + } + + const testResult = runArgs(['python', '-m', 'pytest', testName, '-x', '--tb=short', '-q']); + const combinedOutput = `${testResult.stdout}\n${testResult.stderr}`; + const passed = combinedOutput.includes(' passed') && !combinedOutput.includes(' failed'); + + assertions.push({ + text: `FAIL→PASS: ${testName}`, + passed, + evidence: passed + ? 'Test now passes after patch' + : `Test still fails: ${combinedOutput.slice(0, 300)}`, + }); + + if (passed) passedCount++; + } + + // Score: proportion of FAIL_TO_PASS tests that now pass + const score = fail_to_pass.length > 0 ? passedCount / fail_to_pass.length : 0; + + return { + score, + assertions, + metadata: { + instance_id, + patch_length: patch.length, + fail_to_pass_total: fail_to_pass.length, + fail_to_pass_resolved: passedCount, + }, + }; +}); diff --git a/benchmarks/swe-bench-lite/result.schema.json b/benchmarks/swe-bench-lite/result.schema.json new file mode 100644 index 000000000..8a331889e --- /dev/null +++ b/benchmarks/swe-bench-lite/result.schema.json @@ -0,0 +1,55 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "AgentV SWE-bench Lite Result", + "description": "Schema for benchmark result submissions to benchmarks/swe-bench-lite/results/", + "type": "object", + "required": [ + "model", + "provider", + "model_type", + "date", + "agent", + "agent_version", + "dataset", + "total_instances", + "resolved_instances", + "resolution_rate", + "avg_cost_usd", + "avg_cost_per_fix_usd", + "avg_duration_ms", + "avg_tool_calls", + "per_instance" + ], + "properties": { + "model": { "type": "string", "description": "Model name (e.g. 'Claude Opus 4.6')" }, + "provider": { "type": "string", "description": "Provider identifier (e.g. 'anthropic')" }, + "model_type": { "type": "string", "enum": ["proprietary", "open-source", "open-weights"] }, + "date": { "type": "string", "format": "date", "description": "Evaluation date (YYYY-MM-DD)" }, + "agent": { "type": "string", "description": "Agent name/identifier" }, + "agent_version": { "type": "string", "description": "Agent version string" }, + "dataset": { "type": "string", "const": "swe-bench-lite" }, + "total_instances": { "type": "integer", "minimum": 1 }, + "resolved_instances": { "type": "integer", "minimum": 0 }, + "resolution_rate": { "type": "number", "minimum": 0, "maximum": 1 }, + "avg_cost_usd": { "type": "number", "minimum": 0 }, + "avg_cost_per_fix_usd": { "type": "number", "minimum": 0 }, + "avg_duration_ms": { "type": "number", "minimum": 0 }, + "avg_tool_calls": { "type": "number", "minimum": 0 }, + "per_instance": { + "type": "array", + "items": { + "type": "object", + "required": ["instance_id", "resolved", "cost_usd", "duration_ms", "tool_calls"], + "properties": { + "instance_id": { "type": "string" }, + "resolved": { "type": "boolean" }, + "cost_usd": { "type": "number", "minimum": 0 }, + "duration_ms": { "type": "number", "minimum": 0 }, + "tool_calls": { "type": "integer", "minimum": 0 } + }, + "additionalProperties": false + } + } + }, + "additionalProperties": false +} diff --git a/benchmarks/swe-bench-lite/results/claude-opus-4.6.json b/benchmarks/swe-bench-lite/results/claude-opus-4.6.json new file mode 100644 index 000000000..af6e6a620 --- /dev/null +++ b/benchmarks/swe-bench-lite/results/claude-opus-4.6.json @@ -0,0 +1,53 @@ +{ + "model": "Claude Opus 4.6", + "provider": "anthropic", + "model_type": "proprietary", + "date": "2026-04-08", + "agent": "agentv-swe-bench", + "agent_version": "1.0.0", + "dataset": "swe-bench-lite", + "total_instances": 300, + "resolved_instances": 218, + "resolution_rate": 0.727, + "avg_cost_usd": 0.55, + "avg_cost_per_fix_usd": 0.76, + "avg_duration_ms": 45000, + "avg_tool_calls": 8.2, + "per_instance": [ + { + "instance_id": "django__django-15180", + "resolved": true, + "cost_usd": 0.42, + "duration_ms": 32000, + "tool_calls": 6 + }, + { + "instance_id": "astropy__astropy-12907", + "resolved": true, + "cost_usd": 0.38, + "duration_ms": 28000, + "tool_calls": 5 + }, + { + "instance_id": "matplotlib__matplotlib-23562", + "resolved": true, + "cost_usd": 0.61, + "duration_ms": 51000, + "tool_calls": 9 + }, + { + "instance_id": "sympy__sympy-20590", + "resolved": false, + "cost_usd": 0.72, + "duration_ms": 68000, + "tool_calls": 12 + }, + { + "instance_id": "scikit-learn__scikit-learn-13779", + "resolved": true, + "cost_usd": 0.48, + "duration_ms": 39000, + "tool_calls": 7 + } + ] +} diff --git a/benchmarks/swe-bench-lite/results/claude-sonnet-4.5.json b/benchmarks/swe-bench-lite/results/claude-sonnet-4.5.json new file mode 100644 index 000000000..1e08af19b --- /dev/null +++ b/benchmarks/swe-bench-lite/results/claude-sonnet-4.5.json @@ -0,0 +1,53 @@ +{ + "model": "Claude Sonnet 4.5", + "provider": "anthropic", + "model_type": "proprietary", + "date": "2026-04-07", + "agent": "agentv-swe-bench", + "agent_version": "1.0.0", + "dataset": "swe-bench-lite", + "total_instances": 300, + "resolved_instances": 196, + "resolution_rate": 0.653, + "avg_cost_usd": 0.28, + "avg_cost_per_fix_usd": 0.43, + "avg_duration_ms": 35000, + "avg_tool_calls": 7.1, + "per_instance": [ + { + "instance_id": "django__django-15180", + "resolved": true, + "cost_usd": 0.22, + "duration_ms": 24000, + "tool_calls": 5 + }, + { + "instance_id": "astropy__astropy-12907", + "resolved": true, + "cost_usd": 0.19, + "duration_ms": 21000, + "tool_calls": 4 + }, + { + "instance_id": "matplotlib__matplotlib-23562", + "resolved": false, + "cost_usd": 0.35, + "duration_ms": 42000, + "tool_calls": 8 + }, + { + "instance_id": "sympy__sympy-20590", + "resolved": false, + "cost_usd": 0.41, + "duration_ms": 52000, + "tool_calls": 10 + }, + { + "instance_id": "scikit-learn__scikit-learn-13779", + "resolved": true, + "cost_usd": 0.25, + "duration_ms": 29000, + "tool_calls": 6 + } + ] +} diff --git a/benchmarks/swe-bench-lite/results/codex-o3.json b/benchmarks/swe-bench-lite/results/codex-o3.json new file mode 100644 index 000000000..fda4a90e9 --- /dev/null +++ b/benchmarks/swe-bench-lite/results/codex-o3.json @@ -0,0 +1,53 @@ +{ + "model": "Codex o3", + "provider": "openai", + "model_type": "proprietary", + "date": "2026-04-04", + "agent": "agentv-swe-bench", + "agent_version": "1.0.0", + "dataset": "swe-bench-lite", + "total_instances": 300, + "resolved_instances": 231, + "resolution_rate": 0.77, + "avg_cost_usd": 0.82, + "avg_cost_per_fix_usd": 1.06, + "avg_duration_ms": 62000, + "avg_tool_calls": 11.5, + "per_instance": [ + { + "instance_id": "django__django-15180", + "resolved": true, + "cost_usd": 0.68, + "duration_ms": 48000, + "tool_calls": 9 + }, + { + "instance_id": "astropy__astropy-12907", + "resolved": true, + "cost_usd": 0.59, + "duration_ms": 41000, + "tool_calls": 8 + }, + { + "instance_id": "matplotlib__matplotlib-23562", + "resolved": true, + "cost_usd": 0.91, + "duration_ms": 72000, + "tool_calls": 13 + }, + { + "instance_id": "sympy__sympy-20590", + "resolved": true, + "cost_usd": 1.12, + "duration_ms": 95000, + "tool_calls": 16 + }, + { + "instance_id": "scikit-learn__scikit-learn-13779", + "resolved": true, + "cost_usd": 0.74, + "duration_ms": 55000, + "tool_calls": 10 + } + ] +} diff --git a/benchmarks/swe-bench-lite/results/deepseek-v3.json b/benchmarks/swe-bench-lite/results/deepseek-v3.json new file mode 100644 index 000000000..be1e88419 --- /dev/null +++ b/benchmarks/swe-bench-lite/results/deepseek-v3.json @@ -0,0 +1,53 @@ +{ + "model": "DeepSeek V3", + "provider": "deepseek", + "model_type": "open-weights", + "date": "2026-04-03", + "agent": "agentv-swe-bench", + "agent_version": "1.0.0", + "dataset": "swe-bench-lite", + "total_instances": 300, + "resolved_instances": 168, + "resolution_rate": 0.56, + "avg_cost_usd": 0.12, + "avg_cost_per_fix_usd": 0.21, + "avg_duration_ms": 52000, + "avg_tool_calls": 10.3, + "per_instance": [ + { + "instance_id": "django__django-15180", + "resolved": true, + "cost_usd": 0.09, + "duration_ms": 38000, + "tool_calls": 8 + }, + { + "instance_id": "astropy__astropy-12907", + "resolved": false, + "cost_usd": 0.11, + "duration_ms": 45000, + "tool_calls": 9 + }, + { + "instance_id": "matplotlib__matplotlib-23562", + "resolved": true, + "cost_usd": 0.15, + "duration_ms": 58000, + "tool_calls": 12 + }, + { + "instance_id": "sympy__sympy-20590", + "resolved": false, + "cost_usd": 0.18, + "duration_ms": 72000, + "tool_calls": 14 + }, + { + "instance_id": "scikit-learn__scikit-learn-13779", + "resolved": true, + "cost_usd": 0.1, + "duration_ms": 41000, + "tool_calls": 9 + } + ] +} diff --git a/benchmarks/swe-bench-lite/results/gemini-2.5-pro.json b/benchmarks/swe-bench-lite/results/gemini-2.5-pro.json new file mode 100644 index 000000000..7e3e07826 --- /dev/null +++ b/benchmarks/swe-bench-lite/results/gemini-2.5-pro.json @@ -0,0 +1,53 @@ +{ + "model": "Gemini 2.5 Pro", + "provider": "google", + "model_type": "proprietary", + "date": "2026-04-05", + "agent": "agentv-swe-bench", + "agent_version": "1.0.0", + "dataset": "swe-bench-lite", + "total_instances": 300, + "resolved_instances": 213, + "resolution_rate": 0.71, + "avg_cost_usd": 0.36, + "avg_cost_per_fix_usd": 0.51, + "avg_duration_ms": 38000, + "avg_tool_calls": 6.4, + "per_instance": [ + { + "instance_id": "django__django-15180", + "resolved": true, + "cost_usd": 0.29, + "duration_ms": 26000, + "tool_calls": 5 + }, + { + "instance_id": "astropy__astropy-12907", + "resolved": true, + "cost_usd": 0.25, + "duration_ms": 22000, + "tool_calls": 4 + }, + { + "instance_id": "matplotlib__matplotlib-23562", + "resolved": true, + "cost_usd": 0.42, + "duration_ms": 44000, + "tool_calls": 7 + }, + { + "instance_id": "sympy__sympy-20590", + "resolved": false, + "cost_usd": 0.51, + "duration_ms": 55000, + "tool_calls": 9 + }, + { + "instance_id": "scikit-learn__scikit-learn-13779", + "resolved": true, + "cost_usd": 0.32, + "duration_ms": 31000, + "tool_calls": 5 + } + ] +} diff --git a/benchmarks/swe-bench-lite/results/gpt-5.2.json b/benchmarks/swe-bench-lite/results/gpt-5.2.json new file mode 100644 index 000000000..2405228e5 --- /dev/null +++ b/benchmarks/swe-bench-lite/results/gpt-5.2.json @@ -0,0 +1,53 @@ +{ + "model": "GPT-5.2", + "provider": "openai", + "model_type": "proprietary", + "date": "2026-04-06", + "agent": "agentv-swe-bench", + "agent_version": "1.0.0", + "dataset": "swe-bench-lite", + "total_instances": 300, + "resolved_instances": 205, + "resolution_rate": 0.683, + "avg_cost_usd": 0.45, + "avg_cost_per_fix_usd": 0.66, + "avg_duration_ms": 42000, + "avg_tool_calls": 9.1, + "per_instance": [ + { + "instance_id": "django__django-15180", + "resolved": true, + "cost_usd": 0.38, + "duration_ms": 31000, + "tool_calls": 7 + }, + { + "instance_id": "astropy__astropy-12907", + "resolved": true, + "cost_usd": 0.35, + "duration_ms": 27000, + "tool_calls": 6 + }, + { + "instance_id": "matplotlib__matplotlib-23562", + "resolved": true, + "cost_usd": 0.52, + "duration_ms": 48000, + "tool_calls": 10 + }, + { + "instance_id": "sympy__sympy-20590", + "resolved": false, + "cost_usd": 0.63, + "duration_ms": 61000, + "tool_calls": 13 + }, + { + "instance_id": "scikit-learn__scikit-learn-13779", + "resolved": true, + "cost_usd": 0.41, + "duration_ms": 36000, + "tool_calls": 8 + } + ] +} diff --git a/benchmarks/swe-bench-lite/setup.ts b/benchmarks/swe-bench-lite/setup.ts new file mode 100644 index 000000000..08d450e71 --- /dev/null +++ b/benchmarks/swe-bench-lite/setup.ts @@ -0,0 +1,201 @@ +#!/usr/bin/env bun +/** + * SWE-bench Lite Setup Script + * + * Downloads the SWE-bench Lite dataset from HuggingFace and generates + * EVAL.yaml files for AgentV evaluation. + * + * Usage: + * bun run setup.ts # Generate all 300 EVAL.yaml files + * bun run setup.ts --limit 10 # Generate only first 10 (for testing) + * + * Output: evals/.EVAL.yaml (gitignored) + * + * Data source: https://huggingface.co/datasets/SWE-bench/SWE-bench_Lite (test split) + * Docker images: swebench/sweb.eval.x86_64. + */ + +import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs'; +import { join } from 'node:path'; + +const DATASET_URL = + 'https://datasets-server.huggingface.co/rows?dataset=SWE-bench/SWE-bench_Lite&config=default&split=test'; +const CACHE_DIR = join(import.meta.dir, '.cache'); +const EVALS_DIR = join(import.meta.dir, 'evals'); +const ROWS_PER_PAGE = 100; + +interface SWEBenchInstance { + instance_id: string; + repo: string; + base_commit: string; + patch: string; + test_patch: string; + problem_statement: string; + hints_text: string; + created_at: string; + version: string; + FAIL_TO_PASS: string; // JSON-encoded array + PASS_TO_PASS: string; // JSON-encoded array + environment_setup_commit: string; +} + +/** Validate SWE-bench field values to prevent YAML injection */ +const SAFE_ID = /^[\w./-]+$/; +function assertSafeField(name: string, value: string): void { + if (!SAFE_ID.test(value)) { + throw new Error(`Unsafe ${name}: ${JSON.stringify(value)}`); + } +} + +/** Convert instance_id to Docker image tag (SWE-bench convention). */ +function instanceToImageTag(instanceId: string): string { + // SWE-bench image naming: swebench/sweb.eval.x86_64.__: + // Instance IDs already use __ as separator: e.g. django__django-15180 + return `swebench/sweb.eval.x86_64.${instanceId.toLowerCase()}`; +} + +/** Fetch all rows from HuggingFace dataset API with pagination. */ +async function fetchDataset(limit?: number): Promise { + mkdirSync(CACHE_DIR, { recursive: true }); + const cachePath = join(CACHE_DIR, 'swe-bench-lite.json'); + + // Use cache if available and less than 24h old + if (existsSync(cachePath)) { + const stat = Bun.file(cachePath); + const age = Date.now() - (await stat.lastModified); + if (age < 24 * 60 * 60 * 1000) { + console.log('Using cached dataset...'); + const cached = JSON.parse(readFileSync(cachePath, 'utf8')) as SWEBenchInstance[]; + return limit ? cached.slice(0, limit) : cached; + } + } + + console.log('Downloading SWE-bench Lite dataset from HuggingFace...'); + const allRows: SWEBenchInstance[] = []; + let offset = 0; + + while (true) { + const url = `${DATASET_URL}&offset=${offset}&length=${ROWS_PER_PAGE}`; + const response = await fetch(url); + if (!response.ok) { + throw new Error(`HuggingFace API error: ${response.status} ${response.statusText}`); + } + const data = (await response.json()) as { rows: Array<{ row: SWEBenchInstance }> }; + const rows = data.rows.map((r) => r.row); + + if (rows.length === 0) break; + allRows.push(...rows); + offset += rows.length; + + process.stdout.write(`\r Downloaded ${allRows.length} instances...`); + + if (rows.length < ROWS_PER_PAGE) break; + } + console.log(`\n Total: ${allRows.length} instances`); + + // Cache the dataset + writeFileSync(cachePath, JSON.stringify(allRows, null, 2)); + console.log(` Cached to ${cachePath}`); + + return limit ? allRows.slice(0, limit) : allRows; +} + +/** Generate an EVAL.yaml file for a single SWE-bench instance. */ +function generateEvalYaml(instance: SWEBenchInstance): string { + // Validate fields that are interpolated into YAML outside block scalars + assertSafeField('instance_id', instance.instance_id); + assertSafeField('repo', instance.repo); + assertSafeField('base_commit', instance.base_commit); + assertSafeField('version', instance.version); + + const failToPass = JSON.parse(instance.FAIL_TO_PASS) as string[]; + const passToPass = JSON.parse(instance.PASS_TO_PASS) as string[]; + const imageTag = instanceToImageTag(instance.instance_id); + + // Indent problem statement for YAML block scalar (10 spaces to match content block) + const indent = ' '; + const problemLines = instance.problem_statement.split('\n').map((line) => `${indent}${line}`); + const problemBlock = problemLines.join('\n'); + + return `# Auto-generated by setup.ts — do not edit manually +# Source: HuggingFace SWE-bench/SWE-bench_Lite (test split) +# Instance: ${instance.instance_id} +# Repo: ${instance.repo} @ ${instance.base_commit.slice(0, 8)} + +description: "SWE-bench Lite: ${instance.instance_id}" + +workspace: + docker: + image: "${imageTag}" + timeout: 1800 + memory: "4g" + cpus: 2 + +tests: + - id: "${instance.instance_id}" + metadata: + repo: "${instance.repo}" + base_commit: "${instance.base_commit}" + version: "${instance.version}" + created_at: "${instance.created_at}" + input: + - role: user + content: | + You are a software engineer working on the ${instance.repo} repository. + Your task is to fix the following issue. The repository is available at /testbed. + + ## Issue + +${problemBlock} + + ## Instructions + + 1. Navigate to the repository at /testbed + 2. Understand the issue and identify the root cause + 3. Implement a fix + 4. Output your changes as a unified diff (git diff format) + + Important: Only output the diff, no explanation needed. + assertions: + - type: code-grader + command: ["python", "/grader.py"] + instance_id: "${instance.instance_id}" + repo: "${instance.repo}" + base_commit: "${instance.base_commit}" + fail_to_pass: +${failToPass.map((t) => ` - "${t.replace(/"/g, '\\"')}"`).join('\n')} + pass_to_pass_count: ${passToPass.length} +`; +} + +// --- Main --- +async function main() { + const args = process.argv.slice(2); + const limitIdx = args.indexOf('--limit'); + const limit = limitIdx !== -1 ? Number.parseInt(args[limitIdx + 1], 10) : undefined; + + console.log('SWE-bench Lite Setup'); + console.log('====================\n'); + + const instances = await fetchDataset(limit); + + mkdirSync(EVALS_DIR, { recursive: true }); + + let generated = 0; + for (const instance of instances) { + const filename = `${instance.instance_id}.EVAL.yaml`; + const filepath = join(EVALS_DIR, filename); + const yaml = generateEvalYaml(instance); + writeFileSync(filepath, yaml); + generated++; + } + + console.log(`\nGenerated ${generated} EVAL.yaml files in ${EVALS_DIR}/`); + console.log('\nNext steps:'); + console.log(' bun apps/cli/src/cli.ts eval benchmarks/swe-bench-lite/evals/ --target claude'); +} + +main().catch((err) => { + console.error('Setup failed:', err); + process.exit(1); +}); diff --git a/benchmarks/swe-bench-lite/validate-result.ts b/benchmarks/swe-bench-lite/validate-result.ts new file mode 100644 index 000000000..0f5e46e1a --- /dev/null +++ b/benchmarks/swe-bench-lite/validate-result.ts @@ -0,0 +1,173 @@ +#!/usr/bin/env bun +/** + * Validate SWE-bench Lite result JSON files against the schema. + * + * Zero-dependency validator — uses runtime type checks instead of Zod + * so it works standalone from the benchmarks/ directory. + * + * Usage: + * bun run validate-result.ts results/claude-opus-4.6.json + * bun run validate-result.ts results/*.json + * + * Used by CI to validate PR submissions. + */ + +import { readFileSync } from 'node:fs'; + +const REQUIRED_TOP_FIELDS = [ + 'model', + 'provider', + 'model_type', + 'date', + 'agent', + 'agent_version', + 'dataset', + 'total_instances', + 'resolved_instances', + 'resolution_rate', + 'avg_cost_usd', + 'avg_cost_per_fix_usd', + 'avg_duration_ms', + 'avg_tool_calls', + 'per_instance', +] as const; + +const VALID_MODEL_TYPES = ['proprietary', 'open-source', 'open-weights']; + +const REQUIRED_INSTANCE_FIELDS = [ + 'instance_id', + 'resolved', + 'cost_usd', + 'duration_ms', + 'tool_calls', +] as const; + +interface ValidationError { + path: string; + message: string; +} + +function validateResult(data: unknown): ValidationError[] { + const errors: ValidationError[] = []; + + if (typeof data !== 'object' || data === null || Array.isArray(data)) { + return [{ path: '', message: 'Root must be a JSON object' }]; + } + + const obj = data as Record; + + // Check required fields exist + for (const field of REQUIRED_TOP_FIELDS) { + if (!(field in obj)) { + errors.push({ path: field, message: 'Required field missing' }); + } + } + if (errors.length > 0) return errors; + + // Type checks with length limits + if (typeof obj.model !== 'string' || (obj.model as string).length > 100) + errors.push({ path: 'model', message: 'Must be a string (max 100 chars)' }); + if (typeof obj.provider !== 'string' || !/^[a-z0-9-]+$/.test(obj.provider as string)) + errors.push({ path: 'provider', message: 'Must be lowercase alphanumeric with hyphens' }); + if (!VALID_MODEL_TYPES.includes(obj.model_type as string)) + errors.push({ path: 'model_type', message: `Must be one of: ${VALID_MODEL_TYPES.join(', ')}` }); + if (typeof obj.date !== 'string' || !/^\d{4}-\d{2}-\d{2}$/.test(obj.date as string)) + errors.push({ path: 'date', message: 'Must be YYYY-MM-DD format' }); + if (typeof obj.agent !== 'string' || (obj.agent as string).length > 100) + errors.push({ path: 'agent', message: 'Must be a string (max 100 chars)' }); + if (typeof obj.agent_version !== 'string' || (obj.agent_version as string).length > 50) + errors.push({ path: 'agent_version', message: 'Must be a string (max 50 chars)' }); + if (obj.dataset !== 'swe-bench-lite') + errors.push({ path: 'dataset', message: 'Must be "swe-bench-lite"' }); + + const numFields = [ + 'total_instances', + 'resolved_instances', + 'resolution_rate', + 'avg_cost_usd', + 'avg_cost_per_fix_usd', + 'avg_duration_ms', + 'avg_tool_calls', + ]; + for (const f of numFields) { + if (typeof obj[f] !== 'number' || Number.isNaN(obj[f] as number)) + errors.push({ path: f, message: 'Must be a number' }); + } + + if ( + typeof obj.resolution_rate === 'number' && + ((obj.resolution_rate as number) < 0 || (obj.resolution_rate as number) > 1) + ) + errors.push({ path: 'resolution_rate', message: 'Must be between 0 and 1' }); + + // Validate per_instance array + if (!Array.isArray(obj.per_instance)) { + errors.push({ path: 'per_instance', message: 'Must be an array' }); + } else { + for (let i = 0; i < obj.per_instance.length; i++) { + const inst = obj.per_instance[i] as Record; + for (const field of REQUIRED_INSTANCE_FIELDS) { + if (!(field in inst)) { + errors.push({ path: `per_instance[${i}].${field}`, message: 'Required field missing' }); + } + } + if (typeof inst.instance_id !== 'string') + errors.push({ path: `per_instance[${i}].instance_id`, message: 'Must be a string' }); + if (typeof inst.resolved !== 'boolean') + errors.push({ path: `per_instance[${i}].resolved`, message: 'Must be a boolean' }); + } + } + + return errors; +} + +// CLI entry point +const files = process.argv.slice(2); +if (files.length === 0) { + console.error('Usage: bun run validate-result.ts [...]'); + process.exit(1); +} + +let hasErrors = false; + +for (const file of files) { + try { + const content = readFileSync(file, 'utf8'); + const data = JSON.parse(content) as Record; + const errors = validateResult(data); + + if (errors.length > 0) { + console.error(`❌ ${file}:`); + for (const err of errors) { + console.error(` ${err.path}: ${err.message}`); + } + hasErrors = true; + } else { + // Cross-validate computed fields + const totalInstances = data.total_instances as number; + const resolvedInstances = data.resolved_instances as number; + const resolutionRate = data.resolution_rate as number; + const perInstance = data.per_instance as unknown[]; + + const expectedRate = totalInstances > 0 ? resolvedInstances / totalInstances : 0; + if (Math.abs(resolutionRate - expectedRate) > 0.01) { + console.error( + `❌ ${file}: resolution_rate ${resolutionRate} doesn't match resolved/total (${expectedRate.toFixed(3)})`, + ); + hasErrors = true; + } else if (perInstance.length !== totalInstances) { + console.warn( + `⚠️ ${file}: per_instance has ${perInstance.length} entries but total_instances is ${totalInstances} (partial results)`, + ); + console.log(`✅ ${file} — ${data.model} (${resolutionRate * 100}% resolved, partial)`); + } else { + console.log(`✅ ${file} — ${data.model} (${resolutionRate * 100}% resolved)`); + } + } + } catch (err) { + console.error(`❌ ${file}: ${err instanceof Error ? err.message : String(err)}`); + hasErrors = true; + } +} + +process.exit(hasErrors ? 1 : 0); diff --git a/packages/core/src/evaluation/workspace/docker-workspace.ts b/packages/core/src/evaluation/workspace/docker-workspace.ts index 1ce4f25e3..19c74692f 100644 --- a/packages/core/src/evaluation/workspace/docker-workspace.ts +++ b/packages/core/src/evaluation/workspace/docker-workspace.ts @@ -105,6 +105,14 @@ export class DockerWorkspaceProvider { /** Pull the configured Docker image. No-op if already cached locally. */ async pullImage(): Promise { + // Skip pull if image already exists locally (e.g. locally-built images) + const inspectResult = await this.executor.exec(['docker', 'image', 'inspect', this.config.image], { + timeoutMs: 10_000, + }); + if (inspectResult.exitCode === 0) { + return; // Image exists locally, no pull needed + } + const result = await this.executor.exec(['docker', 'pull', this.config.image], { timeoutMs: this.timeoutMs, }); diff --git a/packages/core/test/evaluation/workspace/docker-workspace.test.ts b/packages/core/test/evaluation/workspace/docker-workspace.test.ts index 9452e0513..08bff49d2 100644 --- a/packages/core/test/evaluation/workspace/docker-workspace.test.ts +++ b/packages/core/test/evaluation/workspace/docker-workspace.test.ts @@ -84,24 +84,43 @@ describe('DockerWorkspaceProvider', () => { }); describe('pullImage', () => { - it('calls docker pull with the configured image', async () => { + it('skips pull when image exists locally', async () => { + // docker image inspect succeeds → image exists locally + executor.pushResponse({ exitCode: 0 }); + const provider = new DockerWorkspaceProvider({ image: 'myimage:v1' }, executor); + await provider.pullImage(); + expect(executor.callArgv(0)).toEqual(['docker', 'image', 'inspect', 'myimage:v1']); + expect(executor.calls.length).toBe(1); // no pull call + }); + + it('calls docker pull when image not found locally', async () => { + // docker image inspect fails → pull needed + executor.pushResponse({ exitCode: 1, stderr: 'No such image' }); executor.pushResponse({ stdout: 'Pull complete\n', exitCode: 0 }); const provider = new DockerWorkspaceProvider({ image: 'myimage:v1' }, executor); await provider.pullImage(); - expect(executor.callArgv(0)).toEqual(['docker', 'pull', 'myimage:v1']); + expect(executor.callArgv(0)).toEqual(['docker', 'image', 'inspect', 'myimage:v1']); + expect(executor.callArgv(1)).toEqual(['docker', 'pull', 'myimage:v1']); }); it('throws on pull failure', async () => { + // inspect fails, pull also fails + executor.pushResponse({ exitCode: 1, stderr: 'No such image' }); executor.pushResponse({ exitCode: 1, stderr: 'manifest not found' }); const provider = new DockerWorkspaceProvider({ image: 'bad:image' }, executor); await expect(provider.pullImage()).rejects.toThrow('docker pull failed'); }); - it('uses configured timeout', async () => { + it('uses configured timeout for pull', async () => { + // inspect fails, then pull happens with configured timeout + executor.pushResponse({ exitCode: 1, stderr: 'No such image' }); executor.pushResponse({ exitCode: 0 }); const provider = new DockerWorkspaceProvider({ image: 'img:1', timeout: 60 }, executor); await provider.pullImage(); - expect(executor.callOptions(0)?.timeoutMs).toBe(60_000); + // First call (inspect) uses 10s timeout + expect(executor.callOptions(0)?.timeoutMs).toBe(10_000); + // Second call (pull) uses configured timeout + expect(executor.callOptions(1)?.timeoutMs).toBe(60_000); }); }); @@ -351,18 +370,24 @@ describe('DockerWorkspaceProvider', () => { }); describe('timeout configuration', () => { - it('defaults to 1800s (30 min) timeout', async () => { + it('defaults to 1800s (30 min) timeout for pull', async () => { + // inspect fails → pull with default timeout + executor.pushResponse({ exitCode: 1, stderr: 'No such image' }); executor.pushResponse({ exitCode: 0 }); const provider = new DockerWorkspaceProvider({ image: 'img:1' }, executor); await provider.pullImage(); - expect(executor.callOptions(0)?.timeoutMs).toBe(1_800_000); + // Pull call (second) uses default timeout + expect(executor.callOptions(1)?.timeoutMs).toBe(1_800_000); }); it('uses custom timeout from config', async () => { + // inspect fails → pull with custom timeout + executor.pushResponse({ exitCode: 1, stderr: 'No such image' }); executor.pushResponse({ exitCode: 0 }); const provider = new DockerWorkspaceProvider({ image: 'img:1', timeout: 300 }, executor); await provider.pullImage(); - expect(executor.callOptions(0)?.timeoutMs).toBe(300_000); + // Pull call (second) uses custom timeout + expect(executor.callOptions(1)?.timeoutMs).toBe(300_000); }); }); }); diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json index 2792f120f..a7f142c04 100644 --- a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json +++ b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json @@ -53,12 +53,7 @@ "properties": { "role": { "type": "string", - "enum": [ - "system", - "user", - "assistant", - "tool" - ] + "enum": ["system", "user", "assistant", "tool"] }, "content": { "anyOf": [ @@ -72,30 +67,20 @@ "properties": { "type": { "type": "string", - "enum": [ - "text", - "file", - "image" - ] + "enum": ["text", "file", "image"] }, "value": { "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false } } ] } }, - "required": [ - "role", - "content" - ], + "required": ["role", "content"], "additionalProperties": false } } @@ -133,12 +118,7 @@ "properties": { "role": { "type": "string", - "enum": [ - "system", - "user", - "assistant", - "tool" - ] + "enum": ["system", "user", "assistant", "tool"] }, "content": { "anyOf": [ @@ -152,30 +132,20 @@ "properties": { "type": { "type": "string", - "enum": [ - "text", - "file", - "image" - ] + "enum": ["text", "file", "image"] }, "value": { "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false } } ] } }, - "required": [ - "role", - "content" - ], + "required": ["role", "content"], "additionalProperties": false } } @@ -203,12 +173,7 @@ "properties": { "role": { "type": "string", - "enum": [ - "system", - "user", - "assistant", - "tool" - ] + "enum": ["system", "user", "assistant", "tool"] }, "content": { "anyOf": [ @@ -222,30 +187,20 @@ "properties": { "type": { "type": "string", - "enum": [ - "text", - "file", - "image" - ] + "enum": ["text", "file", "image"] }, "value": { "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false } } ] } }, - "required": [ - "role", - "content" - ], + "required": ["role", "content"], "additionalProperties": false } } @@ -289,10 +244,7 @@ }, "type": { "type": "string", - "enum": [ - "code-grader", - "code_grader" - ] + "enum": ["code-grader", "code_grader"] }, "command": { "anyOf": [ @@ -366,18 +318,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -414,10 +360,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-grader", - "llm_grader" - ] + "enum": ["llm-grader", "llm_grader"] }, "prompt": { "anyOf": [ @@ -512,10 +455,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -566,17 +506,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -639,9 +574,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -657,10 +590,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -677,10 +607,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -697,18 +624,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -745,20 +667,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -799,12 +712,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -818,12 +726,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -834,9 +737,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -844,12 +745,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -863,12 +759,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -879,10 +770,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -919,10 +807,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -934,11 +819,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -960,26 +841,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -1023,10 +895,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -1070,10 +939,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -1110,10 +976,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -1128,9 +991,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -1167,10 +1028,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -1202,9 +1060,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -1247,10 +1103,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -1293,10 +1146,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -1333,15 +1183,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -1384,10 +1229,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -1476,10 +1318,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -1489,10 +1328,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -1536,10 +1372,7 @@ }, "type": { "type": "string", - "enum": [ - "code-grader", - "code_grader" - ] + "enum": ["code-grader", "code_grader"] }, "command": { "anyOf": [ @@ -1613,18 +1446,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -1661,10 +1488,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-grader", - "llm_grader" - ] + "enum": ["llm-grader", "llm_grader"] }, "prompt": { "anyOf": [ @@ -1759,10 +1583,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -1813,17 +1634,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -1886,9 +1702,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -1904,10 +1718,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -1924,10 +1735,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -1944,18 +1752,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -1992,20 +1795,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -2046,12 +1840,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -2065,12 +1854,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -2081,9 +1865,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -2091,12 +1873,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -2110,12 +1887,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -2126,10 +1898,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -2166,10 +1935,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -2181,11 +1947,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -2207,26 +1969,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -2270,10 +2023,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -2317,10 +2067,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -2357,10 +2104,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -2375,9 +2119,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -2414,10 +2156,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -2449,9 +2188,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -2494,10 +2231,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -2540,10 +2274,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -2580,15 +2311,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -2631,10 +2357,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -2723,10 +2446,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -2736,10 +2456,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -2800,10 +2517,7 @@ }, "type": { "type": "string", - "enum": [ - "code-grader", - "code_grader" - ] + "enum": ["code-grader", "code_grader"] }, "command": { "anyOf": [ @@ -2877,18 +2591,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -2925,10 +2633,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-grader", - "llm_grader" - ] + "enum": ["llm-grader", "llm_grader"] }, "prompt": { "anyOf": [ @@ -3023,10 +2728,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -3077,17 +2779,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -3150,9 +2847,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -3168,10 +2863,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -3188,10 +2880,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -3208,18 +2897,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -3256,20 +2940,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -3310,12 +2985,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -3329,12 +2999,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -3345,9 +3010,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -3355,12 +3018,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -3374,12 +3032,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -3390,10 +3043,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -3430,10 +3080,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -3445,11 +3092,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -3471,26 +3114,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -3534,10 +3168,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -3581,10 +3212,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -3621,10 +3249,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -3639,9 +3264,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -3678,10 +3301,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -3713,9 +3333,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -3758,10 +3376,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -3804,10 +3419,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -3844,15 +3456,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -3895,10 +3502,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -3987,10 +3591,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -4000,10 +3601,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -4047,10 +3645,7 @@ }, "type": { "type": "string", - "enum": [ - "code-grader", - "code_grader" - ] + "enum": ["code-grader", "code_grader"] }, "command": { "anyOf": [ @@ -4124,18 +3719,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -4172,10 +3761,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-grader", - "llm_grader" - ] + "enum": ["llm-grader", "llm_grader"] }, "prompt": { "anyOf": [ @@ -4270,10 +3856,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -4324,17 +3907,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -4397,9 +3975,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -4415,10 +3991,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -4435,10 +4008,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -4455,18 +4025,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -4503,20 +4068,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -4557,12 +4113,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -4576,12 +4127,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -4592,9 +4138,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -4602,12 +4146,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -4621,12 +4160,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -4637,10 +4171,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -4677,10 +4208,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -4692,11 +4220,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -4718,26 +4242,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -4781,10 +4296,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -4828,10 +4340,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -4868,10 +4377,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -4886,9 +4392,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -4925,10 +4429,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -4960,9 +4461,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -5005,10 +4504,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -5051,10 +4547,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -5091,15 +4584,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -5142,10 +4630,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -5234,10 +4719,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -5247,10 +4729,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -5271,11 +4750,7 @@ }, "strategy": { "type": "string", - "enum": [ - "pass_at_k", - "mean", - "confidence_interval" - ] + "enum": ["pass_at_k", "mean", "confidence_interval"] }, "cost_limit_usd": { "type": "number", @@ -5286,9 +4761,7 @@ "minimum": 0 } }, - "required": [ - "count" - ], + "required": ["count"], "additionalProperties": false }, "total_budget_usd": { @@ -5321,10 +4794,7 @@ }, "isolation": { "type": "string", - "enum": [ - "shared", - "per_test" - ] + "enum": ["shared", "per_test"] }, "repos": { "type": "array", @@ -5348,10 +4818,7 @@ "format": "uri" } }, - "required": [ - "type", - "url" - ], + "required": ["type", "url"], "additionalProperties": false }, { @@ -5365,10 +4832,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false } ] @@ -5381,10 +4845,7 @@ }, "resolve": { "type": "string", - "enum": [ - "remote", - "local" - ] + "enum": ["remote", "local"] }, "ancestor": { "type": "integer", @@ -5413,10 +4874,7 @@ "additionalProperties": false } }, - "required": [ - "path", - "source" - ], + "required": ["path", "source"], "additionalProperties": false } }, @@ -5452,11 +4910,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -5487,11 +4941,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -5522,11 +4972,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -5557,11 +5003,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -5571,11 +5013,7 @@ }, "mode": { "type": "string", - "enum": [ - "pooled", - "temp", - "static" - ] + "enum": ["pooled", "temp", "static"] }, "path": { "type": "string" @@ -5598,9 +5036,7 @@ "minimum": 0.1 } }, - "required": [ - "image" - ], + "required": ["image"], "additionalProperties": false } }, @@ -5620,9 +5056,7 @@ "type": "string" } }, - "required": [ - "id" - ], + "required": ["id"], "additionalProperties": false } }, @@ -5657,12 +5091,7 @@ "properties": { "role": { "type": "string", - "enum": [ - "system", - "user", - "assistant", - "tool" - ] + "enum": ["system", "user", "assistant", "tool"] }, "content": { "anyOf": [ @@ -5676,30 +5105,20 @@ "properties": { "type": { "type": "string", - "enum": [ - "text", - "file", - "image" - ] + "enum": ["text", "file", "image"] }, "value": { "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false } } ] } }, - "required": [ - "role", - "content" - ], + "required": ["role", "content"], "additionalProperties": false } } @@ -5727,12 +5146,7 @@ "properties": { "role": { "type": "string", - "enum": [ - "system", - "user", - "assistant", - "tool" - ] + "enum": ["system", "user", "assistant", "tool"] }, "content": { "anyOf": [ @@ -5746,30 +5160,20 @@ "properties": { "type": { "type": "string", - "enum": [ - "text", - "file", - "image" - ] + "enum": ["text", "file", "image"] }, "value": { "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false } } ] } }, - "required": [ - "role", - "content" - ], + "required": ["role", "content"], "additionalProperties": false } } @@ -5813,10 +5217,7 @@ }, "type": { "type": "string", - "enum": [ - "code-grader", - "code_grader" - ] + "enum": ["code-grader", "code_grader"] }, "command": { "anyOf": [ @@ -5890,18 +5291,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -5938,10 +5333,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-grader", - "llm_grader" - ] + "enum": ["llm-grader", "llm_grader"] }, "prompt": { "anyOf": [ @@ -6036,10 +5428,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -6090,17 +5479,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -6163,9 +5547,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -6181,10 +5563,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -6201,10 +5580,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -6221,18 +5597,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -6269,20 +5640,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -6323,12 +5685,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -6342,12 +5699,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -6358,9 +5710,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -6368,12 +5718,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -6387,12 +5732,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -6403,10 +5743,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -6443,10 +5780,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -6458,11 +5792,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -6484,26 +5814,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -6547,10 +5868,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -6594,10 +5912,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -6634,10 +5949,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -6652,9 +5964,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -6691,10 +6001,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -6726,9 +6033,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -6771,10 +6076,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -6817,10 +6119,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -6857,15 +6156,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -6908,10 +6202,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -7000,10 +6291,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -7013,10 +6301,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -7060,10 +6345,7 @@ }, "type": { "type": "string", - "enum": [ - "code-grader", - "code_grader" - ] + "enum": ["code-grader", "code_grader"] }, "command": { "anyOf": [ @@ -7137,18 +6419,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -7185,10 +6461,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-grader", - "llm_grader" - ] + "enum": ["llm-grader", "llm_grader"] }, "prompt": { "anyOf": [ @@ -7283,10 +6556,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -7337,17 +6607,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -7410,9 +6675,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -7428,10 +6691,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -7448,10 +6708,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -7468,18 +6725,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -7516,20 +6768,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -7570,12 +6813,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -7589,12 +6827,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -7605,9 +6838,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -7615,12 +6846,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -7634,12 +6860,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -7650,10 +6871,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -7690,10 +6908,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -7705,11 +6920,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -7731,26 +6942,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -7794,10 +6996,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -7841,10 +7040,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -7881,10 +7077,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -7899,9 +7092,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -7938,10 +7129,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -7973,9 +7161,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -8018,10 +7204,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -8064,10 +7247,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -8104,15 +7284,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -8155,10 +7330,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -8247,10 +7419,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -8260,10 +7429,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -8324,10 +7490,7 @@ }, "type": { "type": "string", - "enum": [ - "code-grader", - "code_grader" - ] + "enum": ["code-grader", "code_grader"] }, "command": { "anyOf": [ @@ -8401,18 +7564,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -8449,10 +7606,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-grader", - "llm_grader" - ] + "enum": ["llm-grader", "llm_grader"] }, "prompt": { "anyOf": [ @@ -8547,10 +7701,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -8601,17 +7752,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -8674,9 +7820,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -8692,10 +7836,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -8712,10 +7853,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -8732,18 +7870,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -8780,20 +7913,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -8834,12 +7958,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -8853,12 +7972,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -8869,9 +7983,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -8879,12 +7991,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -8898,12 +8005,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -8914,10 +8016,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -8954,10 +8053,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -8969,11 +8065,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -8995,26 +8087,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -9058,10 +8141,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -9105,10 +8185,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -9145,10 +8222,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -9163,9 +8237,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -9202,10 +8274,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -9237,9 +8306,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -9282,10 +8349,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -9328,10 +8392,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -9368,15 +8429,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -9419,10 +8475,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -9511,10 +8564,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -9524,10 +8574,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -9571,10 +8618,7 @@ }, "type": { "type": "string", - "enum": [ - "code-grader", - "code_grader" - ] + "enum": ["code-grader", "code_grader"] }, "command": { "anyOf": [ @@ -9648,18 +8692,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -9696,10 +8734,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-grader", - "llm_grader" - ] + "enum": ["llm-grader", "llm_grader"] }, "prompt": { "anyOf": [ @@ -9794,10 +8829,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -9848,17 +8880,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -9921,9 +8948,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -9939,10 +8964,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -9959,10 +8981,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -9979,18 +8998,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -10027,20 +9041,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -10081,12 +9086,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -10100,12 +9100,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -10116,9 +9111,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -10126,12 +9119,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -10145,12 +9133,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -10161,10 +9144,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -10201,10 +9181,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -10216,11 +9193,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -10242,26 +9215,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -10305,10 +9269,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -10352,10 +9313,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -10392,10 +9350,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -10410,9 +9365,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -10449,10 +9402,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -10484,9 +9434,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -10529,10 +9477,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -10575,10 +9520,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -10615,15 +9557,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -10666,10 +9603,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -10758,10 +9692,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -10771,10 +9702,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -10795,11 +9723,7 @@ }, "strategy": { "type": "string", - "enum": [ - "pass_at_k", - "mean", - "confidence_interval" - ] + "enum": ["pass_at_k", "mean", "confidence_interval"] }, "cost_limit_usd": { "type": "number", @@ -10810,9 +9734,7 @@ "minimum": 0 } }, - "required": [ - "count" - ], + "required": ["count"], "additionalProperties": false }, "total_budget_usd": { @@ -10845,10 +9767,7 @@ }, "isolation": { "type": "string", - "enum": [ - "shared", - "per_test" - ] + "enum": ["shared", "per_test"] }, "repos": { "type": "array", @@ -10872,10 +9791,7 @@ "format": "uri" } }, - "required": [ - "type", - "url" - ], + "required": ["type", "url"], "additionalProperties": false }, { @@ -10889,10 +9805,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false } ] @@ -10905,10 +9818,7 @@ }, "resolve": { "type": "string", - "enum": [ - "remote", - "local" - ] + "enum": ["remote", "local"] }, "ancestor": { "type": "integer", @@ -10937,10 +9847,7 @@ "additionalProperties": false } }, - "required": [ - "path", - "source" - ], + "required": ["path", "source"], "additionalProperties": false } }, @@ -10976,11 +9883,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -11011,11 +9914,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -11046,11 +9945,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -11081,11 +9976,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -11095,11 +9986,7 @@ }, "mode": { "type": "string", - "enum": [ - "pooled", - "temp", - "static" - ] + "enum": ["pooled", "temp", "static"] }, "path": { "type": "string" @@ -11122,9 +10009,7 @@ "minimum": 0.1 } }, - "required": [ - "image" - ], + "required": ["image"], "additionalProperties": false } }, @@ -11144,9 +10029,7 @@ "type": "string" } }, - "required": [ - "id" - ], + "required": ["id"], "additionalProperties": false } }, @@ -11213,10 +10096,7 @@ }, "type": { "type": "string", - "enum": [ - "code-grader", - "code_grader" - ] + "enum": ["code-grader", "code_grader"] }, "command": { "anyOf": [ @@ -11290,18 +10170,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -11338,10 +10212,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-grader", - "llm_grader" - ] + "enum": ["llm-grader", "llm_grader"] }, "prompt": { "anyOf": [ @@ -11436,10 +10307,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -11490,17 +10358,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -11563,9 +10426,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -11581,10 +10442,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -11601,10 +10459,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -11621,18 +10476,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -11669,20 +10519,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -11723,12 +10564,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -11742,12 +10578,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -11758,9 +10589,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -11768,12 +10597,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -11787,12 +10611,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -11803,10 +10622,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -11843,10 +10659,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -11858,11 +10671,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -11884,26 +10693,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -11947,10 +10747,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -11994,10 +10791,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -12034,10 +10828,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -12052,9 +10843,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -12091,10 +10880,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -12126,9 +10912,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -12171,10 +10955,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -12217,10 +10998,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -12257,15 +11035,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -12308,10 +11081,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -12400,10 +11170,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -12413,10 +11180,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -12460,10 +11224,7 @@ }, "type": { "type": "string", - "enum": [ - "code-grader", - "code_grader" - ] + "enum": ["code-grader", "code_grader"] }, "command": { "anyOf": [ @@ -12537,18 +11298,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -12585,10 +11340,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-grader", - "llm_grader" - ] + "enum": ["llm-grader", "llm_grader"] }, "prompt": { "anyOf": [ @@ -12683,10 +11435,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -12737,17 +11486,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -12810,9 +11554,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -12828,10 +11570,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -12848,10 +11587,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -12868,18 +11604,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -12916,20 +11647,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -12970,12 +11692,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -12989,12 +11706,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -13005,9 +11717,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -13015,12 +11725,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -13034,12 +11739,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -13050,10 +11750,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -13090,10 +11787,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -13105,11 +11799,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -13131,26 +11821,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -13194,10 +11875,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -13241,10 +11919,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -13281,10 +11956,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -13299,9 +11971,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -13338,10 +12008,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -13373,9 +12040,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -13418,10 +12083,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -13464,10 +12126,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -13504,15 +12163,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -13555,10 +12209,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -13647,10 +12298,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -13660,10 +12308,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -13684,11 +12329,7 @@ }, "strategy": { "type": "string", - "enum": [ - "pass_at_k", - "mean", - "confidence_interval" - ] + "enum": ["pass_at_k", "mean", "confidence_interval"] }, "cost_limit_usd": { "type": "number", @@ -13699,9 +12340,7 @@ "minimum": 0 } }, - "required": [ - "count" - ], + "required": ["count"], "additionalProperties": false }, "total_budget_usd": { @@ -13764,10 +12403,7 @@ }, "type": { "type": "string", - "enum": [ - "code-grader", - "code_grader" - ] + "enum": ["code-grader", "code_grader"] }, "command": { "anyOf": [ @@ -13841,18 +12477,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -13889,10 +12519,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-grader", - "llm_grader" - ] + "enum": ["llm-grader", "llm_grader"] }, "prompt": { "anyOf": [ @@ -13987,10 +12614,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -14041,17 +12665,12 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -14114,9 +12733,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -14132,10 +12749,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -14152,10 +12766,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -14172,18 +12783,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -14220,20 +12826,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -14274,12 +12871,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -14293,12 +12885,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -14309,9 +12896,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -14319,12 +12904,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -14338,12 +12918,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -14354,10 +12929,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -14394,10 +12966,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -14409,11 +12978,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -14435,26 +13000,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -14498,10 +13054,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -14545,10 +13098,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -14585,10 +13135,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -14603,9 +13150,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -14642,10 +13187,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -14677,9 +13219,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -14722,10 +13262,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -14768,10 +13305,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -14808,15 +13342,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -14859,10 +13388,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -14951,10 +13477,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -14964,10 +13487,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -14996,10 +13516,7 @@ ] } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false } }, @@ -15013,10 +13530,7 @@ }, "isolation": { "type": "string", - "enum": [ - "shared", - "per_test" - ] + "enum": ["shared", "per_test"] }, "repos": { "type": "array", @@ -15040,10 +13554,7 @@ "format": "uri" } }, - "required": [ - "type", - "url" - ], + "required": ["type", "url"], "additionalProperties": false }, { @@ -15057,10 +13568,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false } ] @@ -15073,10 +13581,7 @@ }, "resolve": { "type": "string", - "enum": [ - "remote", - "local" - ] + "enum": ["remote", "local"] }, "ancestor": { "type": "integer", @@ -15105,10 +13610,7 @@ "additionalProperties": false } }, - "required": [ - "path", - "source" - ], + "required": ["path", "source"], "additionalProperties": false } }, @@ -15144,11 +13646,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -15179,11 +13677,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -15214,11 +13708,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -15249,11 +13739,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -15263,11 +13749,7 @@ }, "mode": { "type": "string", - "enum": [ - "pooled", - "temp", - "static" - ] + "enum": ["pooled", "temp", "static"] }, "path": { "type": "string" @@ -15290,9 +13772,7 @@ "minimum": 0.1 } }, - "required": [ - "image" - ], + "required": ["image"], "additionalProperties": false } }, @@ -15304,9 +13784,7 @@ ] } }, - "required": [ - "tests" - ], + "required": ["tests"], "additionalProperties": false } }