diff --git a/apps/web/src/pages/leaderboard.astro b/apps/web/src/pages/leaderboard.astro
new file mode 100644
index 000000000..d56908796
--- /dev/null
+++ b/apps/web/src/pages/leaderboard.astro
@@ -0,0 +1,632 @@
+---
+/**
+ * AgentV Leaderboard — SWE-bench Lite
+ *
+ * Static page that reads benchmark result JSON files at build time
+ * and renders a sortable table + Pareto frontier chart.
+ *
+ * Data source: /benchmarks/swe-bench-lite/results/*.json
+ * Route: /leaderboard
+ */
+
+// Read result files at build time
+import { readFileSync, readdirSync } from 'node:fs';
+import { join } from 'node:path';
+
+interface ResultData {
+ model: string;
+ provider: string;
+ model_type: string;
+ date: string;
+ agent: string;
+ agent_version: string;
+ dataset: string;
+ total_instances: number;
+ resolved_instances: number;
+ resolution_rate: number;
+ avg_cost_usd: number;
+ avg_cost_per_fix_usd: number;
+ avg_duration_ms: number;
+ avg_tool_calls: number;
+}
+
+const resultsDir = join(process.cwd(), '../../benchmarks/swe-bench-lite/results');
+let results: ResultData[] = [];
+
+// Sanitize string for use in CSS class names (alphanumeric + hyphens only)
+function safeCssClass(s: string): string {
+ return s.replace(/[^a-z0-9-]/gi, '').toLowerCase();
+}
+
+try {
+ const files = readdirSync(resultsDir).filter((f) => f.endsWith('.json'));
+ results = files.map((f) => {
+ const data = JSON.parse(readFileSync(join(resultsDir, f), 'utf8'));
+ return data as ResultData;
+ });
+ // Sort by resolution rate descending
+ results.sort((a, b) => b.resolution_rate - a.resolution_rate);
+} catch {
+ // Results dir may not exist in all environments
+}
+
+// Provider colors for chart
+const providerColors: Record
= {
+ anthropic: '#06b6d4',
+ openai: '#10b981',
+ google: '#f59e0b',
+ deepseek: '#8b5cf6',
+ meta: '#ef4444',
+};
+
+// Compute Pareto frontier
+function computeParetoFrontier(data: ResultData[]): ResultData[] {
+ const sorted = [...data].sort((a, b) => a.avg_cost_usd - b.avg_cost_usd);
+ const frontier: ResultData[] = [];
+ let maxRate = -1;
+ for (const d of sorted) {
+ if (d.resolution_rate > maxRate) {
+ frontier.push(d);
+ maxRate = d.resolution_rate;
+ }
+ }
+ return frontier;
+}
+
+const frontier = computeParetoFrontier(results);
+const frontierSet = new Set(frontier.map((f) => f.model));
+const providers = [...new Set(results.map((r) => r.provider))].sort();
+---
+
+
+
+
+
+
+ Leaderboard — AgentV SWE-bench Lite
+
+
+
+
+
+
+
+
+
+
+
+
+
+ AgentV Leaderboard — SWE-bench Lite
+ The multi-dimensional agent benchmark. Same SWE-bench tasks, richer metrics — cost efficiency, tool usage, and Pareto-optimal rankings.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ | # ▲ |
+ Model ▲ |
+ Provider ▲ |
+ % Resolved ▲ |
+ Avg $ ▲ |
+ $/Fix ▲ |
+ Tools ▲ |
+ Latency ▲ |
+ Date ▲ |
+
+
+
+ {results.map((r, i) => {
+ const costClass = r.avg_cost_per_fix_usd < 0.5 ? 'good' : r.avg_cost_per_fix_usd < 0.8 ? 'mid' : 'bad';
+ const isFrontier = frontierSet.has(r.model);
+ return (
+
+ | {i + 1} |
+
+ {r.model}
+ {isFrontier && }
+ |
+ {r.provider} |
+ {(r.resolution_rate * 100).toFixed(1)}% |
+ ${r.avg_cost_usd.toFixed(2)} |
+ ${r.avg_cost_per_fix_usd.toFixed(2)} |
+ {r.avg_tool_calls.toFixed(1)} |
+ {(r.avg_duration_ms / 1000).toFixed(0)}s |
+ {r.date} |
+
+ );
+ })}
+
+
+
+
+
+
+ Pareto Frontier — Score vs Cost
+ Models on the frontier line achieve the best resolution rate for their cost. Closer to top-left is better.
+
+
+
+
+
+
+
+
+ Run it yourself
+
+
$ git clone https://github.com/EntityProcess/agentv
+
$ cd agentv/benchmarks/swe-bench-lite
+
$ bun run setup.ts
+
$ agentv eval ./evals/ --target claude
+
+
+
+
+ Submit your results →
+
+
+
+
+
+
+
+
+
+
+
diff --git a/benchmarks/swe-bench-lite/.gitignore b/benchmarks/swe-bench-lite/.gitignore
new file mode 100644
index 000000000..321287329
--- /dev/null
+++ b/benchmarks/swe-bench-lite/.gitignore
@@ -0,0 +1,4 @@
+# Generated eval files from setup.ts
+evals/
+# Cache directory for HuggingFace downloads
+.cache/
diff --git a/benchmarks/swe-bench-lite/README.md b/benchmarks/swe-bench-lite/README.md
new file mode 100644
index 000000000..f20f3e546
--- /dev/null
+++ b/benchmarks/swe-bench-lite/README.md
@@ -0,0 +1,109 @@
+# SWE-bench Lite Benchmark
+
+Run [SWE-bench Lite](https://www.swebench.com/) (300 instances) through AgentV with richer metrics than the original leaderboard.
+
+## Quick Start
+
+### Prerequisites
+
+- **Docker** — Required for running SWE-bench instances. Each instance runs in a pre-built Docker container.
+- **Bun** — Used to run setup and CLI scripts
+- **An LLM API key** — Set via `--target` flag or provider env vars
+
+### 1. Setup
+
+Download the dataset from HuggingFace and generate EVAL.yaml files:
+
+```bash
+cd benchmarks/swe-bench-lite
+bun run setup.ts
+```
+
+This creates `evals/*.EVAL.yaml` — one per SWE-bench instance. Files are gitignored (generated from HuggingFace source of truth).
+
+### 2. Run Evaluations
+
+```bash
+# Run all instances against a target
+bun apps/cli/src/cli.ts eval benchmarks/swe-bench-lite/evals/ --target claude
+
+# Run a single instance
+bun apps/cli/src/cli.ts eval benchmarks/swe-bench-lite/evals/django__django-15180.EVAL.yaml --target claude
+
+# Run with cost tracking
+bun apps/cli/src/cli.ts eval benchmarks/swe-bench-lite/evals/ --target claude --output results/claude-opus-4.6.json
+```
+
+### 3. Submit Results
+
+Results are submitted via GitHub PR. Each result file goes in `results/.json`.
+
+**Steps:**
+1. Fork the [agentv repo](https://github.com/EntityProcess/agentv)
+2. Run the benchmark (see above)
+3. Add your result JSON to `benchmarks/swe-bench-lite/results/.json`
+4. Open a PR — CI validates the JSON schema automatically
+
+### Result JSON Format
+
+```json
+{
+ "model": "Claude Opus 4.6",
+ "provider": "anthropic",
+ "model_type": "proprietary",
+ "date": "2026-04-08",
+ "agent": "mini-swe-agent-agentv",
+ "agent_version": "1.0.0",
+ "dataset": "swe-bench-lite",
+ "total_instances": 300,
+ "resolved_instances": 218,
+ "resolution_rate": 0.727,
+ "avg_cost_usd": 0.55,
+ "avg_cost_per_fix_usd": 0.76,
+ "avg_duration_ms": 45000,
+ "avg_tool_calls": 8.2,
+ "per_instance": [
+ {
+ "instance_id": "django__django-15180",
+ "resolved": true,
+ "cost_usd": 0.42,
+ "duration_ms": 32000,
+ "tool_calls": 6
+ }
+ ]
+}
+```
+
+See `result.schema.json` for the full validation schema.
+
+### Leaderboard
+
+Results are displayed on [agentv.dev/leaderboard](https://agentv.dev/leaderboard) with:
+- **Multi-dimensional ranking** — not just pass/fail, but cost, latency, tool efficiency
+- **Cost-normalized scoring** — $/Fix metric shows best value per dollar
+- **Pareto frontier** — visual chart of score vs cost tradeoffs
+- **Filterable** — by model type, provider, date
+
+## Dataset
+
+- **Source:** [HuggingFace SWE-bench/SWE-bench_Lite](https://huggingface.co/datasets/SWE-bench/SWE-bench_Lite)
+- **Split:** test (300 instances)
+- **Docker images:** `swebench/sweb.eval.x86_64.*` from DockerHub
+
+## Architecture
+
+```
+setup.ts → downloads from HuggingFace → generates evals/*.EVAL.yaml
+ ↓
+ agentv eval ./evals/
+ ↓
+ Docker container per instance
+ (image from SWE-bench registry)
+ ↓
+ graders/swe-bench-grader.ts
+ (runs inside container)
+ ↓
+ results/*.json
+ ↓
+ agentv.dev/leaderboard
+```
diff --git a/benchmarks/swe-bench-lite/e2e-test/calculator-bug.EVAL.yaml b/benchmarks/swe-bench-lite/e2e-test/calculator-bug.EVAL.yaml
new file mode 100644
index 000000000..f8be6a26d
--- /dev/null
+++ b/benchmarks/swe-bench-lite/e2e-test/calculator-bug.EVAL.yaml
@@ -0,0 +1,38 @@
+# E2E test eval - validates Docker workspace + grader pipeline
+description: "E2E test: fix calculator bug in Docker container"
+
+workspace:
+ docker:
+ image: "agentv-test-eval:latest"
+ timeout: 120
+ memory: "1g"
+
+tests:
+ - id: "calculator-add-bug"
+ input:
+ - role: user
+ content: |
+ You are a software engineer. The repository at /testbed has a bug in calculator.py.
+ The function add(a, b) returns a - b instead of a + b.
+
+ Here is the buggy file:
+ ```python
+ def add(a, b):
+ return a - b # BUG: should be a + b
+
+ def subtract(a, b):
+ return a - b
+ ```
+
+ The test test_calculator.py::test_add is failing because add(2,3) returns -1 instead of 5.
+
+ Fix the bug and output ONLY a unified diff (git diff format) that changes `return a - b` to `return a + b` in the add function. No explanation, just the diff.
+ assertions:
+ - type: code-grader
+ command: ["python", "/grader.py"]
+ instance_id: "calculator-add-bug"
+ repo: "test/calculator"
+ base_commit: "initial"
+ fail_to_pass:
+ - "test_calculator.py::test_add"
+ pass_to_pass_count: 0
diff --git a/benchmarks/swe-bench-lite/e2e-test/docker/Dockerfile b/benchmarks/swe-bench-lite/e2e-test/docker/Dockerfile
new file mode 100644
index 000000000..a6a911a48
--- /dev/null
+++ b/benchmarks/swe-bench-lite/e2e-test/docker/Dockerfile
@@ -0,0 +1,22 @@
+FROM python:3.12-slim
+
+RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
+RUN pip install --no-cache-dir pytest
+
+WORKDIR /testbed
+
+# Create a simple calculator module with a known bug
+RUN printf 'def add(a, b):\n return a - b # BUG: should be a + b\n\ndef subtract(a, b):\n return a - b\n' > calculator.py
+
+# Create test file
+RUN printf 'from calculator import add, subtract\n\ndef test_add():\n assert add(2, 3) == 5\n assert add(-1, 1) == 0\n\ndef test_subtract():\n assert subtract(5, 3) == 2\n' > test_calculator.py
+
+# Initialize git so patches can be applied
+RUN git config --global user.email "test@test.com" && \
+ git config --global user.name "Test" && \
+ git init && git add . && git commit -m "initial"
+
+# Copy grader into the image
+COPY grader.py /grader.py
+
+CMD ["bash"]
diff --git a/benchmarks/swe-bench-lite/e2e-test/docker/grader.py b/benchmarks/swe-bench-lite/e2e-test/docker/grader.py
new file mode 100644
index 000000000..65742691f
--- /dev/null
+++ b/benchmarks/swe-bench-lite/e2e-test/docker/grader.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+"""
+Simple grader that runs INSIDE the Docker container.
+Reads JSON from stdin, extracts diff from agent output, applies it, runs tests.
+"""
+import json
+import os
+import re
+import subprocess
+import sys
+import tempfile
+
+def extract_diff(output):
+ """Extract a unified diff from the agent's output messages."""
+ text = ""
+ if isinstance(output, list):
+ for msg in output:
+ if isinstance(msg, dict):
+ text += msg.get("content", "") + "\n"
+ elif isinstance(msg, str):
+ text += msg + "\n"
+ elif isinstance(output, str):
+ text = output
+
+ # Try to extract from code blocks first
+ blocks = re.findall(r"```(?:diff)?\s*\n(.*?)```", text, re.DOTALL)
+ if blocks:
+ return blocks[0].strip()
+
+ # Try to find unified diff lines
+ lines = text.split("\n")
+ diff_lines = []
+ in_diff = False
+ for line in lines:
+ if line.startswith("---") or line.startswith("+++") or line.startswith("diff "):
+ in_diff = True
+ if in_diff:
+ diff_lines.append(line)
+
+ if diff_lines:
+ return "\n".join(diff_lines).strip()
+
+ return text.strip()
+
+
+def main():
+ payload = json.load(sys.stdin)
+ config = payload.get("config", {})
+ output = payload.get("output", [])
+ fail_to_pass = config.get("fail_to_pass", [])
+
+ # Debug info to stderr (won't affect stdout JSON)
+ print(f"DEBUG: output type={type(output).__name__}, config keys={list(config.keys())}, fail_to_pass={fail_to_pass}", file=sys.stderr)
+ if isinstance(output, list) and output:
+ print(f"DEBUG: first output item type={type(output[0]).__name__}, keys={list(output[0].keys()) if isinstance(output[0], dict) else 'N/A'}", file=sys.stderr)
+
+ patch = extract_diff(output)
+ assertions = []
+ workdir = "/testbed"
+
+ print(f"DEBUG: extracted patch length={len(patch)}", file=sys.stderr)
+ print(f"DEBUG: patch first 200 chars: {patch[:200]}", file=sys.stderr)
+
+ if not patch:
+ print(json.dumps({
+ "score": 0.0,
+ "assertions": [{"text": "No patch found in agent output", "passed": False}]
+ }))
+ return
+
+ # Write patch to temp file and apply
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".patch", delete=False) as f:
+ f.write(patch + "\n")
+ patch_path = f.name
+
+ try:
+ result = subprocess.run(
+ ["git", "apply", "--allow-empty", patch_path],
+ cwd=workdir,
+ capture_output=True,
+ text=True,
+ timeout=30,
+ )
+ if result.returncode != 0:
+ assertions.append({
+ "text": f"git apply failed: {result.stderr.strip()[:200]}",
+ "passed": False,
+ })
+ print(json.dumps({"score": 0.0, "assertions": assertions}))
+ return
+ assertions.append({"text": "Patch applied successfully", "passed": True})
+ except Exception as e:
+ assertions.append({"text": f"Patch apply error: {str(e)[:200]}", "passed": False})
+ print(json.dumps({"score": 0.0, "assertions": assertions}))
+ return
+ finally:
+ os.unlink(patch_path)
+
+ # Run fail_to_pass tests
+ print(f"DEBUG: about to run {len(fail_to_pass)} tests", file=sys.stderr)
+ passed = 0
+ total = len(fail_to_pass)
+ for test in fail_to_pass:
+ print(f"DEBUG: running test: {test}", file=sys.stderr)
+ try:
+ result = subprocess.run(
+ ["python", "-m", "pytest", test, "-x", "--tb=short", "-q"],
+ cwd=workdir,
+ capture_output=True,
+ text=True,
+ timeout=60,
+ )
+ print(f"DEBUG: test returncode={result.returncode} stdout={result.stdout[:200]} stderr={result.stderr[:200]}", file=sys.stderr)
+ if result.returncode == 0:
+ passed += 1
+ assertions.append({"text": f"PASS: {test}", "passed": True})
+ else:
+ assertions.append({
+ "text": f"FAIL: {test} — {result.stdout.strip()[-200:]}",
+ "passed": False,
+ })
+ except Exception as e:
+ print(f"DEBUG: test exception: {e}", file=sys.stderr)
+ assertions.append({"text": f"ERROR running {test}: {str(e)[:200]}", "passed": False})
+
+ score = passed / total if total > 0 else 0.0
+ print(f"DEBUG: final score={score} passed={passed} total={total}", file=sys.stderr)
+ print(json.dumps({"score": score, "assertions": assertions}))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/swe-bench-lite/graders/swe-bench-grader.ts b/benchmarks/swe-bench-lite/graders/swe-bench-grader.ts
new file mode 100644
index 000000000..47b66080f
--- /dev/null
+++ b/benchmarks/swe-bench-lite/graders/swe-bench-grader.ts
@@ -0,0 +1,162 @@
+#!/usr/bin/env bun
+/**
+ * SWE-bench Grader for AgentV
+ *
+ * A code-grader that evaluates agent patches against SWE-bench test suites.
+ * Runs inside the Docker container via `docker exec` (handled by Docker workspace provider).
+ *
+ * Flow:
+ * 1. Receives agent output (patch/diff) via stdin payload
+ * 2. Applies the patch to the repository at /testbed
+ * 3. Runs the FAIL_TO_PASS tests
+ * 4. Checks which failing tests now pass
+ * 5. Returns structured score + assertions
+ *
+ * Config (from EVAL.yaml):
+ * instance_id: SWE-bench instance identifier
+ * repo: Repository name (e.g. "django/django")
+ * base_commit: Base commit hash
+ * fail_to_pass: Array of test names that must transition from fail → pass
+ * pass_to_pass_count: Number of tests that must remain passing
+ */
+
+import { execFileSync } from 'node:child_process';
+import { defineCodeGrader } from '@agentv/eval';
+
+interface SWEBenchConfig {
+ instance_id: string;
+ repo: string;
+ base_commit: string;
+ fail_to_pass: string[];
+ pass_to_pass_count: number;
+}
+
+/** Safe test name pattern — only allow expected SWE-bench test identifiers */
+const SAFE_TEST_NAME = /^[\w./:\-[\]]+$/;
+
+function runArgs(
+ args: readonly string[],
+ cwd = '/testbed',
+): { stdout: string; stderr: string; exitCode: number } {
+ try {
+ const stdout = execFileSync(args[0], args.slice(1), {
+ cwd,
+ encoding: 'utf8',
+ timeout: 300_000,
+ stdio: ['pipe', 'pipe', 'pipe'],
+ });
+ return { stdout, stderr: '', exitCode: 0 };
+ } catch (err: unknown) {
+ const e = err as { stdout?: string; stderr?: string; status?: number };
+ return {
+ stdout: String(e.stdout ?? ''),
+ stderr: String(e.stderr ?? ''),
+ exitCode: typeof e.status === 'number' ? e.status : 1,
+ };
+ }
+}
+
+export default defineCodeGrader(async ({ output, config }) => {
+ const swebenchConfig = config as unknown as SWEBenchConfig;
+ const { instance_id, fail_to_pass } = swebenchConfig;
+
+ const assertions: Array<{
+ text: string;
+ passed: boolean;
+ evidence?: string;
+ }> = [];
+
+ // Extract the patch from agent output
+ const agentOutput = output?.map((m) => String(m.content ?? '')).join('\n') ?? '';
+
+ // Extract diff content (unified diff format)
+ const diffMatch = agentOutput.match(/^(---|\+\+\+|diff --git)[\s\S]*$/m);
+ const patch = diffMatch ? diffMatch[0] : agentOutput;
+
+ if (!patch.trim()) {
+ return {
+ score: 0,
+ assertions: [
+ {
+ text: 'Agent produced a patch',
+ passed: false,
+ evidence: 'No patch content found in agent output',
+ },
+ ],
+ };
+ }
+
+ assertions.push({
+ text: 'Agent produced a patch',
+ passed: true,
+ evidence: `Patch length: ${patch.length} chars`,
+ });
+
+ // Step 1: Write patch to a temp file and apply it
+ const patchPath = '/tmp/agent-patch.diff';
+ const { writeFileSync } = await import('node:fs');
+ writeFileSync(patchPath, patch);
+
+ const applyResult = runArgs(['git', 'apply', '--verbose', patchPath]);
+ const patchApplied = applyResult.exitCode === 0;
+
+ if (!patchApplied) {
+ // Try with --3way as fallback
+ const apply3way = runArgs(['git', 'apply', '--3way', patchPath]);
+ if (apply3way.exitCode !== 0) {
+ assertions.push({
+ text: 'Patch applies cleanly',
+ passed: false,
+ evidence: `git apply failed: ${applyResult.stderr.slice(0, 500)}`,
+ });
+ return {
+ score: 0,
+ assertions,
+ metadata: { instance_id, patch_length: patch.length },
+ };
+ }
+ }
+ assertions.push({ text: 'Patch applies cleanly', passed: true });
+
+ // Step 2: Run FAIL_TO_PASS tests (using execFileSync to avoid shell injection)
+ let passedCount = 0;
+ for (const testName of fail_to_pass) {
+ // Validate test name to prevent injection
+ if (!SAFE_TEST_NAME.test(testName)) {
+ assertions.push({
+ text: `FAIL→PASS: ${testName}`,
+ passed: false,
+ evidence: 'Skipped: test name contains unsafe characters',
+ });
+ continue;
+ }
+
+ const testResult = runArgs(['python', '-m', 'pytest', testName, '-x', '--tb=short', '-q']);
+ const combinedOutput = `${testResult.stdout}\n${testResult.stderr}`;
+ const passed = combinedOutput.includes(' passed') && !combinedOutput.includes(' failed');
+
+ assertions.push({
+ text: `FAIL→PASS: ${testName}`,
+ passed,
+ evidence: passed
+ ? 'Test now passes after patch'
+ : `Test still fails: ${combinedOutput.slice(0, 300)}`,
+ });
+
+ if (passed) passedCount++;
+ }
+
+ // Score: proportion of FAIL_TO_PASS tests that now pass
+ const score = fail_to_pass.length > 0 ? passedCount / fail_to_pass.length : 0;
+
+ return {
+ score,
+ assertions,
+ metadata: {
+ instance_id,
+ patch_length: patch.length,
+ fail_to_pass_total: fail_to_pass.length,
+ fail_to_pass_resolved: passedCount,
+ },
+ };
+});
diff --git a/benchmarks/swe-bench-lite/result.schema.json b/benchmarks/swe-bench-lite/result.schema.json
new file mode 100644
index 000000000..8a331889e
--- /dev/null
+++ b/benchmarks/swe-bench-lite/result.schema.json
@@ -0,0 +1,55 @@
+{
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
+ "title": "AgentV SWE-bench Lite Result",
+ "description": "Schema for benchmark result submissions to benchmarks/swe-bench-lite/results/",
+ "type": "object",
+ "required": [
+ "model",
+ "provider",
+ "model_type",
+ "date",
+ "agent",
+ "agent_version",
+ "dataset",
+ "total_instances",
+ "resolved_instances",
+ "resolution_rate",
+ "avg_cost_usd",
+ "avg_cost_per_fix_usd",
+ "avg_duration_ms",
+ "avg_tool_calls",
+ "per_instance"
+ ],
+ "properties": {
+ "model": { "type": "string", "description": "Model name (e.g. 'Claude Opus 4.6')" },
+ "provider": { "type": "string", "description": "Provider identifier (e.g. 'anthropic')" },
+ "model_type": { "type": "string", "enum": ["proprietary", "open-source", "open-weights"] },
+ "date": { "type": "string", "format": "date", "description": "Evaluation date (YYYY-MM-DD)" },
+ "agent": { "type": "string", "description": "Agent name/identifier" },
+ "agent_version": { "type": "string", "description": "Agent version string" },
+ "dataset": { "type": "string", "const": "swe-bench-lite" },
+ "total_instances": { "type": "integer", "minimum": 1 },
+ "resolved_instances": { "type": "integer", "minimum": 0 },
+ "resolution_rate": { "type": "number", "minimum": 0, "maximum": 1 },
+ "avg_cost_usd": { "type": "number", "minimum": 0 },
+ "avg_cost_per_fix_usd": { "type": "number", "minimum": 0 },
+ "avg_duration_ms": { "type": "number", "minimum": 0 },
+ "avg_tool_calls": { "type": "number", "minimum": 0 },
+ "per_instance": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "required": ["instance_id", "resolved", "cost_usd", "duration_ms", "tool_calls"],
+ "properties": {
+ "instance_id": { "type": "string" },
+ "resolved": { "type": "boolean" },
+ "cost_usd": { "type": "number", "minimum": 0 },
+ "duration_ms": { "type": "number", "minimum": 0 },
+ "tool_calls": { "type": "integer", "minimum": 0 }
+ },
+ "additionalProperties": false
+ }
+ }
+ },
+ "additionalProperties": false
+}
diff --git a/benchmarks/swe-bench-lite/results/claude-opus-4.6.json b/benchmarks/swe-bench-lite/results/claude-opus-4.6.json
new file mode 100644
index 000000000..af6e6a620
--- /dev/null
+++ b/benchmarks/swe-bench-lite/results/claude-opus-4.6.json
@@ -0,0 +1,53 @@
+{
+ "model": "Claude Opus 4.6",
+ "provider": "anthropic",
+ "model_type": "proprietary",
+ "date": "2026-04-08",
+ "agent": "agentv-swe-bench",
+ "agent_version": "1.0.0",
+ "dataset": "swe-bench-lite",
+ "total_instances": 300,
+ "resolved_instances": 218,
+ "resolution_rate": 0.727,
+ "avg_cost_usd": 0.55,
+ "avg_cost_per_fix_usd": 0.76,
+ "avg_duration_ms": 45000,
+ "avg_tool_calls": 8.2,
+ "per_instance": [
+ {
+ "instance_id": "django__django-15180",
+ "resolved": true,
+ "cost_usd": 0.42,
+ "duration_ms": 32000,
+ "tool_calls": 6
+ },
+ {
+ "instance_id": "astropy__astropy-12907",
+ "resolved": true,
+ "cost_usd": 0.38,
+ "duration_ms": 28000,
+ "tool_calls": 5
+ },
+ {
+ "instance_id": "matplotlib__matplotlib-23562",
+ "resolved": true,
+ "cost_usd": 0.61,
+ "duration_ms": 51000,
+ "tool_calls": 9
+ },
+ {
+ "instance_id": "sympy__sympy-20590",
+ "resolved": false,
+ "cost_usd": 0.72,
+ "duration_ms": 68000,
+ "tool_calls": 12
+ },
+ {
+ "instance_id": "scikit-learn__scikit-learn-13779",
+ "resolved": true,
+ "cost_usd": 0.48,
+ "duration_ms": 39000,
+ "tool_calls": 7
+ }
+ ]
+}
diff --git a/benchmarks/swe-bench-lite/results/claude-sonnet-4.5.json b/benchmarks/swe-bench-lite/results/claude-sonnet-4.5.json
new file mode 100644
index 000000000..1e08af19b
--- /dev/null
+++ b/benchmarks/swe-bench-lite/results/claude-sonnet-4.5.json
@@ -0,0 +1,53 @@
+{
+ "model": "Claude Sonnet 4.5",
+ "provider": "anthropic",
+ "model_type": "proprietary",
+ "date": "2026-04-07",
+ "agent": "agentv-swe-bench",
+ "agent_version": "1.0.0",
+ "dataset": "swe-bench-lite",
+ "total_instances": 300,
+ "resolved_instances": 196,
+ "resolution_rate": 0.653,
+ "avg_cost_usd": 0.28,
+ "avg_cost_per_fix_usd": 0.43,
+ "avg_duration_ms": 35000,
+ "avg_tool_calls": 7.1,
+ "per_instance": [
+ {
+ "instance_id": "django__django-15180",
+ "resolved": true,
+ "cost_usd": 0.22,
+ "duration_ms": 24000,
+ "tool_calls": 5
+ },
+ {
+ "instance_id": "astropy__astropy-12907",
+ "resolved": true,
+ "cost_usd": 0.19,
+ "duration_ms": 21000,
+ "tool_calls": 4
+ },
+ {
+ "instance_id": "matplotlib__matplotlib-23562",
+ "resolved": false,
+ "cost_usd": 0.35,
+ "duration_ms": 42000,
+ "tool_calls": 8
+ },
+ {
+ "instance_id": "sympy__sympy-20590",
+ "resolved": false,
+ "cost_usd": 0.41,
+ "duration_ms": 52000,
+ "tool_calls": 10
+ },
+ {
+ "instance_id": "scikit-learn__scikit-learn-13779",
+ "resolved": true,
+ "cost_usd": 0.25,
+ "duration_ms": 29000,
+ "tool_calls": 6
+ }
+ ]
+}
diff --git a/benchmarks/swe-bench-lite/results/codex-o3.json b/benchmarks/swe-bench-lite/results/codex-o3.json
new file mode 100644
index 000000000..fda4a90e9
--- /dev/null
+++ b/benchmarks/swe-bench-lite/results/codex-o3.json
@@ -0,0 +1,53 @@
+{
+ "model": "Codex o3",
+ "provider": "openai",
+ "model_type": "proprietary",
+ "date": "2026-04-04",
+ "agent": "agentv-swe-bench",
+ "agent_version": "1.0.0",
+ "dataset": "swe-bench-lite",
+ "total_instances": 300,
+ "resolved_instances": 231,
+ "resolution_rate": 0.77,
+ "avg_cost_usd": 0.82,
+ "avg_cost_per_fix_usd": 1.06,
+ "avg_duration_ms": 62000,
+ "avg_tool_calls": 11.5,
+ "per_instance": [
+ {
+ "instance_id": "django__django-15180",
+ "resolved": true,
+ "cost_usd": 0.68,
+ "duration_ms": 48000,
+ "tool_calls": 9
+ },
+ {
+ "instance_id": "astropy__astropy-12907",
+ "resolved": true,
+ "cost_usd": 0.59,
+ "duration_ms": 41000,
+ "tool_calls": 8
+ },
+ {
+ "instance_id": "matplotlib__matplotlib-23562",
+ "resolved": true,
+ "cost_usd": 0.91,
+ "duration_ms": 72000,
+ "tool_calls": 13
+ },
+ {
+ "instance_id": "sympy__sympy-20590",
+ "resolved": true,
+ "cost_usd": 1.12,
+ "duration_ms": 95000,
+ "tool_calls": 16
+ },
+ {
+ "instance_id": "scikit-learn__scikit-learn-13779",
+ "resolved": true,
+ "cost_usd": 0.74,
+ "duration_ms": 55000,
+ "tool_calls": 10
+ }
+ ]
+}
diff --git a/benchmarks/swe-bench-lite/results/deepseek-v3.json b/benchmarks/swe-bench-lite/results/deepseek-v3.json
new file mode 100644
index 000000000..be1e88419
--- /dev/null
+++ b/benchmarks/swe-bench-lite/results/deepseek-v3.json
@@ -0,0 +1,53 @@
+{
+ "model": "DeepSeek V3",
+ "provider": "deepseek",
+ "model_type": "open-weights",
+ "date": "2026-04-03",
+ "agent": "agentv-swe-bench",
+ "agent_version": "1.0.0",
+ "dataset": "swe-bench-lite",
+ "total_instances": 300,
+ "resolved_instances": 168,
+ "resolution_rate": 0.56,
+ "avg_cost_usd": 0.12,
+ "avg_cost_per_fix_usd": 0.21,
+ "avg_duration_ms": 52000,
+ "avg_tool_calls": 10.3,
+ "per_instance": [
+ {
+ "instance_id": "django__django-15180",
+ "resolved": true,
+ "cost_usd": 0.09,
+ "duration_ms": 38000,
+ "tool_calls": 8
+ },
+ {
+ "instance_id": "astropy__astropy-12907",
+ "resolved": false,
+ "cost_usd": 0.11,
+ "duration_ms": 45000,
+ "tool_calls": 9
+ },
+ {
+ "instance_id": "matplotlib__matplotlib-23562",
+ "resolved": true,
+ "cost_usd": 0.15,
+ "duration_ms": 58000,
+ "tool_calls": 12
+ },
+ {
+ "instance_id": "sympy__sympy-20590",
+ "resolved": false,
+ "cost_usd": 0.18,
+ "duration_ms": 72000,
+ "tool_calls": 14
+ },
+ {
+ "instance_id": "scikit-learn__scikit-learn-13779",
+ "resolved": true,
+ "cost_usd": 0.1,
+ "duration_ms": 41000,
+ "tool_calls": 9
+ }
+ ]
+}
diff --git a/benchmarks/swe-bench-lite/results/gemini-2.5-pro.json b/benchmarks/swe-bench-lite/results/gemini-2.5-pro.json
new file mode 100644
index 000000000..7e3e07826
--- /dev/null
+++ b/benchmarks/swe-bench-lite/results/gemini-2.5-pro.json
@@ -0,0 +1,53 @@
+{
+ "model": "Gemini 2.5 Pro",
+ "provider": "google",
+ "model_type": "proprietary",
+ "date": "2026-04-05",
+ "agent": "agentv-swe-bench",
+ "agent_version": "1.0.0",
+ "dataset": "swe-bench-lite",
+ "total_instances": 300,
+ "resolved_instances": 213,
+ "resolution_rate": 0.71,
+ "avg_cost_usd": 0.36,
+ "avg_cost_per_fix_usd": 0.51,
+ "avg_duration_ms": 38000,
+ "avg_tool_calls": 6.4,
+ "per_instance": [
+ {
+ "instance_id": "django__django-15180",
+ "resolved": true,
+ "cost_usd": 0.29,
+ "duration_ms": 26000,
+ "tool_calls": 5
+ },
+ {
+ "instance_id": "astropy__astropy-12907",
+ "resolved": true,
+ "cost_usd": 0.25,
+ "duration_ms": 22000,
+ "tool_calls": 4
+ },
+ {
+ "instance_id": "matplotlib__matplotlib-23562",
+ "resolved": true,
+ "cost_usd": 0.42,
+ "duration_ms": 44000,
+ "tool_calls": 7
+ },
+ {
+ "instance_id": "sympy__sympy-20590",
+ "resolved": false,
+ "cost_usd": 0.51,
+ "duration_ms": 55000,
+ "tool_calls": 9
+ },
+ {
+ "instance_id": "scikit-learn__scikit-learn-13779",
+ "resolved": true,
+ "cost_usd": 0.32,
+ "duration_ms": 31000,
+ "tool_calls": 5
+ }
+ ]
+}
diff --git a/benchmarks/swe-bench-lite/results/gpt-5.2.json b/benchmarks/swe-bench-lite/results/gpt-5.2.json
new file mode 100644
index 000000000..2405228e5
--- /dev/null
+++ b/benchmarks/swe-bench-lite/results/gpt-5.2.json
@@ -0,0 +1,53 @@
+{
+ "model": "GPT-5.2",
+ "provider": "openai",
+ "model_type": "proprietary",
+ "date": "2026-04-06",
+ "agent": "agentv-swe-bench",
+ "agent_version": "1.0.0",
+ "dataset": "swe-bench-lite",
+ "total_instances": 300,
+ "resolved_instances": 205,
+ "resolution_rate": 0.683,
+ "avg_cost_usd": 0.45,
+ "avg_cost_per_fix_usd": 0.66,
+ "avg_duration_ms": 42000,
+ "avg_tool_calls": 9.1,
+ "per_instance": [
+ {
+ "instance_id": "django__django-15180",
+ "resolved": true,
+ "cost_usd": 0.38,
+ "duration_ms": 31000,
+ "tool_calls": 7
+ },
+ {
+ "instance_id": "astropy__astropy-12907",
+ "resolved": true,
+ "cost_usd": 0.35,
+ "duration_ms": 27000,
+ "tool_calls": 6
+ },
+ {
+ "instance_id": "matplotlib__matplotlib-23562",
+ "resolved": true,
+ "cost_usd": 0.52,
+ "duration_ms": 48000,
+ "tool_calls": 10
+ },
+ {
+ "instance_id": "sympy__sympy-20590",
+ "resolved": false,
+ "cost_usd": 0.63,
+ "duration_ms": 61000,
+ "tool_calls": 13
+ },
+ {
+ "instance_id": "scikit-learn__scikit-learn-13779",
+ "resolved": true,
+ "cost_usd": 0.41,
+ "duration_ms": 36000,
+ "tool_calls": 8
+ }
+ ]
+}
diff --git a/benchmarks/swe-bench-lite/setup.ts b/benchmarks/swe-bench-lite/setup.ts
new file mode 100644
index 000000000..08d450e71
--- /dev/null
+++ b/benchmarks/swe-bench-lite/setup.ts
@@ -0,0 +1,201 @@
+#!/usr/bin/env bun
+/**
+ * SWE-bench Lite Setup Script
+ *
+ * Downloads the SWE-bench Lite dataset from HuggingFace and generates
+ * EVAL.yaml files for AgentV evaluation.
+ *
+ * Usage:
+ * bun run setup.ts # Generate all 300 EVAL.yaml files
+ * bun run setup.ts --limit 10 # Generate only first 10 (for testing)
+ *
+ * Output: evals/.EVAL.yaml (gitignored)
+ *
+ * Data source: https://huggingface.co/datasets/SWE-bench/SWE-bench_Lite (test split)
+ * Docker images: swebench/sweb.eval.x86_64.
+ */
+
+import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs';
+import { join } from 'node:path';
+
+const DATASET_URL =
+ 'https://datasets-server.huggingface.co/rows?dataset=SWE-bench/SWE-bench_Lite&config=default&split=test';
+const CACHE_DIR = join(import.meta.dir, '.cache');
+const EVALS_DIR = join(import.meta.dir, 'evals');
+const ROWS_PER_PAGE = 100;
+
+interface SWEBenchInstance {
+ instance_id: string;
+ repo: string;
+ base_commit: string;
+ patch: string;
+ test_patch: string;
+ problem_statement: string;
+ hints_text: string;
+ created_at: string;
+ version: string;
+ FAIL_TO_PASS: string; // JSON-encoded array
+ PASS_TO_PASS: string; // JSON-encoded array
+ environment_setup_commit: string;
+}
+
+/** Validate SWE-bench field values to prevent YAML injection */
+const SAFE_ID = /^[\w./-]+$/;
+function assertSafeField(name: string, value: string): void {
+ if (!SAFE_ID.test(value)) {
+ throw new Error(`Unsafe ${name}: ${JSON.stringify(value)}`);
+ }
+}
+
+/** Convert instance_id to Docker image tag (SWE-bench convention). */
+function instanceToImageTag(instanceId: string): string {
+ // SWE-bench image naming: swebench/sweb.eval.x86_64.__:
+ // Instance IDs already use __ as separator: e.g. django__django-15180
+ return `swebench/sweb.eval.x86_64.${instanceId.toLowerCase()}`;
+}
+
+/** Fetch all rows from HuggingFace dataset API with pagination. */
+async function fetchDataset(limit?: number): Promise {
+ mkdirSync(CACHE_DIR, { recursive: true });
+ const cachePath = join(CACHE_DIR, 'swe-bench-lite.json');
+
+ // Use cache if available and less than 24h old
+ if (existsSync(cachePath)) {
+ const stat = Bun.file(cachePath);
+ const age = Date.now() - (await stat.lastModified);
+ if (age < 24 * 60 * 60 * 1000) {
+ console.log('Using cached dataset...');
+ const cached = JSON.parse(readFileSync(cachePath, 'utf8')) as SWEBenchInstance[];
+ return limit ? cached.slice(0, limit) : cached;
+ }
+ }
+
+ console.log('Downloading SWE-bench Lite dataset from HuggingFace...');
+ const allRows: SWEBenchInstance[] = [];
+ let offset = 0;
+
+ while (true) {
+ const url = `${DATASET_URL}&offset=${offset}&length=${ROWS_PER_PAGE}`;
+ const response = await fetch(url);
+ if (!response.ok) {
+ throw new Error(`HuggingFace API error: ${response.status} ${response.statusText}`);
+ }
+ const data = (await response.json()) as { rows: Array<{ row: SWEBenchInstance }> };
+ const rows = data.rows.map((r) => r.row);
+
+ if (rows.length === 0) break;
+ allRows.push(...rows);
+ offset += rows.length;
+
+ process.stdout.write(`\r Downloaded ${allRows.length} instances...`);
+
+ if (rows.length < ROWS_PER_PAGE) break;
+ }
+ console.log(`\n Total: ${allRows.length} instances`);
+
+ // Cache the dataset
+ writeFileSync(cachePath, JSON.stringify(allRows, null, 2));
+ console.log(` Cached to ${cachePath}`);
+
+ return limit ? allRows.slice(0, limit) : allRows;
+}
+
+/** Generate an EVAL.yaml file for a single SWE-bench instance. */
+function generateEvalYaml(instance: SWEBenchInstance): string {
+ // Validate fields that are interpolated into YAML outside block scalars
+ assertSafeField('instance_id', instance.instance_id);
+ assertSafeField('repo', instance.repo);
+ assertSafeField('base_commit', instance.base_commit);
+ assertSafeField('version', instance.version);
+
+ const failToPass = JSON.parse(instance.FAIL_TO_PASS) as string[];
+ const passToPass = JSON.parse(instance.PASS_TO_PASS) as string[];
+ const imageTag = instanceToImageTag(instance.instance_id);
+
+ // Indent problem statement for YAML block scalar (10 spaces to match content block)
+ const indent = ' ';
+ const problemLines = instance.problem_statement.split('\n').map((line) => `${indent}${line}`);
+ const problemBlock = problemLines.join('\n');
+
+ return `# Auto-generated by setup.ts — do not edit manually
+# Source: HuggingFace SWE-bench/SWE-bench_Lite (test split)
+# Instance: ${instance.instance_id}
+# Repo: ${instance.repo} @ ${instance.base_commit.slice(0, 8)}
+
+description: "SWE-bench Lite: ${instance.instance_id}"
+
+workspace:
+ docker:
+ image: "${imageTag}"
+ timeout: 1800
+ memory: "4g"
+ cpus: 2
+
+tests:
+ - id: "${instance.instance_id}"
+ metadata:
+ repo: "${instance.repo}"
+ base_commit: "${instance.base_commit}"
+ version: "${instance.version}"
+ created_at: "${instance.created_at}"
+ input:
+ - role: user
+ content: |
+ You are a software engineer working on the ${instance.repo} repository.
+ Your task is to fix the following issue. The repository is available at /testbed.
+
+ ## Issue
+
+${problemBlock}
+
+ ## Instructions
+
+ 1. Navigate to the repository at /testbed
+ 2. Understand the issue and identify the root cause
+ 3. Implement a fix
+ 4. Output your changes as a unified diff (git diff format)
+
+ Important: Only output the diff, no explanation needed.
+ assertions:
+ - type: code-grader
+ command: ["python", "/grader.py"]
+ instance_id: "${instance.instance_id}"
+ repo: "${instance.repo}"
+ base_commit: "${instance.base_commit}"
+ fail_to_pass:
+${failToPass.map((t) => ` - "${t.replace(/"/g, '\\"')}"`).join('\n')}
+ pass_to_pass_count: ${passToPass.length}
+`;
+}
+
+// --- Main ---
+async function main() {
+ const args = process.argv.slice(2);
+ const limitIdx = args.indexOf('--limit');
+ const limit = limitIdx !== -1 ? Number.parseInt(args[limitIdx + 1], 10) : undefined;
+
+ console.log('SWE-bench Lite Setup');
+ console.log('====================\n');
+
+ const instances = await fetchDataset(limit);
+
+ mkdirSync(EVALS_DIR, { recursive: true });
+
+ let generated = 0;
+ for (const instance of instances) {
+ const filename = `${instance.instance_id}.EVAL.yaml`;
+ const filepath = join(EVALS_DIR, filename);
+ const yaml = generateEvalYaml(instance);
+ writeFileSync(filepath, yaml);
+ generated++;
+ }
+
+ console.log(`\nGenerated ${generated} EVAL.yaml files in ${EVALS_DIR}/`);
+ console.log('\nNext steps:');
+ console.log(' bun apps/cli/src/cli.ts eval benchmarks/swe-bench-lite/evals/ --target claude');
+}
+
+main().catch((err) => {
+ console.error('Setup failed:', err);
+ process.exit(1);
+});
diff --git a/benchmarks/swe-bench-lite/validate-result.ts b/benchmarks/swe-bench-lite/validate-result.ts
new file mode 100644
index 000000000..0f5e46e1a
--- /dev/null
+++ b/benchmarks/swe-bench-lite/validate-result.ts
@@ -0,0 +1,173 @@
+#!/usr/bin/env bun
+/**
+ * Validate SWE-bench Lite result JSON files against the schema.
+ *
+ * Zero-dependency validator — uses runtime type checks instead of Zod
+ * so it works standalone from the benchmarks/ directory.
+ *
+ * Usage:
+ * bun run validate-result.ts results/claude-opus-4.6.json
+ * bun run validate-result.ts results/*.json
+ *
+ * Used by CI to validate PR submissions.
+ */
+
+import { readFileSync } from 'node:fs';
+
+const REQUIRED_TOP_FIELDS = [
+ 'model',
+ 'provider',
+ 'model_type',
+ 'date',
+ 'agent',
+ 'agent_version',
+ 'dataset',
+ 'total_instances',
+ 'resolved_instances',
+ 'resolution_rate',
+ 'avg_cost_usd',
+ 'avg_cost_per_fix_usd',
+ 'avg_duration_ms',
+ 'avg_tool_calls',
+ 'per_instance',
+] as const;
+
+const VALID_MODEL_TYPES = ['proprietary', 'open-source', 'open-weights'];
+
+const REQUIRED_INSTANCE_FIELDS = [
+ 'instance_id',
+ 'resolved',
+ 'cost_usd',
+ 'duration_ms',
+ 'tool_calls',
+] as const;
+
+interface ValidationError {
+ path: string;
+ message: string;
+}
+
+function validateResult(data: unknown): ValidationError[] {
+ const errors: ValidationError[] = [];
+
+ if (typeof data !== 'object' || data === null || Array.isArray(data)) {
+ return [{ path: '', message: 'Root must be a JSON object' }];
+ }
+
+ const obj = data as Record;
+
+ // Check required fields exist
+ for (const field of REQUIRED_TOP_FIELDS) {
+ if (!(field in obj)) {
+ errors.push({ path: field, message: 'Required field missing' });
+ }
+ }
+ if (errors.length > 0) return errors;
+
+ // Type checks with length limits
+ if (typeof obj.model !== 'string' || (obj.model as string).length > 100)
+ errors.push({ path: 'model', message: 'Must be a string (max 100 chars)' });
+ if (typeof obj.provider !== 'string' || !/^[a-z0-9-]+$/.test(obj.provider as string))
+ errors.push({ path: 'provider', message: 'Must be lowercase alphanumeric with hyphens' });
+ if (!VALID_MODEL_TYPES.includes(obj.model_type as string))
+ errors.push({ path: 'model_type', message: `Must be one of: ${VALID_MODEL_TYPES.join(', ')}` });
+ if (typeof obj.date !== 'string' || !/^\d{4}-\d{2}-\d{2}$/.test(obj.date as string))
+ errors.push({ path: 'date', message: 'Must be YYYY-MM-DD format' });
+ if (typeof obj.agent !== 'string' || (obj.agent as string).length > 100)
+ errors.push({ path: 'agent', message: 'Must be a string (max 100 chars)' });
+ if (typeof obj.agent_version !== 'string' || (obj.agent_version as string).length > 50)
+ errors.push({ path: 'agent_version', message: 'Must be a string (max 50 chars)' });
+ if (obj.dataset !== 'swe-bench-lite')
+ errors.push({ path: 'dataset', message: 'Must be "swe-bench-lite"' });
+
+ const numFields = [
+ 'total_instances',
+ 'resolved_instances',
+ 'resolution_rate',
+ 'avg_cost_usd',
+ 'avg_cost_per_fix_usd',
+ 'avg_duration_ms',
+ 'avg_tool_calls',
+ ];
+ for (const f of numFields) {
+ if (typeof obj[f] !== 'number' || Number.isNaN(obj[f] as number))
+ errors.push({ path: f, message: 'Must be a number' });
+ }
+
+ if (
+ typeof obj.resolution_rate === 'number' &&
+ ((obj.resolution_rate as number) < 0 || (obj.resolution_rate as number) > 1)
+ )
+ errors.push({ path: 'resolution_rate', message: 'Must be between 0 and 1' });
+
+ // Validate per_instance array
+ if (!Array.isArray(obj.per_instance)) {
+ errors.push({ path: 'per_instance', message: 'Must be an array' });
+ } else {
+ for (let i = 0; i < obj.per_instance.length; i++) {
+ const inst = obj.per_instance[i] as Record;
+ for (const field of REQUIRED_INSTANCE_FIELDS) {
+ if (!(field in inst)) {
+ errors.push({ path: `per_instance[${i}].${field}`, message: 'Required field missing' });
+ }
+ }
+ if (typeof inst.instance_id !== 'string')
+ errors.push({ path: `per_instance[${i}].instance_id`, message: 'Must be a string' });
+ if (typeof inst.resolved !== 'boolean')
+ errors.push({ path: `per_instance[${i}].resolved`, message: 'Must be a boolean' });
+ }
+ }
+
+ return errors;
+}
+
+// CLI entry point
+const files = process.argv.slice(2);
+if (files.length === 0) {
+ console.error('Usage: bun run validate-result.ts [...]');
+ process.exit(1);
+}
+
+let hasErrors = false;
+
+for (const file of files) {
+ try {
+ const content = readFileSync(file, 'utf8');
+ const data = JSON.parse(content) as Record;
+ const errors = validateResult(data);
+
+ if (errors.length > 0) {
+ console.error(`❌ ${file}:`);
+ for (const err of errors) {
+ console.error(` ${err.path}: ${err.message}`);
+ }
+ hasErrors = true;
+ } else {
+ // Cross-validate computed fields
+ const totalInstances = data.total_instances as number;
+ const resolvedInstances = data.resolved_instances as number;
+ const resolutionRate = data.resolution_rate as number;
+ const perInstance = data.per_instance as unknown[];
+
+ const expectedRate = totalInstances > 0 ? resolvedInstances / totalInstances : 0;
+ if (Math.abs(resolutionRate - expectedRate) > 0.01) {
+ console.error(
+ `❌ ${file}: resolution_rate ${resolutionRate} doesn't match resolved/total (${expectedRate.toFixed(3)})`,
+ );
+ hasErrors = true;
+ } else if (perInstance.length !== totalInstances) {
+ console.warn(
+ `⚠️ ${file}: per_instance has ${perInstance.length} entries but total_instances is ${totalInstances} (partial results)`,
+ );
+ console.log(`✅ ${file} — ${data.model} (${resolutionRate * 100}% resolved, partial)`);
+ } else {
+ console.log(`✅ ${file} — ${data.model} (${resolutionRate * 100}% resolved)`);
+ }
+ }
+ } catch (err) {
+ console.error(`❌ ${file}: ${err instanceof Error ? err.message : String(err)}`);
+ hasErrors = true;
+ }
+}
+
+process.exit(hasErrors ? 1 : 0);
diff --git a/packages/core/src/evaluation/workspace/docker-workspace.ts b/packages/core/src/evaluation/workspace/docker-workspace.ts
index 1ce4f25e3..19c74692f 100644
--- a/packages/core/src/evaluation/workspace/docker-workspace.ts
+++ b/packages/core/src/evaluation/workspace/docker-workspace.ts
@@ -105,6 +105,14 @@ export class DockerWorkspaceProvider {
/** Pull the configured Docker image. No-op if already cached locally. */
async pullImage(): Promise {
+ // Skip pull if image already exists locally (e.g. locally-built images)
+ const inspectResult = await this.executor.exec(['docker', 'image', 'inspect', this.config.image], {
+ timeoutMs: 10_000,
+ });
+ if (inspectResult.exitCode === 0) {
+ return; // Image exists locally, no pull needed
+ }
+
const result = await this.executor.exec(['docker', 'pull', this.config.image], {
timeoutMs: this.timeoutMs,
});
diff --git a/packages/core/test/evaluation/workspace/docker-workspace.test.ts b/packages/core/test/evaluation/workspace/docker-workspace.test.ts
index 9452e0513..08bff49d2 100644
--- a/packages/core/test/evaluation/workspace/docker-workspace.test.ts
+++ b/packages/core/test/evaluation/workspace/docker-workspace.test.ts
@@ -84,24 +84,43 @@ describe('DockerWorkspaceProvider', () => {
});
describe('pullImage', () => {
- it('calls docker pull with the configured image', async () => {
+ it('skips pull when image exists locally', async () => {
+ // docker image inspect succeeds → image exists locally
+ executor.pushResponse({ exitCode: 0 });
+ const provider = new DockerWorkspaceProvider({ image: 'myimage:v1' }, executor);
+ await provider.pullImage();
+ expect(executor.callArgv(0)).toEqual(['docker', 'image', 'inspect', 'myimage:v1']);
+ expect(executor.calls.length).toBe(1); // no pull call
+ });
+
+ it('calls docker pull when image not found locally', async () => {
+ // docker image inspect fails → pull needed
+ executor.pushResponse({ exitCode: 1, stderr: 'No such image' });
executor.pushResponse({ stdout: 'Pull complete\n', exitCode: 0 });
const provider = new DockerWorkspaceProvider({ image: 'myimage:v1' }, executor);
await provider.pullImage();
- expect(executor.callArgv(0)).toEqual(['docker', 'pull', 'myimage:v1']);
+ expect(executor.callArgv(0)).toEqual(['docker', 'image', 'inspect', 'myimage:v1']);
+ expect(executor.callArgv(1)).toEqual(['docker', 'pull', 'myimage:v1']);
});
it('throws on pull failure', async () => {
+ // inspect fails, pull also fails
+ executor.pushResponse({ exitCode: 1, stderr: 'No such image' });
executor.pushResponse({ exitCode: 1, stderr: 'manifest not found' });
const provider = new DockerWorkspaceProvider({ image: 'bad:image' }, executor);
await expect(provider.pullImage()).rejects.toThrow('docker pull failed');
});
- it('uses configured timeout', async () => {
+ it('uses configured timeout for pull', async () => {
+ // inspect fails, then pull happens with configured timeout
+ executor.pushResponse({ exitCode: 1, stderr: 'No such image' });
executor.pushResponse({ exitCode: 0 });
const provider = new DockerWorkspaceProvider({ image: 'img:1', timeout: 60 }, executor);
await provider.pullImage();
- expect(executor.callOptions(0)?.timeoutMs).toBe(60_000);
+ // First call (inspect) uses 10s timeout
+ expect(executor.callOptions(0)?.timeoutMs).toBe(10_000);
+ // Second call (pull) uses configured timeout
+ expect(executor.callOptions(1)?.timeoutMs).toBe(60_000);
});
});
@@ -351,18 +370,24 @@ describe('DockerWorkspaceProvider', () => {
});
describe('timeout configuration', () => {
- it('defaults to 1800s (30 min) timeout', async () => {
+ it('defaults to 1800s (30 min) timeout for pull', async () => {
+ // inspect fails → pull with default timeout
+ executor.pushResponse({ exitCode: 1, stderr: 'No such image' });
executor.pushResponse({ exitCode: 0 });
const provider = new DockerWorkspaceProvider({ image: 'img:1' }, executor);
await provider.pullImage();
- expect(executor.callOptions(0)?.timeoutMs).toBe(1_800_000);
+ // Pull call (second) uses default timeout
+ expect(executor.callOptions(1)?.timeoutMs).toBe(1_800_000);
});
it('uses custom timeout from config', async () => {
+ // inspect fails → pull with custom timeout
+ executor.pushResponse({ exitCode: 1, stderr: 'No such image' });
executor.pushResponse({ exitCode: 0 });
const provider = new DockerWorkspaceProvider({ image: 'img:1', timeout: 300 }, executor);
await provider.pullImage();
- expect(executor.callOptions(0)?.timeoutMs).toBe(300_000);
+ // Pull call (second) uses custom timeout
+ expect(executor.callOptions(1)?.timeoutMs).toBe(300_000);
});
});
});
diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json
index 2792f120f..a7f142c04 100644
--- a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json
+++ b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json
@@ -53,12 +53,7 @@
"properties": {
"role": {
"type": "string",
- "enum": [
- "system",
- "user",
- "assistant",
- "tool"
- ]
+ "enum": ["system", "user", "assistant", "tool"]
},
"content": {
"anyOf": [
@@ -72,30 +67,20 @@
"properties": {
"type": {
"type": "string",
- "enum": [
- "text",
- "file",
- "image"
- ]
+ "enum": ["text", "file", "image"]
},
"value": {
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
}
}
]
}
},
- "required": [
- "role",
- "content"
- ],
+ "required": ["role", "content"],
"additionalProperties": false
}
}
@@ -133,12 +118,7 @@
"properties": {
"role": {
"type": "string",
- "enum": [
- "system",
- "user",
- "assistant",
- "tool"
- ]
+ "enum": ["system", "user", "assistant", "tool"]
},
"content": {
"anyOf": [
@@ -152,30 +132,20 @@
"properties": {
"type": {
"type": "string",
- "enum": [
- "text",
- "file",
- "image"
- ]
+ "enum": ["text", "file", "image"]
},
"value": {
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
}
}
]
}
},
- "required": [
- "role",
- "content"
- ],
+ "required": ["role", "content"],
"additionalProperties": false
}
}
@@ -203,12 +173,7 @@
"properties": {
"role": {
"type": "string",
- "enum": [
- "system",
- "user",
- "assistant",
- "tool"
- ]
+ "enum": ["system", "user", "assistant", "tool"]
},
"content": {
"anyOf": [
@@ -222,30 +187,20 @@
"properties": {
"type": {
"type": "string",
- "enum": [
- "text",
- "file",
- "image"
- ]
+ "enum": ["text", "file", "image"]
},
"value": {
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
}
}
]
}
},
- "required": [
- "role",
- "content"
- ],
+ "required": ["role", "content"],
"additionalProperties": false
}
}
@@ -289,10 +244,7 @@
},
"type": {
"type": "string",
- "enum": [
- "code-grader",
- "code_grader"
- ]
+ "enum": ["code-grader", "code_grader"]
},
"command": {
"anyOf": [
@@ -366,18 +318,12 @@
]
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
}
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
},
{
@@ -414,10 +360,7 @@
},
"type": {
"type": "string",
- "enum": [
- "llm-grader",
- "llm_grader"
- ]
+ "enum": ["llm-grader", "llm_grader"]
},
"prompt": {
"anyOf": [
@@ -512,10 +455,7 @@
"minLength": 1
}
},
- "required": [
- "score_range",
- "outcome"
- ],
+ "required": ["score_range", "outcome"],
"additionalProperties": false
}
}
@@ -566,17 +506,12 @@
]
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
}
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -639,9 +574,7 @@
}
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -657,10 +590,7 @@
"maximum": 1
}
},
- "required": [
- "type",
- "threshold"
- ],
+ "required": ["type", "threshold"],
"additionalProperties": false
},
{
@@ -677,10 +607,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "path"
- ],
+ "required": ["type", "path"],
"additionalProperties": false
},
{
@@ -697,18 +624,13 @@
"type": "string"
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
}
]
}
},
- "required": [
- "type",
- "aggregator"
- ],
+ "required": ["type", "aggregator"],
"additionalProperties": false
},
{
@@ -745,20 +667,11 @@
},
"type": {
"type": "string",
- "enum": [
- "tool-trajectory",
- "tool_trajectory"
- ]
+ "enum": ["tool-trajectory", "tool_trajectory"]
},
"mode": {
"type": "string",
- "enum": [
- "any_order",
- "in_order",
- "exact",
- "subset",
- "superset"
- ]
+ "enum": ["any_order", "in_order", "exact", "subset", "superset"]
},
"minimums": {
"type": "object",
@@ -799,12 +712,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -818,12 +726,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -834,9 +737,7 @@
]
}
},
- "required": [
- "tool"
- ],
+ "required": ["tool"],
"additionalProperties": false
}
},
@@ -844,12 +745,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -863,12 +759,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -879,10 +770,7 @@
]
}
},
- "required": [
- "type",
- "mode"
- ],
+ "required": ["type", "mode"],
"additionalProperties": false
},
{
@@ -919,10 +807,7 @@
},
"type": {
"type": "string",
- "enum": [
- "field-accuracy",
- "field_accuracy"
- ]
+ "enum": ["field-accuracy", "field_accuracy"]
},
"fields": {
"type": "array",
@@ -934,11 +819,7 @@
},
"match": {
"type": "string",
- "enum": [
- "exact",
- "numeric_tolerance",
- "date"
- ]
+ "enum": ["exact", "numeric_tolerance", "date"]
},
"required": {
"type": "boolean"
@@ -960,26 +841,17 @@
}
}
},
- "required": [
- "path",
- "match"
- ],
+ "required": ["path", "match"],
"additionalProperties": false
},
"minItems": 1
},
"aggregation": {
"type": "string",
- "enum": [
- "weighted_average",
- "all_or_nothing"
- ]
+ "enum": ["weighted_average", "all_or_nothing"]
}
},
- "required": [
- "type",
- "fields"
- ],
+ "required": ["type", "fields"],
"additionalProperties": false
},
{
@@ -1023,10 +895,7 @@
"minimum": 0
}
},
- "required": [
- "type",
- "threshold"
- ],
+ "required": ["type", "threshold"],
"additionalProperties": false
},
{
@@ -1070,10 +939,7 @@
"minimum": 0
}
},
- "required": [
- "type",
- "budget"
- ],
+ "required": ["type", "budget"],
"additionalProperties": false
},
{
@@ -1110,10 +976,7 @@
},
"type": {
"type": "string",
- "enum": [
- "token-usage",
- "token_usage"
- ]
+ "enum": ["token-usage", "token_usage"]
},
"max_total": {
"type": "number",
@@ -1128,9 +991,7 @@
"minimum": 0
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -1167,10 +1028,7 @@
},
"type": {
"type": "string",
- "enum": [
- "execution-metrics",
- "execution_metrics"
- ]
+ "enum": ["execution-metrics", "execution_metrics"]
},
"max_tool_calls": {
"type": "number",
@@ -1202,9 +1060,7 @@
"minimum": 0
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -1247,10 +1103,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -1293,10 +1146,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -1333,15 +1183,10 @@
},
"type": {
"type": "string",
- "enum": [
- "is-json",
- "is_json"
- ]
+ "enum": ["is-json", "is_json"]
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -1384,10 +1229,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -1476,10 +1318,7 @@
"minLength": 1
}
},
- "required": [
- "score_range",
- "outcome"
- ],
+ "required": ["score_range", "outcome"],
"additionalProperties": false
}
}
@@ -1489,10 +1328,7 @@
"minItems": 1
}
},
- "required": [
- "type",
- "criteria"
- ],
+ "required": ["type", "criteria"],
"additionalProperties": false
}
]
@@ -1536,10 +1372,7 @@
},
"type": {
"type": "string",
- "enum": [
- "code-grader",
- "code_grader"
- ]
+ "enum": ["code-grader", "code_grader"]
},
"command": {
"anyOf": [
@@ -1613,18 +1446,12 @@
]
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
}
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
},
{
@@ -1661,10 +1488,7 @@
},
"type": {
"type": "string",
- "enum": [
- "llm-grader",
- "llm_grader"
- ]
+ "enum": ["llm-grader", "llm_grader"]
},
"prompt": {
"anyOf": [
@@ -1759,10 +1583,7 @@
"minLength": 1
}
},
- "required": [
- "score_range",
- "outcome"
- ],
+ "required": ["score_range", "outcome"],
"additionalProperties": false
}
}
@@ -1813,17 +1634,12 @@
]
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
}
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -1886,9 +1702,7 @@
}
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -1904,10 +1718,7 @@
"maximum": 1
}
},
- "required": [
- "type",
- "threshold"
- ],
+ "required": ["type", "threshold"],
"additionalProperties": false
},
{
@@ -1924,10 +1735,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "path"
- ],
+ "required": ["type", "path"],
"additionalProperties": false
},
{
@@ -1944,18 +1752,13 @@
"type": "string"
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
}
]
}
},
- "required": [
- "type",
- "aggregator"
- ],
+ "required": ["type", "aggregator"],
"additionalProperties": false
},
{
@@ -1992,20 +1795,11 @@
},
"type": {
"type": "string",
- "enum": [
- "tool-trajectory",
- "tool_trajectory"
- ]
+ "enum": ["tool-trajectory", "tool_trajectory"]
},
"mode": {
"type": "string",
- "enum": [
- "any_order",
- "in_order",
- "exact",
- "subset",
- "superset"
- ]
+ "enum": ["any_order", "in_order", "exact", "subset", "superset"]
},
"minimums": {
"type": "object",
@@ -2046,12 +1840,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -2065,12 +1854,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -2081,9 +1865,7 @@
]
}
},
- "required": [
- "tool"
- ],
+ "required": ["tool"],
"additionalProperties": false
}
},
@@ -2091,12 +1873,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -2110,12 +1887,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -2126,10 +1898,7 @@
]
}
},
- "required": [
- "type",
- "mode"
- ],
+ "required": ["type", "mode"],
"additionalProperties": false
},
{
@@ -2166,10 +1935,7 @@
},
"type": {
"type": "string",
- "enum": [
- "field-accuracy",
- "field_accuracy"
- ]
+ "enum": ["field-accuracy", "field_accuracy"]
},
"fields": {
"type": "array",
@@ -2181,11 +1947,7 @@
},
"match": {
"type": "string",
- "enum": [
- "exact",
- "numeric_tolerance",
- "date"
- ]
+ "enum": ["exact", "numeric_tolerance", "date"]
},
"required": {
"type": "boolean"
@@ -2207,26 +1969,17 @@
}
}
},
- "required": [
- "path",
- "match"
- ],
+ "required": ["path", "match"],
"additionalProperties": false
},
"minItems": 1
},
"aggregation": {
"type": "string",
- "enum": [
- "weighted_average",
- "all_or_nothing"
- ]
+ "enum": ["weighted_average", "all_or_nothing"]
}
},
- "required": [
- "type",
- "fields"
- ],
+ "required": ["type", "fields"],
"additionalProperties": false
},
{
@@ -2270,10 +2023,7 @@
"minimum": 0
}
},
- "required": [
- "type",
- "threshold"
- ],
+ "required": ["type", "threshold"],
"additionalProperties": false
},
{
@@ -2317,10 +2067,7 @@
"minimum": 0
}
},
- "required": [
- "type",
- "budget"
- ],
+ "required": ["type", "budget"],
"additionalProperties": false
},
{
@@ -2357,10 +2104,7 @@
},
"type": {
"type": "string",
- "enum": [
- "token-usage",
- "token_usage"
- ]
+ "enum": ["token-usage", "token_usage"]
},
"max_total": {
"type": "number",
@@ -2375,9 +2119,7 @@
"minimum": 0
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -2414,10 +2156,7 @@
},
"type": {
"type": "string",
- "enum": [
- "execution-metrics",
- "execution_metrics"
- ]
+ "enum": ["execution-metrics", "execution_metrics"]
},
"max_tool_calls": {
"type": "number",
@@ -2449,9 +2188,7 @@
"minimum": 0
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -2494,10 +2231,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -2540,10 +2274,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -2580,15 +2311,10 @@
},
"type": {
"type": "string",
- "enum": [
- "is-json",
- "is_json"
- ]
+ "enum": ["is-json", "is_json"]
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -2631,10 +2357,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -2723,10 +2446,7 @@
"minLength": 1
}
},
- "required": [
- "score_range",
- "outcome"
- ],
+ "required": ["score_range", "outcome"],
"additionalProperties": false
}
}
@@ -2736,10 +2456,7 @@
"minItems": 1
}
},
- "required": [
- "type",
- "criteria"
- ],
+ "required": ["type", "criteria"],
"additionalProperties": false
}
]
@@ -2800,10 +2517,7 @@
},
"type": {
"type": "string",
- "enum": [
- "code-grader",
- "code_grader"
- ]
+ "enum": ["code-grader", "code_grader"]
},
"command": {
"anyOf": [
@@ -2877,18 +2591,12 @@
]
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
}
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
},
{
@@ -2925,10 +2633,7 @@
},
"type": {
"type": "string",
- "enum": [
- "llm-grader",
- "llm_grader"
- ]
+ "enum": ["llm-grader", "llm_grader"]
},
"prompt": {
"anyOf": [
@@ -3023,10 +2728,7 @@
"minLength": 1
}
},
- "required": [
- "score_range",
- "outcome"
- ],
+ "required": ["score_range", "outcome"],
"additionalProperties": false
}
}
@@ -3077,17 +2779,12 @@
]
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
}
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -3150,9 +2847,7 @@
}
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -3168,10 +2863,7 @@
"maximum": 1
}
},
- "required": [
- "type",
- "threshold"
- ],
+ "required": ["type", "threshold"],
"additionalProperties": false
},
{
@@ -3188,10 +2880,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "path"
- ],
+ "required": ["type", "path"],
"additionalProperties": false
},
{
@@ -3208,18 +2897,13 @@
"type": "string"
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
}
]
}
},
- "required": [
- "type",
- "aggregator"
- ],
+ "required": ["type", "aggregator"],
"additionalProperties": false
},
{
@@ -3256,20 +2940,11 @@
},
"type": {
"type": "string",
- "enum": [
- "tool-trajectory",
- "tool_trajectory"
- ]
+ "enum": ["tool-trajectory", "tool_trajectory"]
},
"mode": {
"type": "string",
- "enum": [
- "any_order",
- "in_order",
- "exact",
- "subset",
- "superset"
- ]
+ "enum": ["any_order", "in_order", "exact", "subset", "superset"]
},
"minimums": {
"type": "object",
@@ -3310,12 +2985,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -3329,12 +2999,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -3345,9 +3010,7 @@
]
}
},
- "required": [
- "tool"
- ],
+ "required": ["tool"],
"additionalProperties": false
}
},
@@ -3355,12 +3018,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -3374,12 +3032,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -3390,10 +3043,7 @@
]
}
},
- "required": [
- "type",
- "mode"
- ],
+ "required": ["type", "mode"],
"additionalProperties": false
},
{
@@ -3430,10 +3080,7 @@
},
"type": {
"type": "string",
- "enum": [
- "field-accuracy",
- "field_accuracy"
- ]
+ "enum": ["field-accuracy", "field_accuracy"]
},
"fields": {
"type": "array",
@@ -3445,11 +3092,7 @@
},
"match": {
"type": "string",
- "enum": [
- "exact",
- "numeric_tolerance",
- "date"
- ]
+ "enum": ["exact", "numeric_tolerance", "date"]
},
"required": {
"type": "boolean"
@@ -3471,26 +3114,17 @@
}
}
},
- "required": [
- "path",
- "match"
- ],
+ "required": ["path", "match"],
"additionalProperties": false
},
"minItems": 1
},
"aggregation": {
"type": "string",
- "enum": [
- "weighted_average",
- "all_or_nothing"
- ]
+ "enum": ["weighted_average", "all_or_nothing"]
}
},
- "required": [
- "type",
- "fields"
- ],
+ "required": ["type", "fields"],
"additionalProperties": false
},
{
@@ -3534,10 +3168,7 @@
"minimum": 0
}
},
- "required": [
- "type",
- "threshold"
- ],
+ "required": ["type", "threshold"],
"additionalProperties": false
},
{
@@ -3581,10 +3212,7 @@
"minimum": 0
}
},
- "required": [
- "type",
- "budget"
- ],
+ "required": ["type", "budget"],
"additionalProperties": false
},
{
@@ -3621,10 +3249,7 @@
},
"type": {
"type": "string",
- "enum": [
- "token-usage",
- "token_usage"
- ]
+ "enum": ["token-usage", "token_usage"]
},
"max_total": {
"type": "number",
@@ -3639,9 +3264,7 @@
"minimum": 0
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -3678,10 +3301,7 @@
},
"type": {
"type": "string",
- "enum": [
- "execution-metrics",
- "execution_metrics"
- ]
+ "enum": ["execution-metrics", "execution_metrics"]
},
"max_tool_calls": {
"type": "number",
@@ -3713,9 +3333,7 @@
"minimum": 0
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -3758,10 +3376,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -3804,10 +3419,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -3844,15 +3456,10 @@
},
"type": {
"type": "string",
- "enum": [
- "is-json",
- "is_json"
- ]
+ "enum": ["is-json", "is_json"]
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -3895,10 +3502,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -3987,10 +3591,7 @@
"minLength": 1
}
},
- "required": [
- "score_range",
- "outcome"
- ],
+ "required": ["score_range", "outcome"],
"additionalProperties": false
}
}
@@ -4000,10 +3601,7 @@
"minItems": 1
}
},
- "required": [
- "type",
- "criteria"
- ],
+ "required": ["type", "criteria"],
"additionalProperties": false
}
]
@@ -4047,10 +3645,7 @@
},
"type": {
"type": "string",
- "enum": [
- "code-grader",
- "code_grader"
- ]
+ "enum": ["code-grader", "code_grader"]
},
"command": {
"anyOf": [
@@ -4124,18 +3719,12 @@
]
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
}
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
},
{
@@ -4172,10 +3761,7 @@
},
"type": {
"type": "string",
- "enum": [
- "llm-grader",
- "llm_grader"
- ]
+ "enum": ["llm-grader", "llm_grader"]
},
"prompt": {
"anyOf": [
@@ -4270,10 +3856,7 @@
"minLength": 1
}
},
- "required": [
- "score_range",
- "outcome"
- ],
+ "required": ["score_range", "outcome"],
"additionalProperties": false
}
}
@@ -4324,17 +3907,12 @@
]
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
}
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -4397,9 +3975,7 @@
}
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -4415,10 +3991,7 @@
"maximum": 1
}
},
- "required": [
- "type",
- "threshold"
- ],
+ "required": ["type", "threshold"],
"additionalProperties": false
},
{
@@ -4435,10 +4008,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "path"
- ],
+ "required": ["type", "path"],
"additionalProperties": false
},
{
@@ -4455,18 +4025,13 @@
"type": "string"
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
}
]
}
},
- "required": [
- "type",
- "aggregator"
- ],
+ "required": ["type", "aggregator"],
"additionalProperties": false
},
{
@@ -4503,20 +4068,11 @@
},
"type": {
"type": "string",
- "enum": [
- "tool-trajectory",
- "tool_trajectory"
- ]
+ "enum": ["tool-trajectory", "tool_trajectory"]
},
"mode": {
"type": "string",
- "enum": [
- "any_order",
- "in_order",
- "exact",
- "subset",
- "superset"
- ]
+ "enum": ["any_order", "in_order", "exact", "subset", "superset"]
},
"minimums": {
"type": "object",
@@ -4557,12 +4113,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -4576,12 +4127,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -4592,9 +4138,7 @@
]
}
},
- "required": [
- "tool"
- ],
+ "required": ["tool"],
"additionalProperties": false
}
},
@@ -4602,12 +4146,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -4621,12 +4160,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -4637,10 +4171,7 @@
]
}
},
- "required": [
- "type",
- "mode"
- ],
+ "required": ["type", "mode"],
"additionalProperties": false
},
{
@@ -4677,10 +4208,7 @@
},
"type": {
"type": "string",
- "enum": [
- "field-accuracy",
- "field_accuracy"
- ]
+ "enum": ["field-accuracy", "field_accuracy"]
},
"fields": {
"type": "array",
@@ -4692,11 +4220,7 @@
},
"match": {
"type": "string",
- "enum": [
- "exact",
- "numeric_tolerance",
- "date"
- ]
+ "enum": ["exact", "numeric_tolerance", "date"]
},
"required": {
"type": "boolean"
@@ -4718,26 +4242,17 @@
}
}
},
- "required": [
- "path",
- "match"
- ],
+ "required": ["path", "match"],
"additionalProperties": false
},
"minItems": 1
},
"aggregation": {
"type": "string",
- "enum": [
- "weighted_average",
- "all_or_nothing"
- ]
+ "enum": ["weighted_average", "all_or_nothing"]
}
},
- "required": [
- "type",
- "fields"
- ],
+ "required": ["type", "fields"],
"additionalProperties": false
},
{
@@ -4781,10 +4296,7 @@
"minimum": 0
}
},
- "required": [
- "type",
- "threshold"
- ],
+ "required": ["type", "threshold"],
"additionalProperties": false
},
{
@@ -4828,10 +4340,7 @@
"minimum": 0
}
},
- "required": [
- "type",
- "budget"
- ],
+ "required": ["type", "budget"],
"additionalProperties": false
},
{
@@ -4868,10 +4377,7 @@
},
"type": {
"type": "string",
- "enum": [
- "token-usage",
- "token_usage"
- ]
+ "enum": ["token-usage", "token_usage"]
},
"max_total": {
"type": "number",
@@ -4886,9 +4392,7 @@
"minimum": 0
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -4925,10 +4429,7 @@
},
"type": {
"type": "string",
- "enum": [
- "execution-metrics",
- "execution_metrics"
- ]
+ "enum": ["execution-metrics", "execution_metrics"]
},
"max_tool_calls": {
"type": "number",
@@ -4960,9 +4461,7 @@
"minimum": 0
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -5005,10 +4504,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -5051,10 +4547,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -5091,15 +4584,10 @@
},
"type": {
"type": "string",
- "enum": [
- "is-json",
- "is_json"
- ]
+ "enum": ["is-json", "is_json"]
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -5142,10 +4630,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -5234,10 +4719,7 @@
"minLength": 1
}
},
- "required": [
- "score_range",
- "outcome"
- ],
+ "required": ["score_range", "outcome"],
"additionalProperties": false
}
}
@@ -5247,10 +4729,7 @@
"minItems": 1
}
},
- "required": [
- "type",
- "criteria"
- ],
+ "required": ["type", "criteria"],
"additionalProperties": false
}
]
@@ -5271,11 +4750,7 @@
},
"strategy": {
"type": "string",
- "enum": [
- "pass_at_k",
- "mean",
- "confidence_interval"
- ]
+ "enum": ["pass_at_k", "mean", "confidence_interval"]
},
"cost_limit_usd": {
"type": "number",
@@ -5286,9 +4761,7 @@
"minimum": 0
}
},
- "required": [
- "count"
- ],
+ "required": ["count"],
"additionalProperties": false
},
"total_budget_usd": {
@@ -5321,10 +4794,7 @@
},
"isolation": {
"type": "string",
- "enum": [
- "shared",
- "per_test"
- ]
+ "enum": ["shared", "per_test"]
},
"repos": {
"type": "array",
@@ -5348,10 +4818,7 @@
"format": "uri"
}
},
- "required": [
- "type",
- "url"
- ],
+ "required": ["type", "url"],
"additionalProperties": false
},
{
@@ -5365,10 +4832,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "path"
- ],
+ "required": ["type", "path"],
"additionalProperties": false
}
]
@@ -5381,10 +4845,7 @@
},
"resolve": {
"type": "string",
- "enum": [
- "remote",
- "local"
- ]
+ "enum": ["remote", "local"]
},
"ancestor": {
"type": "integer",
@@ -5413,10 +4874,7 @@
"additionalProperties": false
}
},
- "required": [
- "path",
- "source"
- ],
+ "required": ["path", "source"],
"additionalProperties": false
}
},
@@ -5452,11 +4910,7 @@
},
"reset": {
"type": "string",
- "enum": [
- "none",
- "fast",
- "strict"
- ]
+ "enum": ["none", "fast", "strict"]
}
},
"additionalProperties": false
@@ -5487,11 +4941,7 @@
},
"reset": {
"type": "string",
- "enum": [
- "none",
- "fast",
- "strict"
- ]
+ "enum": ["none", "fast", "strict"]
}
},
"additionalProperties": false
@@ -5522,11 +4972,7 @@
},
"reset": {
"type": "string",
- "enum": [
- "none",
- "fast",
- "strict"
- ]
+ "enum": ["none", "fast", "strict"]
}
},
"additionalProperties": false
@@ -5557,11 +5003,7 @@
},
"reset": {
"type": "string",
- "enum": [
- "none",
- "fast",
- "strict"
- ]
+ "enum": ["none", "fast", "strict"]
}
},
"additionalProperties": false
@@ -5571,11 +5013,7 @@
},
"mode": {
"type": "string",
- "enum": [
- "pooled",
- "temp",
- "static"
- ]
+ "enum": ["pooled", "temp", "static"]
},
"path": {
"type": "string"
@@ -5598,9 +5036,7 @@
"minimum": 0.1
}
},
- "required": [
- "image"
- ],
+ "required": ["image"],
"additionalProperties": false
}
},
@@ -5620,9 +5056,7 @@
"type": "string"
}
},
- "required": [
- "id"
- ],
+ "required": ["id"],
"additionalProperties": false
}
},
@@ -5657,12 +5091,7 @@
"properties": {
"role": {
"type": "string",
- "enum": [
- "system",
- "user",
- "assistant",
- "tool"
- ]
+ "enum": ["system", "user", "assistant", "tool"]
},
"content": {
"anyOf": [
@@ -5676,30 +5105,20 @@
"properties": {
"type": {
"type": "string",
- "enum": [
- "text",
- "file",
- "image"
- ]
+ "enum": ["text", "file", "image"]
},
"value": {
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
}
}
]
}
},
- "required": [
- "role",
- "content"
- ],
+ "required": ["role", "content"],
"additionalProperties": false
}
}
@@ -5727,12 +5146,7 @@
"properties": {
"role": {
"type": "string",
- "enum": [
- "system",
- "user",
- "assistant",
- "tool"
- ]
+ "enum": ["system", "user", "assistant", "tool"]
},
"content": {
"anyOf": [
@@ -5746,30 +5160,20 @@
"properties": {
"type": {
"type": "string",
- "enum": [
- "text",
- "file",
- "image"
- ]
+ "enum": ["text", "file", "image"]
},
"value": {
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
}
}
]
}
},
- "required": [
- "role",
- "content"
- ],
+ "required": ["role", "content"],
"additionalProperties": false
}
}
@@ -5813,10 +5217,7 @@
},
"type": {
"type": "string",
- "enum": [
- "code-grader",
- "code_grader"
- ]
+ "enum": ["code-grader", "code_grader"]
},
"command": {
"anyOf": [
@@ -5890,18 +5291,12 @@
]
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
}
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
},
{
@@ -5938,10 +5333,7 @@
},
"type": {
"type": "string",
- "enum": [
- "llm-grader",
- "llm_grader"
- ]
+ "enum": ["llm-grader", "llm_grader"]
},
"prompt": {
"anyOf": [
@@ -6036,10 +5428,7 @@
"minLength": 1
}
},
- "required": [
- "score_range",
- "outcome"
- ],
+ "required": ["score_range", "outcome"],
"additionalProperties": false
}
}
@@ -6090,17 +5479,12 @@
]
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
}
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -6163,9 +5547,7 @@
}
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -6181,10 +5563,7 @@
"maximum": 1
}
},
- "required": [
- "type",
- "threshold"
- ],
+ "required": ["type", "threshold"],
"additionalProperties": false
},
{
@@ -6201,10 +5580,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "path"
- ],
+ "required": ["type", "path"],
"additionalProperties": false
},
{
@@ -6221,18 +5597,13 @@
"type": "string"
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
}
]
}
},
- "required": [
- "type",
- "aggregator"
- ],
+ "required": ["type", "aggregator"],
"additionalProperties": false
},
{
@@ -6269,20 +5640,11 @@
},
"type": {
"type": "string",
- "enum": [
- "tool-trajectory",
- "tool_trajectory"
- ]
+ "enum": ["tool-trajectory", "tool_trajectory"]
},
"mode": {
"type": "string",
- "enum": [
- "any_order",
- "in_order",
- "exact",
- "subset",
- "superset"
- ]
+ "enum": ["any_order", "in_order", "exact", "subset", "superset"]
},
"minimums": {
"type": "object",
@@ -6323,12 +5685,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -6342,12 +5699,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -6358,9 +5710,7 @@
]
}
},
- "required": [
- "tool"
- ],
+ "required": ["tool"],
"additionalProperties": false
}
},
@@ -6368,12 +5718,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -6387,12 +5732,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -6403,10 +5743,7 @@
]
}
},
- "required": [
- "type",
- "mode"
- ],
+ "required": ["type", "mode"],
"additionalProperties": false
},
{
@@ -6443,10 +5780,7 @@
},
"type": {
"type": "string",
- "enum": [
- "field-accuracy",
- "field_accuracy"
- ]
+ "enum": ["field-accuracy", "field_accuracy"]
},
"fields": {
"type": "array",
@@ -6458,11 +5792,7 @@
},
"match": {
"type": "string",
- "enum": [
- "exact",
- "numeric_tolerance",
- "date"
- ]
+ "enum": ["exact", "numeric_tolerance", "date"]
},
"required": {
"type": "boolean"
@@ -6484,26 +5814,17 @@
}
}
},
- "required": [
- "path",
- "match"
- ],
+ "required": ["path", "match"],
"additionalProperties": false
},
"minItems": 1
},
"aggregation": {
"type": "string",
- "enum": [
- "weighted_average",
- "all_or_nothing"
- ]
+ "enum": ["weighted_average", "all_or_nothing"]
}
},
- "required": [
- "type",
- "fields"
- ],
+ "required": ["type", "fields"],
"additionalProperties": false
},
{
@@ -6547,10 +5868,7 @@
"minimum": 0
}
},
- "required": [
- "type",
- "threshold"
- ],
+ "required": ["type", "threshold"],
"additionalProperties": false
},
{
@@ -6594,10 +5912,7 @@
"minimum": 0
}
},
- "required": [
- "type",
- "budget"
- ],
+ "required": ["type", "budget"],
"additionalProperties": false
},
{
@@ -6634,10 +5949,7 @@
},
"type": {
"type": "string",
- "enum": [
- "token-usage",
- "token_usage"
- ]
+ "enum": ["token-usage", "token_usage"]
},
"max_total": {
"type": "number",
@@ -6652,9 +5964,7 @@
"minimum": 0
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -6691,10 +6001,7 @@
},
"type": {
"type": "string",
- "enum": [
- "execution-metrics",
- "execution_metrics"
- ]
+ "enum": ["execution-metrics", "execution_metrics"]
},
"max_tool_calls": {
"type": "number",
@@ -6726,9 +6033,7 @@
"minimum": 0
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -6771,10 +6076,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -6817,10 +6119,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -6857,15 +6156,10 @@
},
"type": {
"type": "string",
- "enum": [
- "is-json",
- "is_json"
- ]
+ "enum": ["is-json", "is_json"]
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -6908,10 +6202,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -7000,10 +6291,7 @@
"minLength": 1
}
},
- "required": [
- "score_range",
- "outcome"
- ],
+ "required": ["score_range", "outcome"],
"additionalProperties": false
}
}
@@ -7013,10 +6301,7 @@
"minItems": 1
}
},
- "required": [
- "type",
- "criteria"
- ],
+ "required": ["type", "criteria"],
"additionalProperties": false
}
]
@@ -7060,10 +6345,7 @@
},
"type": {
"type": "string",
- "enum": [
- "code-grader",
- "code_grader"
- ]
+ "enum": ["code-grader", "code_grader"]
},
"command": {
"anyOf": [
@@ -7137,18 +6419,12 @@
]
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
}
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
},
{
@@ -7185,10 +6461,7 @@
},
"type": {
"type": "string",
- "enum": [
- "llm-grader",
- "llm_grader"
- ]
+ "enum": ["llm-grader", "llm_grader"]
},
"prompt": {
"anyOf": [
@@ -7283,10 +6556,7 @@
"minLength": 1
}
},
- "required": [
- "score_range",
- "outcome"
- ],
+ "required": ["score_range", "outcome"],
"additionalProperties": false
}
}
@@ -7337,17 +6607,12 @@
]
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
}
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -7410,9 +6675,7 @@
}
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -7428,10 +6691,7 @@
"maximum": 1
}
},
- "required": [
- "type",
- "threshold"
- ],
+ "required": ["type", "threshold"],
"additionalProperties": false
},
{
@@ -7448,10 +6708,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "path"
- ],
+ "required": ["type", "path"],
"additionalProperties": false
},
{
@@ -7468,18 +6725,13 @@
"type": "string"
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
}
]
}
},
- "required": [
- "type",
- "aggregator"
- ],
+ "required": ["type", "aggregator"],
"additionalProperties": false
},
{
@@ -7516,20 +6768,11 @@
},
"type": {
"type": "string",
- "enum": [
- "tool-trajectory",
- "tool_trajectory"
- ]
+ "enum": ["tool-trajectory", "tool_trajectory"]
},
"mode": {
"type": "string",
- "enum": [
- "any_order",
- "in_order",
- "exact",
- "subset",
- "superset"
- ]
+ "enum": ["any_order", "in_order", "exact", "subset", "superset"]
},
"minimums": {
"type": "object",
@@ -7570,12 +6813,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -7589,12 +6827,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -7605,9 +6838,7 @@
]
}
},
- "required": [
- "tool"
- ],
+ "required": ["tool"],
"additionalProperties": false
}
},
@@ -7615,12 +6846,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -7634,12 +6860,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -7650,10 +6871,7 @@
]
}
},
- "required": [
- "type",
- "mode"
- ],
+ "required": ["type", "mode"],
"additionalProperties": false
},
{
@@ -7690,10 +6908,7 @@
},
"type": {
"type": "string",
- "enum": [
- "field-accuracy",
- "field_accuracy"
- ]
+ "enum": ["field-accuracy", "field_accuracy"]
},
"fields": {
"type": "array",
@@ -7705,11 +6920,7 @@
},
"match": {
"type": "string",
- "enum": [
- "exact",
- "numeric_tolerance",
- "date"
- ]
+ "enum": ["exact", "numeric_tolerance", "date"]
},
"required": {
"type": "boolean"
@@ -7731,26 +6942,17 @@
}
}
},
- "required": [
- "path",
- "match"
- ],
+ "required": ["path", "match"],
"additionalProperties": false
},
"minItems": 1
},
"aggregation": {
"type": "string",
- "enum": [
- "weighted_average",
- "all_or_nothing"
- ]
+ "enum": ["weighted_average", "all_or_nothing"]
}
},
- "required": [
- "type",
- "fields"
- ],
+ "required": ["type", "fields"],
"additionalProperties": false
},
{
@@ -7794,10 +6996,7 @@
"minimum": 0
}
},
- "required": [
- "type",
- "threshold"
- ],
+ "required": ["type", "threshold"],
"additionalProperties": false
},
{
@@ -7841,10 +7040,7 @@
"minimum": 0
}
},
- "required": [
- "type",
- "budget"
- ],
+ "required": ["type", "budget"],
"additionalProperties": false
},
{
@@ -7881,10 +7077,7 @@
},
"type": {
"type": "string",
- "enum": [
- "token-usage",
- "token_usage"
- ]
+ "enum": ["token-usage", "token_usage"]
},
"max_total": {
"type": "number",
@@ -7899,9 +7092,7 @@
"minimum": 0
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -7938,10 +7129,7 @@
},
"type": {
"type": "string",
- "enum": [
- "execution-metrics",
- "execution_metrics"
- ]
+ "enum": ["execution-metrics", "execution_metrics"]
},
"max_tool_calls": {
"type": "number",
@@ -7973,9 +7161,7 @@
"minimum": 0
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -8018,10 +7204,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -8064,10 +7247,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -8104,15 +7284,10 @@
},
"type": {
"type": "string",
- "enum": [
- "is-json",
- "is_json"
- ]
+ "enum": ["is-json", "is_json"]
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -8155,10 +7330,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -8247,10 +7419,7 @@
"minLength": 1
}
},
- "required": [
- "score_range",
- "outcome"
- ],
+ "required": ["score_range", "outcome"],
"additionalProperties": false
}
}
@@ -8260,10 +7429,7 @@
"minItems": 1
}
},
- "required": [
- "type",
- "criteria"
- ],
+ "required": ["type", "criteria"],
"additionalProperties": false
}
]
@@ -8324,10 +7490,7 @@
},
"type": {
"type": "string",
- "enum": [
- "code-grader",
- "code_grader"
- ]
+ "enum": ["code-grader", "code_grader"]
},
"command": {
"anyOf": [
@@ -8401,18 +7564,12 @@
]
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
}
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
},
{
@@ -8449,10 +7606,7 @@
},
"type": {
"type": "string",
- "enum": [
- "llm-grader",
- "llm_grader"
- ]
+ "enum": ["llm-grader", "llm_grader"]
},
"prompt": {
"anyOf": [
@@ -8547,10 +7701,7 @@
"minLength": 1
}
},
- "required": [
- "score_range",
- "outcome"
- ],
+ "required": ["score_range", "outcome"],
"additionalProperties": false
}
}
@@ -8601,17 +7752,12 @@
]
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
}
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -8674,9 +7820,7 @@
}
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -8692,10 +7836,7 @@
"maximum": 1
}
},
- "required": [
- "type",
- "threshold"
- ],
+ "required": ["type", "threshold"],
"additionalProperties": false
},
{
@@ -8712,10 +7853,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "path"
- ],
+ "required": ["type", "path"],
"additionalProperties": false
},
{
@@ -8732,18 +7870,13 @@
"type": "string"
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
}
]
}
},
- "required": [
- "type",
- "aggregator"
- ],
+ "required": ["type", "aggregator"],
"additionalProperties": false
},
{
@@ -8780,20 +7913,11 @@
},
"type": {
"type": "string",
- "enum": [
- "tool-trajectory",
- "tool_trajectory"
- ]
+ "enum": ["tool-trajectory", "tool_trajectory"]
},
"mode": {
"type": "string",
- "enum": [
- "any_order",
- "in_order",
- "exact",
- "subset",
- "superset"
- ]
+ "enum": ["any_order", "in_order", "exact", "subset", "superset"]
},
"minimums": {
"type": "object",
@@ -8834,12 +7958,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -8853,12 +7972,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -8869,9 +7983,7 @@
]
}
},
- "required": [
- "tool"
- ],
+ "required": ["tool"],
"additionalProperties": false
}
},
@@ -8879,12 +7991,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -8898,12 +8005,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -8914,10 +8016,7 @@
]
}
},
- "required": [
- "type",
- "mode"
- ],
+ "required": ["type", "mode"],
"additionalProperties": false
},
{
@@ -8954,10 +8053,7 @@
},
"type": {
"type": "string",
- "enum": [
- "field-accuracy",
- "field_accuracy"
- ]
+ "enum": ["field-accuracy", "field_accuracy"]
},
"fields": {
"type": "array",
@@ -8969,11 +8065,7 @@
},
"match": {
"type": "string",
- "enum": [
- "exact",
- "numeric_tolerance",
- "date"
- ]
+ "enum": ["exact", "numeric_tolerance", "date"]
},
"required": {
"type": "boolean"
@@ -8995,26 +8087,17 @@
}
}
},
- "required": [
- "path",
- "match"
- ],
+ "required": ["path", "match"],
"additionalProperties": false
},
"minItems": 1
},
"aggregation": {
"type": "string",
- "enum": [
- "weighted_average",
- "all_or_nothing"
- ]
+ "enum": ["weighted_average", "all_or_nothing"]
}
},
- "required": [
- "type",
- "fields"
- ],
+ "required": ["type", "fields"],
"additionalProperties": false
},
{
@@ -9058,10 +8141,7 @@
"minimum": 0
}
},
- "required": [
- "type",
- "threshold"
- ],
+ "required": ["type", "threshold"],
"additionalProperties": false
},
{
@@ -9105,10 +8185,7 @@
"minimum": 0
}
},
- "required": [
- "type",
- "budget"
- ],
+ "required": ["type", "budget"],
"additionalProperties": false
},
{
@@ -9145,10 +8222,7 @@
},
"type": {
"type": "string",
- "enum": [
- "token-usage",
- "token_usage"
- ]
+ "enum": ["token-usage", "token_usage"]
},
"max_total": {
"type": "number",
@@ -9163,9 +8237,7 @@
"minimum": 0
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -9202,10 +8274,7 @@
},
"type": {
"type": "string",
- "enum": [
- "execution-metrics",
- "execution_metrics"
- ]
+ "enum": ["execution-metrics", "execution_metrics"]
},
"max_tool_calls": {
"type": "number",
@@ -9237,9 +8306,7 @@
"minimum": 0
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -9282,10 +8349,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -9328,10 +8392,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -9368,15 +8429,10 @@
},
"type": {
"type": "string",
- "enum": [
- "is-json",
- "is_json"
- ]
+ "enum": ["is-json", "is_json"]
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -9419,10 +8475,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -9511,10 +8564,7 @@
"minLength": 1
}
},
- "required": [
- "score_range",
- "outcome"
- ],
+ "required": ["score_range", "outcome"],
"additionalProperties": false
}
}
@@ -9524,10 +8574,7 @@
"minItems": 1
}
},
- "required": [
- "type",
- "criteria"
- ],
+ "required": ["type", "criteria"],
"additionalProperties": false
}
]
@@ -9571,10 +8618,7 @@
},
"type": {
"type": "string",
- "enum": [
- "code-grader",
- "code_grader"
- ]
+ "enum": ["code-grader", "code_grader"]
},
"command": {
"anyOf": [
@@ -9648,18 +8692,12 @@
]
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
}
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
},
{
@@ -9696,10 +8734,7 @@
},
"type": {
"type": "string",
- "enum": [
- "llm-grader",
- "llm_grader"
- ]
+ "enum": ["llm-grader", "llm_grader"]
},
"prompt": {
"anyOf": [
@@ -9794,10 +8829,7 @@
"minLength": 1
}
},
- "required": [
- "score_range",
- "outcome"
- ],
+ "required": ["score_range", "outcome"],
"additionalProperties": false
}
}
@@ -9848,17 +8880,12 @@
]
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
}
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -9921,9 +8948,7 @@
}
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -9939,10 +8964,7 @@
"maximum": 1
}
},
- "required": [
- "type",
- "threshold"
- ],
+ "required": ["type", "threshold"],
"additionalProperties": false
},
{
@@ -9959,10 +8981,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "path"
- ],
+ "required": ["type", "path"],
"additionalProperties": false
},
{
@@ -9979,18 +8998,13 @@
"type": "string"
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
}
]
}
},
- "required": [
- "type",
- "aggregator"
- ],
+ "required": ["type", "aggregator"],
"additionalProperties": false
},
{
@@ -10027,20 +9041,11 @@
},
"type": {
"type": "string",
- "enum": [
- "tool-trajectory",
- "tool_trajectory"
- ]
+ "enum": ["tool-trajectory", "tool_trajectory"]
},
"mode": {
"type": "string",
- "enum": [
- "any_order",
- "in_order",
- "exact",
- "subset",
- "superset"
- ]
+ "enum": ["any_order", "in_order", "exact", "subset", "superset"]
},
"minimums": {
"type": "object",
@@ -10081,12 +9086,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -10100,12 +9100,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -10116,9 +9111,7 @@
]
}
},
- "required": [
- "tool"
- ],
+ "required": ["tool"],
"additionalProperties": false
}
},
@@ -10126,12 +9119,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -10145,12 +9133,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -10161,10 +9144,7 @@
]
}
},
- "required": [
- "type",
- "mode"
- ],
+ "required": ["type", "mode"],
"additionalProperties": false
},
{
@@ -10201,10 +9181,7 @@
},
"type": {
"type": "string",
- "enum": [
- "field-accuracy",
- "field_accuracy"
- ]
+ "enum": ["field-accuracy", "field_accuracy"]
},
"fields": {
"type": "array",
@@ -10216,11 +9193,7 @@
},
"match": {
"type": "string",
- "enum": [
- "exact",
- "numeric_tolerance",
- "date"
- ]
+ "enum": ["exact", "numeric_tolerance", "date"]
},
"required": {
"type": "boolean"
@@ -10242,26 +9215,17 @@
}
}
},
- "required": [
- "path",
- "match"
- ],
+ "required": ["path", "match"],
"additionalProperties": false
},
"minItems": 1
},
"aggregation": {
"type": "string",
- "enum": [
- "weighted_average",
- "all_or_nothing"
- ]
+ "enum": ["weighted_average", "all_or_nothing"]
}
},
- "required": [
- "type",
- "fields"
- ],
+ "required": ["type", "fields"],
"additionalProperties": false
},
{
@@ -10305,10 +9269,7 @@
"minimum": 0
}
},
- "required": [
- "type",
- "threshold"
- ],
+ "required": ["type", "threshold"],
"additionalProperties": false
},
{
@@ -10352,10 +9313,7 @@
"minimum": 0
}
},
- "required": [
- "type",
- "budget"
- ],
+ "required": ["type", "budget"],
"additionalProperties": false
},
{
@@ -10392,10 +9350,7 @@
},
"type": {
"type": "string",
- "enum": [
- "token-usage",
- "token_usage"
- ]
+ "enum": ["token-usage", "token_usage"]
},
"max_total": {
"type": "number",
@@ -10410,9 +9365,7 @@
"minimum": 0
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -10449,10 +9402,7 @@
},
"type": {
"type": "string",
- "enum": [
- "execution-metrics",
- "execution_metrics"
- ]
+ "enum": ["execution-metrics", "execution_metrics"]
},
"max_tool_calls": {
"type": "number",
@@ -10484,9 +9434,7 @@
"minimum": 0
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -10529,10 +9477,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -10575,10 +9520,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -10615,15 +9557,10 @@
},
"type": {
"type": "string",
- "enum": [
- "is-json",
- "is_json"
- ]
+ "enum": ["is-json", "is_json"]
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -10666,10 +9603,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -10758,10 +9692,7 @@
"minLength": 1
}
},
- "required": [
- "score_range",
- "outcome"
- ],
+ "required": ["score_range", "outcome"],
"additionalProperties": false
}
}
@@ -10771,10 +9702,7 @@
"minItems": 1
}
},
- "required": [
- "type",
- "criteria"
- ],
+ "required": ["type", "criteria"],
"additionalProperties": false
}
]
@@ -10795,11 +9723,7 @@
},
"strategy": {
"type": "string",
- "enum": [
- "pass_at_k",
- "mean",
- "confidence_interval"
- ]
+ "enum": ["pass_at_k", "mean", "confidence_interval"]
},
"cost_limit_usd": {
"type": "number",
@@ -10810,9 +9734,7 @@
"minimum": 0
}
},
- "required": [
- "count"
- ],
+ "required": ["count"],
"additionalProperties": false
},
"total_budget_usd": {
@@ -10845,10 +9767,7 @@
},
"isolation": {
"type": "string",
- "enum": [
- "shared",
- "per_test"
- ]
+ "enum": ["shared", "per_test"]
},
"repos": {
"type": "array",
@@ -10872,10 +9791,7 @@
"format": "uri"
}
},
- "required": [
- "type",
- "url"
- ],
+ "required": ["type", "url"],
"additionalProperties": false
},
{
@@ -10889,10 +9805,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "path"
- ],
+ "required": ["type", "path"],
"additionalProperties": false
}
]
@@ -10905,10 +9818,7 @@
},
"resolve": {
"type": "string",
- "enum": [
- "remote",
- "local"
- ]
+ "enum": ["remote", "local"]
},
"ancestor": {
"type": "integer",
@@ -10937,10 +9847,7 @@
"additionalProperties": false
}
},
- "required": [
- "path",
- "source"
- ],
+ "required": ["path", "source"],
"additionalProperties": false
}
},
@@ -10976,11 +9883,7 @@
},
"reset": {
"type": "string",
- "enum": [
- "none",
- "fast",
- "strict"
- ]
+ "enum": ["none", "fast", "strict"]
}
},
"additionalProperties": false
@@ -11011,11 +9914,7 @@
},
"reset": {
"type": "string",
- "enum": [
- "none",
- "fast",
- "strict"
- ]
+ "enum": ["none", "fast", "strict"]
}
},
"additionalProperties": false
@@ -11046,11 +9945,7 @@
},
"reset": {
"type": "string",
- "enum": [
- "none",
- "fast",
- "strict"
- ]
+ "enum": ["none", "fast", "strict"]
}
},
"additionalProperties": false
@@ -11081,11 +9976,7 @@
},
"reset": {
"type": "string",
- "enum": [
- "none",
- "fast",
- "strict"
- ]
+ "enum": ["none", "fast", "strict"]
}
},
"additionalProperties": false
@@ -11095,11 +9986,7 @@
},
"mode": {
"type": "string",
- "enum": [
- "pooled",
- "temp",
- "static"
- ]
+ "enum": ["pooled", "temp", "static"]
},
"path": {
"type": "string"
@@ -11122,9 +10009,7 @@
"minimum": 0.1
}
},
- "required": [
- "image"
- ],
+ "required": ["image"],
"additionalProperties": false
}
},
@@ -11144,9 +10029,7 @@
"type": "string"
}
},
- "required": [
- "id"
- ],
+ "required": ["id"],
"additionalProperties": false
}
},
@@ -11213,10 +10096,7 @@
},
"type": {
"type": "string",
- "enum": [
- "code-grader",
- "code_grader"
- ]
+ "enum": ["code-grader", "code_grader"]
},
"command": {
"anyOf": [
@@ -11290,18 +10170,12 @@
]
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
}
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
},
{
@@ -11338,10 +10212,7 @@
},
"type": {
"type": "string",
- "enum": [
- "llm-grader",
- "llm_grader"
- ]
+ "enum": ["llm-grader", "llm_grader"]
},
"prompt": {
"anyOf": [
@@ -11436,10 +10307,7 @@
"minLength": 1
}
},
- "required": [
- "score_range",
- "outcome"
- ],
+ "required": ["score_range", "outcome"],
"additionalProperties": false
}
}
@@ -11490,17 +10358,12 @@
]
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
}
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -11563,9 +10426,7 @@
}
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -11581,10 +10442,7 @@
"maximum": 1
}
},
- "required": [
- "type",
- "threshold"
- ],
+ "required": ["type", "threshold"],
"additionalProperties": false
},
{
@@ -11601,10 +10459,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "path"
- ],
+ "required": ["type", "path"],
"additionalProperties": false
},
{
@@ -11621,18 +10476,13 @@
"type": "string"
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
}
]
}
},
- "required": [
- "type",
- "aggregator"
- ],
+ "required": ["type", "aggregator"],
"additionalProperties": false
},
{
@@ -11669,20 +10519,11 @@
},
"type": {
"type": "string",
- "enum": [
- "tool-trajectory",
- "tool_trajectory"
- ]
+ "enum": ["tool-trajectory", "tool_trajectory"]
},
"mode": {
"type": "string",
- "enum": [
- "any_order",
- "in_order",
- "exact",
- "subset",
- "superset"
- ]
+ "enum": ["any_order", "in_order", "exact", "subset", "superset"]
},
"minimums": {
"type": "object",
@@ -11723,12 +10564,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -11742,12 +10578,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -11758,9 +10589,7 @@
]
}
},
- "required": [
- "tool"
- ],
+ "required": ["tool"],
"additionalProperties": false
}
},
@@ -11768,12 +10597,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -11787,12 +10611,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -11803,10 +10622,7 @@
]
}
},
- "required": [
- "type",
- "mode"
- ],
+ "required": ["type", "mode"],
"additionalProperties": false
},
{
@@ -11843,10 +10659,7 @@
},
"type": {
"type": "string",
- "enum": [
- "field-accuracy",
- "field_accuracy"
- ]
+ "enum": ["field-accuracy", "field_accuracy"]
},
"fields": {
"type": "array",
@@ -11858,11 +10671,7 @@
},
"match": {
"type": "string",
- "enum": [
- "exact",
- "numeric_tolerance",
- "date"
- ]
+ "enum": ["exact", "numeric_tolerance", "date"]
},
"required": {
"type": "boolean"
@@ -11884,26 +10693,17 @@
}
}
},
- "required": [
- "path",
- "match"
- ],
+ "required": ["path", "match"],
"additionalProperties": false
},
"minItems": 1
},
"aggregation": {
"type": "string",
- "enum": [
- "weighted_average",
- "all_or_nothing"
- ]
+ "enum": ["weighted_average", "all_or_nothing"]
}
},
- "required": [
- "type",
- "fields"
- ],
+ "required": ["type", "fields"],
"additionalProperties": false
},
{
@@ -11947,10 +10747,7 @@
"minimum": 0
}
},
- "required": [
- "type",
- "threshold"
- ],
+ "required": ["type", "threshold"],
"additionalProperties": false
},
{
@@ -11994,10 +10791,7 @@
"minimum": 0
}
},
- "required": [
- "type",
- "budget"
- ],
+ "required": ["type", "budget"],
"additionalProperties": false
},
{
@@ -12034,10 +10828,7 @@
},
"type": {
"type": "string",
- "enum": [
- "token-usage",
- "token_usage"
- ]
+ "enum": ["token-usage", "token_usage"]
},
"max_total": {
"type": "number",
@@ -12052,9 +10843,7 @@
"minimum": 0
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -12091,10 +10880,7 @@
},
"type": {
"type": "string",
- "enum": [
- "execution-metrics",
- "execution_metrics"
- ]
+ "enum": ["execution-metrics", "execution_metrics"]
},
"max_tool_calls": {
"type": "number",
@@ -12126,9 +10912,7 @@
"minimum": 0
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -12171,10 +10955,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -12217,10 +10998,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -12257,15 +11035,10 @@
},
"type": {
"type": "string",
- "enum": [
- "is-json",
- "is_json"
- ]
+ "enum": ["is-json", "is_json"]
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -12308,10 +11081,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -12400,10 +11170,7 @@
"minLength": 1
}
},
- "required": [
- "score_range",
- "outcome"
- ],
+ "required": ["score_range", "outcome"],
"additionalProperties": false
}
}
@@ -12413,10 +11180,7 @@
"minItems": 1
}
},
- "required": [
- "type",
- "criteria"
- ],
+ "required": ["type", "criteria"],
"additionalProperties": false
}
]
@@ -12460,10 +11224,7 @@
},
"type": {
"type": "string",
- "enum": [
- "code-grader",
- "code_grader"
- ]
+ "enum": ["code-grader", "code_grader"]
},
"command": {
"anyOf": [
@@ -12537,18 +11298,12 @@
]
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
}
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
},
{
@@ -12585,10 +11340,7 @@
},
"type": {
"type": "string",
- "enum": [
- "llm-grader",
- "llm_grader"
- ]
+ "enum": ["llm-grader", "llm_grader"]
},
"prompt": {
"anyOf": [
@@ -12683,10 +11435,7 @@
"minLength": 1
}
},
- "required": [
- "score_range",
- "outcome"
- ],
+ "required": ["score_range", "outcome"],
"additionalProperties": false
}
}
@@ -12737,17 +11486,12 @@
]
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
}
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -12810,9 +11554,7 @@
}
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -12828,10 +11570,7 @@
"maximum": 1
}
},
- "required": [
- "type",
- "threshold"
- ],
+ "required": ["type", "threshold"],
"additionalProperties": false
},
{
@@ -12848,10 +11587,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "path"
- ],
+ "required": ["type", "path"],
"additionalProperties": false
},
{
@@ -12868,18 +11604,13 @@
"type": "string"
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
}
]
}
},
- "required": [
- "type",
- "aggregator"
- ],
+ "required": ["type", "aggregator"],
"additionalProperties": false
},
{
@@ -12916,20 +11647,11 @@
},
"type": {
"type": "string",
- "enum": [
- "tool-trajectory",
- "tool_trajectory"
- ]
+ "enum": ["tool-trajectory", "tool_trajectory"]
},
"mode": {
"type": "string",
- "enum": [
- "any_order",
- "in_order",
- "exact",
- "subset",
- "superset"
- ]
+ "enum": ["any_order", "in_order", "exact", "subset", "superset"]
},
"minimums": {
"type": "object",
@@ -12970,12 +11692,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -12989,12 +11706,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -13005,9 +11717,7 @@
]
}
},
- "required": [
- "tool"
- ],
+ "required": ["tool"],
"additionalProperties": false
}
},
@@ -13015,12 +11725,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -13034,12 +11739,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -13050,10 +11750,7 @@
]
}
},
- "required": [
- "type",
- "mode"
- ],
+ "required": ["type", "mode"],
"additionalProperties": false
},
{
@@ -13090,10 +11787,7 @@
},
"type": {
"type": "string",
- "enum": [
- "field-accuracy",
- "field_accuracy"
- ]
+ "enum": ["field-accuracy", "field_accuracy"]
},
"fields": {
"type": "array",
@@ -13105,11 +11799,7 @@
},
"match": {
"type": "string",
- "enum": [
- "exact",
- "numeric_tolerance",
- "date"
- ]
+ "enum": ["exact", "numeric_tolerance", "date"]
},
"required": {
"type": "boolean"
@@ -13131,26 +11821,17 @@
}
}
},
- "required": [
- "path",
- "match"
- ],
+ "required": ["path", "match"],
"additionalProperties": false
},
"minItems": 1
},
"aggregation": {
"type": "string",
- "enum": [
- "weighted_average",
- "all_or_nothing"
- ]
+ "enum": ["weighted_average", "all_or_nothing"]
}
},
- "required": [
- "type",
- "fields"
- ],
+ "required": ["type", "fields"],
"additionalProperties": false
},
{
@@ -13194,10 +11875,7 @@
"minimum": 0
}
},
- "required": [
- "type",
- "threshold"
- ],
+ "required": ["type", "threshold"],
"additionalProperties": false
},
{
@@ -13241,10 +11919,7 @@
"minimum": 0
}
},
- "required": [
- "type",
- "budget"
- ],
+ "required": ["type", "budget"],
"additionalProperties": false
},
{
@@ -13281,10 +11956,7 @@
},
"type": {
"type": "string",
- "enum": [
- "token-usage",
- "token_usage"
- ]
+ "enum": ["token-usage", "token_usage"]
},
"max_total": {
"type": "number",
@@ -13299,9 +11971,7 @@
"minimum": 0
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -13338,10 +12008,7 @@
},
"type": {
"type": "string",
- "enum": [
- "execution-metrics",
- "execution_metrics"
- ]
+ "enum": ["execution-metrics", "execution_metrics"]
},
"max_tool_calls": {
"type": "number",
@@ -13373,9 +12040,7 @@
"minimum": 0
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -13418,10 +12083,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -13464,10 +12126,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -13504,15 +12163,10 @@
},
"type": {
"type": "string",
- "enum": [
- "is-json",
- "is_json"
- ]
+ "enum": ["is-json", "is_json"]
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -13555,10 +12209,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -13647,10 +12298,7 @@
"minLength": 1
}
},
- "required": [
- "score_range",
- "outcome"
- ],
+ "required": ["score_range", "outcome"],
"additionalProperties": false
}
}
@@ -13660,10 +12308,7 @@
"minItems": 1
}
},
- "required": [
- "type",
- "criteria"
- ],
+ "required": ["type", "criteria"],
"additionalProperties": false
}
]
@@ -13684,11 +12329,7 @@
},
"strategy": {
"type": "string",
- "enum": [
- "pass_at_k",
- "mean",
- "confidence_interval"
- ]
+ "enum": ["pass_at_k", "mean", "confidence_interval"]
},
"cost_limit_usd": {
"type": "number",
@@ -13699,9 +12340,7 @@
"minimum": 0
}
},
- "required": [
- "count"
- ],
+ "required": ["count"],
"additionalProperties": false
},
"total_budget_usd": {
@@ -13764,10 +12403,7 @@
},
"type": {
"type": "string",
- "enum": [
- "code-grader",
- "code_grader"
- ]
+ "enum": ["code-grader", "code_grader"]
},
"command": {
"anyOf": [
@@ -13841,18 +12477,12 @@
]
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
}
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
},
{
@@ -13889,10 +12519,7 @@
},
"type": {
"type": "string",
- "enum": [
- "llm-grader",
- "llm_grader"
- ]
+ "enum": ["llm-grader", "llm_grader"]
},
"prompt": {
"anyOf": [
@@ -13987,10 +12614,7 @@
"minLength": 1
}
},
- "required": [
- "score_range",
- "outcome"
- ],
+ "required": ["score_range", "outcome"],
"additionalProperties": false
}
}
@@ -14041,17 +12665,12 @@
]
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
}
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -14114,9 +12733,7 @@
}
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -14132,10 +12749,7 @@
"maximum": 1
}
},
- "required": [
- "type",
- "threshold"
- ],
+ "required": ["type", "threshold"],
"additionalProperties": false
},
{
@@ -14152,10 +12766,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "path"
- ],
+ "required": ["type", "path"],
"additionalProperties": false
},
{
@@ -14172,18 +12783,13 @@
"type": "string"
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
}
]
}
},
- "required": [
- "type",
- "aggregator"
- ],
+ "required": ["type", "aggregator"],
"additionalProperties": false
},
{
@@ -14220,20 +12826,11 @@
},
"type": {
"type": "string",
- "enum": [
- "tool-trajectory",
- "tool_trajectory"
- ]
+ "enum": ["tool-trajectory", "tool_trajectory"]
},
"mode": {
"type": "string",
- "enum": [
- "any_order",
- "in_order",
- "exact",
- "subset",
- "superset"
- ]
+ "enum": ["any_order", "in_order", "exact", "subset", "superset"]
},
"minimums": {
"type": "object",
@@ -14274,12 +12871,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -14293,12 +12885,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -14309,9 +12896,7 @@
]
}
},
- "required": [
- "tool"
- ],
+ "required": ["tool"],
"additionalProperties": false
}
},
@@ -14319,12 +12904,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -14338,12 +12918,7 @@
"anyOf": [
{
"type": "string",
- "enum": [
- "exact",
- "ignore",
- "subset",
- "superset"
- ]
+ "enum": ["exact", "ignore", "subset", "superset"]
},
{
"type": "array",
@@ -14354,10 +12929,7 @@
]
}
},
- "required": [
- "type",
- "mode"
- ],
+ "required": ["type", "mode"],
"additionalProperties": false
},
{
@@ -14394,10 +12966,7 @@
},
"type": {
"type": "string",
- "enum": [
- "field-accuracy",
- "field_accuracy"
- ]
+ "enum": ["field-accuracy", "field_accuracy"]
},
"fields": {
"type": "array",
@@ -14409,11 +12978,7 @@
},
"match": {
"type": "string",
- "enum": [
- "exact",
- "numeric_tolerance",
- "date"
- ]
+ "enum": ["exact", "numeric_tolerance", "date"]
},
"required": {
"type": "boolean"
@@ -14435,26 +13000,17 @@
}
}
},
- "required": [
- "path",
- "match"
- ],
+ "required": ["path", "match"],
"additionalProperties": false
},
"minItems": 1
},
"aggregation": {
"type": "string",
- "enum": [
- "weighted_average",
- "all_or_nothing"
- ]
+ "enum": ["weighted_average", "all_or_nothing"]
}
},
- "required": [
- "type",
- "fields"
- ],
+ "required": ["type", "fields"],
"additionalProperties": false
},
{
@@ -14498,10 +13054,7 @@
"minimum": 0
}
},
- "required": [
- "type",
- "threshold"
- ],
+ "required": ["type", "threshold"],
"additionalProperties": false
},
{
@@ -14545,10 +13098,7 @@
"minimum": 0
}
},
- "required": [
- "type",
- "budget"
- ],
+ "required": ["type", "budget"],
"additionalProperties": false
},
{
@@ -14585,10 +13135,7 @@
},
"type": {
"type": "string",
- "enum": [
- "token-usage",
- "token_usage"
- ]
+ "enum": ["token-usage", "token_usage"]
},
"max_total": {
"type": "number",
@@ -14603,9 +13150,7 @@
"minimum": 0
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -14642,10 +13187,7 @@
},
"type": {
"type": "string",
- "enum": [
- "execution-metrics",
- "execution_metrics"
- ]
+ "enum": ["execution-metrics", "execution_metrics"]
},
"max_tool_calls": {
"type": "number",
@@ -14677,9 +13219,7 @@
"minimum": 0
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -14722,10 +13262,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -14768,10 +13305,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -14808,15 +13342,10 @@
},
"type": {
"type": "string",
- "enum": [
- "is-json",
- "is_json"
- ]
+ "enum": ["is-json", "is_json"]
}
},
- "required": [
- "type"
- ],
+ "required": ["type"],
"additionalProperties": false
},
{
@@ -14859,10 +13388,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "value"
- ],
+ "required": ["type", "value"],
"additionalProperties": false
},
{
@@ -14951,10 +13477,7 @@
"minLength": 1
}
},
- "required": [
- "score_range",
- "outcome"
- ],
+ "required": ["score_range", "outcome"],
"additionalProperties": false
}
}
@@ -14964,10 +13487,7 @@
"minItems": 1
}
},
- "required": [
- "type",
- "criteria"
- ],
+ "required": ["type", "criteria"],
"additionalProperties": false
}
]
@@ -14996,10 +13516,7 @@
]
}
},
- "required": [
- "type",
- "command"
- ],
+ "required": ["type", "command"],
"additionalProperties": false
}
},
@@ -15013,10 +13530,7 @@
},
"isolation": {
"type": "string",
- "enum": [
- "shared",
- "per_test"
- ]
+ "enum": ["shared", "per_test"]
},
"repos": {
"type": "array",
@@ -15040,10 +13554,7 @@
"format": "uri"
}
},
- "required": [
- "type",
- "url"
- ],
+ "required": ["type", "url"],
"additionalProperties": false
},
{
@@ -15057,10 +13568,7 @@
"type": "string"
}
},
- "required": [
- "type",
- "path"
- ],
+ "required": ["type", "path"],
"additionalProperties": false
}
]
@@ -15073,10 +13581,7 @@
},
"resolve": {
"type": "string",
- "enum": [
- "remote",
- "local"
- ]
+ "enum": ["remote", "local"]
},
"ancestor": {
"type": "integer",
@@ -15105,10 +13610,7 @@
"additionalProperties": false
}
},
- "required": [
- "path",
- "source"
- ],
+ "required": ["path", "source"],
"additionalProperties": false
}
},
@@ -15144,11 +13646,7 @@
},
"reset": {
"type": "string",
- "enum": [
- "none",
- "fast",
- "strict"
- ]
+ "enum": ["none", "fast", "strict"]
}
},
"additionalProperties": false
@@ -15179,11 +13677,7 @@
},
"reset": {
"type": "string",
- "enum": [
- "none",
- "fast",
- "strict"
- ]
+ "enum": ["none", "fast", "strict"]
}
},
"additionalProperties": false
@@ -15214,11 +13708,7 @@
},
"reset": {
"type": "string",
- "enum": [
- "none",
- "fast",
- "strict"
- ]
+ "enum": ["none", "fast", "strict"]
}
},
"additionalProperties": false
@@ -15249,11 +13739,7 @@
},
"reset": {
"type": "string",
- "enum": [
- "none",
- "fast",
- "strict"
- ]
+ "enum": ["none", "fast", "strict"]
}
},
"additionalProperties": false
@@ -15263,11 +13749,7 @@
},
"mode": {
"type": "string",
- "enum": [
- "pooled",
- "temp",
- "static"
- ]
+ "enum": ["pooled", "temp", "static"]
},
"path": {
"type": "string"
@@ -15290,9 +13772,7 @@
"minimum": 0.1
}
},
- "required": [
- "image"
- ],
+ "required": ["image"],
"additionalProperties": false
}
},
@@ -15304,9 +13784,7 @@
]
}
},
- "required": [
- "tests"
- ],
+ "required": ["tests"],
"additionalProperties": false
}
}