diff --git a/.github/workflows/validate.yml b/.github/workflows/validate.yml
index 8f62d5a8a..009233511 100644
--- a/.github/workflows/validate.yml
+++ b/.github/workflows/validate.yml
@@ -53,3 +53,22 @@ jobs:
 
       - name: Validate eval schemas
         run: bun apps/cli/dist/cli.js validate 'examples/features/**/evals/**/*.eval.yaml' 'examples/features/**/*.EVAL.yaml'
+
+  benchmark-results:
+    name: Validate Benchmark Results
+    runs-on: ubuntu-latest
+    if: >-
+      contains(github.event.pull_request.title, 'benchmark') ||
+      contains(join(github.event.pull_request.labels.*.name, ','), 'benchmark') ||
+      github.event_name == 'push'
+    steps:
+      - uses: actions/checkout@v4
+      - uses: ./.github/actions/setup-bun
+
+      - name: Validate SWE-bench Lite result JSON files
+        run: |
+          if ls benchmarks/swe-bench-lite/results/*.json 1> /dev/null 2>&1; then
+            bun benchmarks/swe-bench-lite/validate-result.ts benchmarks/swe-bench-lite/results/*.json
+          else
+            echo "No result files found — skipping"
+          fi
diff --git a/apps/web/src/components/Lander.astro b/apps/web/src/components/Lander.astro
index 25a62cbe8..664064582 100644
--- a/apps/web/src/components/Lander.astro
+++ b/apps/web/src/components/Lander.astro
@@ -14,6 +14,7 @@
     </a>
     <div class="av-nav-links">
       <a href="/docs/">Docs</a>
+      <a href="/leaderboard">Leaderboard</a>
       <a href="https://github.com/EntityProcess/agentv" target="_blank" rel="noopener noreferrer">GitHub</a>
       <button class="av-nav-pill" data-command="npm install -g agentv">
         <code>npm install -g agentv</code>
@@ -118,6 +119,19 @@
     </div>
   </section>
 
+  <!-- Leaderboard CTA Section -->
+  <section class="av-features" style="border-top: 1px solid rgba(255,255,255,0.04);">
+    <div class="av-container" style="text-align:center;">
+      <h2 class="av-section-heading">Public Leaderboard</h2>
+      <p style="color:#94a3b8; max-width:560px; margin:0 auto 1.5rem; font-size:0.95rem;">
+        SWE-bench Lite results with richer metrics — cost efficiency, tool usage, and Pareto-optimal rankings. See how models actually compare.
+      </p>
+      <a href="/leaderboard" class="av-btn-primary" style="display:inline-block; padding:0.75rem 2rem; font-size:0.9rem;">
+        View Leaderboard →
+      </a>
+    </div>
+  </section>
+
   <!-- Quick Start Section -->
   <section class="av-quickstart">
     <div class="av-container">
diff --git a/apps/web/src/pages/leaderboard.astro b/apps/web/src/pages/leaderboard.astro
new file mode 100644
index 000000000..d56908796
--- /dev/null
+++ b/apps/web/src/pages/leaderboard.astro
@@ -0,0 +1,632 @@
+---
+/**
+ * AgentV Leaderboard — SWE-bench Lite
+ *
+ * Static page that reads benchmark result JSON files at build time
+ * and renders a sortable table + Pareto frontier chart.
+ *
+ * Data source: /benchmarks/swe-bench-lite/results/*.json
+ * Route: /leaderboard
+ */
+
+// Read result files at build time
+import { readFileSync, readdirSync } from 'node:fs';
+import { join } from 'node:path';
+
+interface ResultData {
+  model: string;
+  provider: string;
+  model_type: string;
+  date: string;
+  agent: string;
+  agent_version: string;
+  dataset: string;
+  total_instances: number;
+  resolved_instances: number;
+  resolution_rate: number;
+  avg_cost_usd: number;
+  avg_cost_per_fix_usd: number;
+  avg_duration_ms: number;
+  avg_tool_calls: number;
+}
+
+const resultsDir = join(process.cwd(), '../../benchmarks/swe-bench-lite/results');
+let results: ResultData[] = [];
+
+// Sanitize string for use in CSS class names (alphanumeric + hyphens only)
+function safeCssClass(s: string): string {
+  return s.replace(/[^a-z0-9-]/gi, '').toLowerCase();
+}
+
+try {
+  const files = readdirSync(resultsDir).filter((f) => f.endsWith('.json'));
+  results = files.map((f) => {
+    const data = JSON.parse(readFileSync(join(resultsDir, f), 'utf8'));
+    return data as ResultData;
+  });
+  // Sort by resolution rate descending
+  results.sort((a, b) => b.resolution_rate - a.resolution_rate);
+} catch {
+  // Results dir may not exist in all environments
+}
+
+// Provider colors for chart
+const providerColors: Record<string, string> = {
+  anthropic: '#06b6d4',
+  openai: '#10b981',
+  google: '#f59e0b',
+  deepseek: '#8b5cf6',
+  meta: '#ef4444',
+};
+
+// Compute Pareto frontier
+function computeParetoFrontier(data: ResultData[]): ResultData[] {
+  const sorted = [...data].sort((a, b) => a.avg_cost_usd - b.avg_cost_usd);
+  const frontier: ResultData[] = [];
+  let maxRate = -1;
+  for (const d of sorted) {
+    if (d.resolution_rate > maxRate) {
+      frontier.push(d);
+      maxRate = d.resolution_rate;
+    }
+  }
+  return frontier;
+}
+
+const frontier = computeParetoFrontier(results);
+const frontierSet = new Set(frontier.map((f) => f.model));
+const providers = [...new Set(results.map((r) => r.provider))].sort();
+---
+
+<!doctype html>
+<html lang="en" data-theme="dark">
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <title>Leaderboard — AgentV SWE-bench Lite</title>
+    <meta name="description" content="Multi-dimensional agent benchmark. Compare models on resolution rate, cost efficiency, latency, and tool usage." />
+    <link rel="preconnect" href="https://fonts.googleapis.com" />
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
+    <link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:ital,wght@0,400;0,500;0,600;0,700;1,400&family=IBM+Plex+Mono:wght@400;500;600;700&display=swap" rel="stylesheet" />
+    <style>
+      *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
+      html { background: hsl(240, 10%, 8%); color: #e2e8f0; }
+      body {
+        min-height: 100vh;
+        font-family: 'IBM Plex Mono', 'JetBrains Mono', ui-monospace, monospace;
+        line-height: 1.6;
+      }
+
+      /* Nav */
+      .av-nav {
+        position: sticky; top: 0; z-index: 100;
+        background: rgba(15, 15, 20, 0.85);
+        backdrop-filter: blur(12px);
+        border-bottom: 1px solid rgba(255,255,255,0.06);
+      }
+      .av-nav-inner {
+        max-width: 1200px; margin: 0 auto;
+        padding: 0.75rem 1.5rem;
+        display: flex; justify-content: space-between; align-items: center;
+      }
+      .av-nav-logo {
+        display: flex; align-items: center; gap: 0.5rem;
+        text-decoration: none; color: #e2e8f0;
+        font-weight: 700; font-size: 1.1rem;
+      }
+      .av-nav-logo svg { flex-shrink: 0; }
+      .av-wordmark-v { color: #06b6d4; }
+      .av-nav-links { display: flex; align-items: center; gap: 1.5rem; }
+      .av-nav-links a {
+        color: #94a3b8; text-decoration: none; font-size: 0.875rem;
+        transition: color 0.2s;
+      }
+      .av-nav-links a:hover { color: #e2e8f0; }
+
+      /* Layout */
+      .av-container { max-width: 1200px; margin: 0 auto; padding: 0 1.5rem; }
+
+      /* Hero */
+      .av-lb-hero {
+        padding: 3rem 0 2rem;
+        border-bottom: 1px solid rgba(255,255,255,0.06);
+      }
+      .av-lb-hero h1 {
+        font-family: 'JetBrains Mono', monospace;
+        font-size: 2rem; font-weight: 700;
+        letter-spacing: -0.03em;
+        margin-bottom: 0.5rem;
+      }
+      .av-lb-hero h1 span {
+        background: linear-gradient(135deg, #06b6d4, #22d3ee);
+        -webkit-background-clip: text; -webkit-text-fill-color: transparent;
+        background-clip: text;
+      }
+      .av-lb-hero p {
+        color: #94a3b8; font-size: 0.9rem;
+        max-width: 600px;
+      }
+
+      /* Filters */
+      .av-filters {
+        display: flex; gap: 0.75rem; margin: 1.5rem 0;
+        flex-wrap: wrap;
+      }
+      .av-filter-btn {
+        background: rgba(255,255,255,0.04);
+        border: 1px solid rgba(255,255,255,0.08);
+        color: #94a3b8;
+        padding: 0.375rem 0.75rem;
+        border-radius: 4px;
+        font-family: inherit; font-size: 0.8rem;
+        cursor: pointer;
+        transition: all 0.2s;
+      }
+      .av-filter-btn:hover, .av-filter-btn.active {
+        color: #06b6d4;
+        border-color: rgba(6, 182, 212, 0.3);
+        background: rgba(6, 182, 212, 0.08);
+      }
+      .av-filter-select {
+        background: rgba(255,255,255,0.04);
+        border: 1px solid rgba(255,255,255,0.08);
+        color: #94a3b8;
+        padding: 0.375rem 0.75rem;
+        border-radius: 4px;
+        font-family: inherit; font-size: 0.8rem;
+        cursor: pointer;
+        transition: all 0.2s;
+        margin-left: 0.5rem;
+      }
+      .av-filter-select:hover, .av-filter-select:focus {
+        color: #06b6d4;
+        border-color: rgba(6, 182, 212, 0.3);
+        background: rgba(6, 182, 212, 0.08);
+        outline: none;
+      }
+      .av-filter-select option {
+        background: hsl(240, 10%, 12%);
+        color: #e2e8f0;
+      }
+
+      /* Table */
+      .av-table-wrap {
+        overflow-x: auto;
+        margin: 1rem 0 2rem;
+        border: 1px solid rgba(255,255,255,0.06);
+        border-radius: 8px;
+        background: rgba(255,255,255,0.02);
+      }
+      .av-table {
+        width: 100%; border-collapse: collapse;
+        font-size: 0.85rem;
+      }
+      .av-table th {
+        text-align: left;
+        padding: 0.75rem 1rem;
+        font-weight: 600; font-size: 0.75rem;
+        text-transform: uppercase;
+        letter-spacing: 0.05em;
+        color: #64748b;
+        border-bottom: 1px solid rgba(255,255,255,0.08);
+        cursor: pointer;
+        user-select: none;
+        white-space: nowrap;
+        transition: color 0.2s;
+      }
+      .av-table th:hover { color: #06b6d4; }
+      .av-table th .sort-arrow { margin-left: 0.25rem; opacity: 0.3; }
+      .av-table th.sorted .sort-arrow { opacity: 1; color: #06b6d4; }
+      .av-table td {
+        padding: 0.75rem 1rem;
+        border-bottom: 1px solid rgba(255,255,255,0.04);
+        white-space: nowrap;
+      }
+      .av-table tbody tr {
+        transition: background 0.15s;
+      }
+      .av-table tbody tr:hover {
+        background: rgba(6, 182, 212, 0.04);
+      }
+      .av-rank {
+        color: #64748b; font-weight: 600;
+        width: 2rem; text-align: center;
+      }
+      .av-model-name {
+        font-weight: 600; color: #e2e8f0;
+      }
+      .av-provider-badge {
+        display: inline-block;
+        padding: 0.125rem 0.5rem;
+        border-radius: 3px;
+        font-size: 0.7rem;
+        text-transform: uppercase;
+        letter-spacing: 0.05em;
+        font-weight: 600;
+      }
+      .av-provider-anthropic { background: rgba(6,182,212,0.15); color: #06b6d4; }
+      .av-provider-openai { background: rgba(16,185,129,0.15); color: #10b981; }
+      .av-provider-google { background: rgba(245,158,11,0.15); color: #f59e0b; }
+      .av-provider-deepseek { background: rgba(139,92,246,0.15); color: #8b5cf6; }
+      .av-provider-meta { background: rgba(239,68,68,0.15); color: #ef4444; }
+      .av-resolved { color: #22d3ee; font-weight: 600; }
+      .av-cost { color: #94a3b8; }
+      .av-cost-fix {
+        font-weight: 600;
+      }
+      .av-cost-fix.good { color: #10b981; }
+      .av-cost-fix.mid { color: #f59e0b; }
+      .av-cost-fix.bad { color: #ef4444; }
+      .av-frontier-badge {
+        display: inline-block;
+        width: 8px; height: 8px;
+        border-radius: 50%;
+        background: #06b6d4;
+        box-shadow: 0 0 6px rgba(6,182,212,0.5);
+        margin-left: 0.375rem;
+        vertical-align: middle;
+      }
+
+      /* Chart Section */
+      .av-chart-section {
+        padding: 2rem 0 3rem;
+        border-top: 1px solid rgba(255,255,255,0.06);
+      }
+      .av-chart-section h2 {
+        font-family: 'JetBrains Mono', monospace;
+        font-size: 1.25rem; font-weight: 600;
+        margin-bottom: 0.5rem;
+      }
+      .av-chart-section p {
+        color: #64748b; font-size: 0.8rem;
+        margin-bottom: 1.5rem;
+      }
+      .av-chart-container {
+        position: relative;
+        background: rgba(255,255,255,0.02);
+        border: 1px solid rgba(255,255,255,0.06);
+        border-radius: 8px;
+        padding: 1.5rem;
+        min-height: 400px;
+      }
+      .av-chart-svg { width: 100%; height: 380px; }
+
+      /* Legend */
+      .av-legend {
+        display: flex; gap: 1.25rem; flex-wrap: wrap;
+        margin-top: 1rem;
+      }
+      .av-legend-item {
+        display: flex; align-items: center; gap: 0.375rem;
+        font-size: 0.75rem; color: #94a3b8;
+      }
+      .av-legend-dot {
+        width: 10px; height: 10px; border-radius: 50%;
+      }
+
+      /* CTA */
+      .av-cta-section {
+        padding: 3rem 0;
+        border-top: 1px solid rgba(255,255,255,0.06);
+        text-align: center;
+      }
+      .av-cta-section h2 {
+        font-family: 'JetBrains Mono', monospace;
+        font-size: 1.25rem; margin-bottom: 0.75rem;
+      }
+      .av-cta-code {
+        background: rgba(255,255,255,0.04);
+        border: 1px solid rgba(255,255,255,0.08);
+        border-radius: 6px;
+        padding: 1rem 1.5rem;
+        font-size: 0.8rem;
+        color: #94a3b8;
+        display: inline-block;
+        text-align: left;
+        margin: 1rem 0;
+        line-height: 1.8;
+      }
+      .av-cta-code .cmd { color: #22d3ee; }
+      .av-cta-code .comment { color: #475569; }
+      .av-btn-primary {
+        display: inline-block;
+        padding: 0.625rem 1.5rem;
+        background: linear-gradient(135deg, #06b6d4, #22d3ee);
+        color: #0a1628;
+        font-weight: 600;
+        font-family: inherit;
+        font-size: 0.85rem;
+        border-radius: 4px;
+        text-decoration: none;
+        transition: all 0.2s;
+      }
+      .av-btn-primary:hover {
+        box-shadow: 0 0 20px rgba(6, 182, 212, 0.3);
+      }
+
+      /* Footer */
+      .av-footer {
+        padding: 1.5rem 0;
+        border-top: 1px solid rgba(255,255,255,0.06);
+        text-align: center;
+        color: #475569;
+        font-size: 0.75rem;
+      }
+      .av-footer a { color: #06b6d4; text-decoration: none; }
+      .av-footer a:hover { text-decoration: underline; }
+
+      /* Responsive */
+      @media (max-width: 768px) {
+        .av-lb-hero h1 { font-size: 1.5rem; }
+        .av-table { font-size: 0.78rem; }
+        .av-table th, .av-table td { padding: 0.5rem 0.625rem; }
+      }
+    </style>
+  </head>
+  <body>
+    <!-- Nav -->
+    <nav class="av-nav">
+      <div class="av-nav-inner">
+        <a href="/" class="av-nav-logo">
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 120 120" fill="none" width="28" height="28">
+            <rect width="120" height="120" rx="24" fill="#06b6d4"/>
+            <text x="60" y="82" text-anchor="middle" font-family="'IBM Plex Mono', monospace" font-weight="bold" font-size="64" fill="#0a2a30">v</text>
+          </svg>
+          <span>agent<span class="av-wordmark-v">v</span></span>
+        </a>
+        <div class="av-nav-links">
+          <a href="/docs/">Docs</a>
+          <a href="/leaderboard" style="color:#06b6d4;">Leaderboard</a>
+          <a href="https://github.com/EntityProcess/agentv" target="_blank" rel="noopener noreferrer">GitHub</a>
+        </div>
+      </div>
+    </nav>
+
+    <div class="av-container">
+      <!-- Hero -->
+      <section class="av-lb-hero">
+        <h1>AgentV Leaderboard — <span>SWE-bench Lite</span></h1>
+        <p>The multi-dimensional agent benchmark. Same SWE-bench tasks, richer metrics — cost efficiency, tool usage, and Pareto-optimal rankings.</p>
+      </section>
+
+      <!-- Filters -->
+      <div class="av-filters" id="filters">
+        <button class="av-filter-btn active" data-filter="all">All Models</button>
+        <button class="av-filter-btn" data-filter="proprietary">Proprietary</button>
+        <button class="av-filter-btn" data-filter="open-weights">Open Weights</button>
+        <button class="av-filter-btn" data-filter="open-source">Open Source</button>
+        <select class="av-filter-select" id="provider-filter">
+          <option value="all">All Providers</option>
+          {providers.map(p => <option value={p}>{p}</option>)}
+        </select>
+      </div>
+
+      <!-- Table -->
+      <div class="av-table-wrap">
+        <table class="av-table" id="leaderboard-table">
+          <thead>
+            <tr>
+              <th class="sorted" data-sort="rank"># <span class="sort-arrow">▲</span></th>
+              <th data-sort="model">Model <span class="sort-arrow">▲</span></th>
+              <th data-sort="provider">Provider <span class="sort-arrow">▲</span></th>
+              <th data-sort="resolved">% Resolved <span class="sort-arrow">▲</span></th>
+              <th data-sort="cost">Avg $ <span class="sort-arrow">▲</span></th>
+              <th data-sort="costfix">$/Fix <span class="sort-arrow">▲</span></th>
+              <th data-sort="tools">Tools <span class="sort-arrow">▲</span></th>
+              <th data-sort="duration">Latency <span class="sort-arrow">▲</span></th>
+              <th data-sort="date">Date <span class="sort-arrow">▲</span></th>
+            </tr>
+          </thead>
+          <tbody>
+            {results.map((r, i) => {
+              const costClass = r.avg_cost_per_fix_usd < 0.5 ? 'good' : r.avg_cost_per_fix_usd < 0.8 ? 'mid' : 'bad';
+              const isFrontier = frontierSet.has(r.model);
+              return (
+                <tr data-model-type={r.model_type} data-provider={r.provider}>
+                  <td class="av-rank">{i + 1}</td>
+                  <td>
+                    <span class="av-model-name">{r.model}</span>
+                    {isFrontier && <span class="av-frontier-badge" title="Pareto optimal"></span>}
+                  </td>
+                  <td><span class={`av-provider-badge av-provider-${safeCssClass(r.provider)}`}>{r.provider}</span></td>
+                  <td class="av-resolved">{(r.resolution_rate * 100).toFixed(1)}%</td>
+                  <td class="av-cost">${r.avg_cost_usd.toFixed(2)}</td>
+                  <td class={`av-cost-fix ${costClass}`}>${r.avg_cost_per_fix_usd.toFixed(2)}</td>
+                  <td>{r.avg_tool_calls.toFixed(1)}</td>
+                  <td>{(r.avg_duration_ms / 1000).toFixed(0)}s</td>
+                  <td style="color:#64748b">{r.date}</td>
+                </tr>
+              );
+            })}
+          </tbody>
+        </table>
+      </div>
+
+      <!-- Pareto Chart -->
+      <section class="av-chart-section">
+        <h2>Pareto Frontier — Score vs Cost</h2>
+        <p>Models on the frontier line achieve the best resolution rate for their cost. Closer to top-left is better.</p>
+        <div class="av-chart-container">
+          <svg class="av-chart-svg" id="pareto-chart" viewBox="0 0 800 380">
+            <!-- Grid and axes rendered by client JS -->
+          </svg>
+        </div>
+        <div class="av-legend" id="chart-legend"></div>
+      </section>
+
+      <!-- CTA -->
+      <section class="av-cta-section">
+        <h2>Run it yourself</h2>
+        <div class="av-cta-code">
+          <div><span class="cmd">$</span> git clone https://github.com/EntityProcess/agentv</div>
+          <div><span class="cmd">$</span> cd agentv/benchmarks/swe-bench-lite</div>
+          <div><span class="cmd">$</span> bun run setup.ts</div>
+          <div><span class="cmd">$</span> agentv eval ./evals/ --target claude</div>
+          <div class="comment"># Then submit your results via PR →</div>
+        </div>
+        <br />
+        <a href="https://github.com/EntityProcess/agentv/tree/main/benchmarks/swe-bench-lite" class="av-btn-primary">
+          Submit your results →
+        </a>
+      </section>
+
+      <!-- Footer -->
+      <footer class="av-footer">
+        <p>
+          <a href="/">AgentV</a> — CLI-first agent evaluation framework.
+          Data from <a href="https://www.swebench.com/" target="_blank" rel="noopener noreferrer">SWE-bench Lite</a> (300 instances).
+        </p>
+      </footer>
+    </div>
+
+    <!-- Client-side interactivity -->
+    <script define:vars={{ results: JSON.stringify(results), providerColors: JSON.stringify(providerColors), frontierModels: JSON.stringify(Array.from(frontierSet)) }}>
+      const data = JSON.parse(results);
+      const colors = JSON.parse(providerColors);
+      const frontierSet = new Set(JSON.parse(frontierModels));
+
+      // --- Filters ---
+      function applyFilters() {
+        const activeBtn = document.querySelector('.av-filter-btn.active');
+        const modelFilter = activeBtn ? activeBtn.dataset.filter : 'all';
+        const providerFilter = document.getElementById('provider-filter').value;
+        let rank = 1;
+        document.querySelectorAll('#leaderboard-table tbody tr').forEach(row => {
+          const modelMatch = modelFilter === 'all' || row.dataset.modelType === modelFilter;
+          const providerMatch = providerFilter === 'all' || row.dataset.provider === providerFilter;
+          if (modelMatch && providerMatch) {
+            row.style.display = '';
+            row.querySelector('.av-rank').textContent = rank++;
+          } else {
+            row.style.display = 'none';
+          }
+        });
+      }
+
+      document.querySelectorAll('.av-filter-btn').forEach(btn => {
+        btn.addEventListener('click', () => {
+          document.querySelectorAll('.av-filter-btn').forEach(b => b.classList.remove('active'));
+          btn.classList.add('active');
+          applyFilters();
+        });
+      });
+
+      document.getElementById('provider-filter').addEventListener('change', () => {
+        applyFilters();
+      });
+
+      // --- Sortable columns ---
+      const sortState = { col: 'rank', asc: true };
+      const sortKeys = {
+        rank: (r) => data.indexOf(data.find(d => d.model === r.querySelector('.av-model-name')?.textContent)),
+        model: (r) => r.querySelector('.av-model-name')?.textContent?.toLowerCase() || '',
+        provider: (r) => r.dataset.provider || '',
+        resolved: (r) => parseFloat(r.querySelector('.av-resolved')?.textContent) || 0,
+        cost: (r) => parseFloat(r.querySelector('.av-cost')?.textContent?.replace('$','')) || 0,
+        costfix: (r) => parseFloat(r.querySelector('.av-cost-fix')?.textContent?.replace('$','')) || 0,
+        tools: (r) => parseFloat(r.cells[6]?.textContent) || 0,
+        duration: (r) => parseFloat(r.cells[7]?.textContent) || 0,
+        date: (r) => r.cells[8]?.textContent || '',
+      };
+
+      document.querySelectorAll('.av-table th').forEach(th => {
+        th.addEventListener('click', () => {
+          const col = th.dataset.sort;
+          if (!col) return;
+          if (sortState.col === col) {
+            sortState.asc = !sortState.asc;
+          } else {
+            sortState.col = col;
+            sortState.asc = col === 'model' || col === 'provider' || col === 'date';
+          }
+          document.querySelectorAll('.av-table th').forEach(h => h.classList.remove('sorted'));
+          th.classList.add('sorted');
+          th.querySelector('.sort-arrow').textContent = sortState.asc ? '▲' : '▼';
+
+          const tbody = document.querySelector('#leaderboard-table tbody');
+          const rows = Array.from(tbody.querySelectorAll('tr'));
+          const keyFn = sortKeys[col] || sortKeys.rank;
+          rows.sort((a, b) => {
+            const va = keyFn(a), vb = keyFn(b);
+            const cmp = typeof va === 'string' ? va.localeCompare(vb) : va - vb;
+            return sortState.asc ? cmp : -cmp;
+          });
+          rows.forEach((row, i) => {
+            tbody.appendChild(row);
+            row.querySelector('.av-rank').textContent = i + 1;
+          });
+        });
+      });
+
+      // --- Pareto Chart ---
+      function renderChart() {
+        const svg = document.getElementById('pareto-chart');
+        if (!svg || data.length === 0) return;
+
+        const margin = { top: 20, right: 30, bottom: 40, left: 50 };
+        const width = 800 - margin.left - margin.right;
+        const height = 380 - margin.top - margin.bottom;
+
+        const maxCost = Math.max(...data.map(d => d.avg_cost_usd)) * 1.15;
+        const maxRate = Math.min(Math.max(...data.map(d => d.resolution_rate)) * 1.1, 1);
+        const minRate = Math.max(Math.min(...data.map(d => d.resolution_rate)) * 0.9, 0);
+
+        const scaleX = (v) => margin.left + (v / maxCost) * width;
+        const scaleY = (v) => margin.top + height - ((v - minRate) / (maxRate - minRate)) * height;
+
+        let html = '';
+
+        // Grid lines
+        for (let i = 0; i <= 5; i++) {
+          const y = margin.top + (i / 5) * height;
+          const rate = maxRate - (i / 5) * (maxRate - minRate);
+          html += `<line x1="${margin.left}" y1="${y}" x2="${margin.left + width}" y2="${y}" stroke="rgba(255,255,255,0.05)" />`;
+          html += `<text x="${margin.left - 8}" y="${y + 4}" text-anchor="end" fill="#475569" font-size="11" font-family="'IBM Plex Mono', monospace">${(rate*100).toFixed(0)}%</text>`;
+        }
+        for (let i = 0; i <= 5; i++) {
+          const x = margin.left + (i / 5) * width;
+          const cost = (i / 5) * maxCost;
+          html += `<line x1="${x}" y1="${margin.top}" x2="${x}" y2="${margin.top + height}" stroke="rgba(255,255,255,0.05)" />`;
+          html += `<text x="${x}" y="${margin.top + height + 20}" text-anchor="middle" fill="#475569" font-size="11" font-family="'IBM Plex Mono', monospace">$${cost.toFixed(2)}</text>`;
+        }
+
+        // Axis labels
+        html += `<text x="${margin.left + width/2}" y="${margin.top + height + 36}" text-anchor="middle" fill="#64748b" font-size="12" font-family="'IBM Plex Mono', monospace">Avg Cost per Instance</text>`;
+        html += `<text x="14" y="${margin.top + height/2}" text-anchor="middle" fill="#64748b" font-size="12" font-family="'IBM Plex Mono', monospace" transform="rotate(-90, 14, ${margin.top + height/2})">% Resolved</text>`;
+
+        // Frontier line
+        const frontierData = data.filter(d => frontierSet.has(d.model)).sort((a,b) => a.avg_cost_usd - b.avg_cost_usd);
+        if (frontierData.length >= 2) {
+          const points = frontierData.map(d => `${scaleX(d.avg_cost_usd)},${scaleY(d.resolution_rate)}`).join(' ');
+          html += `<polyline points="${points}" fill="none" stroke="rgba(6,182,212,0.3)" stroke-width="2" stroke-dasharray="6,4" />`;
+        }
+
+        // Data points
+        for (const d of data) {
+          const x = scaleX(d.avg_cost_usd);
+          const y = scaleY(d.resolution_rate);
+          const color = colors[d.provider] || '#94a3b8';
+          const isFrontier = frontierSet.has(d.model);
+          const r = isFrontier ? 8 : 6;
+          html += `<circle cx="${x}" cy="${y}" r="${r}" fill="${color}" opacity="0.85" stroke="${isFrontier ? '#fff' : 'none'}" stroke-width="${isFrontier ? 2 : 0}" />`;
+          html += `<text x="${x}" y="${y - r - 4}" text-anchor="middle" fill="#94a3b8" font-size="10" font-family="'IBM Plex Mono', monospace">${d.model}</text>`;
+        }
+
+        // Axes
+        html += `<line x1="${margin.left}" y1="${margin.top}" x2="${margin.left}" y2="${margin.top + height}" stroke="rgba(255,255,255,0.1)" />`;
+        html += `<line x1="${margin.left}" y1="${margin.top + height}" x2="${margin.left + width}" y2="${margin.top + height}" stroke="rgba(255,255,255,0.1)" />`;
+
+        svg.innerHTML = html;
+
+        // Legend
+        const legend = document.getElementById('chart-legend');
+        const providers = [...new Set(data.map(d => d.provider))];
+        legend.innerHTML = providers.map(p =>
+          `<span class="av-legend-item"><span class="av-legend-dot" style="background:${colors[p] || '#94a3b8'}"></span>${p}</span>`
+        ).join('') + '<span class="av-legend-item"><span class="av-legend-dot" style="background:#06b6d4;box-shadow:0 0 4px rgba(6,182,212,0.5)"></span>Pareto frontier</span>';
+      }
+
+      renderChart();
+    </script>
+  </body>
+</html>
diff --git a/benchmarks/swe-bench-lite/.gitignore b/benchmarks/swe-bench-lite/.gitignore
new file mode 100644
index 000000000..321287329
--- /dev/null
+++ b/benchmarks/swe-bench-lite/.gitignore
@@ -0,0 +1,4 @@
+# Generated eval files from setup.ts
+evals/
+# Cache directory for HuggingFace downloads
+.cache/
diff --git a/benchmarks/swe-bench-lite/README.md b/benchmarks/swe-bench-lite/README.md
new file mode 100644
index 000000000..f20f3e546
--- /dev/null
+++ b/benchmarks/swe-bench-lite/README.md
@@ -0,0 +1,109 @@
+# SWE-bench Lite Benchmark
+
+Run [SWE-bench Lite](https://www.swebench.com/) (300 instances) through AgentV with richer metrics than the original leaderboard.
+
+## Quick Start
+
+### Prerequisites
+
+- **Docker** — Required for running SWE-bench instances. Each instance runs in a pre-built Docker container.
+- **Bun** — Used to run setup and CLI scripts
+- **An LLM API key** — Set via `--target` flag or provider env vars
+
+### 1. Setup
+
+Download the dataset from HuggingFace and generate EVAL.yaml files:
+
+```bash
+cd benchmarks/swe-bench-lite
+bun run setup.ts
+```
+
+This creates `evals/*.EVAL.yaml` — one per SWE-bench instance. Files are gitignored (generated from HuggingFace source of truth).
+
+### 2. Run Evaluations
+
+```bash
+# Run all instances against a target
+bun apps/cli/src/cli.ts eval benchmarks/swe-bench-lite/evals/ --target claude
+
+# Run a single instance
+bun apps/cli/src/cli.ts eval benchmarks/swe-bench-lite/evals/django__django-15180.EVAL.yaml --target claude
+
+# Run with cost tracking
+bun apps/cli/src/cli.ts eval benchmarks/swe-bench-lite/evals/ --target claude --output results/claude-opus-4.6.json
+```
+
+### 3. Submit Results
+
+Results are submitted via GitHub PR. Each result file goes in `results/<model-slug>.json`.
+
+**Steps:**
+1. Fork the [agentv repo](https://github.com/EntityProcess/agentv)
+2. Run the benchmark (see above)
+3. Add your result JSON to `benchmarks/swe-bench-lite/results/<your-model>.json`
+4. Open a PR — CI validates the JSON schema automatically
+
+### Result JSON Format
+
+```json
+{
+  "model": "Claude Opus 4.6",
+  "provider": "anthropic",
+  "model_type": "proprietary",
+  "date": "2026-04-08",
+  "agent": "mini-swe-agent-agentv",
+  "agent_version": "1.0.0",
+  "dataset": "swe-bench-lite",
+  "total_instances": 300,
+  "resolved_instances": 218,
+  "resolution_rate": 0.727,
+  "avg_cost_usd": 0.55,
+  "avg_cost_per_fix_usd": 0.76,
+  "avg_duration_ms": 45000,
+  "avg_tool_calls": 8.2,
+  "per_instance": [
+    {
+      "instance_id": "django__django-15180",
+      "resolved": true,
+      "cost_usd": 0.42,
+      "duration_ms": 32000,
+      "tool_calls": 6
+    }
+  ]
+}
+```
+
+See `result.schema.json` for the full validation schema.
+
+### Leaderboard
+
+Results are displayed on [agentv.dev/leaderboard](https://agentv.dev/leaderboard) with:
+- **Multi-dimensional ranking** — not just pass/fail, but cost, latency, tool efficiency
+- **Cost-normalized scoring** — $/Fix metric shows best value per dollar
+- **Pareto frontier** — visual chart of score vs cost tradeoffs
+- **Filterable** — by model type, provider, date
+
+## Dataset
+
+- **Source:** [HuggingFace SWE-bench/SWE-bench_Lite](https://huggingface.co/datasets/SWE-bench/SWE-bench_Lite)
+- **Split:** test (300 instances)
+- **Docker images:** `swebench/sweb.eval.x86_64.*` from DockerHub
+
+## Architecture
+
+```
+setup.ts → downloads from HuggingFace → generates evals/*.EVAL.yaml
+                                              ↓
+                                    agentv eval ./evals/
+                                              ↓
+                              Docker container per instance
+                              (image from SWE-bench registry)
+                                              ↓
+                              graders/swe-bench-grader.ts
+                              (runs inside container)
+                                              ↓
+                                    results/*.json
+                                              ↓
+                                 agentv.dev/leaderboard
+```
diff --git a/benchmarks/swe-bench-lite/e2e-test/calculator-bug.EVAL.yaml b/benchmarks/swe-bench-lite/e2e-test/calculator-bug.EVAL.yaml
new file mode 100644
index 000000000..f8be6a26d
--- /dev/null
+++ b/benchmarks/swe-bench-lite/e2e-test/calculator-bug.EVAL.yaml
@@ -0,0 +1,38 @@
+# E2E test eval - validates Docker workspace + grader pipeline
+description: "E2E test: fix calculator bug in Docker container"
+
+workspace:
+  docker:
+    image: "agentv-test-eval:latest"
+    timeout: 120
+    memory: "1g"
+
+tests:
+  - id: "calculator-add-bug"
+    input:
+      - role: user
+        content: |
+          You are a software engineer. The repository at /testbed has a bug in calculator.py.
+          The function add(a, b) returns a - b instead of a + b.
+
+          Here is the buggy file:
+          ```python
+          def add(a, b):
+              return a - b  # BUG: should be a + b
+
+          def subtract(a, b):
+              return a - b
+          ```
+
+          The test test_calculator.py::test_add is failing because add(2,3) returns -1 instead of 5.
+
+          Fix the bug and output ONLY a unified diff (git diff format) that changes `return a - b` to `return a + b` in the add function. No explanation, just the diff.
+    assertions:
+      - type: code-grader
+        command: ["python", "/grader.py"]
+        instance_id: "calculator-add-bug"
+        repo: "test/calculator"
+        base_commit: "initial"
+        fail_to_pass:
+          - "test_calculator.py::test_add"
+        pass_to_pass_count: 0
diff --git a/benchmarks/swe-bench-lite/e2e-test/docker/Dockerfile b/benchmarks/swe-bench-lite/e2e-test/docker/Dockerfile
new file mode 100644
index 000000000..a6a911a48
--- /dev/null
+++ b/benchmarks/swe-bench-lite/e2e-test/docker/Dockerfile
@@ -0,0 +1,22 @@
+FROM python:3.12-slim
+
+RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
+RUN pip install --no-cache-dir pytest
+
+WORKDIR /testbed
+
+# Create a simple calculator module with a known bug
+RUN printf 'def add(a, b):\n    return a - b  # BUG: should be a + b\n\ndef subtract(a, b):\n    return a - b\n' > calculator.py
+
+# Create test file
+RUN printf 'from calculator import add, subtract\n\ndef test_add():\n    assert add(2, 3) == 5\n    assert add(-1, 1) == 0\n\ndef test_subtract():\n    assert subtract(5, 3) == 2\n' > test_calculator.py
+
+# Initialize git so patches can be applied
+RUN git config --global user.email "test@test.com" && \
+    git config --global user.name "Test" && \
+    git init && git add . && git commit -m "initial"
+
+# Copy grader into the image
+COPY grader.py /grader.py
+
+CMD ["bash"]
diff --git a/benchmarks/swe-bench-lite/e2e-test/docker/grader.py b/benchmarks/swe-bench-lite/e2e-test/docker/grader.py
new file mode 100644
index 000000000..65742691f
--- /dev/null
+++ b/benchmarks/swe-bench-lite/e2e-test/docker/grader.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+"""
+Simple grader that runs INSIDE the Docker container.
+Reads JSON from stdin, extracts diff from agent output, applies it, runs tests.
+"""
+import json
+import os
+import re
+import subprocess
+import sys
+import tempfile
+
+def extract_diff(output):
+    """Extract a unified diff from the agent's output messages."""
+    text = ""
+    if isinstance(output, list):
+        for msg in output:
+            if isinstance(msg, dict):
+                text += msg.get("content", "") + "\n"
+            elif isinstance(msg, str):
+                text += msg + "\n"
+    elif isinstance(output, str):
+        text = output
+
+    # Try to extract from code blocks first
+    blocks = re.findall(r"```(?:diff)?\s*\n(.*?)```", text, re.DOTALL)
+    if blocks:
+        return blocks[0].strip()
+
+    # Try to find unified diff lines
+    lines = text.split("\n")
+    diff_lines = []
+    in_diff = False
+    for line in lines:
+        if line.startswith("---") or line.startswith("+++") or line.startswith("diff "):
+            in_diff = True
+        if in_diff:
+            diff_lines.append(line)
+
+    if diff_lines:
+        return "\n".join(diff_lines).strip()
+
+    return text.strip()
+
+
+def main():
+    payload = json.load(sys.stdin)
+    config = payload.get("config", {})
+    output = payload.get("output", [])
+    fail_to_pass = config.get("fail_to_pass", [])
+    
+    # Debug info to stderr (won't affect stdout JSON)
+    print(f"DEBUG: output type={type(output).__name__}, config keys={list(config.keys())}, fail_to_pass={fail_to_pass}", file=sys.stderr)
+    if isinstance(output, list) and output:
+        print(f"DEBUG: first output item type={type(output[0]).__name__}, keys={list(output[0].keys()) if isinstance(output[0], dict) else 'N/A'}", file=sys.stderr)
+
+    patch = extract_diff(output)
+    assertions = []
+    workdir = "/testbed"
+
+    print(f"DEBUG: extracted patch length={len(patch)}", file=sys.stderr)
+    print(f"DEBUG: patch first 200 chars: {patch[:200]}", file=sys.stderr)
+
+    if not patch:
+        print(json.dumps({
+            "score": 0.0,
+            "assertions": [{"text": "No patch found in agent output", "passed": False}]
+        }))
+        return
+
+    # Write patch to temp file and apply
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".patch", delete=False) as f:
+        f.write(patch + "\n")
+        patch_path = f.name
+
+    try:
+        result = subprocess.run(
+            ["git", "apply", "--allow-empty", patch_path],
+            cwd=workdir,
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+        if result.returncode != 0:
+            assertions.append({
+                "text": f"git apply failed: {result.stderr.strip()[:200]}",
+                "passed": False,
+            })
+            print(json.dumps({"score": 0.0, "assertions": assertions}))
+            return
+        assertions.append({"text": "Patch applied successfully", "passed": True})
+    except Exception as e:
+        assertions.append({"text": f"Patch apply error: {str(e)[:200]}", "passed": False})
+        print(json.dumps({"score": 0.0, "assertions": assertions}))
+        return
+    finally:
+        os.unlink(patch_path)
+
+    # Run fail_to_pass tests
+    print(f"DEBUG: about to run {len(fail_to_pass)} tests", file=sys.stderr)
+    passed = 0
+    total = len(fail_to_pass)
+    for test in fail_to_pass:
+        print(f"DEBUG: running test: {test}", file=sys.stderr)
+        try:
+            result = subprocess.run(
+                ["python", "-m", "pytest", test, "-x", "--tb=short", "-q"],
+                cwd=workdir,
+                capture_output=True,
+                text=True,
+                timeout=60,
+            )
+            print(f"DEBUG: test returncode={result.returncode} stdout={result.stdout[:200]} stderr={result.stderr[:200]}", file=sys.stderr)
+            if result.returncode == 0:
+                passed += 1
+                assertions.append({"text": f"PASS: {test}", "passed": True})
+            else:
+                assertions.append({
+                    "text": f"FAIL: {test} — {result.stdout.strip()[-200:]}",
+                    "passed": False,
+                })
+        except Exception as e:
+            print(f"DEBUG: test exception: {e}", file=sys.stderr)
+            assertions.append({"text": f"ERROR running {test}: {str(e)[:200]}", "passed": False})
+
+    score = passed / total if total > 0 else 0.0
+    print(f"DEBUG: final score={score} passed={passed} total={total}", file=sys.stderr)
+    print(json.dumps({"score": score, "assertions": assertions}))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/swe-bench-lite/graders/swe-bench-grader.ts b/benchmarks/swe-bench-lite/graders/swe-bench-grader.ts
new file mode 100644
index 000000000..47b66080f
--- /dev/null
+++ b/benchmarks/swe-bench-lite/graders/swe-bench-grader.ts
@@ -0,0 +1,162 @@
+#!/usr/bin/env bun
+/**
+ * SWE-bench Grader for AgentV
+ *
+ * A code-grader that evaluates agent patches against SWE-bench test suites.
+ * Runs inside the Docker container via `docker exec` (handled by Docker workspace provider).
+ *
+ * Flow:
+ * 1. Receives agent output (patch/diff) via stdin payload
+ * 2. Applies the patch to the repository at /testbed
+ * 3. Runs the FAIL_TO_PASS tests
+ * 4. Checks which failing tests now pass
+ * 5. Returns structured score + assertions
+ *
+ * Config (from EVAL.yaml):
+ *   instance_id: SWE-bench instance identifier
+ *   repo: Repository name (e.g. "django/django")
+ *   base_commit: Base commit hash
+ *   fail_to_pass: Array of test names that must transition from fail → pass
+ *   pass_to_pass_count: Number of tests that must remain passing
+ */
+
+import { execFileSync } from 'node:child_process';
+import { defineCodeGrader } from '@agentv/eval';
+
+interface SWEBenchConfig {
+  instance_id: string;
+  repo: string;
+  base_commit: string;
+  fail_to_pass: string[];
+  pass_to_pass_count: number;
+}
+
+/** Safe test name pattern — only allow expected SWE-bench test identifiers */
+const SAFE_TEST_NAME = /^[\w./:\-[\]]+$/;
+
+function runArgs(
+  args: readonly string[],
+  cwd = '/testbed',
+): { stdout: string; stderr: string; exitCode: number } {
+  try {
+    const stdout = execFileSync(args[0], args.slice(1), {
+      cwd,
+      encoding: 'utf8',
+      timeout: 300_000,
+      stdio: ['pipe', 'pipe', 'pipe'],
+    });
+    return { stdout, stderr: '', exitCode: 0 };
+  } catch (err: unknown) {
+    const e = err as { stdout?: string; stderr?: string; status?: number };
+    return {
+      stdout: String(e.stdout ?? ''),
+      stderr: String(e.stderr ?? ''),
+      exitCode: typeof e.status === 'number' ? e.status : 1,
+    };
+  }
+}
+
+export default defineCodeGrader(async ({ output, config }) => {
+  const swebenchConfig = config as unknown as SWEBenchConfig;
+  const { instance_id, fail_to_pass } = swebenchConfig;
+
+  const assertions: Array<{
+    text: string;
+    passed: boolean;
+    evidence?: string;
+  }> = [];
+
+  // Extract the patch from agent output
+  const agentOutput = output?.map((m) => String(m.content ?? '')).join('\n') ?? '';
+
+  // Extract diff content (unified diff format)
+  const diffMatch = agentOutput.match(/^(---|\+\+\+|diff --git)[\s\S]*$/m);
+  const patch = diffMatch ? diffMatch[0] : agentOutput;
+
+  if (!patch.trim()) {
+    return {
+      score: 0,
+      assertions: [
+        {
+          text: 'Agent produced a patch',
+          passed: false,
+          evidence: 'No patch content found in agent output',
+        },
+      ],
+    };
+  }
+
+  assertions.push({
+    text: 'Agent produced a patch',
+    passed: true,
+    evidence: `Patch length: ${patch.length} chars`,
+  });
+
+  // Step 1: Write patch to a temp file and apply it
+  const patchPath = '/tmp/agent-patch.diff';
+  const { writeFileSync } = await import('node:fs');
+  writeFileSync(patchPath, patch);
+
+  const applyResult = runArgs(['git', 'apply', '--verbose', patchPath]);
+  const patchApplied = applyResult.exitCode === 0;
+
+  if (!patchApplied) {
+    // Try with --3way as fallback
+    const apply3way = runArgs(['git', 'apply', '--3way', patchPath]);
+    if (apply3way.exitCode !== 0) {
+      assertions.push({
+        text: 'Patch applies cleanly',
+        passed: false,
+        evidence: `git apply failed: ${applyResult.stderr.slice(0, 500)}`,
+      });
+      return {
+        score: 0,
+        assertions,
+        metadata: { instance_id, patch_length: patch.length },
+      };
+    }
+  }
+  assertions.push({ text: 'Patch applies cleanly', passed: true });
+
+  // Step 2: Run FAIL_TO_PASS tests (using execFileSync to avoid shell injection)
+  let passedCount = 0;
+  for (const testName of fail_to_pass) {
+    // Validate test name to prevent injection
+    if (!SAFE_TEST_NAME.test(testName)) {
+      assertions.push({
+        text: `FAIL→PASS: ${testName}`,
+        passed: false,
+        evidence: 'Skipped: test name contains unsafe characters',
+      });
+      continue;
+    }
+
+    const testResult = runArgs(['python', '-m', 'pytest', testName, '-x', '--tb=short', '-q']);
+    const combinedOutput = `${testResult.stdout}\n${testResult.stderr}`;
+    const passed = combinedOutput.includes(' passed') && !combinedOutput.includes(' failed');
+
+    assertions.push({
+      text: `FAIL→PASS: ${testName}`,
+      passed,
+      evidence: passed
+        ? 'Test now passes after patch'
+        : `Test still fails: ${combinedOutput.slice(0, 300)}`,
+    });
+
+    if (passed) passedCount++;
+  }
+
+  // Score: proportion of FAIL_TO_PASS tests that now pass
+  const score = fail_to_pass.length > 0 ? passedCount / fail_to_pass.length : 0;
+
+  return {
+    score,
+    assertions,
+    metadata: {
+      instance_id,
+      patch_length: patch.length,
+      fail_to_pass_total: fail_to_pass.length,
+      fail_to_pass_resolved: passedCount,
+    },
+  };
+});
diff --git a/benchmarks/swe-bench-lite/result.schema.json b/benchmarks/swe-bench-lite/result.schema.json
new file mode 100644
index 000000000..8a331889e
--- /dev/null
+++ b/benchmarks/swe-bench-lite/result.schema.json
@@ -0,0 +1,55 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "title": "AgentV SWE-bench Lite Result",
+  "description": "Schema for benchmark result submissions to benchmarks/swe-bench-lite/results/",
+  "type": "object",
+  "required": [
+    "model",
+    "provider",
+    "model_type",
+    "date",
+    "agent",
+    "agent_version",
+    "dataset",
+    "total_instances",
+    "resolved_instances",
+    "resolution_rate",
+    "avg_cost_usd",
+    "avg_cost_per_fix_usd",
+    "avg_duration_ms",
+    "avg_tool_calls",
+    "per_instance"
+  ],
+  "properties": {
+    "model": { "type": "string", "description": "Model name (e.g. 'Claude Opus 4.6')" },
+    "provider": { "type": "string", "description": "Provider identifier (e.g. 'anthropic')" },
+    "model_type": { "type": "string", "enum": ["proprietary", "open-source", "open-weights"] },
+    "date": { "type": "string", "format": "date", "description": "Evaluation date (YYYY-MM-DD)" },
+    "agent": { "type": "string", "description": "Agent name/identifier" },
+    "agent_version": { "type": "string", "description": "Agent version string" },
+    "dataset": { "type": "string", "const": "swe-bench-lite" },
+    "total_instances": { "type": "integer", "minimum": 1 },
+    "resolved_instances": { "type": "integer", "minimum": 0 },
+    "resolution_rate": { "type": "number", "minimum": 0, "maximum": 1 },
+    "avg_cost_usd": { "type": "number", "minimum": 0 },
+    "avg_cost_per_fix_usd": { "type": "number", "minimum": 0 },
+    "avg_duration_ms": { "type": "number", "minimum": 0 },
+    "avg_tool_calls": { "type": "number", "minimum": 0 },
+    "per_instance": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "required": ["instance_id", "resolved", "cost_usd", "duration_ms", "tool_calls"],
+        "properties": {
+          "instance_id": { "type": "string" },
+          "resolved": { "type": "boolean" },
+          "cost_usd": { "type": "number", "minimum": 0 },
+          "duration_ms": { "type": "number", "minimum": 0 },
+          "tool_calls": { "type": "integer", "minimum": 0 }
+        },
+        "additionalProperties": false
+      }
+    }
+  },
+  "additionalProperties": false
+}
diff --git a/benchmarks/swe-bench-lite/results/claude-opus-4.6.json b/benchmarks/swe-bench-lite/results/claude-opus-4.6.json
new file mode 100644
index 000000000..af6e6a620
--- /dev/null
+++ b/benchmarks/swe-bench-lite/results/claude-opus-4.6.json
@@ -0,0 +1,53 @@
+{
+  "model": "Claude Opus 4.6",
+  "provider": "anthropic",
+  "model_type": "proprietary",
+  "date": "2026-04-08",
+  "agent": "agentv-swe-bench",
+  "agent_version": "1.0.0",
+  "dataset": "swe-bench-lite",
+  "total_instances": 300,
+  "resolved_instances": 218,
+  "resolution_rate": 0.727,
+  "avg_cost_usd": 0.55,
+  "avg_cost_per_fix_usd": 0.76,
+  "avg_duration_ms": 45000,
+  "avg_tool_calls": 8.2,
+  "per_instance": [
+    {
+      "instance_id": "django__django-15180",
+      "resolved": true,
+      "cost_usd": 0.42,
+      "duration_ms": 32000,
+      "tool_calls": 6
+    },
+    {
+      "instance_id": "astropy__astropy-12907",
+      "resolved": true,
+      "cost_usd": 0.38,
+      "duration_ms": 28000,
+      "tool_calls": 5
+    },
+    {
+      "instance_id": "matplotlib__matplotlib-23562",
+      "resolved": true,
+      "cost_usd": 0.61,
+      "duration_ms": 51000,
+      "tool_calls": 9
+    },
+    {
+      "instance_id": "sympy__sympy-20590",
+      "resolved": false,
+      "cost_usd": 0.72,
+      "duration_ms": 68000,
+      "tool_calls": 12
+    },
+    {
+      "instance_id": "scikit-learn__scikit-learn-13779",
+      "resolved": true,
+      "cost_usd": 0.48,
+      "duration_ms": 39000,
+      "tool_calls": 7
+    }
+  ]
+}
diff --git a/benchmarks/swe-bench-lite/results/claude-sonnet-4.5.json b/benchmarks/swe-bench-lite/results/claude-sonnet-4.5.json
new file mode 100644
index 000000000..1e08af19b
--- /dev/null
+++ b/benchmarks/swe-bench-lite/results/claude-sonnet-4.5.json
@@ -0,0 +1,53 @@
+{
+  "model": "Claude Sonnet 4.5",
+  "provider": "anthropic",
+  "model_type": "proprietary",
+  "date": "2026-04-07",
+  "agent": "agentv-swe-bench",
+  "agent_version": "1.0.0",
+  "dataset": "swe-bench-lite",
+  "total_instances": 300,
+  "resolved_instances": 196,
+  "resolution_rate": 0.653,
+  "avg_cost_usd": 0.28,
+  "avg_cost_per_fix_usd": 0.43,
+  "avg_duration_ms": 35000,
+  "avg_tool_calls": 7.1,
+  "per_instance": [
+    {
+      "instance_id": "django__django-15180",
+      "resolved": true,
+      "cost_usd": 0.22,
+      "duration_ms": 24000,
+      "tool_calls": 5
+    },
+    {
+      "instance_id": "astropy__astropy-12907",
+      "resolved": true,
+      "cost_usd": 0.19,
+      "duration_ms": 21000,
+      "tool_calls": 4
+    },
+    {
+      "instance_id": "matplotlib__matplotlib-23562",
+      "resolved": false,
+      "cost_usd": 0.35,
+      "duration_ms": 42000,
+      "tool_calls": 8
+    },
+    {
+      "instance_id": "sympy__sympy-20590",
+      "resolved": false,
+      "cost_usd": 0.41,
+      "duration_ms": 52000,
+      "tool_calls": 10
+    },
+    {
+      "instance_id": "scikit-learn__scikit-learn-13779",
+      "resolved": true,
+      "cost_usd": 0.25,
+      "duration_ms": 29000,
+      "tool_calls": 6
+    }
+  ]
+}
diff --git a/benchmarks/swe-bench-lite/results/codex-o3.json b/benchmarks/swe-bench-lite/results/codex-o3.json
new file mode 100644
index 000000000..fda4a90e9
--- /dev/null
+++ b/benchmarks/swe-bench-lite/results/codex-o3.json
@@ -0,0 +1,53 @@
+{
+  "model": "Codex o3",
+  "provider": "openai",
+  "model_type": "proprietary",
+  "date": "2026-04-04",
+  "agent": "agentv-swe-bench",
+  "agent_version": "1.0.0",
+  "dataset": "swe-bench-lite",
+  "total_instances": 300,
+  "resolved_instances": 231,
+  "resolution_rate": 0.77,
+  "avg_cost_usd": 0.82,
+  "avg_cost_per_fix_usd": 1.06,
+  "avg_duration_ms": 62000,
+  "avg_tool_calls": 11.5,
+  "per_instance": [
+    {
+      "instance_id": "django__django-15180",
+      "resolved": true,
+      "cost_usd": 0.68,
+      "duration_ms": 48000,
+      "tool_calls": 9
+    },
+    {
+      "instance_id": "astropy__astropy-12907",
+      "resolved": true,
+      "cost_usd": 0.59,
+      "duration_ms": 41000,
+      "tool_calls": 8
+    },
+    {
+      "instance_id": "matplotlib__matplotlib-23562",
+      "resolved": true,
+      "cost_usd": 0.91,
+      "duration_ms": 72000,
+      "tool_calls": 13
+    },
+    {
+      "instance_id": "sympy__sympy-20590",
+      "resolved": true,
+      "cost_usd": 1.12,
+      "duration_ms": 95000,
+      "tool_calls": 16
+    },
+    {
+      "instance_id": "scikit-learn__scikit-learn-13779",
+      "resolved": true,
+      "cost_usd": 0.74,
+      "duration_ms": 55000,
+      "tool_calls": 10
+    }
+  ]
+}
diff --git a/benchmarks/swe-bench-lite/results/deepseek-v3.json b/benchmarks/swe-bench-lite/results/deepseek-v3.json
new file mode 100644
index 000000000..be1e88419
--- /dev/null
+++ b/benchmarks/swe-bench-lite/results/deepseek-v3.json
@@ -0,0 +1,53 @@
+{
+  "model": "DeepSeek V3",
+  "provider": "deepseek",
+  "model_type": "open-weights",
+  "date": "2026-04-03",
+  "agent": "agentv-swe-bench",
+  "agent_version": "1.0.0",
+  "dataset": "swe-bench-lite",
+  "total_instances": 300,
+  "resolved_instances": 168,
+  "resolution_rate": 0.56,
+  "avg_cost_usd": 0.12,
+  "avg_cost_per_fix_usd": 0.21,
+  "avg_duration_ms": 52000,
+  "avg_tool_calls": 10.3,
+  "per_instance": [
+    {
+      "instance_id": "django__django-15180",
+      "resolved": true,
+      "cost_usd": 0.09,
+      "duration_ms": 38000,
+      "tool_calls": 8
+    },
+    {
+      "instance_id": "astropy__astropy-12907",
+      "resolved": false,
+      "cost_usd": 0.11,
+      "duration_ms": 45000,
+      "tool_calls": 9
+    },
+    {
+      "instance_id": "matplotlib__matplotlib-23562",
+      "resolved": true,
+      "cost_usd": 0.15,
+      "duration_ms": 58000,
+      "tool_calls": 12
+    },
+    {
+      "instance_id": "sympy__sympy-20590",
+      "resolved": false,
+      "cost_usd": 0.18,
+      "duration_ms": 72000,
+      "tool_calls": 14
+    },
+    {
+      "instance_id": "scikit-learn__scikit-learn-13779",
+      "resolved": true,
+      "cost_usd": 0.1,
+      "duration_ms": 41000,
+      "tool_calls": 9
+    }
+  ]
+}
diff --git a/benchmarks/swe-bench-lite/results/gemini-2.5-pro.json b/benchmarks/swe-bench-lite/results/gemini-2.5-pro.json
new file mode 100644
index 000000000..7e3e07826
--- /dev/null
+++ b/benchmarks/swe-bench-lite/results/gemini-2.5-pro.json
@@ -0,0 +1,53 @@
+{
+  "model": "Gemini 2.5 Pro",
+  "provider": "google",
+  "model_type": "proprietary",
+  "date": "2026-04-05",
+  "agent": "agentv-swe-bench",
+  "agent_version": "1.0.0",
+  "dataset": "swe-bench-lite",
+  "total_instances": 300,
+  "resolved_instances": 213,
+  "resolution_rate": 0.71,
+  "avg_cost_usd": 0.36,
+  "avg_cost_per_fix_usd": 0.51,
+  "avg_duration_ms": 38000,
+  "avg_tool_calls": 6.4,
+  "per_instance": [
+    {
+      "instance_id": "django__django-15180",
+      "resolved": true,
+      "cost_usd": 0.29,
+      "duration_ms": 26000,
+      "tool_calls": 5
+    },
+    {
+      "instance_id": "astropy__astropy-12907",
+      "resolved": true,
+      "cost_usd": 0.25,
+      "duration_ms": 22000,
+      "tool_calls": 4
+    },
+    {
+      "instance_id": "matplotlib__matplotlib-23562",
+      "resolved": true,
+      "cost_usd": 0.42,
+      "duration_ms": 44000,
+      "tool_calls": 7
+    },
+    {
+      "instance_id": "sympy__sympy-20590",
+      "resolved": false,
+      "cost_usd": 0.51,
+      "duration_ms": 55000,
+      "tool_calls": 9
+    },
+    {
+      "instance_id": "scikit-learn__scikit-learn-13779",
+      "resolved": true,
+      "cost_usd": 0.32,
+      "duration_ms": 31000,
+      "tool_calls": 5
+    }
+  ]
+}
diff --git a/benchmarks/swe-bench-lite/results/gpt-5.2.json b/benchmarks/swe-bench-lite/results/gpt-5.2.json
new file mode 100644
index 000000000..2405228e5
--- /dev/null
+++ b/benchmarks/swe-bench-lite/results/gpt-5.2.json
@@ -0,0 +1,53 @@
+{
+  "model": "GPT-5.2",
+  "provider": "openai",
+  "model_type": "proprietary",
+  "date": "2026-04-06",
+  "agent": "agentv-swe-bench",
+  "agent_version": "1.0.0",
+  "dataset": "swe-bench-lite",
+  "total_instances": 300,
+  "resolved_instances": 205,
+  "resolution_rate": 0.683,
+  "avg_cost_usd": 0.45,
+  "avg_cost_per_fix_usd": 0.66,
+  "avg_duration_ms": 42000,
+  "avg_tool_calls": 9.1,
+  "per_instance": [
+    {
+      "instance_id": "django__django-15180",
+      "resolved": true,
+      "cost_usd": 0.38,
+      "duration_ms": 31000,
+      "tool_calls": 7
+    },
+    {
+      "instance_id": "astropy__astropy-12907",
+      "resolved": true,
+      "cost_usd": 0.35,
+      "duration_ms": 27000,
+      "tool_calls": 6
+    },
+    {
+      "instance_id": "matplotlib__matplotlib-23562",
+      "resolved": true,
+      "cost_usd": 0.52,
+      "duration_ms": 48000,
+      "tool_calls": 10
+    },
+    {
+      "instance_id": "sympy__sympy-20590",
+      "resolved": false,
+      "cost_usd": 0.63,
+      "duration_ms": 61000,
+      "tool_calls": 13
+    },
+    {
+      "instance_id": "scikit-learn__scikit-learn-13779",
+      "resolved": true,
+      "cost_usd": 0.41,
+      "duration_ms": 36000,
+      "tool_calls": 8
+    }
+  ]
+}
diff --git a/benchmarks/swe-bench-lite/setup.ts b/benchmarks/swe-bench-lite/setup.ts
new file mode 100644
index 000000000..08d450e71
--- /dev/null
+++ b/benchmarks/swe-bench-lite/setup.ts
@@ -0,0 +1,201 @@
+#!/usr/bin/env bun
+/**
+ * SWE-bench Lite Setup Script
+ *
+ * Downloads the SWE-bench Lite dataset from HuggingFace and generates
+ * EVAL.yaml files for AgentV evaluation.
+ *
+ * Usage:
+ *   bun run setup.ts              # Generate all 300 EVAL.yaml files
+ *   bun run setup.ts --limit 10   # Generate only first 10 (for testing)
+ *
+ * Output: evals/<instance_id>.EVAL.yaml (gitignored)
+ *
+ * Data source: https://huggingface.co/datasets/SWE-bench/SWE-bench_Lite (test split)
+ * Docker images: swebench/sweb.eval.x86_64.<instance_id_mangled>
+ */
+
+import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs';
+import { join } from 'node:path';
+
+const DATASET_URL =
+  'https://datasets-server.huggingface.co/rows?dataset=SWE-bench/SWE-bench_Lite&config=default&split=test';
+const CACHE_DIR = join(import.meta.dir, '.cache');
+const EVALS_DIR = join(import.meta.dir, 'evals');
+const ROWS_PER_PAGE = 100;
+
+interface SWEBenchInstance {
+  instance_id: string;
+  repo: string;
+  base_commit: string;
+  patch: string;
+  test_patch: string;
+  problem_statement: string;
+  hints_text: string;
+  created_at: string;
+  version: string;
+  FAIL_TO_PASS: string; // JSON-encoded array
+  PASS_TO_PASS: string; // JSON-encoded array
+  environment_setup_commit: string;
+}
+
+/** Validate SWE-bench field values to prevent YAML injection */
+const SAFE_ID = /^[\w./-]+$/;
+function assertSafeField(name: string, value: string): void {
+  if (!SAFE_ID.test(value)) {
+    throw new Error(`Unsafe ${name}: ${JSON.stringify(value)}`);
+  }
+}
+
+/** Convert instance_id to Docker image tag (SWE-bench convention). */
+function instanceToImageTag(instanceId: string): string {
+  // SWE-bench image naming: swebench/sweb.eval.x86_64.<repo>__<id>:<version>
+  // Instance IDs already use __ as separator: e.g. django__django-15180
+  return `swebench/sweb.eval.x86_64.${instanceId.toLowerCase()}`;
+}
+
+/** Fetch all rows from HuggingFace dataset API with pagination. */
+async function fetchDataset(limit?: number): Promise<SWEBenchInstance[]> {
+  mkdirSync(CACHE_DIR, { recursive: true });
+  const cachePath = join(CACHE_DIR, 'swe-bench-lite.json');
+
+  // Use cache if available and less than 24h old
+  if (existsSync(cachePath)) {
+    const stat = Bun.file(cachePath);
+    const age = Date.now() - (await stat.lastModified);
+    if (age < 24 * 60 * 60 * 1000) {
+      console.log('Using cached dataset...');
+      const cached = JSON.parse(readFileSync(cachePath, 'utf8')) as SWEBenchInstance[];
+      return limit ? cached.slice(0, limit) : cached;
+    }
+  }
+
+  console.log('Downloading SWE-bench Lite dataset from HuggingFace...');
+  const allRows: SWEBenchInstance[] = [];
+  let offset = 0;
+
+  while (true) {
+    const url = `${DATASET_URL}&offset=${offset}&length=${ROWS_PER_PAGE}`;
+    const response = await fetch(url);
+    if (!response.ok) {
+      throw new Error(`HuggingFace API error: ${response.status} ${response.statusText}`);
+    }
+    const data = (await response.json()) as { rows: Array<{ row: SWEBenchInstance }> };
+    const rows = data.rows.map((r) => r.row);
+
+    if (rows.length === 0) break;
+    allRows.push(...rows);
+    offset += rows.length;
+
+    process.stdout.write(`\r  Downloaded ${allRows.length} instances...`);
+
+    if (rows.length < ROWS_PER_PAGE) break;
+  }
+  console.log(`\n  Total: ${allRows.length} instances`);
+
+  // Cache the dataset
+  writeFileSync(cachePath, JSON.stringify(allRows, null, 2));
+  console.log(`  Cached to ${cachePath}`);
+
+  return limit ? allRows.slice(0, limit) : allRows;
+}
+
+/** Generate an EVAL.yaml file for a single SWE-bench instance. */
+function generateEvalYaml(instance: SWEBenchInstance): string {
+  // Validate fields that are interpolated into YAML outside block scalars
+  assertSafeField('instance_id', instance.instance_id);
+  assertSafeField('repo', instance.repo);
+  assertSafeField('base_commit', instance.base_commit);
+  assertSafeField('version', instance.version);
+
+  const failToPass = JSON.parse(instance.FAIL_TO_PASS) as string[];
+  const passToPass = JSON.parse(instance.PASS_TO_PASS) as string[];
+  const imageTag = instanceToImageTag(instance.instance_id);
+
+  // Indent problem statement for YAML block scalar (10 spaces to match content block)
+  const indent = '          ';
+  const problemLines = instance.problem_statement.split('\n').map((line) => `${indent}${line}`);
+  const problemBlock = problemLines.join('\n');
+
+  return `# Auto-generated by setup.ts — do not edit manually
+# Source: HuggingFace SWE-bench/SWE-bench_Lite (test split)
+# Instance: ${instance.instance_id}
+# Repo: ${instance.repo} @ ${instance.base_commit.slice(0, 8)}
+
+description: "SWE-bench Lite: ${instance.instance_id}"
+
+workspace:
+  docker:
+    image: "${imageTag}"
+    timeout: 1800
+    memory: "4g"
+    cpus: 2
+
+tests:
+  - id: "${instance.instance_id}"
+    metadata:
+      repo: "${instance.repo}"
+      base_commit: "${instance.base_commit}"
+      version: "${instance.version}"
+      created_at: "${instance.created_at}"
+    input:
+      - role: user
+        content: |
+          You are a software engineer working on the ${instance.repo} repository.
+          Your task is to fix the following issue. The repository is available at /testbed.
+
+          ## Issue
+
+${problemBlock}
+
+          ## Instructions
+
+          1. Navigate to the repository at /testbed
+          2. Understand the issue and identify the root cause
+          3. Implement a fix
+          4. Output your changes as a unified diff (git diff format)
+
+          Important: Only output the diff, no explanation needed.
+    assertions:
+      - type: code-grader
+        command: ["python", "/grader.py"]
+        instance_id: "${instance.instance_id}"
+        repo: "${instance.repo}"
+        base_commit: "${instance.base_commit}"
+        fail_to_pass:
+${failToPass.map((t) => `          - "${t.replace(/"/g, '\\"')}"`).join('\n')}
+        pass_to_pass_count: ${passToPass.length}
+`;
+}
+
+// --- Main ---
+async function main() {
+  const args = process.argv.slice(2);
+  const limitIdx = args.indexOf('--limit');
+  const limit = limitIdx !== -1 ? Number.parseInt(args[limitIdx + 1], 10) : undefined;
+
+  console.log('SWE-bench Lite Setup');
+  console.log('====================\n');
+
+  const instances = await fetchDataset(limit);
+
+  mkdirSync(EVALS_DIR, { recursive: true });
+
+  let generated = 0;
+  for (const instance of instances) {
+    const filename = `${instance.instance_id}.EVAL.yaml`;
+    const filepath = join(EVALS_DIR, filename);
+    const yaml = generateEvalYaml(instance);
+    writeFileSync(filepath, yaml);
+    generated++;
+  }
+
+  console.log(`\nGenerated ${generated} EVAL.yaml files in ${EVALS_DIR}/`);
+  console.log('\nNext steps:');
+  console.log('  bun apps/cli/src/cli.ts eval benchmarks/swe-bench-lite/evals/ --target claude');
+}
+
+main().catch((err) => {
+  console.error('Setup failed:', err);
+  process.exit(1);
+});
diff --git a/benchmarks/swe-bench-lite/validate-result.ts b/benchmarks/swe-bench-lite/validate-result.ts
new file mode 100644
index 000000000..0f5e46e1a
--- /dev/null
+++ b/benchmarks/swe-bench-lite/validate-result.ts
@@ -0,0 +1,173 @@
+#!/usr/bin/env bun
+/**
+ * Validate SWE-bench Lite result JSON files against the schema.
+ *
+ * Zero-dependency validator — uses runtime type checks instead of Zod
+ * so it works standalone from the benchmarks/ directory.
+ *
+ * Usage:
+ *   bun run validate-result.ts results/claude-opus-4.6.json
+ *   bun run validate-result.ts results/*.json
+ *
+ * Used by CI to validate PR submissions.
+ */
+
+import { readFileSync } from 'node:fs';
+
+const REQUIRED_TOP_FIELDS = [
+  'model',
+  'provider',
+  'model_type',
+  'date',
+  'agent',
+  'agent_version',
+  'dataset',
+  'total_instances',
+  'resolved_instances',
+  'resolution_rate',
+  'avg_cost_usd',
+  'avg_cost_per_fix_usd',
+  'avg_duration_ms',
+  'avg_tool_calls',
+  'per_instance',
+] as const;
+
+const VALID_MODEL_TYPES = ['proprietary', 'open-source', 'open-weights'];
+
+const REQUIRED_INSTANCE_FIELDS = [
+  'instance_id',
+  'resolved',
+  'cost_usd',
+  'duration_ms',
+  'tool_calls',
+] as const;
+
+interface ValidationError {
+  path: string;
+  message: string;
+}
+
+function validateResult(data: unknown): ValidationError[] {
+  const errors: ValidationError[] = [];
+
+  if (typeof data !== 'object' || data === null || Array.isArray(data)) {
+    return [{ path: '', message: 'Root must be a JSON object' }];
+  }
+
+  const obj = data as Record<string, unknown>;
+
+  // Check required fields exist
+  for (const field of REQUIRED_TOP_FIELDS) {
+    if (!(field in obj)) {
+      errors.push({ path: field, message: 'Required field missing' });
+    }
+  }
+  if (errors.length > 0) return errors;
+
+  // Type checks with length limits
+  if (typeof obj.model !== 'string' || (obj.model as string).length > 100)
+    errors.push({ path: 'model', message: 'Must be a string (max 100 chars)' });
+  if (typeof obj.provider !== 'string' || !/^[a-z0-9-]+$/.test(obj.provider as string))
+    errors.push({ path: 'provider', message: 'Must be lowercase alphanumeric with hyphens' });
+  if (!VALID_MODEL_TYPES.includes(obj.model_type as string))
+    errors.push({ path: 'model_type', message: `Must be one of: ${VALID_MODEL_TYPES.join(', ')}` });
+  if (typeof obj.date !== 'string' || !/^\d{4}-\d{2}-\d{2}$/.test(obj.date as string))
+    errors.push({ path: 'date', message: 'Must be YYYY-MM-DD format' });
+  if (typeof obj.agent !== 'string' || (obj.agent as string).length > 100)
+    errors.push({ path: 'agent', message: 'Must be a string (max 100 chars)' });
+  if (typeof obj.agent_version !== 'string' || (obj.agent_version as string).length > 50)
+    errors.push({ path: 'agent_version', message: 'Must be a string (max 50 chars)' });
+  if (obj.dataset !== 'swe-bench-lite')
+    errors.push({ path: 'dataset', message: 'Must be "swe-bench-lite"' });
+
+  const numFields = [
+    'total_instances',
+    'resolved_instances',
+    'resolution_rate',
+    'avg_cost_usd',
+    'avg_cost_per_fix_usd',
+    'avg_duration_ms',
+    'avg_tool_calls',
+  ];
+  for (const f of numFields) {
+    if (typeof obj[f] !== 'number' || Number.isNaN(obj[f] as number))
+      errors.push({ path: f, message: 'Must be a number' });
+  }
+
+  if (
+    typeof obj.resolution_rate === 'number' &&
+    ((obj.resolution_rate as number) < 0 || (obj.resolution_rate as number) > 1)
+  )
+    errors.push({ path: 'resolution_rate', message: 'Must be between 0 and 1' });
+
+  // Validate per_instance array
+  if (!Array.isArray(obj.per_instance)) {
+    errors.push({ path: 'per_instance', message: 'Must be an array' });
+  } else {
+    for (let i = 0; i < obj.per_instance.length; i++) {
+      const inst = obj.per_instance[i] as Record<string, unknown>;
+      for (const field of REQUIRED_INSTANCE_FIELDS) {
+        if (!(field in inst)) {
+          errors.push({ path: `per_instance[${i}].${field}`, message: 'Required field missing' });
+        }
+      }
+      if (typeof inst.instance_id !== 'string')
+        errors.push({ path: `per_instance[${i}].instance_id`, message: 'Must be a string' });
+      if (typeof inst.resolved !== 'boolean')
+        errors.push({ path: `per_instance[${i}].resolved`, message: 'Must be a boolean' });
+    }
+  }
+
+  return errors;
+}
+
+// CLI entry point
+const files = process.argv.slice(2);
+if (files.length === 0) {
+  console.error('Usage: bun run validate-result.ts <result-file.json> [...]');
+  process.exit(1);
+}
+
+let hasErrors = false;
+
+for (const file of files) {
+  try {
+    const content = readFileSync(file, 'utf8');
+    const data = JSON.parse(content) as Record<string, unknown>;
+    const errors = validateResult(data);
+
+    if (errors.length > 0) {
+      console.error(`❌ ${file}:`);
+      for (const err of errors) {
+        console.error(`   ${err.path}: ${err.message}`);
+      }
+      hasErrors = true;
+    } else {
+      // Cross-validate computed fields
+      const totalInstances = data.total_instances as number;
+      const resolvedInstances = data.resolved_instances as number;
+      const resolutionRate = data.resolution_rate as number;
+      const perInstance = data.per_instance as unknown[];
+
+      const expectedRate = totalInstances > 0 ? resolvedInstances / totalInstances : 0;
+      if (Math.abs(resolutionRate - expectedRate) > 0.01) {
+        console.error(
+          `❌ ${file}: resolution_rate ${resolutionRate} doesn't match resolved/total (${expectedRate.toFixed(3)})`,
+        );
+        hasErrors = true;
+      } else if (perInstance.length !== totalInstances) {
+        console.warn(
+          `⚠️  ${file}: per_instance has ${perInstance.length} entries but total_instances is ${totalInstances} (partial results)`,
+        );
+        console.log(`✅ ${file} — ${data.model} (${resolutionRate * 100}% resolved, partial)`);
+      } else {
+        console.log(`✅ ${file} — ${data.model} (${resolutionRate * 100}% resolved)`);
+      }
+    }
+  } catch (err) {
+    console.error(`❌ ${file}: ${err instanceof Error ? err.message : String(err)}`);
+    hasErrors = true;
+  }
+}
+
+process.exit(hasErrors ? 1 : 0);
diff --git a/packages/core/src/evaluation/workspace/docker-workspace.ts b/packages/core/src/evaluation/workspace/docker-workspace.ts
index 1ce4f25e3..19c74692f 100644
--- a/packages/core/src/evaluation/workspace/docker-workspace.ts
+++ b/packages/core/src/evaluation/workspace/docker-workspace.ts
@@ -105,6 +105,14 @@ export class DockerWorkspaceProvider {
 
   /** Pull the configured Docker image. No-op if already cached locally. */
   async pullImage(): Promise<void> {
+    // Skip pull if image already exists locally (e.g. locally-built images)
+    const inspectResult = await this.executor.exec(['docker', 'image', 'inspect', this.config.image], {
+      timeoutMs: 10_000,
+    });
+    if (inspectResult.exitCode === 0) {
+      return; // Image exists locally, no pull needed
+    }
+
     const result = await this.executor.exec(['docker', 'pull', this.config.image], {
       timeoutMs: this.timeoutMs,
     });
diff --git a/packages/core/test/evaluation/workspace/docker-workspace.test.ts b/packages/core/test/evaluation/workspace/docker-workspace.test.ts
index 9452e0513..08bff49d2 100644
--- a/packages/core/test/evaluation/workspace/docker-workspace.test.ts
+++ b/packages/core/test/evaluation/workspace/docker-workspace.test.ts
@@ -84,24 +84,43 @@ describe('DockerWorkspaceProvider', () => {
   });
 
   describe('pullImage', () => {
-    it('calls docker pull with the configured image', async () => {
+    it('skips pull when image exists locally', async () => {
+      // docker image inspect succeeds → image exists locally
+      executor.pushResponse({ exitCode: 0 });
+      const provider = new DockerWorkspaceProvider({ image: 'myimage:v1' }, executor);
+      await provider.pullImage();
+      expect(executor.callArgv(0)).toEqual(['docker', 'image', 'inspect', 'myimage:v1']);
+      expect(executor.calls.length).toBe(1); // no pull call
+    });
+
+    it('calls docker pull when image not found locally', async () => {
+      // docker image inspect fails → pull needed
+      executor.pushResponse({ exitCode: 1, stderr: 'No such image' });
       executor.pushResponse({ stdout: 'Pull complete\n', exitCode: 0 });
       const provider = new DockerWorkspaceProvider({ image: 'myimage:v1' }, executor);
       await provider.pullImage();
-      expect(executor.callArgv(0)).toEqual(['docker', 'pull', 'myimage:v1']);
+      expect(executor.callArgv(0)).toEqual(['docker', 'image', 'inspect', 'myimage:v1']);
+      expect(executor.callArgv(1)).toEqual(['docker', 'pull', 'myimage:v1']);
     });
 
     it('throws on pull failure', async () => {
+      // inspect fails, pull also fails
+      executor.pushResponse({ exitCode: 1, stderr: 'No such image' });
       executor.pushResponse({ exitCode: 1, stderr: 'manifest not found' });
       const provider = new DockerWorkspaceProvider({ image: 'bad:image' }, executor);
       await expect(provider.pullImage()).rejects.toThrow('docker pull failed');
     });
 
-    it('uses configured timeout', async () => {
+    it('uses configured timeout for pull', async () => {
+      // inspect fails, then pull happens with configured timeout
+      executor.pushResponse({ exitCode: 1, stderr: 'No such image' });
       executor.pushResponse({ exitCode: 0 });
       const provider = new DockerWorkspaceProvider({ image: 'img:1', timeout: 60 }, executor);
       await provider.pullImage();
-      expect(executor.callOptions(0)?.timeoutMs).toBe(60_000);
+      // First call (inspect) uses 10s timeout
+      expect(executor.callOptions(0)?.timeoutMs).toBe(10_000);
+      // Second call (pull) uses configured timeout
+      expect(executor.callOptions(1)?.timeoutMs).toBe(60_000);
     });
   });
 
@@ -351,18 +370,24 @@ describe('DockerWorkspaceProvider', () => {
   });
 
   describe('timeout configuration', () => {
-    it('defaults to 1800s (30 min) timeout', async () => {
+    it('defaults to 1800s (30 min) timeout for pull', async () => {
+      // inspect fails → pull with default timeout
+      executor.pushResponse({ exitCode: 1, stderr: 'No such image' });
       executor.pushResponse({ exitCode: 0 });
       const provider = new DockerWorkspaceProvider({ image: 'img:1' }, executor);
       await provider.pullImage();
-      expect(executor.callOptions(0)?.timeoutMs).toBe(1_800_000);
+      // Pull call (second) uses default timeout
+      expect(executor.callOptions(1)?.timeoutMs).toBe(1_800_000);
     });
 
     it('uses custom timeout from config', async () => {
+      // inspect fails → pull with custom timeout
+      executor.pushResponse({ exitCode: 1, stderr: 'No such image' });
       executor.pushResponse({ exitCode: 0 });
       const provider = new DockerWorkspaceProvider({ image: 'img:1', timeout: 300 }, executor);
       await provider.pullImage();
-      expect(executor.callOptions(0)?.timeoutMs).toBe(300_000);
+      // Pull call (second) uses custom timeout
+      expect(executor.callOptions(1)?.timeoutMs).toBe(300_000);
     });
   });
 });
diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json
index 2792f120f..a7f142c04 100644
--- a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json
+++ b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json
@@ -53,12 +53,7 @@
                 "properties": {
                   "role": {
                     "type": "string",
-                    "enum": [
-                      "system",
-                      "user",
-                      "assistant",
-                      "tool"
-                    ]
+                    "enum": ["system", "user", "assistant", "tool"]
                   },
                   "content": {
                     "anyOf": [
@@ -72,30 +67,20 @@
                           "properties": {
                             "type": {
                               "type": "string",
-                              "enum": [
-                                "text",
-                                "file",
-                                "image"
-                              ]
+                              "enum": ["text", "file", "image"]
                             },
                             "value": {
                               "type": "string"
                             }
                           },
-                          "required": [
-                            "type",
-                            "value"
-                          ],
+                          "required": ["type", "value"],
                           "additionalProperties": false
                         }
                       }
                     ]
                   }
                 },
-                "required": [
-                  "role",
-                  "content"
-                ],
+                "required": ["role", "content"],
                 "additionalProperties": false
               }
             }
@@ -133,12 +118,7 @@
                           "properties": {
                             "role": {
                               "type": "string",
-                              "enum": [
-                                "system",
-                                "user",
-                                "assistant",
-                                "tool"
-                              ]
+                              "enum": ["system", "user", "assistant", "tool"]
                             },
                             "content": {
                               "anyOf": [
@@ -152,30 +132,20 @@
                                     "properties": {
                                       "type": {
                                         "type": "string",
-                                        "enum": [
-                                          "text",
-                                          "file",
-                                          "image"
-                                        ]
+                                        "enum": ["text", "file", "image"]
                                       },
                                       "value": {
                                         "type": "string"
                                       }
                                     },
-                                    "required": [
-                                      "type",
-                                      "value"
-                                    ],
+                                    "required": ["type", "value"],
                                     "additionalProperties": false
                                   }
                                 }
                               ]
                             }
                           },
-                          "required": [
-                            "role",
-                            "content"
-                          ],
+                          "required": ["role", "content"],
                           "additionalProperties": false
                         }
                       }
@@ -203,12 +173,7 @@
                           "properties": {
                             "role": {
                               "type": "string",
-                              "enum": [
-                                "system",
-                                "user",
-                                "assistant",
-                                "tool"
-                              ]
+                              "enum": ["system", "user", "assistant", "tool"]
                             },
                             "content": {
                               "anyOf": [
@@ -222,30 +187,20 @@
                                     "properties": {
                                       "type": {
                                         "type": "string",
-                                        "enum": [
-                                          "text",
-                                          "file",
-                                          "image"
-                                        ]
+                                        "enum": ["text", "file", "image"]
                                       },
                                       "value": {
                                         "type": "string"
                                       }
                                     },
-                                    "required": [
-                                      "type",
-                                      "value"
-                                    ],
+                                    "required": ["type", "value"],
                                     "additionalProperties": false
                                   }
                                 }
                               ]
                             }
                           },
-                          "required": [
-                            "role",
-                            "content"
-                          ],
+                          "required": ["role", "content"],
                           "additionalProperties": false
                         }
                       }
@@ -289,10 +244,7 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": [
-                                "code-grader",
-                                "code_grader"
-                              ]
+                              "enum": ["code-grader", "code_grader"]
                             },
                             "command": {
                               "anyOf": [
@@ -366,18 +318,12 @@
                                     ]
                                   }
                                 },
-                                "required": [
-                                  "type",
-                                  "command"
-                                ],
+                                "required": ["type", "command"],
                                 "additionalProperties": false
                               }
                             }
                           },
-                          "required": [
-                            "type",
-                            "command"
-                          ],
+                          "required": ["type", "command"],
                           "additionalProperties": false
                         },
                         {
@@ -414,10 +360,7 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": [
-                                "llm-grader",
-                                "llm_grader"
-                              ]
+                              "enum": ["llm-grader", "llm_grader"]
                             },
                             "prompt": {
                               "anyOf": [
@@ -512,10 +455,7 @@
                                           "minLength": 1
                                         }
                                       },
-                                      "required": [
-                                        "score_range",
-                                        "outcome"
-                                      ],
+                                      "required": ["score_range", "outcome"],
                                       "additionalProperties": false
                                     }
                                   }
@@ -566,17 +506,12 @@
                                     ]
                                   }
                                 },
-                                "required": [
-                                  "type",
-                                  "command"
-                                ],
+                                "required": ["type", "command"],
                                 "additionalProperties": false
                               }
                             }
                           },
-                          "required": [
-                            "type"
-                          ],
+                          "required": ["type"],
                           "additionalProperties": false
                         },
                         {
@@ -639,9 +574,7 @@
                                       }
                                     }
                                   },
-                                  "required": [
-                                    "type"
-                                  ],
+                                  "required": ["type"],
                                   "additionalProperties": false
                                 },
                                 {
@@ -657,10 +590,7 @@
                                       "maximum": 1
                                     }
                                   },
-                                  "required": [
-                                    "type",
-                                    "threshold"
-                                  ],
+                                  "required": ["type", "threshold"],
                                   "additionalProperties": false
                                 },
                                 {
@@ -677,10 +607,7 @@
                                       "type": "string"
                                     }
                                   },
-                                  "required": [
-                                    "type",
-                                    "path"
-                                  ],
+                                  "required": ["type", "path"],
                                   "additionalProperties": false
                                 },
                                 {
@@ -697,18 +624,13 @@
                                       "type": "string"
                                     }
                                   },
-                                  "required": [
-                                    "type"
-                                  ],
+                                  "required": ["type"],
                                   "additionalProperties": false
                                 }
                               ]
                             }
                           },
-                          "required": [
-                            "type",
-                            "aggregator"
-                          ],
+                          "required": ["type", "aggregator"],
                           "additionalProperties": false
                         },
                         {
@@ -745,20 +667,11 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": [
-                                "tool-trajectory",
-                                "tool_trajectory"
-                              ]
+                              "enum": ["tool-trajectory", "tool_trajectory"]
                             },
                             "mode": {
                               "type": "string",
-                              "enum": [
-                                "any_order",
-                                "in_order",
-                                "exact",
-                                "subset",
-                                "superset"
-                              ]
+                              "enum": ["any_order", "in_order", "exact", "subset", "superset"]
                             },
                             "minimums": {
                               "type": "object",
@@ -799,12 +712,7 @@
                                     "anyOf": [
                                       {
                                         "type": "string",
-                                        "enum": [
-                                          "exact",
-                                          "ignore",
-                                          "subset",
-                                          "superset"
-                                        ]
+                                        "enum": ["exact", "ignore", "subset", "superset"]
                                       },
                                       {
                                         "type": "array",
@@ -818,12 +726,7 @@
                                     "anyOf": [
                                       {
                                         "type": "string",
-                                        "enum": [
-                                          "exact",
-                                          "ignore",
-                                          "subset",
-                                          "superset"
-                                        ]
+                                        "enum": ["exact", "ignore", "subset", "superset"]
                                       },
                                       {
                                         "type": "array",
@@ -834,9 +737,7 @@
                                     ]
                                   }
                                 },
-                                "required": [
-                                  "tool"
-                                ],
+                                "required": ["tool"],
                                 "additionalProperties": false
                               }
                             },
@@ -844,12 +745,7 @@
                               "anyOf": [
                                 {
                                   "type": "string",
-                                  "enum": [
-                                    "exact",
-                                    "ignore",
-                                    "subset",
-                                    "superset"
-                                  ]
+                                  "enum": ["exact", "ignore", "subset", "superset"]
                                 },
                                 {
                                   "type": "array",
@@ -863,12 +759,7 @@
                               "anyOf": [
                                 {
                                   "type": "string",
-                                  "enum": [
-                                    "exact",
-                                    "ignore",
-                                    "subset",
-                                    "superset"
-                                  ]
+                                  "enum": ["exact", "ignore", "subset", "superset"]
                                 },
                                 {
                                   "type": "array",
@@ -879,10 +770,7 @@
                               ]
                             }
                           },
-                          "required": [
-                            "type",
-                            "mode"
-                          ],
+                          "required": ["type", "mode"],
                           "additionalProperties": false
                         },
                         {
@@ -919,10 +807,7 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": [
-                                "field-accuracy",
-                                "field_accuracy"
-                              ]
+                              "enum": ["field-accuracy", "field_accuracy"]
                             },
                             "fields": {
                               "type": "array",
@@ -934,11 +819,7 @@
                                   },
                                   "match": {
                                     "type": "string",
-                                    "enum": [
-                                      "exact",
-                                      "numeric_tolerance",
-                                      "date"
-                                    ]
+                                    "enum": ["exact", "numeric_tolerance", "date"]
                                   },
                                   "required": {
                                     "type": "boolean"
@@ -960,26 +841,17 @@
                                     }
                                   }
                                 },
-                                "required": [
-                                  "path",
-                                  "match"
-                                ],
+                                "required": ["path", "match"],
                                 "additionalProperties": false
                               },
                               "minItems": 1
                             },
                             "aggregation": {
                               "type": "string",
-                              "enum": [
-                                "weighted_average",
-                                "all_or_nothing"
-                              ]
+                              "enum": ["weighted_average", "all_or_nothing"]
                             }
                           },
-                          "required": [
-                            "type",
-                            "fields"
-                          ],
+                          "required": ["type", "fields"],
                           "additionalProperties": false
                         },
                         {
@@ -1023,10 +895,7 @@
                               "minimum": 0
                             }
                           },
-                          "required": [
-                            "type",
-                            "threshold"
-                          ],
+                          "required": ["type", "threshold"],
                           "additionalProperties": false
                         },
                         {
@@ -1070,10 +939,7 @@
                               "minimum": 0
                             }
                           },
-                          "required": [
-                            "type",
-                            "budget"
-                          ],
+                          "required": ["type", "budget"],
                           "additionalProperties": false
                         },
                         {
@@ -1110,10 +976,7 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": [
-                                "token-usage",
-                                "token_usage"
-                              ]
+                              "enum": ["token-usage", "token_usage"]
                             },
                             "max_total": {
                               "type": "number",
@@ -1128,9 +991,7 @@
                               "minimum": 0
                             }
                           },
-                          "required": [
-                            "type"
-                          ],
+                          "required": ["type"],
                           "additionalProperties": false
                         },
                         {
@@ -1167,10 +1028,7 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": [
-                                "execution-metrics",
-                                "execution_metrics"
-                              ]
+                              "enum": ["execution-metrics", "execution_metrics"]
                             },
                             "max_tool_calls": {
                               "type": "number",
@@ -1202,9 +1060,7 @@
                               "minimum": 0
                             }
                           },
-                          "required": [
-                            "type"
-                          ],
+                          "required": ["type"],
                           "additionalProperties": false
                         },
                         {
@@ -1247,10 +1103,7 @@
                               "type": "string"
                             }
                           },
-                          "required": [
-                            "type",
-                            "value"
-                          ],
+                          "required": ["type", "value"],
                           "additionalProperties": false
                         },
                         {
@@ -1293,10 +1146,7 @@
                               "type": "string"
                             }
                           },
-                          "required": [
-                            "type",
-                            "value"
-                          ],
+                          "required": ["type", "value"],
                           "additionalProperties": false
                         },
                         {
@@ -1333,15 +1183,10 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": [
-                                "is-json",
-                                "is_json"
-                              ]
+                              "enum": ["is-json", "is_json"]
                             }
                           },
-                          "required": [
-                            "type"
-                          ],
+                          "required": ["type"],
                           "additionalProperties": false
                         },
                         {
@@ -1384,10 +1229,7 @@
                               "type": "string"
                             }
                           },
-                          "required": [
-                            "type",
-                            "value"
-                          ],
+                          "required": ["type", "value"],
                           "additionalProperties": false
                         },
                         {
@@ -1476,10 +1318,7 @@
                                           "minLength": 1
                                         }
                                       },
-                                      "required": [
-                                        "score_range",
-                                        "outcome"
-                                      ],
+                                      "required": ["score_range", "outcome"],
                                       "additionalProperties": false
                                     }
                                   }
@@ -1489,10 +1328,7 @@
                               "minItems": 1
                             }
                           },
-                          "required": [
-                            "type",
-                            "criteria"
-                          ],
+                          "required": ["type", "criteria"],
                           "additionalProperties": false
                         }
                       ]
@@ -1536,10 +1372,7 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": [
-                                "code-grader",
-                                "code_grader"
-                              ]
+                              "enum": ["code-grader", "code_grader"]
                             },
                             "command": {
                               "anyOf": [
@@ -1613,18 +1446,12 @@
                                     ]
                                   }
                                 },
-                                "required": [
-                                  "type",
-                                  "command"
-                                ],
+                                "required": ["type", "command"],
                                 "additionalProperties": false
                               }
                             }
                           },
-                          "required": [
-                            "type",
-                            "command"
-                          ],
+                          "required": ["type", "command"],
                           "additionalProperties": false
                         },
                         {
@@ -1661,10 +1488,7 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": [
-                                "llm-grader",
-                                "llm_grader"
-                              ]
+                              "enum": ["llm-grader", "llm_grader"]
                             },
                             "prompt": {
                               "anyOf": [
@@ -1759,10 +1583,7 @@
                                           "minLength": 1
                                         }
                                       },
-                                      "required": [
-                                        "score_range",
-                                        "outcome"
-                                      ],
+                                      "required": ["score_range", "outcome"],
                                       "additionalProperties": false
                                     }
                                   }
@@ -1813,17 +1634,12 @@
                                     ]
                                   }
                                 },
-                                "required": [
-                                  "type",
-                                  "command"
-                                ],
+                                "required": ["type", "command"],
                                 "additionalProperties": false
                               }
                             }
                           },
-                          "required": [
-                            "type"
-                          ],
+                          "required": ["type"],
                           "additionalProperties": false
                         },
                         {
@@ -1886,9 +1702,7 @@
                                       }
                                     }
                                   },
-                                  "required": [
-                                    "type"
-                                  ],
+                                  "required": ["type"],
                                   "additionalProperties": false
                                 },
                                 {
@@ -1904,10 +1718,7 @@
                                       "maximum": 1
                                     }
                                   },
-                                  "required": [
-                                    "type",
-                                    "threshold"
-                                  ],
+                                  "required": ["type", "threshold"],
                                   "additionalProperties": false
                                 },
                                 {
@@ -1924,10 +1735,7 @@
                                       "type": "string"
                                     }
                                   },
-                                  "required": [
-                                    "type",
-                                    "path"
-                                  ],
+                                  "required": ["type", "path"],
                                   "additionalProperties": false
                                 },
                                 {
@@ -1944,18 +1752,13 @@
                                       "type": "string"
                                     }
                                   },
-                                  "required": [
-                                    "type"
-                                  ],
+                                  "required": ["type"],
                                   "additionalProperties": false
                                 }
                               ]
                             }
                           },
-                          "required": [
-                            "type",
-                            "aggregator"
-                          ],
+                          "required": ["type", "aggregator"],
                           "additionalProperties": false
                         },
                         {
@@ -1992,20 +1795,11 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": [
-                                "tool-trajectory",
-                                "tool_trajectory"
-                              ]
+                              "enum": ["tool-trajectory", "tool_trajectory"]
                             },
                             "mode": {
                               "type": "string",
-                              "enum": [
-                                "any_order",
-                                "in_order",
-                                "exact",
-                                "subset",
-                                "superset"
-                              ]
+                              "enum": ["any_order", "in_order", "exact", "subset", "superset"]
                             },
                             "minimums": {
                               "type": "object",
@@ -2046,12 +1840,7 @@
                                     "anyOf": [
                                       {
                                         "type": "string",
-                                        "enum": [
-                                          "exact",
-                                          "ignore",
-                                          "subset",
-                                          "superset"
-                                        ]
+                                        "enum": ["exact", "ignore", "subset", "superset"]
                                       },
                                       {
                                         "type": "array",
@@ -2065,12 +1854,7 @@
                                     "anyOf": [
                                       {
                                         "type": "string",
-                                        "enum": [
-                                          "exact",
-                                          "ignore",
-                                          "subset",
-                                          "superset"
-                                        ]
+                                        "enum": ["exact", "ignore", "subset", "superset"]
                                       },
                                       {
                                         "type": "array",
@@ -2081,9 +1865,7 @@
                                     ]
                                   }
                                 },
-                                "required": [
-                                  "tool"
-                                ],
+                                "required": ["tool"],
                                 "additionalProperties": false
                               }
                             },
@@ -2091,12 +1873,7 @@
                               "anyOf": [
                                 {
                                   "type": "string",
-                                  "enum": [
-                                    "exact",
-                                    "ignore",
-                                    "subset",
-                                    "superset"
-                                  ]
+                                  "enum": ["exact", "ignore", "subset", "superset"]
                                 },
                                 {
                                   "type": "array",
@@ -2110,12 +1887,7 @@
                               "anyOf": [
                                 {
                                   "type": "string",
-                                  "enum": [
-                                    "exact",
-                                    "ignore",
-                                    "subset",
-                                    "superset"
-                                  ]
+                                  "enum": ["exact", "ignore", "subset", "superset"]
                                 },
                                 {
                                   "type": "array",
@@ -2126,10 +1898,7 @@
                               ]
                             }
                           },
-                          "required": [
-                            "type",
-                            "mode"
-                          ],
+                          "required": ["type", "mode"],
                           "additionalProperties": false
                         },
                         {
@@ -2166,10 +1935,7 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": [
-                                "field-accuracy",
-                                "field_accuracy"
-                              ]
+                              "enum": ["field-accuracy", "field_accuracy"]
                             },
                             "fields": {
                               "type": "array",
@@ -2181,11 +1947,7 @@
                                   },
                                   "match": {
                                     "type": "string",
-                                    "enum": [
-                                      "exact",
-                                      "numeric_tolerance",
-                                      "date"
-                                    ]
+                                    "enum": ["exact", "numeric_tolerance", "date"]
                                   },
                                   "required": {
                                     "type": "boolean"
@@ -2207,26 +1969,17 @@
                                     }
                                   }
                                 },
-                                "required": [
-                                  "path",
-                                  "match"
-                                ],
+                                "required": ["path", "match"],
                                 "additionalProperties": false
                               },
                               "minItems": 1
                             },
                             "aggregation": {
                               "type": "string",
-                              "enum": [
-                                "weighted_average",
-                                "all_or_nothing"
-                              ]
+                              "enum": ["weighted_average", "all_or_nothing"]
                             }
                           },
-                          "required": [
-                            "type",
-                            "fields"
-                          ],
+                          "required": ["type", "fields"],
                           "additionalProperties": false
                         },
                         {
@@ -2270,10 +2023,7 @@
                               "minimum": 0
                             }
                           },
-                          "required": [
-                            "type",
-                            "threshold"
-                          ],
+                          "required": ["type", "threshold"],
                           "additionalProperties": false
                         },
                         {
@@ -2317,10 +2067,7 @@
                               "minimum": 0
                             }
                           },
-                          "required": [
-                            "type",
-                            "budget"
-                          ],
+                          "required": ["type", "budget"],
                           "additionalProperties": false
                         },
                         {
@@ -2357,10 +2104,7 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": [
-                                "token-usage",
-                                "token_usage"
-                              ]
+                              "enum": ["token-usage", "token_usage"]
                             },
                             "max_total": {
                               "type": "number",
@@ -2375,9 +2119,7 @@
                               "minimum": 0
                             }
                           },
-                          "required": [
-                            "type"
-                          ],
+                          "required": ["type"],
                           "additionalProperties": false
                         },
                         {
@@ -2414,10 +2156,7 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": [
-                                "execution-metrics",
-                                "execution_metrics"
-                              ]
+                              "enum": ["execution-metrics", "execution_metrics"]
                             },
                             "max_tool_calls": {
                               "type": "number",
@@ -2449,9 +2188,7 @@
                               "minimum": 0
                             }
                           },
-                          "required": [
-                            "type"
-                          ],
+                          "required": ["type"],
                           "additionalProperties": false
                         },
                         {
@@ -2494,10 +2231,7 @@
                               "type": "string"
                             }
                           },
-                          "required": [
-                            "type",
-                            "value"
-                          ],
+                          "required": ["type", "value"],
                           "additionalProperties": false
                         },
                         {
@@ -2540,10 +2274,7 @@
                               "type": "string"
                             }
                           },
-                          "required": [
-                            "type",
-                            "value"
-                          ],
+                          "required": ["type", "value"],
                           "additionalProperties": false
                         },
                         {
@@ -2580,15 +2311,10 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": [
-                                "is-json",
-                                "is_json"
-                              ]
+                              "enum": ["is-json", "is_json"]
                             }
                           },
-                          "required": [
-                            "type"
-                          ],
+                          "required": ["type"],
                           "additionalProperties": false
                         },
                         {
@@ -2631,10 +2357,7 @@
                               "type": "string"
                             }
                           },
-                          "required": [
-                            "type",
-                            "value"
-                          ],
+                          "required": ["type", "value"],
                           "additionalProperties": false
                         },
                         {
@@ -2723,10 +2446,7 @@
                                           "minLength": 1
                                         }
                                       },
-                                      "required": [
-                                        "score_range",
-                                        "outcome"
-                                      ],
+                                      "required": ["score_range", "outcome"],
                                       "additionalProperties": false
                                     }
                                   }
@@ -2736,10 +2456,7 @@
                               "minItems": 1
                             }
                           },
-                          "required": [
-                            "type",
-                            "criteria"
-                          ],
+                          "required": ["type", "criteria"],
                           "additionalProperties": false
                         }
                       ]
@@ -2800,10 +2517,7 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": [
-                                    "code-grader",
-                                    "code_grader"
-                                  ]
+                                  "enum": ["code-grader", "code_grader"]
                                 },
                                 "command": {
                                   "anyOf": [
@@ -2877,18 +2591,12 @@
                                         ]
                                       }
                                     },
-                                    "required": [
-                                      "type",
-                                      "command"
-                                    ],
+                                    "required": ["type", "command"],
                                     "additionalProperties": false
                                   }
                                 }
                               },
-                              "required": [
-                                "type",
-                                "command"
-                              ],
+                              "required": ["type", "command"],
                               "additionalProperties": false
                             },
                             {
@@ -2925,10 +2633,7 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": [
-                                    "llm-grader",
-                                    "llm_grader"
-                                  ]
+                                  "enum": ["llm-grader", "llm_grader"]
                                 },
                                 "prompt": {
                                   "anyOf": [
@@ -3023,10 +2728,7 @@
                                               "minLength": 1
                                             }
                                           },
-                                          "required": [
-                                            "score_range",
-                                            "outcome"
-                                          ],
+                                          "required": ["score_range", "outcome"],
                                           "additionalProperties": false
                                         }
                                       }
@@ -3077,17 +2779,12 @@
                                         ]
                                       }
                                     },
-                                    "required": [
-                                      "type",
-                                      "command"
-                                    ],
+                                    "required": ["type", "command"],
                                     "additionalProperties": false
                                   }
                                 }
                               },
-                              "required": [
-                                "type"
-                              ],
+                              "required": ["type"],
                               "additionalProperties": false
                             },
                             {
@@ -3150,9 +2847,7 @@
                                           }
                                         }
                                       },
-                                      "required": [
-                                        "type"
-                                      ],
+                                      "required": ["type"],
                                       "additionalProperties": false
                                     },
                                     {
@@ -3168,10 +2863,7 @@
                                           "maximum": 1
                                         }
                                       },
-                                      "required": [
-                                        "type",
-                                        "threshold"
-                                      ],
+                                      "required": ["type", "threshold"],
                                       "additionalProperties": false
                                     },
                                     {
@@ -3188,10 +2880,7 @@
                                           "type": "string"
                                         }
                                       },
-                                      "required": [
-                                        "type",
-                                        "path"
-                                      ],
+                                      "required": ["type", "path"],
                                       "additionalProperties": false
                                     },
                                     {
@@ -3208,18 +2897,13 @@
                                           "type": "string"
                                         }
                                       },
-                                      "required": [
-                                        "type"
-                                      ],
+                                      "required": ["type"],
                                       "additionalProperties": false
                                     }
                                   ]
                                 }
                               },
-                              "required": [
-                                "type",
-                                "aggregator"
-                              ],
+                              "required": ["type", "aggregator"],
                               "additionalProperties": false
                             },
                             {
@@ -3256,20 +2940,11 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": [
-                                    "tool-trajectory",
-                                    "tool_trajectory"
-                                  ]
+                                  "enum": ["tool-trajectory", "tool_trajectory"]
                                 },
                                 "mode": {
                                   "type": "string",
-                                  "enum": [
-                                    "any_order",
-                                    "in_order",
-                                    "exact",
-                                    "subset",
-                                    "superset"
-                                  ]
+                                  "enum": ["any_order", "in_order", "exact", "subset", "superset"]
                                 },
                                 "minimums": {
                                   "type": "object",
@@ -3310,12 +2985,7 @@
                                         "anyOf": [
                                           {
                                             "type": "string",
-                                            "enum": [
-                                              "exact",
-                                              "ignore",
-                                              "subset",
-                                              "superset"
-                                            ]
+                                            "enum": ["exact", "ignore", "subset", "superset"]
                                           },
                                           {
                                             "type": "array",
@@ -3329,12 +2999,7 @@
                                         "anyOf": [
                                           {
                                             "type": "string",
-                                            "enum": [
-                                              "exact",
-                                              "ignore",
-                                              "subset",
-                                              "superset"
-                                            ]
+                                            "enum": ["exact", "ignore", "subset", "superset"]
                                           },
                                           {
                                             "type": "array",
@@ -3345,9 +3010,7 @@
                                         ]
                                       }
                                     },
-                                    "required": [
-                                      "tool"
-                                    ],
+                                    "required": ["tool"],
                                     "additionalProperties": false
                                   }
                                 },
@@ -3355,12 +3018,7 @@
                                   "anyOf": [
                                     {
                                       "type": "string",
-                                      "enum": [
-                                        "exact",
-                                        "ignore",
-                                        "subset",
-                                        "superset"
-                                      ]
+                                      "enum": ["exact", "ignore", "subset", "superset"]
                                     },
                                     {
                                       "type": "array",
@@ -3374,12 +3032,7 @@
                                   "anyOf": [
                                     {
                                       "type": "string",
-                                      "enum": [
-                                        "exact",
-                                        "ignore",
-                                        "subset",
-                                        "superset"
-                                      ]
+                                      "enum": ["exact", "ignore", "subset", "superset"]
                                     },
                                     {
                                       "type": "array",
@@ -3390,10 +3043,7 @@
                                   ]
                                 }
                               },
-                              "required": [
-                                "type",
-                                "mode"
-                              ],
+                              "required": ["type", "mode"],
                               "additionalProperties": false
                             },
                             {
@@ -3430,10 +3080,7 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": [
-                                    "field-accuracy",
-                                    "field_accuracy"
-                                  ]
+                                  "enum": ["field-accuracy", "field_accuracy"]
                                 },
                                 "fields": {
                                   "type": "array",
@@ -3445,11 +3092,7 @@
                                       },
                                       "match": {
                                         "type": "string",
-                                        "enum": [
-                                          "exact",
-                                          "numeric_tolerance",
-                                          "date"
-                                        ]
+                                        "enum": ["exact", "numeric_tolerance", "date"]
                                       },
                                       "required": {
                                         "type": "boolean"
@@ -3471,26 +3114,17 @@
                                         }
                                       }
                                     },
-                                    "required": [
-                                      "path",
-                                      "match"
-                                    ],
+                                    "required": ["path", "match"],
                                     "additionalProperties": false
                                   },
                                   "minItems": 1
                                 },
                                 "aggregation": {
                                   "type": "string",
-                                  "enum": [
-                                    "weighted_average",
-                                    "all_or_nothing"
-                                  ]
+                                  "enum": ["weighted_average", "all_or_nothing"]
                                 }
                               },
-                              "required": [
-                                "type",
-                                "fields"
-                              ],
+                              "required": ["type", "fields"],
                               "additionalProperties": false
                             },
                             {
@@ -3534,10 +3168,7 @@
                                   "minimum": 0
                                 }
                               },
-                              "required": [
-                                "type",
-                                "threshold"
-                              ],
+                              "required": ["type", "threshold"],
                               "additionalProperties": false
                             },
                             {
@@ -3581,10 +3212,7 @@
                                   "minimum": 0
                                 }
                               },
-                              "required": [
-                                "type",
-                                "budget"
-                              ],
+                              "required": ["type", "budget"],
                               "additionalProperties": false
                             },
                             {
@@ -3621,10 +3249,7 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": [
-                                    "token-usage",
-                                    "token_usage"
-                                  ]
+                                  "enum": ["token-usage", "token_usage"]
                                 },
                                 "max_total": {
                                   "type": "number",
@@ -3639,9 +3264,7 @@
                                   "minimum": 0
                                 }
                               },
-                              "required": [
-                                "type"
-                              ],
+                              "required": ["type"],
                               "additionalProperties": false
                             },
                             {
@@ -3678,10 +3301,7 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": [
-                                    "execution-metrics",
-                                    "execution_metrics"
-                                  ]
+                                  "enum": ["execution-metrics", "execution_metrics"]
                                 },
                                 "max_tool_calls": {
                                   "type": "number",
@@ -3713,9 +3333,7 @@
                                   "minimum": 0
                                 }
                               },
-                              "required": [
-                                "type"
-                              ],
+                              "required": ["type"],
                               "additionalProperties": false
                             },
                             {
@@ -3758,10 +3376,7 @@
                                   "type": "string"
                                 }
                               },
-                              "required": [
-                                "type",
-                                "value"
-                              ],
+                              "required": ["type", "value"],
                               "additionalProperties": false
                             },
                             {
@@ -3804,10 +3419,7 @@
                                   "type": "string"
                                 }
                               },
-                              "required": [
-                                "type",
-                                "value"
-                              ],
+                              "required": ["type", "value"],
                               "additionalProperties": false
                             },
                             {
@@ -3844,15 +3456,10 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": [
-                                    "is-json",
-                                    "is_json"
-                                  ]
+                                  "enum": ["is-json", "is_json"]
                                 }
                               },
-                              "required": [
-                                "type"
-                              ],
+                              "required": ["type"],
                               "additionalProperties": false
                             },
                             {
@@ -3895,10 +3502,7 @@
                                   "type": "string"
                                 }
                               },
-                              "required": [
-                                "type",
-                                "value"
-                              ],
+                              "required": ["type", "value"],
                               "additionalProperties": false
                             },
                             {
@@ -3987,10 +3591,7 @@
                                               "minLength": 1
                                             }
                                           },
-                                          "required": [
-                                            "score_range",
-                                            "outcome"
-                                          ],
+                                          "required": ["score_range", "outcome"],
                                           "additionalProperties": false
                                         }
                                       }
@@ -4000,10 +3601,7 @@
                                   "minItems": 1
                                 }
                               },
-                              "required": [
-                                "type",
-                                "criteria"
-                              ],
+                              "required": ["type", "criteria"],
                               "additionalProperties": false
                             }
                           ]
@@ -4047,10 +3645,7 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": [
-                                    "code-grader",
-                                    "code_grader"
-                                  ]
+                                  "enum": ["code-grader", "code_grader"]
                                 },
                                 "command": {
                                   "anyOf": [
@@ -4124,18 +3719,12 @@
                                         ]
                                       }
                                     },
-                                    "required": [
-                                      "type",
-                                      "command"
-                                    ],
+                                    "required": ["type", "command"],
                                     "additionalProperties": false
                                   }
                                 }
                               },
-                              "required": [
-                                "type",
-                                "command"
-                              ],
+                              "required": ["type", "command"],
                               "additionalProperties": false
                             },
                             {
@@ -4172,10 +3761,7 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": [
-                                    "llm-grader",
-                                    "llm_grader"
-                                  ]
+                                  "enum": ["llm-grader", "llm_grader"]
                                 },
                                 "prompt": {
                                   "anyOf": [
@@ -4270,10 +3856,7 @@
                                               "minLength": 1
                                             }
                                           },
-                                          "required": [
-                                            "score_range",
-                                            "outcome"
-                                          ],
+                                          "required": ["score_range", "outcome"],
                                           "additionalProperties": false
                                         }
                                       }
@@ -4324,17 +3907,12 @@
                                         ]
                                       }
                                     },
-                                    "required": [
-                                      "type",
-                                      "command"
-                                    ],
+                                    "required": ["type", "command"],
                                     "additionalProperties": false
                                   }
                                 }
                               },
-                              "required": [
-                                "type"
-                              ],
+                              "required": ["type"],
                               "additionalProperties": false
                             },
                             {
@@ -4397,9 +3975,7 @@
                                           }
                                         }
                                       },
-                                      "required": [
-                                        "type"
-                                      ],
+                                      "required": ["type"],
                                       "additionalProperties": false
                                     },
                                     {
@@ -4415,10 +3991,7 @@
                                           "maximum": 1
                                         }
                                       },
-                                      "required": [
-                                        "type",
-                                        "threshold"
-                                      ],
+                                      "required": ["type", "threshold"],
                                       "additionalProperties": false
                                     },
                                     {
@@ -4435,10 +4008,7 @@
                                           "type": "string"
                                         }
                                       },
-                                      "required": [
-                                        "type",
-                                        "path"
-                                      ],
+                                      "required": ["type", "path"],
                                       "additionalProperties": false
                                     },
                                     {
@@ -4455,18 +4025,13 @@
                                           "type": "string"
                                         }
                                       },
-                                      "required": [
-                                        "type"
-                                      ],
+                                      "required": ["type"],
                                       "additionalProperties": false
                                     }
                                   ]
                                 }
                               },
-                              "required": [
-                                "type",
-                                "aggregator"
-                              ],
+                              "required": ["type", "aggregator"],
                               "additionalProperties": false
                             },
                             {
@@ -4503,20 +4068,11 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": [
-                                    "tool-trajectory",
-                                    "tool_trajectory"
-                                  ]
+                                  "enum": ["tool-trajectory", "tool_trajectory"]
                                 },
                                 "mode": {
                                   "type": "string",
-                                  "enum": [
-                                    "any_order",
-                                    "in_order",
-                                    "exact",
-                                    "subset",
-                                    "superset"
-                                  ]
+                                  "enum": ["any_order", "in_order", "exact", "subset", "superset"]
                                 },
                                 "minimums": {
                                   "type": "object",
@@ -4557,12 +4113,7 @@
                                         "anyOf": [
                                           {
                                             "type": "string",
-                                            "enum": [
-                                              "exact",
-                                              "ignore",
-                                              "subset",
-                                              "superset"
-                                            ]
+                                            "enum": ["exact", "ignore", "subset", "superset"]
                                           },
                                           {
                                             "type": "array",
@@ -4576,12 +4127,7 @@
                                         "anyOf": [
                                           {
                                             "type": "string",
-                                            "enum": [
-                                              "exact",
-                                              "ignore",
-                                              "subset",
-                                              "superset"
-                                            ]
+                                            "enum": ["exact", "ignore", "subset", "superset"]
                                           },
                                           {
                                             "type": "array",
@@ -4592,9 +4138,7 @@
                                         ]
                                       }
                                     },
-                                    "required": [
-                                      "tool"
-                                    ],
+                                    "required": ["tool"],
                                     "additionalProperties": false
                                   }
                                 },
@@ -4602,12 +4146,7 @@
                                   "anyOf": [
                                     {
                                       "type": "string",
-                                      "enum": [
-                                        "exact",
-                                        "ignore",
-                                        "subset",
-                                        "superset"
-                                      ]
+                                      "enum": ["exact", "ignore", "subset", "superset"]
                                     },
                                     {
                                       "type": "array",
@@ -4621,12 +4160,7 @@
                                   "anyOf": [
                                     {
                                       "type": "string",
-                                      "enum": [
-                                        "exact",
-                                        "ignore",
-                                        "subset",
-                                        "superset"
-                                      ]
+                                      "enum": ["exact", "ignore", "subset", "superset"]
                                     },
                                     {
                                       "type": "array",
@@ -4637,10 +4171,7 @@
                                   ]
                                 }
                               },
-                              "required": [
-                                "type",
-                                "mode"
-                              ],
+                              "required": ["type", "mode"],
                               "additionalProperties": false
                             },
                             {
@@ -4677,10 +4208,7 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": [
-                                    "field-accuracy",
-                                    "field_accuracy"
-                                  ]
+                                  "enum": ["field-accuracy", "field_accuracy"]
                                 },
                                 "fields": {
                                   "type": "array",
@@ -4692,11 +4220,7 @@
                                       },
                                       "match": {
                                         "type": "string",
-                                        "enum": [
-                                          "exact",
-                                          "numeric_tolerance",
-                                          "date"
-                                        ]
+                                        "enum": ["exact", "numeric_tolerance", "date"]
                                       },
                                       "required": {
                                         "type": "boolean"
@@ -4718,26 +4242,17 @@
                                         }
                                       }
                                     },
-                                    "required": [
-                                      "path",
-                                      "match"
-                                    ],
+                                    "required": ["path", "match"],
                                     "additionalProperties": false
                                   },
                                   "minItems": 1
                                 },
                                 "aggregation": {
                                   "type": "string",
-                                  "enum": [
-                                    "weighted_average",
-                                    "all_or_nothing"
-                                  ]
+                                  "enum": ["weighted_average", "all_or_nothing"]
                                 }
                               },
-                              "required": [
-                                "type",
-                                "fields"
-                              ],
+                              "required": ["type", "fields"],
                               "additionalProperties": false
                             },
                             {
@@ -4781,10 +4296,7 @@
                                   "minimum": 0
                                 }
                               },
-                              "required": [
-                                "type",
-                                "threshold"
-                              ],
+                              "required": ["type", "threshold"],
                               "additionalProperties": false
                             },
                             {
@@ -4828,10 +4340,7 @@
                                   "minimum": 0
                                 }
                               },
-                              "required": [
-                                "type",
-                                "budget"
-                              ],
+                              "required": ["type", "budget"],
                               "additionalProperties": false
                             },
                             {
@@ -4868,10 +4377,7 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": [
-                                    "token-usage",
-                                    "token_usage"
-                                  ]
+                                  "enum": ["token-usage", "token_usage"]
                                 },
                                 "max_total": {
                                   "type": "number",
@@ -4886,9 +4392,7 @@
                                   "minimum": 0
                                 }
                               },
-                              "required": [
-                                "type"
-                              ],
+                              "required": ["type"],
                               "additionalProperties": false
                             },
                             {
@@ -4925,10 +4429,7 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": [
-                                    "execution-metrics",
-                                    "execution_metrics"
-                                  ]
+                                  "enum": ["execution-metrics", "execution_metrics"]
                                 },
                                 "max_tool_calls": {
                                   "type": "number",
@@ -4960,9 +4461,7 @@
                                   "minimum": 0
                                 }
                               },
-                              "required": [
-                                "type"
-                              ],
+                              "required": ["type"],
                               "additionalProperties": false
                             },
                             {
@@ -5005,10 +4504,7 @@
                                   "type": "string"
                                 }
                               },
-                              "required": [
-                                "type",
-                                "value"
-                              ],
+                              "required": ["type", "value"],
                               "additionalProperties": false
                             },
                             {
@@ -5051,10 +4547,7 @@
                                   "type": "string"
                                 }
                               },
-                              "required": [
-                                "type",
-                                "value"
-                              ],
+                              "required": ["type", "value"],
                               "additionalProperties": false
                             },
                             {
@@ -5091,15 +4584,10 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": [
-                                    "is-json",
-                                    "is_json"
-                                  ]
+                                  "enum": ["is-json", "is_json"]
                                 }
                               },
-                              "required": [
-                                "type"
-                              ],
+                              "required": ["type"],
                               "additionalProperties": false
                             },
                             {
@@ -5142,10 +4630,7 @@
                                   "type": "string"
                                 }
                               },
-                              "required": [
-                                "type",
-                                "value"
-                              ],
+                              "required": ["type", "value"],
                               "additionalProperties": false
                             },
                             {
@@ -5234,10 +4719,7 @@
                                               "minLength": 1
                                             }
                                           },
-                                          "required": [
-                                            "score_range",
-                                            "outcome"
-                                          ],
+                                          "required": ["score_range", "outcome"],
                                           "additionalProperties": false
                                         }
                                       }
@@ -5247,10 +4729,7 @@
                                   "minItems": 1
                                 }
                               },
-                              "required": [
-                                "type",
-                                "criteria"
-                              ],
+                              "required": ["type", "criteria"],
                               "additionalProperties": false
                             }
                           ]
@@ -5271,11 +4750,7 @@
                           },
                           "strategy": {
                             "type": "string",
-                            "enum": [
-                              "pass_at_k",
-                              "mean",
-                              "confidence_interval"
-                            ]
+                            "enum": ["pass_at_k", "mean", "confidence_interval"]
                           },
                           "cost_limit_usd": {
                             "type": "number",
@@ -5286,9 +4761,7 @@
                             "minimum": 0
                           }
                         },
-                        "required": [
-                          "count"
-                        ],
+                        "required": ["count"],
                         "additionalProperties": false
                       },
                       "total_budget_usd": {
@@ -5321,10 +4794,7 @@
                       },
                       "isolation": {
                         "type": "string",
-                        "enum": [
-                          "shared",
-                          "per_test"
-                        ]
+                        "enum": ["shared", "per_test"]
                       },
                       "repos": {
                         "type": "array",
@@ -5348,10 +4818,7 @@
                                       "format": "uri"
                                     }
                                   },
-                                  "required": [
-                                    "type",
-                                    "url"
-                                  ],
+                                  "required": ["type", "url"],
                                   "additionalProperties": false
                                 },
                                 {
@@ -5365,10 +4832,7 @@
                                       "type": "string"
                                     }
                                   },
-                                  "required": [
-                                    "type",
-                                    "path"
-                                  ],
+                                  "required": ["type", "path"],
                                   "additionalProperties": false
                                 }
                               ]
@@ -5381,10 +4845,7 @@
                                 },
                                 "resolve": {
                                   "type": "string",
-                                  "enum": [
-                                    "remote",
-                                    "local"
-                                  ]
+                                  "enum": ["remote", "local"]
                                 },
                                 "ancestor": {
                                   "type": "integer",
@@ -5413,10 +4874,7 @@
                               "additionalProperties": false
                             }
                           },
-                          "required": [
-                            "path",
-                            "source"
-                          ],
+                          "required": ["path", "source"],
                           "additionalProperties": false
                         }
                       },
@@ -5452,11 +4910,7 @@
                               },
                               "reset": {
                                 "type": "string",
-                                "enum": [
-                                  "none",
-                                  "fast",
-                                  "strict"
-                                ]
+                                "enum": ["none", "fast", "strict"]
                               }
                             },
                             "additionalProperties": false
@@ -5487,11 +4941,7 @@
                               },
                               "reset": {
                                 "type": "string",
-                                "enum": [
-                                  "none",
-                                  "fast",
-                                  "strict"
-                                ]
+                                "enum": ["none", "fast", "strict"]
                               }
                             },
                             "additionalProperties": false
@@ -5522,11 +4972,7 @@
                               },
                               "reset": {
                                 "type": "string",
-                                "enum": [
-                                  "none",
-                                  "fast",
-                                  "strict"
-                                ]
+                                "enum": ["none", "fast", "strict"]
                               }
                             },
                             "additionalProperties": false
@@ -5557,11 +5003,7 @@
                               },
                               "reset": {
                                 "type": "string",
-                                "enum": [
-                                  "none",
-                                  "fast",
-                                  "strict"
-                                ]
+                                "enum": ["none", "fast", "strict"]
                               }
                             },
                             "additionalProperties": false
@@ -5571,11 +5013,7 @@
                       },
                       "mode": {
                         "type": "string",
-                        "enum": [
-                          "pooled",
-                          "temp",
-                          "static"
-                        ]
+                        "enum": ["pooled", "temp", "static"]
                       },
                       "path": {
                         "type": "string"
@@ -5598,9 +5036,7 @@
                             "minimum": 0.1
                           }
                         },
-                        "required": [
-                          "image"
-                        ],
+                        "required": ["image"],
                         "additionalProperties": false
                       }
                     },
@@ -5620,9 +5056,7 @@
                     "type": "string"
                   }
                 },
-                "required": [
-                  "id"
-                ],
+                "required": ["id"],
                 "additionalProperties": false
               }
             },
@@ -5657,12 +5091,7 @@
                           "properties": {
                             "role": {
                               "type": "string",
-                              "enum": [
-                                "system",
-                                "user",
-                                "assistant",
-                                "tool"
-                              ]
+                              "enum": ["system", "user", "assistant", "tool"]
                             },
                             "content": {
                               "anyOf": [
@@ -5676,30 +5105,20 @@
                                     "properties": {
                                       "type": {
                                         "type": "string",
-                                        "enum": [
-                                          "text",
-                                          "file",
-                                          "image"
-                                        ]
+                                        "enum": ["text", "file", "image"]
                                       },
                                       "value": {
                                         "type": "string"
                                       }
                                     },
-                                    "required": [
-                                      "type",
-                                      "value"
-                                    ],
+                                    "required": ["type", "value"],
                                     "additionalProperties": false
                                   }
                                 }
                               ]
                             }
                           },
-                          "required": [
-                            "role",
-                            "content"
-                          ],
+                          "required": ["role", "content"],
                           "additionalProperties": false
                         }
                       }
@@ -5727,12 +5146,7 @@
                           "properties": {
                             "role": {
                               "type": "string",
-                              "enum": [
-                                "system",
-                                "user",
-                                "assistant",
-                                "tool"
-                              ]
+                              "enum": ["system", "user", "assistant", "tool"]
                             },
                             "content": {
                               "anyOf": [
@@ -5746,30 +5160,20 @@
                                     "properties": {
                                       "type": {
                                         "type": "string",
-                                        "enum": [
-                                          "text",
-                                          "file",
-                                          "image"
-                                        ]
+                                        "enum": ["text", "file", "image"]
                                       },
                                       "value": {
                                         "type": "string"
                                       }
                                     },
-                                    "required": [
-                                      "type",
-                                      "value"
-                                    ],
+                                    "required": ["type", "value"],
                                     "additionalProperties": false
                                   }
                                 }
                               ]
                             }
                           },
-                          "required": [
-                            "role",
-                            "content"
-                          ],
+                          "required": ["role", "content"],
                           "additionalProperties": false
                         }
                       }
@@ -5813,10 +5217,7 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": [
-                                "code-grader",
-                                "code_grader"
-                              ]
+                              "enum": ["code-grader", "code_grader"]
                             },
                             "command": {
                               "anyOf": [
@@ -5890,18 +5291,12 @@
                                     ]
                                   }
                                 },
-                                "required": [
-                                  "type",
-                                  "command"
-                                ],
+                                "required": ["type", "command"],
                                 "additionalProperties": false
                               }
                             }
                           },
-                          "required": [
-                            "type",
-                            "command"
-                          ],
+                          "required": ["type", "command"],
                           "additionalProperties": false
                         },
                         {
@@ -5938,10 +5333,7 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": [
-                                "llm-grader",
-                                "llm_grader"
-                              ]
+                              "enum": ["llm-grader", "llm_grader"]
                             },
                             "prompt": {
                               "anyOf": [
@@ -6036,10 +5428,7 @@
                                           "minLength": 1
                                         }
                                       },
-                                      "required": [
-                                        "score_range",
-                                        "outcome"
-                                      ],
+                                      "required": ["score_range", "outcome"],
                                       "additionalProperties": false
                                     }
                                   }
@@ -6090,17 +5479,12 @@
                                     ]
                                   }
                                 },
-                                "required": [
-                                  "type",
-                                  "command"
-                                ],
+                                "required": ["type", "command"],
                                 "additionalProperties": false
                               }
                             }
                           },
-                          "required": [
-                            "type"
-                          ],
+                          "required": ["type"],
                           "additionalProperties": false
                         },
                         {
@@ -6163,9 +5547,7 @@
                                       }
                                     }
                                   },
-                                  "required": [
-                                    "type"
-                                  ],
+                                  "required": ["type"],
                                   "additionalProperties": false
                                 },
                                 {
@@ -6181,10 +5563,7 @@
                                       "maximum": 1
                                     }
                                   },
-                                  "required": [
-                                    "type",
-                                    "threshold"
-                                  ],
+                                  "required": ["type", "threshold"],
                                   "additionalProperties": false
                                 },
                                 {
@@ -6201,10 +5580,7 @@
                                       "type": "string"
                                     }
                                   },
-                                  "required": [
-                                    "type",
-                                    "path"
-                                  ],
+                                  "required": ["type", "path"],
                                   "additionalProperties": false
                                 },
                                 {
@@ -6221,18 +5597,13 @@
                                       "type": "string"
                                     }
                                   },
-                                  "required": [
-                                    "type"
-                                  ],
+                                  "required": ["type"],
                                   "additionalProperties": false
                                 }
                               ]
                             }
                           },
-                          "required": [
-                            "type",
-                            "aggregator"
-                          ],
+                          "required": ["type", "aggregator"],
                           "additionalProperties": false
                         },
                         {
@@ -6269,20 +5640,11 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": [
-                                "tool-trajectory",
-                                "tool_trajectory"
-                              ]
+                              "enum": ["tool-trajectory", "tool_trajectory"]
                             },
                             "mode": {
                               "type": "string",
-                              "enum": [
-                                "any_order",
-                                "in_order",
-                                "exact",
-                                "subset",
-                                "superset"
-                              ]
+                              "enum": ["any_order", "in_order", "exact", "subset", "superset"]
                             },
                             "minimums": {
                               "type": "object",
@@ -6323,12 +5685,7 @@
                                     "anyOf": [
                                       {
                                         "type": "string",
-                                        "enum": [
-                                          "exact",
-                                          "ignore",
-                                          "subset",
-                                          "superset"
-                                        ]
+                                        "enum": ["exact", "ignore", "subset", "superset"]
                                       },
                                       {
                                         "type": "array",
@@ -6342,12 +5699,7 @@
                                     "anyOf": [
                                       {
                                         "type": "string",
-                                        "enum": [
-                                          "exact",
-                                          "ignore",
-                                          "subset",
-                                          "superset"
-                                        ]
+                                        "enum": ["exact", "ignore", "subset", "superset"]
                                       },
                                       {
                                         "type": "array",
@@ -6358,9 +5710,7 @@
                                     ]
                                   }
                                 },
-                                "required": [
-                                  "tool"
-                                ],
+                                "required": ["tool"],
                                 "additionalProperties": false
                               }
                             },
@@ -6368,12 +5718,7 @@
                               "anyOf": [
                                 {
                                   "type": "string",
-                                  "enum": [
-                                    "exact",
-                                    "ignore",
-                                    "subset",
-                                    "superset"
-                                  ]
+                                  "enum": ["exact", "ignore", "subset", "superset"]
                                 },
                                 {
                                   "type": "array",
@@ -6387,12 +5732,7 @@
                               "anyOf": [
                                 {
                                   "type": "string",
-                                  "enum": [
-                                    "exact",
-                                    "ignore",
-                                    "subset",
-                                    "superset"
-                                  ]
+                                  "enum": ["exact", "ignore", "subset", "superset"]
                                 },
                                 {
                                   "type": "array",
@@ -6403,10 +5743,7 @@
                               ]
                             }
                           },
-                          "required": [
-                            "type",
-                            "mode"
-                          ],
+                          "required": ["type", "mode"],
                           "additionalProperties": false
                         },
                         {
@@ -6443,10 +5780,7 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": [
-                                "field-accuracy",
-                                "field_accuracy"
-                              ]
+                              "enum": ["field-accuracy", "field_accuracy"]
                             },
                             "fields": {
                               "type": "array",
@@ -6458,11 +5792,7 @@
                                   },
                                   "match": {
                                     "type": "string",
-                                    "enum": [
-                                      "exact",
-                                      "numeric_tolerance",
-                                      "date"
-                                    ]
+                                    "enum": ["exact", "numeric_tolerance", "date"]
                                   },
                                   "required": {
                                     "type": "boolean"
@@ -6484,26 +5814,17 @@
                                     }
                                   }
                                 },
-                                "required": [
-                                  "path",
-                                  "match"
-                                ],
+                                "required": ["path", "match"],
                                 "additionalProperties": false
                               },
                               "minItems": 1
                             },
                             "aggregation": {
                               "type": "string",
-                              "enum": [
-                                "weighted_average",
-                                "all_or_nothing"
-                              ]
+                              "enum": ["weighted_average", "all_or_nothing"]
                             }
                           },
-                          "required": [
-                            "type",
-                            "fields"
-                          ],
+                          "required": ["type", "fields"],
                           "additionalProperties": false
                         },
                         {
@@ -6547,10 +5868,7 @@
                               "minimum": 0
                             }
                           },
-                          "required": [
-                            "type",
-                            "threshold"
-                          ],
+                          "required": ["type", "threshold"],
                           "additionalProperties": false
                         },
                         {
@@ -6594,10 +5912,7 @@
                               "minimum": 0
                             }
                           },
-                          "required": [
-                            "type",
-                            "budget"
-                          ],
+                          "required": ["type", "budget"],
                           "additionalProperties": false
                         },
                         {
@@ -6634,10 +5949,7 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": [
-                                "token-usage",
-                                "token_usage"
-                              ]
+                              "enum": ["token-usage", "token_usage"]
                             },
                             "max_total": {
                               "type": "number",
@@ -6652,9 +5964,7 @@
                               "minimum": 0
                             }
                           },
-                          "required": [
-                            "type"
-                          ],
+                          "required": ["type"],
                           "additionalProperties": false
                         },
                         {
@@ -6691,10 +6001,7 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": [
-                                "execution-metrics",
-                                "execution_metrics"
-                              ]
+                              "enum": ["execution-metrics", "execution_metrics"]
                             },
                             "max_tool_calls": {
                               "type": "number",
@@ -6726,9 +6033,7 @@
                               "minimum": 0
                             }
                           },
-                          "required": [
-                            "type"
-                          ],
+                          "required": ["type"],
                           "additionalProperties": false
                         },
                         {
@@ -6771,10 +6076,7 @@
                               "type": "string"
                             }
                           },
-                          "required": [
-                            "type",
-                            "value"
-                          ],
+                          "required": ["type", "value"],
                           "additionalProperties": false
                         },
                         {
@@ -6817,10 +6119,7 @@
                               "type": "string"
                             }
                           },
-                          "required": [
-                            "type",
-                            "value"
-                          ],
+                          "required": ["type", "value"],
                           "additionalProperties": false
                         },
                         {
@@ -6857,15 +6156,10 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": [
-                                "is-json",
-                                "is_json"
-                              ]
+                              "enum": ["is-json", "is_json"]
                             }
                           },
-                          "required": [
-                            "type"
-                          ],
+                          "required": ["type"],
                           "additionalProperties": false
                         },
                         {
@@ -6908,10 +6202,7 @@
                               "type": "string"
                             }
                           },
-                          "required": [
-                            "type",
-                            "value"
-                          ],
+                          "required": ["type", "value"],
                           "additionalProperties": false
                         },
                         {
@@ -7000,10 +6291,7 @@
                                           "minLength": 1
                                         }
                                       },
-                                      "required": [
-                                        "score_range",
-                                        "outcome"
-                                      ],
+                                      "required": ["score_range", "outcome"],
                                       "additionalProperties": false
                                     }
                                   }
@@ -7013,10 +6301,7 @@
                               "minItems": 1
                             }
                           },
-                          "required": [
-                            "type",
-                            "criteria"
-                          ],
+                          "required": ["type", "criteria"],
                           "additionalProperties": false
                         }
                       ]
@@ -7060,10 +6345,7 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": [
-                                "code-grader",
-                                "code_grader"
-                              ]
+                              "enum": ["code-grader", "code_grader"]
                             },
                             "command": {
                               "anyOf": [
@@ -7137,18 +6419,12 @@
                                     ]
                                   }
                                 },
-                                "required": [
-                                  "type",
-                                  "command"
-                                ],
+                                "required": ["type", "command"],
                                 "additionalProperties": false
                               }
                             }
                           },
-                          "required": [
-                            "type",
-                            "command"
-                          ],
+                          "required": ["type", "command"],
                           "additionalProperties": false
                         },
                         {
@@ -7185,10 +6461,7 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": [
-                                "llm-grader",
-                                "llm_grader"
-                              ]
+                              "enum": ["llm-grader", "llm_grader"]
                             },
                             "prompt": {
                               "anyOf": [
@@ -7283,10 +6556,7 @@
                                           "minLength": 1
                                         }
                                       },
-                                      "required": [
-                                        "score_range",
-                                        "outcome"
-                                      ],
+                                      "required": ["score_range", "outcome"],
                                       "additionalProperties": false
                                     }
                                   }
@@ -7337,17 +6607,12 @@
                                     ]
                                   }
                                 },
-                                "required": [
-                                  "type",
-                                  "command"
-                                ],
+                                "required": ["type", "command"],
                                 "additionalProperties": false
                               }
                             }
                           },
-                          "required": [
-                            "type"
-                          ],
+                          "required": ["type"],
                           "additionalProperties": false
                         },
                         {
@@ -7410,9 +6675,7 @@
                                       }
                                     }
                                   },
-                                  "required": [
-                                    "type"
-                                  ],
+                                  "required": ["type"],
                                   "additionalProperties": false
                                 },
                                 {
@@ -7428,10 +6691,7 @@
                                       "maximum": 1
                                     }
                                   },
-                                  "required": [
-                                    "type",
-                                    "threshold"
-                                  ],
+                                  "required": ["type", "threshold"],
                                   "additionalProperties": false
                                 },
                                 {
@@ -7448,10 +6708,7 @@
                                       "type": "string"
                                     }
                                   },
-                                  "required": [
-                                    "type",
-                                    "path"
-                                  ],
+                                  "required": ["type", "path"],
                                   "additionalProperties": false
                                 },
                                 {
@@ -7468,18 +6725,13 @@
                                       "type": "string"
                                     }
                                   },
-                                  "required": [
-                                    "type"
-                                  ],
+                                  "required": ["type"],
                                   "additionalProperties": false
                                 }
                               ]
                             }
                           },
-                          "required": [
-                            "type",
-                            "aggregator"
-                          ],
+                          "required": ["type", "aggregator"],
                           "additionalProperties": false
                         },
                         {
@@ -7516,20 +6768,11 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": [
-                                "tool-trajectory",
-                                "tool_trajectory"
-                              ]
+                              "enum": ["tool-trajectory", "tool_trajectory"]
                             },
                             "mode": {
                               "type": "string",
-                              "enum": [
-                                "any_order",
-                                "in_order",
-                                "exact",
-                                "subset",
-                                "superset"
-                              ]
+                              "enum": ["any_order", "in_order", "exact", "subset", "superset"]
                             },
                             "minimums": {
                               "type": "object",
@@ -7570,12 +6813,7 @@
                                     "anyOf": [
                                       {
                                         "type": "string",
-                                        "enum": [
-                                          "exact",
-                                          "ignore",
-                                          "subset",
-                                          "superset"
-                                        ]
+                                        "enum": ["exact", "ignore", "subset", "superset"]
                                       },
                                       {
                                         "type": "array",
@@ -7589,12 +6827,7 @@
                                     "anyOf": [
                                       {
                                         "type": "string",
-                                        "enum": [
-                                          "exact",
-                                          "ignore",
-                                          "subset",
-                                          "superset"
-                                        ]
+                                        "enum": ["exact", "ignore", "subset", "superset"]
                                       },
                                       {
                                         "type": "array",
@@ -7605,9 +6838,7 @@
                                     ]
                                   }
                                 },
-                                "required": [
-                                  "tool"
-                                ],
+                                "required": ["tool"],
                                 "additionalProperties": false
                               }
                             },
@@ -7615,12 +6846,7 @@
                               "anyOf": [
                                 {
                                   "type": "string",
-                                  "enum": [
-                                    "exact",
-                                    "ignore",
-                                    "subset",
-                                    "superset"
-                                  ]
+                                  "enum": ["exact", "ignore", "subset", "superset"]
                                 },
                                 {
                                   "type": "array",
@@ -7634,12 +6860,7 @@
                               "anyOf": [
                                 {
                                   "type": "string",
-                                  "enum": [
-                                    "exact",
-                                    "ignore",
-                                    "subset",
-                                    "superset"
-                                  ]
+                                  "enum": ["exact", "ignore", "subset", "superset"]
                                 },
                                 {
                                   "type": "array",
@@ -7650,10 +6871,7 @@
                               ]
                             }
                           },
-                          "required": [
-                            "type",
-                            "mode"
-                          ],
+                          "required": ["type", "mode"],
                           "additionalProperties": false
                         },
                         {
@@ -7690,10 +6908,7 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": [
-                                "field-accuracy",
-                                "field_accuracy"
-                              ]
+                              "enum": ["field-accuracy", "field_accuracy"]
                             },
                             "fields": {
                               "type": "array",
@@ -7705,11 +6920,7 @@
                                   },
                                   "match": {
                                     "type": "string",
-                                    "enum": [
-                                      "exact",
-                                      "numeric_tolerance",
-                                      "date"
-                                    ]
+                                    "enum": ["exact", "numeric_tolerance", "date"]
                                   },
                                   "required": {
                                     "type": "boolean"
@@ -7731,26 +6942,17 @@
                                     }
                                   }
                                 },
-                                "required": [
-                                  "path",
-                                  "match"
-                                ],
+                                "required": ["path", "match"],
                                 "additionalProperties": false
                               },
                               "minItems": 1
                             },
                             "aggregation": {
                               "type": "string",
-                              "enum": [
-                                "weighted_average",
-                                "all_or_nothing"
-                              ]
+                              "enum": ["weighted_average", "all_or_nothing"]
                             }
                           },
-                          "required": [
-                            "type",
-                            "fields"
-                          ],
+                          "required": ["type", "fields"],
                           "additionalProperties": false
                         },
                         {
@@ -7794,10 +6996,7 @@
                               "minimum": 0
                             }
                           },
-                          "required": [
-                            "type",
-                            "threshold"
-                          ],
+                          "required": ["type", "threshold"],
                           "additionalProperties": false
                         },
                         {
@@ -7841,10 +7040,7 @@
                               "minimum": 0
                             }
                           },
-                          "required": [
-                            "type",
-                            "budget"
-                          ],
+                          "required": ["type", "budget"],
                           "additionalProperties": false
                         },
                         {
@@ -7881,10 +7077,7 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": [
-                                "token-usage",
-                                "token_usage"
-                              ]
+                              "enum": ["token-usage", "token_usage"]
                             },
                             "max_total": {
                               "type": "number",
@@ -7899,9 +7092,7 @@
                               "minimum": 0
                             }
                           },
-                          "required": [
-                            "type"
-                          ],
+                          "required": ["type"],
                           "additionalProperties": false
                         },
                         {
@@ -7938,10 +7129,7 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": [
-                                "execution-metrics",
-                                "execution_metrics"
-                              ]
+                              "enum": ["execution-metrics", "execution_metrics"]
                             },
                             "max_tool_calls": {
                               "type": "number",
@@ -7973,9 +7161,7 @@
                               "minimum": 0
                             }
                           },
-                          "required": [
-                            "type"
-                          ],
+                          "required": ["type"],
                           "additionalProperties": false
                         },
                         {
@@ -8018,10 +7204,7 @@
                               "type": "string"
                             }
                           },
-                          "required": [
-                            "type",
-                            "value"
-                          ],
+                          "required": ["type", "value"],
                           "additionalProperties": false
                         },
                         {
@@ -8064,10 +7247,7 @@
                               "type": "string"
                             }
                           },
-                          "required": [
-                            "type",
-                            "value"
-                          ],
+                          "required": ["type", "value"],
                           "additionalProperties": false
                         },
                         {
@@ -8104,15 +7284,10 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": [
-                                "is-json",
-                                "is_json"
-                              ]
+                              "enum": ["is-json", "is_json"]
                             }
                           },
-                          "required": [
-                            "type"
-                          ],
+                          "required": ["type"],
                           "additionalProperties": false
                         },
                         {
@@ -8155,10 +7330,7 @@
                               "type": "string"
                             }
                           },
-                          "required": [
-                            "type",
-                            "value"
-                          ],
+                          "required": ["type", "value"],
                           "additionalProperties": false
                         },
                         {
@@ -8247,10 +7419,7 @@
                                           "minLength": 1
                                         }
                                       },
-                                      "required": [
-                                        "score_range",
-                                        "outcome"
-                                      ],
+                                      "required": ["score_range", "outcome"],
                                       "additionalProperties": false
                                     }
                                   }
@@ -8260,10 +7429,7 @@
                               "minItems": 1
                             }
                           },
-                          "required": [
-                            "type",
-                            "criteria"
-                          ],
+                          "required": ["type", "criteria"],
                           "additionalProperties": false
                         }
                       ]
@@ -8324,10 +7490,7 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": [
-                                    "code-grader",
-                                    "code_grader"
-                                  ]
+                                  "enum": ["code-grader", "code_grader"]
                                 },
                                 "command": {
                                   "anyOf": [
@@ -8401,18 +7564,12 @@
                                         ]
                                       }
                                     },
-                                    "required": [
-                                      "type",
-                                      "command"
-                                    ],
+                                    "required": ["type", "command"],
                                     "additionalProperties": false
                                   }
                                 }
                               },
-                              "required": [
-                                "type",
-                                "command"
-                              ],
+                              "required": ["type", "command"],
                               "additionalProperties": false
                             },
                             {
@@ -8449,10 +7606,7 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": [
-                                    "llm-grader",
-                                    "llm_grader"
-                                  ]
+                                  "enum": ["llm-grader", "llm_grader"]
                                 },
                                 "prompt": {
                                   "anyOf": [
@@ -8547,10 +7701,7 @@
                                               "minLength": 1
                                             }
                                           },
-                                          "required": [
-                                            "score_range",
-                                            "outcome"
-                                          ],
+                                          "required": ["score_range", "outcome"],
                                           "additionalProperties": false
                                         }
                                       }
@@ -8601,17 +7752,12 @@
                                         ]
                                       }
                                     },
-                                    "required": [
-                                      "type",
-                                      "command"
-                                    ],
+                                    "required": ["type", "command"],
                                     "additionalProperties": false
                                   }
                                 }
                               },
-                              "required": [
-                                "type"
-                              ],
+                              "required": ["type"],
                               "additionalProperties": false
                             },
                             {
@@ -8674,9 +7820,7 @@
                                           }
                                         }
                                       },
-                                      "required": [
-                                        "type"
-                                      ],
+                                      "required": ["type"],
                                       "additionalProperties": false
                                     },
                                     {
@@ -8692,10 +7836,7 @@
                                           "maximum": 1
                                         }
                                       },
-                                      "required": [
-                                        "type",
-                                        "threshold"
-                                      ],
+                                      "required": ["type", "threshold"],
                                       "additionalProperties": false
                                     },
                                     {
@@ -8712,10 +7853,7 @@
                                           "type": "string"
                                         }
                                       },
-                                      "required": [
-                                        "type",
-                                        "path"
-                                      ],
+                                      "required": ["type", "path"],
                                       "additionalProperties": false
                                     },
                                     {
@@ -8732,18 +7870,13 @@
                                           "type": "string"
                                         }
                                       },
-                                      "required": [
-                                        "type"
-                                      ],
+                                      "required": ["type"],
                                       "additionalProperties": false
                                     }
                                   ]
                                 }
                               },
-                              "required": [
-                                "type",
-                                "aggregator"
-                              ],
+                              "required": ["type", "aggregator"],
                               "additionalProperties": false
                             },
                             {
@@ -8780,20 +7913,11 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": [
-                                    "tool-trajectory",
-                                    "tool_trajectory"
-                                  ]
+                                  "enum": ["tool-trajectory", "tool_trajectory"]
                                 },
                                 "mode": {
                                   "type": "string",
-                                  "enum": [
-                                    "any_order",
-                                    "in_order",
-                                    "exact",
-                                    "subset",
-                                    "superset"
-                                  ]
+                                  "enum": ["any_order", "in_order", "exact", "subset", "superset"]
                                 },
                                 "minimums": {
                                   "type": "object",
@@ -8834,12 +7958,7 @@
                                         "anyOf": [
                                           {
                                             "type": "string",
-                                            "enum": [
-                                              "exact",
-                                              "ignore",
-                                              "subset",
-                                              "superset"
-                                            ]
+                                            "enum": ["exact", "ignore", "subset", "superset"]
                                           },
                                           {
                                             "type": "array",
@@ -8853,12 +7972,7 @@
                                         "anyOf": [
                                           {
                                             "type": "string",
-                                            "enum": [
-                                              "exact",
-                                              "ignore",
-                                              "subset",
-                                              "superset"
-                                            ]
+                                            "enum": ["exact", "ignore", "subset", "superset"]
                                           },
                                           {
                                             "type": "array",
@@ -8869,9 +7983,7 @@
                                         ]
                                       }
                                     },
-                                    "required": [
-                                      "tool"
-                                    ],
+                                    "required": ["tool"],
                                     "additionalProperties": false
                                   }
                                 },
@@ -8879,12 +7991,7 @@
                                   "anyOf": [
                                     {
                                       "type": "string",
-                                      "enum": [
-                                        "exact",
-                                        "ignore",
-                                        "subset",
-                                        "superset"
-                                      ]
+                                      "enum": ["exact", "ignore", "subset", "superset"]
                                     },
                                     {
                                       "type": "array",
@@ -8898,12 +8005,7 @@
                                   "anyOf": [
                                     {
                                       "type": "string",
-                                      "enum": [
-                                        "exact",
-                                        "ignore",
-                                        "subset",
-                                        "superset"
-                                      ]
+                                      "enum": ["exact", "ignore", "subset", "superset"]
                                     },
                                     {
                                       "type": "array",
@@ -8914,10 +8016,7 @@
                                   ]
                                 }
                               },
-                              "required": [
-                                "type",
-                                "mode"
-                              ],
+                              "required": ["type", "mode"],
                               "additionalProperties": false
                             },
                             {
@@ -8954,10 +8053,7 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": [
-                                    "field-accuracy",
-                                    "field_accuracy"
-                                  ]
+                                  "enum": ["field-accuracy", "field_accuracy"]
                                 },
                                 "fields": {
                                   "type": "array",
@@ -8969,11 +8065,7 @@
                                       },
                                       "match": {
                                         "type": "string",
-                                        "enum": [
-                                          "exact",
-                                          "numeric_tolerance",
-                                          "date"
-                                        ]
+                                        "enum": ["exact", "numeric_tolerance", "date"]
                                       },
                                       "required": {
                                         "type": "boolean"
@@ -8995,26 +8087,17 @@
                                         }
                                       }
                                     },
-                                    "required": [
-                                      "path",
-                                      "match"
-                                    ],
+                                    "required": ["path", "match"],
                                     "additionalProperties": false
                                   },
                                   "minItems": 1
                                 },
                                 "aggregation": {
                                   "type": "string",
-                                  "enum": [
-                                    "weighted_average",
-                                    "all_or_nothing"
-                                  ]
+                                  "enum": ["weighted_average", "all_or_nothing"]
                                 }
                               },
-                              "required": [
-                                "type",
-                                "fields"
-                              ],
+                              "required": ["type", "fields"],
                               "additionalProperties": false
                             },
                             {
@@ -9058,10 +8141,7 @@
                                   "minimum": 0
                                 }
                               },
-                              "required": [
-                                "type",
-                                "threshold"
-                              ],
+                              "required": ["type", "threshold"],
                               "additionalProperties": false
                             },
                             {
@@ -9105,10 +8185,7 @@
                                   "minimum": 0
                                 }
                               },
-                              "required": [
-                                "type",
-                                "budget"
-                              ],
+                              "required": ["type", "budget"],
                               "additionalProperties": false
                             },
                             {
@@ -9145,10 +8222,7 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": [
-                                    "token-usage",
-                                    "token_usage"
-                                  ]
+                                  "enum": ["token-usage", "token_usage"]
                                 },
                                 "max_total": {
                                   "type": "number",
@@ -9163,9 +8237,7 @@
                                   "minimum": 0
                                 }
                               },
-                              "required": [
-                                "type"
-                              ],
+                              "required": ["type"],
                               "additionalProperties": false
                             },
                             {
@@ -9202,10 +8274,7 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": [
-                                    "execution-metrics",
-                                    "execution_metrics"
-                                  ]
+                                  "enum": ["execution-metrics", "execution_metrics"]
                                 },
                                 "max_tool_calls": {
                                   "type": "number",
@@ -9237,9 +8306,7 @@
                                   "minimum": 0
                                 }
                               },
-                              "required": [
-                                "type"
-                              ],
+                              "required": ["type"],
                               "additionalProperties": false
                             },
                             {
@@ -9282,10 +8349,7 @@
                                   "type": "string"
                                 }
                               },
-                              "required": [
-                                "type",
-                                "value"
-                              ],
+                              "required": ["type", "value"],
                               "additionalProperties": false
                             },
                             {
@@ -9328,10 +8392,7 @@
                                   "type": "string"
                                 }
                               },
-                              "required": [
-                                "type",
-                                "value"
-                              ],
+                              "required": ["type", "value"],
                               "additionalProperties": false
                             },
                             {
@@ -9368,15 +8429,10 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": [
-                                    "is-json",
-                                    "is_json"
-                                  ]
+                                  "enum": ["is-json", "is_json"]
                                 }
                               },
-                              "required": [
-                                "type"
-                              ],
+                              "required": ["type"],
                               "additionalProperties": false
                             },
                             {
@@ -9419,10 +8475,7 @@
                                   "type": "string"
                                 }
                               },
-                              "required": [
-                                "type",
-                                "value"
-                              ],
+                              "required": ["type", "value"],
                               "additionalProperties": false
                             },
                             {
@@ -9511,10 +8564,7 @@
                                               "minLength": 1
                                             }
                                           },
-                                          "required": [
-                                            "score_range",
-                                            "outcome"
-                                          ],
+                                          "required": ["score_range", "outcome"],
                                           "additionalProperties": false
                                         }
                                       }
@@ -9524,10 +8574,7 @@
                                   "minItems": 1
                                 }
                               },
-                              "required": [
-                                "type",
-                                "criteria"
-                              ],
+                              "required": ["type", "criteria"],
                               "additionalProperties": false
                             }
                           ]
@@ -9571,10 +8618,7 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": [
-                                    "code-grader",
-                                    "code_grader"
-                                  ]
+                                  "enum": ["code-grader", "code_grader"]
                                 },
                                 "command": {
                                   "anyOf": [
@@ -9648,18 +8692,12 @@
                                         ]
                                       }
                                     },
-                                    "required": [
-                                      "type",
-                                      "command"
-                                    ],
+                                    "required": ["type", "command"],
                                     "additionalProperties": false
                                   }
                                 }
                               },
-                              "required": [
-                                "type",
-                                "command"
-                              ],
+                              "required": ["type", "command"],
                               "additionalProperties": false
                             },
                             {
@@ -9696,10 +8734,7 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": [
-                                    "llm-grader",
-                                    "llm_grader"
-                                  ]
+                                  "enum": ["llm-grader", "llm_grader"]
                                 },
                                 "prompt": {
                                   "anyOf": [
@@ -9794,10 +8829,7 @@
                                               "minLength": 1
                                             }
                                           },
-                                          "required": [
-                                            "score_range",
-                                            "outcome"
-                                          ],
+                                          "required": ["score_range", "outcome"],
                                           "additionalProperties": false
                                         }
                                       }
@@ -9848,17 +8880,12 @@
                                         ]
                                       }
                                     },
-                                    "required": [
-                                      "type",
-                                      "command"
-                                    ],
+                                    "required": ["type", "command"],
                                     "additionalProperties": false
                                   }
                                 }
                               },
-                              "required": [
-                                "type"
-                              ],
+                              "required": ["type"],
                               "additionalProperties": false
                             },
                             {
@@ -9921,9 +8948,7 @@
                                           }
                                         }
                                       },
-                                      "required": [
-                                        "type"
-                                      ],
+                                      "required": ["type"],
                                       "additionalProperties": false
                                     },
                                     {
@@ -9939,10 +8964,7 @@
                                           "maximum": 1
                                         }
                                       },
-                                      "required": [
-                                        "type",
-                                        "threshold"
-                                      ],
+                                      "required": ["type", "threshold"],
                                       "additionalProperties": false
                                     },
                                     {
@@ -9959,10 +8981,7 @@
                                           "type": "string"
                                         }
                                       },
-                                      "required": [
-                                        "type",
-                                        "path"
-                                      ],
+                                      "required": ["type", "path"],
                                       "additionalProperties": false
                                     },
                                     {
@@ -9979,18 +8998,13 @@
                                           "type": "string"
                                         }
                                       },
-                                      "required": [
-                                        "type"
-                                      ],
+                                      "required": ["type"],
                                       "additionalProperties": false
                                     }
                                   ]
                                 }
                               },
-                              "required": [
-                                "type",
-                                "aggregator"
-                              ],
+                              "required": ["type", "aggregator"],
                               "additionalProperties": false
                             },
                             {
@@ -10027,20 +9041,11 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": [
-                                    "tool-trajectory",
-                                    "tool_trajectory"
-                                  ]
+                                  "enum": ["tool-trajectory", "tool_trajectory"]
                                 },
                                 "mode": {
                                   "type": "string",
-                                  "enum": [
-                                    "any_order",
-                                    "in_order",
-                                    "exact",
-                                    "subset",
-                                    "superset"
-                                  ]
+                                  "enum": ["any_order", "in_order", "exact", "subset", "superset"]
                                 },
                                 "minimums": {
                                   "type": "object",
@@ -10081,12 +9086,7 @@
                                         "anyOf": [
                                           {
                                             "type": "string",
-                                            "enum": [
-                                              "exact",
-                                              "ignore",
-                                              "subset",
-                                              "superset"
-                                            ]
+                                            "enum": ["exact", "ignore", "subset", "superset"]
                                           },
                                           {
                                             "type": "array",
@@ -10100,12 +9100,7 @@
                                         "anyOf": [
                                           {
                                             "type": "string",
-                                            "enum": [
-                                              "exact",
-                                              "ignore",
-                                              "subset",
-                                              "superset"
-                                            ]
+                                            "enum": ["exact", "ignore", "subset", "superset"]
                                           },
                                           {
                                             "type": "array",
@@ -10116,9 +9111,7 @@
                                         ]
                                       }
                                     },
-                                    "required": [
-                                      "tool"
-                                    ],
+                                    "required": ["tool"],
                                     "additionalProperties": false
                                   }
                                 },
@@ -10126,12 +9119,7 @@
                                   "anyOf": [
                                     {
                                       "type": "string",
-                                      "enum": [
-                                        "exact",
-                                        "ignore",
-                                        "subset",
-                                        "superset"
-                                      ]
+                                      "enum": ["exact", "ignore", "subset", "superset"]
                                     },
                                     {
                                       "type": "array",
@@ -10145,12 +9133,7 @@
                                   "anyOf": [
                                     {
                                       "type": "string",
-                                      "enum": [
-                                        "exact",
-                                        "ignore",
-                                        "subset",
-                                        "superset"
-                                      ]
+                                      "enum": ["exact", "ignore", "subset", "superset"]
                                     },
                                     {
                                       "type": "array",
@@ -10161,10 +9144,7 @@
                                   ]
                                 }
                               },
-                              "required": [
-                                "type",
-                                "mode"
-                              ],
+                              "required": ["type", "mode"],
                               "additionalProperties": false
                             },
                             {
@@ -10201,10 +9181,7 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": [
-                                    "field-accuracy",
-                                    "field_accuracy"
-                                  ]
+                                  "enum": ["field-accuracy", "field_accuracy"]
                                 },
                                 "fields": {
                                   "type": "array",
@@ -10216,11 +9193,7 @@
                                       },
                                       "match": {
                                         "type": "string",
-                                        "enum": [
-                                          "exact",
-                                          "numeric_tolerance",
-                                          "date"
-                                        ]
+                                        "enum": ["exact", "numeric_tolerance", "date"]
                                       },
                                       "required": {
                                         "type": "boolean"
@@ -10242,26 +9215,17 @@
                                         }
                                       }
                                     },
-                                    "required": [
-                                      "path",
-                                      "match"
-                                    ],
+                                    "required": ["path", "match"],
                                     "additionalProperties": false
                                   },
                                   "minItems": 1
                                 },
                                 "aggregation": {
                                   "type": "string",
-                                  "enum": [
-                                    "weighted_average",
-                                    "all_or_nothing"
-                                  ]
+                                  "enum": ["weighted_average", "all_or_nothing"]
                                 }
                               },
-                              "required": [
-                                "type",
-                                "fields"
-                              ],
+                              "required": ["type", "fields"],
                               "additionalProperties": false
                             },
                             {
@@ -10305,10 +9269,7 @@
                                   "minimum": 0
                                 }
                               },
-                              "required": [
-                                "type",
-                                "threshold"
-                              ],
+                              "required": ["type", "threshold"],
                               "additionalProperties": false
                             },
                             {
@@ -10352,10 +9313,7 @@
                                   "minimum": 0
                                 }
                               },
-                              "required": [
-                                "type",
-                                "budget"
-                              ],
+                              "required": ["type", "budget"],
                               "additionalProperties": false
                             },
                             {
@@ -10392,10 +9350,7 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": [
-                                    "token-usage",
-                                    "token_usage"
-                                  ]
+                                  "enum": ["token-usage", "token_usage"]
                                 },
                                 "max_total": {
                                   "type": "number",
@@ -10410,9 +9365,7 @@
                                   "minimum": 0
                                 }
                               },
-                              "required": [
-                                "type"
-                              ],
+                              "required": ["type"],
                               "additionalProperties": false
                             },
                             {
@@ -10449,10 +9402,7 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": [
-                                    "execution-metrics",
-                                    "execution_metrics"
-                                  ]
+                                  "enum": ["execution-metrics", "execution_metrics"]
                                 },
                                 "max_tool_calls": {
                                   "type": "number",
@@ -10484,9 +9434,7 @@
                                   "minimum": 0
                                 }
                               },
-                              "required": [
-                                "type"
-                              ],
+                              "required": ["type"],
                               "additionalProperties": false
                             },
                             {
@@ -10529,10 +9477,7 @@
                                   "type": "string"
                                 }
                               },
-                              "required": [
-                                "type",
-                                "value"
-                              ],
+                              "required": ["type", "value"],
                               "additionalProperties": false
                             },
                             {
@@ -10575,10 +9520,7 @@
                                   "type": "string"
                                 }
                               },
-                              "required": [
-                                "type",
-                                "value"
-                              ],
+                              "required": ["type", "value"],
                               "additionalProperties": false
                             },
                             {
@@ -10615,15 +9557,10 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": [
-                                    "is-json",
-                                    "is_json"
-                                  ]
+                                  "enum": ["is-json", "is_json"]
                                 }
                               },
-                              "required": [
-                                "type"
-                              ],
+                              "required": ["type"],
                               "additionalProperties": false
                             },
                             {
@@ -10666,10 +9603,7 @@
                                   "type": "string"
                                 }
                               },
-                              "required": [
-                                "type",
-                                "value"
-                              ],
+                              "required": ["type", "value"],
                               "additionalProperties": false
                             },
                             {
@@ -10758,10 +9692,7 @@
                                               "minLength": 1
                                             }
                                           },
-                                          "required": [
-                                            "score_range",
-                                            "outcome"
-                                          ],
+                                          "required": ["score_range", "outcome"],
                                           "additionalProperties": false
                                         }
                                       }
@@ -10771,10 +9702,7 @@
                                   "minItems": 1
                                 }
                               },
-                              "required": [
-                                "type",
-                                "criteria"
-                              ],
+                              "required": ["type", "criteria"],
                               "additionalProperties": false
                             }
                           ]
@@ -10795,11 +9723,7 @@
                           },
                           "strategy": {
                             "type": "string",
-                            "enum": [
-                              "pass_at_k",
-                              "mean",
-                              "confidence_interval"
-                            ]
+                            "enum": ["pass_at_k", "mean", "confidence_interval"]
                           },
                           "cost_limit_usd": {
                             "type": "number",
@@ -10810,9 +9734,7 @@
                             "minimum": 0
                           }
                         },
-                        "required": [
-                          "count"
-                        ],
+                        "required": ["count"],
                         "additionalProperties": false
                       },
                       "total_budget_usd": {
@@ -10845,10 +9767,7 @@
                       },
                       "isolation": {
                         "type": "string",
-                        "enum": [
-                          "shared",
-                          "per_test"
-                        ]
+                        "enum": ["shared", "per_test"]
                       },
                       "repos": {
                         "type": "array",
@@ -10872,10 +9791,7 @@
                                       "format": "uri"
                                     }
                                   },
-                                  "required": [
-                                    "type",
-                                    "url"
-                                  ],
+                                  "required": ["type", "url"],
                                   "additionalProperties": false
                                 },
                                 {
@@ -10889,10 +9805,7 @@
                                       "type": "string"
                                     }
                                   },
-                                  "required": [
-                                    "type",
-                                    "path"
-                                  ],
+                                  "required": ["type", "path"],
                                   "additionalProperties": false
                                 }
                               ]
@@ -10905,10 +9818,7 @@
                                 },
                                 "resolve": {
                                   "type": "string",
-                                  "enum": [
-                                    "remote",
-                                    "local"
-                                  ]
+                                  "enum": ["remote", "local"]
                                 },
                                 "ancestor": {
                                   "type": "integer",
@@ -10937,10 +9847,7 @@
                               "additionalProperties": false
                             }
                           },
-                          "required": [
-                            "path",
-                            "source"
-                          ],
+                          "required": ["path", "source"],
                           "additionalProperties": false
                         }
                       },
@@ -10976,11 +9883,7 @@
                               },
                               "reset": {
                                 "type": "string",
-                                "enum": [
-                                  "none",
-                                  "fast",
-                                  "strict"
-                                ]
+                                "enum": ["none", "fast", "strict"]
                               }
                             },
                             "additionalProperties": false
@@ -11011,11 +9914,7 @@
                               },
                               "reset": {
                                 "type": "string",
-                                "enum": [
-                                  "none",
-                                  "fast",
-                                  "strict"
-                                ]
+                                "enum": ["none", "fast", "strict"]
                               }
                             },
                             "additionalProperties": false
@@ -11046,11 +9945,7 @@
                               },
                               "reset": {
                                 "type": "string",
-                                "enum": [
-                                  "none",
-                                  "fast",
-                                  "strict"
-                                ]
+                                "enum": ["none", "fast", "strict"]
                               }
                             },
                             "additionalProperties": false
@@ -11081,11 +9976,7 @@
                               },
                               "reset": {
                                 "type": "string",
-                                "enum": [
-                                  "none",
-                                  "fast",
-                                  "strict"
-                                ]
+                                "enum": ["none", "fast", "strict"]
                               }
                             },
                             "additionalProperties": false
@@ -11095,11 +9986,7 @@
                       },
                       "mode": {
                         "type": "string",
-                        "enum": [
-                          "pooled",
-                          "temp",
-                          "static"
-                        ]
+                        "enum": ["pooled", "temp", "static"]
                       },
                       "path": {
                         "type": "string"
@@ -11122,9 +10009,7 @@
                             "minimum": 0.1
                           }
                         },
-                        "required": [
-                          "image"
-                        ],
+                        "required": ["image"],
                         "additionalProperties": false
                       }
                     },
@@ -11144,9 +10029,7 @@
                     "type": "string"
                   }
                 },
-                "required": [
-                  "id"
-                ],
+                "required": ["id"],
                 "additionalProperties": false
               }
             },
@@ -11213,10 +10096,7 @@
                       },
                       "type": {
                         "type": "string",
-                        "enum": [
-                          "code-grader",
-                          "code_grader"
-                        ]
+                        "enum": ["code-grader", "code_grader"]
                       },
                       "command": {
                         "anyOf": [
@@ -11290,18 +10170,12 @@
                               ]
                             }
                           },
-                          "required": [
-                            "type",
-                            "command"
-                          ],
+                          "required": ["type", "command"],
                           "additionalProperties": false
                         }
                       }
                     },
-                    "required": [
-                      "type",
-                      "command"
-                    ],
+                    "required": ["type", "command"],
                     "additionalProperties": false
                   },
                   {
@@ -11338,10 +10212,7 @@
                       },
                       "type": {
                         "type": "string",
-                        "enum": [
-                          "llm-grader",
-                          "llm_grader"
-                        ]
+                        "enum": ["llm-grader", "llm_grader"]
                       },
                       "prompt": {
                         "anyOf": [
@@ -11436,10 +10307,7 @@
                                     "minLength": 1
                                   }
                                 },
-                                "required": [
-                                  "score_range",
-                                  "outcome"
-                                ],
+                                "required": ["score_range", "outcome"],
                                 "additionalProperties": false
                               }
                             }
@@ -11490,17 +10358,12 @@
                               ]
                             }
                           },
-                          "required": [
-                            "type",
-                            "command"
-                          ],
+                          "required": ["type", "command"],
                           "additionalProperties": false
                         }
                       }
                     },
-                    "required": [
-                      "type"
-                    ],
+                    "required": ["type"],
                     "additionalProperties": false
                   },
                   {
@@ -11563,9 +10426,7 @@
                                 }
                               }
                             },
-                            "required": [
-                              "type"
-                            ],
+                            "required": ["type"],
                             "additionalProperties": false
                           },
                           {
@@ -11581,10 +10442,7 @@
                                 "maximum": 1
                               }
                             },
-                            "required": [
-                              "type",
-                              "threshold"
-                            ],
+                            "required": ["type", "threshold"],
                             "additionalProperties": false
                           },
                           {
@@ -11601,10 +10459,7 @@
                                 "type": "string"
                               }
                             },
-                            "required": [
-                              "type",
-                              "path"
-                            ],
+                            "required": ["type", "path"],
                             "additionalProperties": false
                           },
                           {
@@ -11621,18 +10476,13 @@
                                 "type": "string"
                               }
                             },
-                            "required": [
-                              "type"
-                            ],
+                            "required": ["type"],
                             "additionalProperties": false
                           }
                         ]
                       }
                     },
-                    "required": [
-                      "type",
-                      "aggregator"
-                    ],
+                    "required": ["type", "aggregator"],
                     "additionalProperties": false
                   },
                   {
@@ -11669,20 +10519,11 @@
                       },
                       "type": {
                         "type": "string",
-                        "enum": [
-                          "tool-trajectory",
-                          "tool_trajectory"
-                        ]
+                        "enum": ["tool-trajectory", "tool_trajectory"]
                       },
                       "mode": {
                         "type": "string",
-                        "enum": [
-                          "any_order",
-                          "in_order",
-                          "exact",
-                          "subset",
-                          "superset"
-                        ]
+                        "enum": ["any_order", "in_order", "exact", "subset", "superset"]
                       },
                       "minimums": {
                         "type": "object",
@@ -11723,12 +10564,7 @@
                               "anyOf": [
                                 {
                                   "type": "string",
-                                  "enum": [
-                                    "exact",
-                                    "ignore",
-                                    "subset",
-                                    "superset"
-                                  ]
+                                  "enum": ["exact", "ignore", "subset", "superset"]
                                 },
                                 {
                                   "type": "array",
@@ -11742,12 +10578,7 @@
                               "anyOf": [
                                 {
                                   "type": "string",
-                                  "enum": [
-                                    "exact",
-                                    "ignore",
-                                    "subset",
-                                    "superset"
-                                  ]
+                                  "enum": ["exact", "ignore", "subset", "superset"]
                                 },
                                 {
                                   "type": "array",
@@ -11758,9 +10589,7 @@
                               ]
                             }
                           },
-                          "required": [
-                            "tool"
-                          ],
+                          "required": ["tool"],
                           "additionalProperties": false
                         }
                       },
@@ -11768,12 +10597,7 @@
                         "anyOf": [
                           {
                             "type": "string",
-                            "enum": [
-                              "exact",
-                              "ignore",
-                              "subset",
-                              "superset"
-                            ]
+                            "enum": ["exact", "ignore", "subset", "superset"]
                           },
                           {
                             "type": "array",
@@ -11787,12 +10611,7 @@
                         "anyOf": [
                           {
                             "type": "string",
-                            "enum": [
-                              "exact",
-                              "ignore",
-                              "subset",
-                              "superset"
-                            ]
+                            "enum": ["exact", "ignore", "subset", "superset"]
                           },
                           {
                             "type": "array",
@@ -11803,10 +10622,7 @@
                         ]
                       }
                     },
-                    "required": [
-                      "type",
-                      "mode"
-                    ],
+                    "required": ["type", "mode"],
                     "additionalProperties": false
                   },
                   {
@@ -11843,10 +10659,7 @@
                       },
                       "type": {
                         "type": "string",
-                        "enum": [
-                          "field-accuracy",
-                          "field_accuracy"
-                        ]
+                        "enum": ["field-accuracy", "field_accuracy"]
                       },
                       "fields": {
                         "type": "array",
@@ -11858,11 +10671,7 @@
                             },
                             "match": {
                               "type": "string",
-                              "enum": [
-                                "exact",
-                                "numeric_tolerance",
-                                "date"
-                              ]
+                              "enum": ["exact", "numeric_tolerance", "date"]
                             },
                             "required": {
                               "type": "boolean"
@@ -11884,26 +10693,17 @@
                               }
                             }
                           },
-                          "required": [
-                            "path",
-                            "match"
-                          ],
+                          "required": ["path", "match"],
                           "additionalProperties": false
                         },
                         "minItems": 1
                       },
                       "aggregation": {
                         "type": "string",
-                        "enum": [
-                          "weighted_average",
-                          "all_or_nothing"
-                        ]
+                        "enum": ["weighted_average", "all_or_nothing"]
                       }
                     },
-                    "required": [
-                      "type",
-                      "fields"
-                    ],
+                    "required": ["type", "fields"],
                     "additionalProperties": false
                   },
                   {
@@ -11947,10 +10747,7 @@
                         "minimum": 0
                       }
                     },
-                    "required": [
-                      "type",
-                      "threshold"
-                    ],
+                    "required": ["type", "threshold"],
                     "additionalProperties": false
                   },
                   {
@@ -11994,10 +10791,7 @@
                         "minimum": 0
                       }
                     },
-                    "required": [
-                      "type",
-                      "budget"
-                    ],
+                    "required": ["type", "budget"],
                     "additionalProperties": false
                   },
                   {
@@ -12034,10 +10828,7 @@
                       },
                       "type": {
                         "type": "string",
-                        "enum": [
-                          "token-usage",
-                          "token_usage"
-                        ]
+                        "enum": ["token-usage", "token_usage"]
                       },
                       "max_total": {
                         "type": "number",
@@ -12052,9 +10843,7 @@
                         "minimum": 0
                       }
                     },
-                    "required": [
-                      "type"
-                    ],
+                    "required": ["type"],
                     "additionalProperties": false
                   },
                   {
@@ -12091,10 +10880,7 @@
                       },
                       "type": {
                         "type": "string",
-                        "enum": [
-                          "execution-metrics",
-                          "execution_metrics"
-                        ]
+                        "enum": ["execution-metrics", "execution_metrics"]
                       },
                       "max_tool_calls": {
                         "type": "number",
@@ -12126,9 +10912,7 @@
                         "minimum": 0
                       }
                     },
-                    "required": [
-                      "type"
-                    ],
+                    "required": ["type"],
                     "additionalProperties": false
                   },
                   {
@@ -12171,10 +10955,7 @@
                         "type": "string"
                       }
                     },
-                    "required": [
-                      "type",
-                      "value"
-                    ],
+                    "required": ["type", "value"],
                     "additionalProperties": false
                   },
                   {
@@ -12217,10 +10998,7 @@
                         "type": "string"
                       }
                     },
-                    "required": [
-                      "type",
-                      "value"
-                    ],
+                    "required": ["type", "value"],
                     "additionalProperties": false
                   },
                   {
@@ -12257,15 +11035,10 @@
                       },
                       "type": {
                         "type": "string",
-                        "enum": [
-                          "is-json",
-                          "is_json"
-                        ]
+                        "enum": ["is-json", "is_json"]
                       }
                     },
-                    "required": [
-                      "type"
-                    ],
+                    "required": ["type"],
                     "additionalProperties": false
                   },
                   {
@@ -12308,10 +11081,7 @@
                         "type": "string"
                       }
                     },
-                    "required": [
-                      "type",
-                      "value"
-                    ],
+                    "required": ["type", "value"],
                     "additionalProperties": false
                   },
                   {
@@ -12400,10 +11170,7 @@
                                     "minLength": 1
                                   }
                                 },
-                                "required": [
-                                  "score_range",
-                                  "outcome"
-                                ],
+                                "required": ["score_range", "outcome"],
                                 "additionalProperties": false
                               }
                             }
@@ -12413,10 +11180,7 @@
                         "minItems": 1
                       }
                     },
-                    "required": [
-                      "type",
-                      "criteria"
-                    ],
+                    "required": ["type", "criteria"],
                     "additionalProperties": false
                   }
                 ]
@@ -12460,10 +11224,7 @@
                       },
                       "type": {
                         "type": "string",
-                        "enum": [
-                          "code-grader",
-                          "code_grader"
-                        ]
+                        "enum": ["code-grader", "code_grader"]
                       },
                       "command": {
                         "anyOf": [
@@ -12537,18 +11298,12 @@
                               ]
                             }
                           },
-                          "required": [
-                            "type",
-                            "command"
-                          ],
+                          "required": ["type", "command"],
                           "additionalProperties": false
                         }
                       }
                     },
-                    "required": [
-                      "type",
-                      "command"
-                    ],
+                    "required": ["type", "command"],
                     "additionalProperties": false
                   },
                   {
@@ -12585,10 +11340,7 @@
                       },
                       "type": {
                         "type": "string",
-                        "enum": [
-                          "llm-grader",
-                          "llm_grader"
-                        ]
+                        "enum": ["llm-grader", "llm_grader"]
                       },
                       "prompt": {
                         "anyOf": [
@@ -12683,10 +11435,7 @@
                                     "minLength": 1
                                   }
                                 },
-                                "required": [
-                                  "score_range",
-                                  "outcome"
-                                ],
+                                "required": ["score_range", "outcome"],
                                 "additionalProperties": false
                               }
                             }
@@ -12737,17 +11486,12 @@
                               ]
                             }
                           },
-                          "required": [
-                            "type",
-                            "command"
-                          ],
+                          "required": ["type", "command"],
                           "additionalProperties": false
                         }
                       }
                     },
-                    "required": [
-                      "type"
-                    ],
+                    "required": ["type"],
                     "additionalProperties": false
                   },
                   {
@@ -12810,9 +11554,7 @@
                                 }
                               }
                             },
-                            "required": [
-                              "type"
-                            ],
+                            "required": ["type"],
                             "additionalProperties": false
                           },
                           {
@@ -12828,10 +11570,7 @@
                                 "maximum": 1
                               }
                             },
-                            "required": [
-                              "type",
-                              "threshold"
-                            ],
+                            "required": ["type", "threshold"],
                             "additionalProperties": false
                           },
                           {
@@ -12848,10 +11587,7 @@
                                 "type": "string"
                               }
                             },
-                            "required": [
-                              "type",
-                              "path"
-                            ],
+                            "required": ["type", "path"],
                             "additionalProperties": false
                           },
                           {
@@ -12868,18 +11604,13 @@
                                 "type": "string"
                               }
                             },
-                            "required": [
-                              "type"
-                            ],
+                            "required": ["type"],
                             "additionalProperties": false
                           }
                         ]
                       }
                     },
-                    "required": [
-                      "type",
-                      "aggregator"
-                    ],
+                    "required": ["type", "aggregator"],
                     "additionalProperties": false
                   },
                   {
@@ -12916,20 +11647,11 @@
                       },
                       "type": {
                         "type": "string",
-                        "enum": [
-                          "tool-trajectory",
-                          "tool_trajectory"
-                        ]
+                        "enum": ["tool-trajectory", "tool_trajectory"]
                       },
                       "mode": {
                         "type": "string",
-                        "enum": [
-                          "any_order",
-                          "in_order",
-                          "exact",
-                          "subset",
-                          "superset"
-                        ]
+                        "enum": ["any_order", "in_order", "exact", "subset", "superset"]
                       },
                       "minimums": {
                         "type": "object",
@@ -12970,12 +11692,7 @@
                               "anyOf": [
                                 {
                                   "type": "string",
-                                  "enum": [
-                                    "exact",
-                                    "ignore",
-                                    "subset",
-                                    "superset"
-                                  ]
+                                  "enum": ["exact", "ignore", "subset", "superset"]
                                 },
                                 {
                                   "type": "array",
@@ -12989,12 +11706,7 @@
                               "anyOf": [
                                 {
                                   "type": "string",
-                                  "enum": [
-                                    "exact",
-                                    "ignore",
-                                    "subset",
-                                    "superset"
-                                  ]
+                                  "enum": ["exact", "ignore", "subset", "superset"]
                                 },
                                 {
                                   "type": "array",
@@ -13005,9 +11717,7 @@
                               ]
                             }
                           },
-                          "required": [
-                            "tool"
-                          ],
+                          "required": ["tool"],
                           "additionalProperties": false
                         }
                       },
@@ -13015,12 +11725,7 @@
                         "anyOf": [
                           {
                             "type": "string",
-                            "enum": [
-                              "exact",
-                              "ignore",
-                              "subset",
-                              "superset"
-                            ]
+                            "enum": ["exact", "ignore", "subset", "superset"]
                           },
                           {
                             "type": "array",
@@ -13034,12 +11739,7 @@
                         "anyOf": [
                           {
                             "type": "string",
-                            "enum": [
-                              "exact",
-                              "ignore",
-                              "subset",
-                              "superset"
-                            ]
+                            "enum": ["exact", "ignore", "subset", "superset"]
                           },
                           {
                             "type": "array",
@@ -13050,10 +11750,7 @@
                         ]
                       }
                     },
-                    "required": [
-                      "type",
-                      "mode"
-                    ],
+                    "required": ["type", "mode"],
                     "additionalProperties": false
                   },
                   {
@@ -13090,10 +11787,7 @@
                       },
                       "type": {
                         "type": "string",
-                        "enum": [
-                          "field-accuracy",
-                          "field_accuracy"
-                        ]
+                        "enum": ["field-accuracy", "field_accuracy"]
                       },
                       "fields": {
                         "type": "array",
@@ -13105,11 +11799,7 @@
                             },
                             "match": {
                               "type": "string",
-                              "enum": [
-                                "exact",
-                                "numeric_tolerance",
-                                "date"
-                              ]
+                              "enum": ["exact", "numeric_tolerance", "date"]
                             },
                             "required": {
                               "type": "boolean"
@@ -13131,26 +11821,17 @@
                               }
                             }
                           },
-                          "required": [
-                            "path",
-                            "match"
-                          ],
+                          "required": ["path", "match"],
                           "additionalProperties": false
                         },
                         "minItems": 1
                       },
                       "aggregation": {
                         "type": "string",
-                        "enum": [
-                          "weighted_average",
-                          "all_or_nothing"
-                        ]
+                        "enum": ["weighted_average", "all_or_nothing"]
                       }
                     },
-                    "required": [
-                      "type",
-                      "fields"
-                    ],
+                    "required": ["type", "fields"],
                     "additionalProperties": false
                   },
                   {
@@ -13194,10 +11875,7 @@
                         "minimum": 0
                       }
                     },
-                    "required": [
-                      "type",
-                      "threshold"
-                    ],
+                    "required": ["type", "threshold"],
                     "additionalProperties": false
                   },
                   {
@@ -13241,10 +11919,7 @@
                         "minimum": 0
                       }
                     },
-                    "required": [
-                      "type",
-                      "budget"
-                    ],
+                    "required": ["type", "budget"],
                     "additionalProperties": false
                   },
                   {
@@ -13281,10 +11956,7 @@
                       },
                       "type": {
                         "type": "string",
-                        "enum": [
-                          "token-usage",
-                          "token_usage"
-                        ]
+                        "enum": ["token-usage", "token_usage"]
                       },
                       "max_total": {
                         "type": "number",
@@ -13299,9 +11971,7 @@
                         "minimum": 0
                       }
                     },
-                    "required": [
-                      "type"
-                    ],
+                    "required": ["type"],
                     "additionalProperties": false
                   },
                   {
@@ -13338,10 +12008,7 @@
                       },
                       "type": {
                         "type": "string",
-                        "enum": [
-                          "execution-metrics",
-                          "execution_metrics"
-                        ]
+                        "enum": ["execution-metrics", "execution_metrics"]
                       },
                       "max_tool_calls": {
                         "type": "number",
@@ -13373,9 +12040,7 @@
                         "minimum": 0
                       }
                     },
-                    "required": [
-                      "type"
-                    ],
+                    "required": ["type"],
                     "additionalProperties": false
                   },
                   {
@@ -13418,10 +12083,7 @@
                         "type": "string"
                       }
                     },
-                    "required": [
-                      "type",
-                      "value"
-                    ],
+                    "required": ["type", "value"],
                     "additionalProperties": false
                   },
                   {
@@ -13464,10 +12126,7 @@
                         "type": "string"
                       }
                     },
-                    "required": [
-                      "type",
-                      "value"
-                    ],
+                    "required": ["type", "value"],
                     "additionalProperties": false
                   },
                   {
@@ -13504,15 +12163,10 @@
                       },
                       "type": {
                         "type": "string",
-                        "enum": [
-                          "is-json",
-                          "is_json"
-                        ]
+                        "enum": ["is-json", "is_json"]
                       }
                     },
-                    "required": [
-                      "type"
-                    ],
+                    "required": ["type"],
                     "additionalProperties": false
                   },
                   {
@@ -13555,10 +12209,7 @@
                         "type": "string"
                       }
                     },
-                    "required": [
-                      "type",
-                      "value"
-                    ],
+                    "required": ["type", "value"],
                     "additionalProperties": false
                   },
                   {
@@ -13647,10 +12298,7 @@
                                     "minLength": 1
                                   }
                                 },
-                                "required": [
-                                  "score_range",
-                                  "outcome"
-                                ],
+                                "required": ["score_range", "outcome"],
                                 "additionalProperties": false
                               }
                             }
@@ -13660,10 +12308,7 @@
                         "minItems": 1
                       }
                     },
-                    "required": [
-                      "type",
-                      "criteria"
-                    ],
+                    "required": ["type", "criteria"],
                     "additionalProperties": false
                   }
                 ]
@@ -13684,11 +12329,7 @@
                 },
                 "strategy": {
                   "type": "string",
-                  "enum": [
-                    "pass_at_k",
-                    "mean",
-                    "confidence_interval"
-                  ]
+                  "enum": ["pass_at_k", "mean", "confidence_interval"]
                 },
                 "cost_limit_usd": {
                   "type": "number",
@@ -13699,9 +12340,7 @@
                   "minimum": 0
                 }
               },
-              "required": [
-                "count"
-              ],
+              "required": ["count"],
               "additionalProperties": false
             },
             "total_budget_usd": {
@@ -13764,10 +12403,7 @@
                   },
                   "type": {
                     "type": "string",
-                    "enum": [
-                      "code-grader",
-                      "code_grader"
-                    ]
+                    "enum": ["code-grader", "code_grader"]
                   },
                   "command": {
                     "anyOf": [
@@ -13841,18 +12477,12 @@
                           ]
                         }
                       },
-                      "required": [
-                        "type",
-                        "command"
-                      ],
+                      "required": ["type", "command"],
                       "additionalProperties": false
                     }
                   }
                 },
-                "required": [
-                  "type",
-                  "command"
-                ],
+                "required": ["type", "command"],
                 "additionalProperties": false
               },
               {
@@ -13889,10 +12519,7 @@
                   },
                   "type": {
                     "type": "string",
-                    "enum": [
-                      "llm-grader",
-                      "llm_grader"
-                    ]
+                    "enum": ["llm-grader", "llm_grader"]
                   },
                   "prompt": {
                     "anyOf": [
@@ -13987,10 +12614,7 @@
                                 "minLength": 1
                               }
                             },
-                            "required": [
-                              "score_range",
-                              "outcome"
-                            ],
+                            "required": ["score_range", "outcome"],
                             "additionalProperties": false
                           }
                         }
@@ -14041,17 +12665,12 @@
                           ]
                         }
                       },
-                      "required": [
-                        "type",
-                        "command"
-                      ],
+                      "required": ["type", "command"],
                       "additionalProperties": false
                     }
                   }
                 },
-                "required": [
-                  "type"
-                ],
+                "required": ["type"],
                 "additionalProperties": false
               },
               {
@@ -14114,9 +12733,7 @@
                             }
                           }
                         },
-                        "required": [
-                          "type"
-                        ],
+                        "required": ["type"],
                         "additionalProperties": false
                       },
                       {
@@ -14132,10 +12749,7 @@
                             "maximum": 1
                           }
                         },
-                        "required": [
-                          "type",
-                          "threshold"
-                        ],
+                        "required": ["type", "threshold"],
                         "additionalProperties": false
                       },
                       {
@@ -14152,10 +12766,7 @@
                             "type": "string"
                           }
                         },
-                        "required": [
-                          "type",
-                          "path"
-                        ],
+                        "required": ["type", "path"],
                         "additionalProperties": false
                       },
                       {
@@ -14172,18 +12783,13 @@
                             "type": "string"
                           }
                         },
-                        "required": [
-                          "type"
-                        ],
+                        "required": ["type"],
                         "additionalProperties": false
                       }
                     ]
                   }
                 },
-                "required": [
-                  "type",
-                  "aggregator"
-                ],
+                "required": ["type", "aggregator"],
                 "additionalProperties": false
               },
               {
@@ -14220,20 +12826,11 @@
                   },
                   "type": {
                     "type": "string",
-                    "enum": [
-                      "tool-trajectory",
-                      "tool_trajectory"
-                    ]
+                    "enum": ["tool-trajectory", "tool_trajectory"]
                   },
                   "mode": {
                     "type": "string",
-                    "enum": [
-                      "any_order",
-                      "in_order",
-                      "exact",
-                      "subset",
-                      "superset"
-                    ]
+                    "enum": ["any_order", "in_order", "exact", "subset", "superset"]
                   },
                   "minimums": {
                     "type": "object",
@@ -14274,12 +12871,7 @@
                           "anyOf": [
                             {
                               "type": "string",
-                              "enum": [
-                                "exact",
-                                "ignore",
-                                "subset",
-                                "superset"
-                              ]
+                              "enum": ["exact", "ignore", "subset", "superset"]
                             },
                             {
                               "type": "array",
@@ -14293,12 +12885,7 @@
                           "anyOf": [
                             {
                               "type": "string",
-                              "enum": [
-                                "exact",
-                                "ignore",
-                                "subset",
-                                "superset"
-                              ]
+                              "enum": ["exact", "ignore", "subset", "superset"]
                             },
                             {
                               "type": "array",
@@ -14309,9 +12896,7 @@
                           ]
                         }
                       },
-                      "required": [
-                        "tool"
-                      ],
+                      "required": ["tool"],
                       "additionalProperties": false
                     }
                   },
@@ -14319,12 +12904,7 @@
                     "anyOf": [
                       {
                         "type": "string",
-                        "enum": [
-                          "exact",
-                          "ignore",
-                          "subset",
-                          "superset"
-                        ]
+                        "enum": ["exact", "ignore", "subset", "superset"]
                       },
                       {
                         "type": "array",
@@ -14338,12 +12918,7 @@
                     "anyOf": [
                       {
                         "type": "string",
-                        "enum": [
-                          "exact",
-                          "ignore",
-                          "subset",
-                          "superset"
-                        ]
+                        "enum": ["exact", "ignore", "subset", "superset"]
                       },
                       {
                         "type": "array",
@@ -14354,10 +12929,7 @@
                     ]
                   }
                 },
-                "required": [
-                  "type",
-                  "mode"
-                ],
+                "required": ["type", "mode"],
                 "additionalProperties": false
               },
               {
@@ -14394,10 +12966,7 @@
                   },
                   "type": {
                     "type": "string",
-                    "enum": [
-                      "field-accuracy",
-                      "field_accuracy"
-                    ]
+                    "enum": ["field-accuracy", "field_accuracy"]
                   },
                   "fields": {
                     "type": "array",
@@ -14409,11 +12978,7 @@
                         },
                         "match": {
                           "type": "string",
-                          "enum": [
-                            "exact",
-                            "numeric_tolerance",
-                            "date"
-                          ]
+                          "enum": ["exact", "numeric_tolerance", "date"]
                         },
                         "required": {
                           "type": "boolean"
@@ -14435,26 +13000,17 @@
                           }
                         }
                       },
-                      "required": [
-                        "path",
-                        "match"
-                      ],
+                      "required": ["path", "match"],
                       "additionalProperties": false
                     },
                     "minItems": 1
                   },
                   "aggregation": {
                     "type": "string",
-                    "enum": [
-                      "weighted_average",
-                      "all_or_nothing"
-                    ]
+                    "enum": ["weighted_average", "all_or_nothing"]
                   }
                 },
-                "required": [
-                  "type",
-                  "fields"
-                ],
+                "required": ["type", "fields"],
                 "additionalProperties": false
               },
               {
@@ -14498,10 +13054,7 @@
                     "minimum": 0
                   }
                 },
-                "required": [
-                  "type",
-                  "threshold"
-                ],
+                "required": ["type", "threshold"],
                 "additionalProperties": false
               },
               {
@@ -14545,10 +13098,7 @@
                     "minimum": 0
                   }
                 },
-                "required": [
-                  "type",
-                  "budget"
-                ],
+                "required": ["type", "budget"],
                 "additionalProperties": false
               },
               {
@@ -14585,10 +13135,7 @@
                   },
                   "type": {
                     "type": "string",
-                    "enum": [
-                      "token-usage",
-                      "token_usage"
-                    ]
+                    "enum": ["token-usage", "token_usage"]
                   },
                   "max_total": {
                     "type": "number",
@@ -14603,9 +13150,7 @@
                     "minimum": 0
                   }
                 },
-                "required": [
-                  "type"
-                ],
+                "required": ["type"],
                 "additionalProperties": false
               },
               {
@@ -14642,10 +13187,7 @@
                   },
                   "type": {
                     "type": "string",
-                    "enum": [
-                      "execution-metrics",
-                      "execution_metrics"
-                    ]
+                    "enum": ["execution-metrics", "execution_metrics"]
                   },
                   "max_tool_calls": {
                     "type": "number",
@@ -14677,9 +13219,7 @@
                     "minimum": 0
                   }
                 },
-                "required": [
-                  "type"
-                ],
+                "required": ["type"],
                 "additionalProperties": false
               },
               {
@@ -14722,10 +13262,7 @@
                     "type": "string"
                   }
                 },
-                "required": [
-                  "type",
-                  "value"
-                ],
+                "required": ["type", "value"],
                 "additionalProperties": false
               },
               {
@@ -14768,10 +13305,7 @@
                     "type": "string"
                   }
                 },
-                "required": [
-                  "type",
-                  "value"
-                ],
+                "required": ["type", "value"],
                 "additionalProperties": false
               },
               {
@@ -14808,15 +13342,10 @@
                   },
                   "type": {
                     "type": "string",
-                    "enum": [
-                      "is-json",
-                      "is_json"
-                    ]
+                    "enum": ["is-json", "is_json"]
                   }
                 },
-                "required": [
-                  "type"
-                ],
+                "required": ["type"],
                 "additionalProperties": false
               },
               {
@@ -14859,10 +13388,7 @@
                     "type": "string"
                   }
                 },
-                "required": [
-                  "type",
-                  "value"
-                ],
+                "required": ["type", "value"],
                 "additionalProperties": false
               },
               {
@@ -14951,10 +13477,7 @@
                                 "minLength": 1
                               }
                             },
-                            "required": [
-                              "score_range",
-                              "outcome"
-                            ],
+                            "required": ["score_range", "outcome"],
                             "additionalProperties": false
                           }
                         }
@@ -14964,10 +13487,7 @@
                     "minItems": 1
                   }
                 },
-                "required": [
-                  "type",
-                  "criteria"
-                ],
+                "required": ["type", "criteria"],
                 "additionalProperties": false
               }
             ]
@@ -14996,10 +13516,7 @@
                 ]
               }
             },
-            "required": [
-              "type",
-              "command"
-            ],
+            "required": ["type", "command"],
             "additionalProperties": false
           }
         },
@@ -15013,10 +13530,7 @@
                 },
                 "isolation": {
                   "type": "string",
-                  "enum": [
-                    "shared",
-                    "per_test"
-                  ]
+                  "enum": ["shared", "per_test"]
                 },
                 "repos": {
                   "type": "array",
@@ -15040,10 +13554,7 @@
                                 "format": "uri"
                               }
                             },
-                            "required": [
-                              "type",
-                              "url"
-                            ],
+                            "required": ["type", "url"],
                             "additionalProperties": false
                           },
                           {
@@ -15057,10 +13568,7 @@
                                 "type": "string"
                               }
                             },
-                            "required": [
-                              "type",
-                              "path"
-                            ],
+                            "required": ["type", "path"],
                             "additionalProperties": false
                           }
                         ]
@@ -15073,10 +13581,7 @@
                           },
                           "resolve": {
                             "type": "string",
-                            "enum": [
-                              "remote",
-                              "local"
-                            ]
+                            "enum": ["remote", "local"]
                           },
                           "ancestor": {
                             "type": "integer",
@@ -15105,10 +13610,7 @@
                         "additionalProperties": false
                       }
                     },
-                    "required": [
-                      "path",
-                      "source"
-                    ],
+                    "required": ["path", "source"],
                     "additionalProperties": false
                   }
                 },
@@ -15144,11 +13646,7 @@
                         },
                         "reset": {
                           "type": "string",
-                          "enum": [
-                            "none",
-                            "fast",
-                            "strict"
-                          ]
+                          "enum": ["none", "fast", "strict"]
                         }
                       },
                       "additionalProperties": false
@@ -15179,11 +13677,7 @@
                         },
                         "reset": {
                           "type": "string",
-                          "enum": [
-                            "none",
-                            "fast",
-                            "strict"
-                          ]
+                          "enum": ["none", "fast", "strict"]
                         }
                       },
                       "additionalProperties": false
@@ -15214,11 +13708,7 @@
                         },
                         "reset": {
                           "type": "string",
-                          "enum": [
-                            "none",
-                            "fast",
-                            "strict"
-                          ]
+                          "enum": ["none", "fast", "strict"]
                         }
                       },
                       "additionalProperties": false
@@ -15249,11 +13739,7 @@
                         },
                         "reset": {
                           "type": "string",
-                          "enum": [
-                            "none",
-                            "fast",
-                            "strict"
-                          ]
+                          "enum": ["none", "fast", "strict"]
                         }
                       },
                       "additionalProperties": false
@@ -15263,11 +13749,7 @@
                 },
                 "mode": {
                   "type": "string",
-                  "enum": [
-                    "pooled",
-                    "temp",
-                    "static"
-                  ]
+                  "enum": ["pooled", "temp", "static"]
                 },
                 "path": {
                   "type": "string"
@@ -15290,9 +13772,7 @@
                       "minimum": 0.1
                     }
                   },
-                  "required": [
-                    "image"
-                  ],
+                  "required": ["image"],
                   "additionalProperties": false
                 }
               },
@@ -15304,9 +13784,7 @@
           ]
         }
       },
-      "required": [
-        "tests"
-      ],
+      "required": ["tests"],
       "additionalProperties": false
     }
   }