AltimateAI · anandgupta42 · Mar 11, 2026 · Mar 11, 2026
diff --git a/experiments/spider2_dbt/.gitignore b/experiments/spider2_dbt/.gitignore
@@ -0,0 +1,13 @@
+# Spider2 cloned repo (large, re-cloneable)
+spider2_repo/
+
+# Per-task workspace copies
+workspace/
+
+# Results and reports (generated artifacts)
+results/
+reports/
+
+# Python
+__pycache__/
+*.pyc
diff --git a/experiments/spider2_dbt/README.md b/experiments/spider2_dbt/README.md
@@ -0,0 +1,103 @@
+# Spider 2.0-DBT Benchmark Evaluation
+
+Evaluate **altimate-code** against the [Spider 2.0-DBT](https://spider2-dbt.github.io/) benchmark — 68 real-world dbt + DuckDB data engineering tasks.
+
+## Quick Start
+
+```bash
+# 1. Install dependencies
+pip install -r requirements.txt
+
+# 2. Setup (clone Spider2 repo, download databases)
+python setup_spider2.py
+
+# 3. Run benchmark (all 68 tasks)
+python run_benchmark.py
+
+# 4. Evaluate against gold standard
+python evaluate_results.py
+
+# 5. Generate interactive HTML report
+python report.py
+```
+
+## Smoke Test (5 tasks)
+
+```bash
+python run_benchmark.py --tasks 5
+python evaluate_results.py
+python report.py
+```
+
+## CLI Options
+
+### `run_benchmark.py`
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--tasks N` | all | First N tasks |
+| `--tasks id1 id2` | all | Specific task IDs |
+| `--timeout` | 600 | Seconds per task |
+| `--model` | `anthropic/claude-opus-4-6` | Model to use |
+| `--agent` | default | Agent to use |
+| `--no-resume` | off | Force re-run all tasks |
+| `--dry-run` | off | Print tasks without running |
+
+### `evaluate_results.py`
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--results` | latest | Path to benchmark results JSON |
+
+### `report.py`
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--evaluation` | latest | Path to evaluation JSON |
+| `--output` | auto | Output HTML file path |
+
+## Directory Structure
+
+```
+experiments/spider2_dbt/
+├── config.py              # Paths, leaderboard data, defaults
+├── setup_spider2.py       # One-time: clone Spider2, download data
+├── prompt_template.py     # Prompt engineering for each task
+├── run_benchmark.py       # Runner: invoke altimate-code per task
+├── evaluate_results.py    # Bridge to Spider2's official eval_utils
+├── report.py              # Generate interactive single-file HTML report
+├── requirements.txt       # Python deps
+├── results/               # Timestamped JSON results
+│   └── incremental/       # Per-task results for resumability
+├── reports/               # Generated HTML reports
+├── workspace/             # Per-task dbt project copies (gitignored)
+└── spider2_repo/          # Cloned Spider2 repository (gitignored)
+```
+
+## Resumability
+
+The benchmark runner saves per-task results to `results/incremental/`. If interrupted, re-running `python run_benchmark.py` will skip completed tasks. Use `--no-resume` to force a full re-run.
+
+## Report Features
+
+The HTML report is a single self-contained file (no external dependencies):
+
+- **Summary cards**: Pass rate, total time, model, rank
+- **Leaderboard chart**: SVG bar chart with all Spider2 entries + altimate-code highlighted
+- **Category breakdown**: Tasks grouped by domain with pass/fail counts
+- **Per-task table**: Sortable, filterable, with expandable agent logs
+- **Timing histogram**: Distribution of execution times
+
+## Leaderboard Context
+
+Current Spider 2.0-DBT leaderboard (as of 2025):
+
+| Agent | Pass Rate |
+|-------|-----------|
+| Databao Agent | 44.11% |
+| MLE-Bench Agent | 38.24% |
+| Claude 3.5 Sonnet (CoT) | 36.76% |
+| GPT-4o (CoT) | 33.82% |
+| CodeS Agent | 32.35% |
+| OpenHands Agent | 30.88% |
+| SWE-Agent | 27.94% |
diff --git a/experiments/spider2_dbt/altimate-code-dev.sh b/experiments/spider2_dbt/altimate-code-dev.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+exec bun run --cwd /Users/anandgupta/codebase/altimate-code/packages/opencode --conditions=browser src/index.ts "$@"
diff --git a/experiments/spider2_dbt/config.py b/experiments/spider2_dbt/config.py
@@ -0,0 +1,74 @@
+"""Configuration constants for Spider 2.0-DBT benchmark evaluation."""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+# ── Paths ──────────────────────────────────────────────────────────────────────
+
+BASE_DIR = Path(__file__).resolve().parent
+SPIDER2_REPO_DIR = BASE_DIR / "spider2_repo"
+SPIDER2_DBT_DIR = SPIDER2_REPO_DIR / "spider2-dbt"
+TASK_JSONL = SPIDER2_DBT_DIR / "examples" / "spider2-dbt.jsonl"
+EXAMPLES_DIR = SPIDER2_DBT_DIR / "examples"
+GOLD_EVAL_JSONL = SPIDER2_DBT_DIR / "evaluation_suite" / "gold" / "spider2_eval.jsonl"
+EVAL_UTILS_DIR = SPIDER2_DBT_DIR / "evaluation_suite"
+WORKSPACE_DIR = BASE_DIR / "workspace"
+RESULTS_DIR = BASE_DIR / "results"
+INCREMENTAL_DIR = RESULTS_DIR / "incremental"
+REPORTS_DIR = BASE_DIR / "reports"
+
+# ── Spider2 Repository ─────────────────────────────────────────────────────────
+
+SPIDER2_REPO_URL = "https://github.com/xlang-ai/Spider2.git"
+# Pin to a known-good commit for reproducibility
+SPIDER2_COMMIT = "main"
+
+# Google Drive file IDs for DuckDB database zips (from Spider2 README)
+# Format: (gdrive_id, expected_filename)
+DUCKDB_ZIP_DOWNLOADS = [
+    ("1N3f7BSWC4foj-V-1C9n8M2XmgV7FOcqL", "DBT_start_db.zip"),
+    ("1s0USV_iQLo4oe05QqAMnhGGp5jeejCzp", "dbt_gold.zip"),
+]
+
+# ── Execution ──────────────────────────────────────────────────────────────────
+
+ALTIMATE_CODE_BIN = os.environ.get("ALTIMATE_CODE_BIN", "altimate-code")
+DEFAULT_TIMEOUT = 600  # seconds per task (slowest legit tasks take ~593s)
+MAX_RETRIES = 2  # auto-retry only for fast exits (API/init failures)
+FAST_EXIT_THRESHOLD_S = 10  # tasks completing under this are likely failures
+DEFAULT_PARALLEL = 2  # concurrent tasks (4 caused too much resource contention)
+DEFAULT_MODEL = "anthropic/claude-sonnet-4-6"
+DEFAULT_AGENT = "coder"
+
+# ── Leaderboard Data (Spider 2.0-DBT, as of 2025) ─────────────────────────────
+# Source: https://spider2-dbt.github.io/
+# Format: (agent_name, pass_rate)
+
+LEADERBOARD: list[tuple[str, float]] = [
+    ("Databao Agent", 44.11),
+    ("MLE-Bench Agent", 38.24),
+    ("Claude 3.5 Sonnet (CoT)", 36.76),
+    ("GPT-4o (CoT)", 33.82),
+    ("CodeS Agent", 32.35),
+    ("OpenHands Agent", 30.88),
+    ("SWE-Agent", 27.94),
+    ("Gemini 1.5 Pro (CoT)", 26.47),
+    ("Llama 3.1 405B (CoT)", 22.06),
+    ("GPT-4o mini (CoT)", 19.12),
+    ("Claude 3 Haiku (CoT)", 16.18),
+]
+
+# ── Task Categories (domain grouping for report) ──────────────────────────────
+# Extract domain from instance_id by stripping trailing digits
+
+import re
+
+
+def get_task_domain(instance_id: str) -> str:
+    """Extract domain from instance_id by stripping trailing digits.
+
+    e.g. 'shopify002' -> 'shopify', 'f1003' -> 'f1', 'tpch001' -> 'tpch'
+    """
+    return re.sub(r"\d+$", "", instance_id)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		#!/bin/bash
		exec bun run --cwd /Users/anandgupta/codebase/altimate-code/packages/opencode --conditions=browser src/index.ts "$@"