diff --git a/experiments/harness-eval/README.md b/experiments/harness-eval/README.md
new file mode 100644
index 000000000..41859d211
--- /dev/null
+++ b/experiments/harness-eval/README.md
@@ -0,0 +1,209 @@
+# Harness Eval Suite
+
+Cost-aware model evaluation suite for SAM-native harness model selection. Compares models through SAM's Cloudflare AI Gateway on deterministic coding tasks, scoring **cost per successful task** — not just token price.
+
+## Models Under Test
+
+| Model | Provider | Gateway Path | Cost Source |
+|-------|----------|-------------|-------------|
+| Gemma 4 26B | Workers AI | `workers-ai` | Cloudflare Workers AI billing (~$0.011/1M tokens) |
+| GPT-4.1 Mini | OpenAI (Unified) | `unified` | OpenAI pricing ($0.40/$1.60 per 1M tokens) |
+| Claude Haiku 4.5 | Anthropic (Unified) | `unified` | Anthropic pricing ($0.80/$4.00 per 1M tokens) |
+
+> **Note:** GPT-5 Mini is not yet available in the SAM model registry. The closest available model is `gpt-5.2` ($10/$40 per 1M tokens), which is excluded from default runs due to cost. Add it manually via `EVAL_MODELS` if needed.
+
+## Scenarios
+
+| ID | Category | What It Tests | Tools Used |
+|----|----------|--------------|------------|
+| `weather-baseline` | baseline | Two-tool loop: get_weather + calculate (F to C) | get_weather, calculate |
+| `read-and-summarize` | coding | Read a file and summarize its functionality | read_file, glob |
+| `grep-locate-code` | coding | Search for a function, read the file, identify callers | grep, read_file, glob |
+| `missing-file-recovery` | coding | Handle a read_file error and recover via search | read_file, grep, glob |
+| `propose-patch` | coding | Read code, identify a bug, propose an edit_file fix | read_file, edit_file, grep, glob |
+| `interpret-test-failure` | coding | Read test output, trace to root cause, explain fix | read_file, grep, glob |
+
+All coding scenarios use a **virtual filesystem** — deterministic, network-free, no side effects.
+
+## Running the Suite
+
+### Prerequisites
+
+- Node.js 20+
+- `tsx` (installed via `npx`)
+- Cloudflare credentials with AI Gateway access
+
+### Environment Variables
+
+| Variable | Required | Description |
+|----------|----------|-------------|
+| `CF_ACCOUNT_ID` | Yes | Cloudflare account ID |
+| `CF_TOKEN` | Yes | Cloudflare API token (needs AI Gateway access) |
+| `AI_GATEWAY_ID` | No | Gateway ID (default: `sam`) |
+| `WORKERS_AI_COST_PER_1K_TOKENS` | No | Override Workers AI cost per 1K tokens (default: `0.000011`) |
+| `EVAL_SCENARIOS` | No | Comma-separated scenario IDs to run (default: all) |
+| `EVAL_MODELS` | No | Comma-separated model IDs to run (default: all) |
+
+### Run All
+
+```bash
+CF_ACCOUNT_ID=<id> CF_TOKEN=<token> npx tsx experiments/harness-eval/run.ts
+```
+
+### Run Specific Scenarios or Models
+
+```bash
+# Only the weather baseline
+EVAL_SCENARIOS=weather-baseline CF_ACCOUNT_ID=... CF_TOKEN=... npx tsx experiments/harness-eval/run.ts
+
+# Only Gemma
+EVAL_MODELS=@cf/google/gemma-4-26b-a4b-it CF_ACCOUNT_ID=... CF_TOKEN=... npx tsx experiments/harness-eval/run.ts
+```
+
+## Credential Blockers
+
+- **Workers AI (Gemma 4 26B):** Routes through `workers-ai/v1/chat/completions`. Auth via `Authorization: Bearer <CF_TOKEN>`. **Works** with the standard CF API token.
+- **Unified API (GPT-4.1 Mini, Claude Haiku 4.5):** Routes through `compat/chat/completions`. Auth via `cf-aig-authorization: Bearer <CF_TOKEN>`. **May fail** if the CF_TOKEN lacks Unified Billing scope. The suite will record these as API errors — they do not crash the runner.
+
+If Unified API models return 401, the eval trace will show `stopReason: "error"` with the HTTP status. This is expected and documented. To enable these models, configure a CF_TOKEN with Unified Billing access or set provider-specific API keys.
+
+## Output: JSON Traces
+
+Each run produces a timestamped JSON file in `experiments/harness-eval/traces/`:
+
+```
+traces/eval-2026-05-06_14-30-00.json
+```
+
+### Trace Schema (v1.0)
+
+```typescript
+{
+  version: "1.0";                // Schema version
+  timestamp: string;             // ISO 8601
+  suite: {
+    commitHash: string;          // Git short hash
+    schemaVersion: string;       // For forward compat
+  };
+  results: [{
+    scenarioId: string;
+    scenarioName: string;
+    category: string;            // "baseline" | "coding"
+    model: {
+      displayName: string;
+      modelId: string;
+      provider: string;
+      path: "workers-ai" | "unified";
+    };
+    rubric: {
+      pass: boolean;
+      reason: string;
+      checks: [{ name, pass, detail }];
+    };
+    usage: { prompt_tokens, completion_tokens, total_tokens };
+    costUsd: number;             // Derived cost for this run
+    latencyMs: number;           // Wall-clock time
+    turnsUsed: number;
+    stopReason: "complete" | "max_turns" | "error";
+    conversation: [];            // Full messages (system, user, assistant, tool)
+    toolCalls: [{                // Every tool invocation
+      turn: number;
+      toolName: string;
+      arguments: {};
+      result: string;
+      isError: boolean;
+    }];
+    turnUsage: [];               // Per-turn token breakdown
+    turnLatency: [];             // Per-turn latency in ms
+    error?: string;              // Error message if stopReason is "error"
+  }];
+  summary: {
+    totalScenarios: number;
+    passedScenarios: number;
+    successRate: number;         // 0-1
+    totalCostUsd: number;
+    costPerSuccessUsd: number;   // Key metric: total cost / passed
+    avgLatencyMs: number;
+    perModel: [{
+      model: string;             // Display name
+      provider: string;
+      totalRuns: number;
+      passed: number;
+      successRate: number;
+      totalCostUsd: number;
+      costPerSuccessUsd: number;
+      avgLatencyMs: number;
+      totalTokens: number;
+    }];
+  };
+}
+```
+
+## Interpreting Results
+
+### Key Metric: Cost Per Successful Task
+
+The primary ranking metric is `costPerSuccessUsd` — total cost divided by number of passing scenarios. This captures both:
+
+1. **Token efficiency** — fewer tokens for the same result = lower cost
+2. **Reliability** — a model that fails half the tasks pays double per success
+
+A cheap model that fails often can be more expensive per success than a pricier model that always passes.
+
+### Reading the Summary
+
+```
+Model: Gemma 4 26B (workers-ai)
+  Pass rate:           6/6 (100%)
+  Total cost:          $0.000066
+  Cost/success:        $0.000011
+  Avg latency:         1200ms
+  Total tokens:        6000
+```
+
+- **Pass rate** — how many scenarios the model completed correctly
+- **Total cost** — sum of all runs (passes + failures)
+- **Cost/success** — total cost / passes. Lower is better. N/A if 0 passes.
+- **Avg latency** — mean wall-clock time per scenario
+- **Total tokens** — aggregate prompt + completion tokens
+
+### Error Runs
+
+Runs with `stopReason: "error"` indicate API-level failures (401, 500, timeout). These count toward total cost but not toward passes, increasing cost-per-success. Common causes:
+
+- **401** — Missing or wrong credentials for the model's gateway path
+- **429** — Rate limited
+- **500** — Provider error
+
+Check the `error` field in the trace JSON for the specific error message.
+
+## Architecture
+
+```
+run.ts              — Main entry point, orchestrates scenarios x models
+runner.ts           — Think-act-observe loop against AI Gateway
+models.ts           — Model registry and gateway URL/header builders
+tools.ts            — Virtual filesystem + mock tool implementations
+cost.ts             — Cost computation and summary aggregation
+trace.ts            — JSON trace persistence and summary printing
+types.ts            — TypeScript type definitions
+scenarios/
+  index.ts          — Scenario barrel export
+  weather-baseline.ts
+  read-and-summarize.ts
+  grep-locate-code.ts
+  missing-file-recovery.ts
+  propose-patch.ts
+  interpret-test-failure.ts
+traces/             — Output directory for JSON trace files (gitignored content)
+```
+
+## Adding a New Scenario
+
+1. Create `scenarios/my-scenario.ts` following the existing pattern
+2. Export it from `scenarios/index.ts`
+3. Each scenario provides:
+   - `systemPrompt` + `userPrompt` — the task
+   - `tools` — array of `EvalTool` (definition + handler)
+   - `maxTurns` — turn budget
+   - `evaluate(run)` — rubric function returning pass/fail with check details
diff --git a/experiments/harness-eval/cost.ts b/experiments/harness-eval/cost.ts
new file mode 100644
index 000000000..05a8327e1
--- /dev/null
+++ b/experiments/harness-eval/cost.ts
@@ -0,0 +1,97 @@
+/**
+ * Cost computation for eval runs.
+ *
+ * Workers AI models are treated as Cloudflare-billed (not free) per task requirements.
+ * The estimated cost uses Cloudflare's published Workers AI Everywhere pricing
+ * ($0.011 per 1M input neurons, $0.011 per 1M output neurons ≈ $0.011/1K tokens).
+ * This is an estimate — actual billing depends on the Cloudflare plan.
+ *
+ * For Unified API models (Anthropic, OpenAI), costs are hardcoded in models.ts
+ * (matching PLATFORM_AI_MODELS registry values at time of writing).
+ */
+
+import type { ModelConfig, TokenUsage, ScenarioResult, EvalSummary } from './types.js';
+
+/** Estimated Workers AI cost per 1K tokens (input and output).
+ * Based on Cloudflare's Workers AI pricing at scale ($0.011 / 1M neurons ≈ $0.000011 / 1K tokens).
+ * Override via WORKERS_AI_COST_PER_1K_TOKENS env var for updated pricing. */
+const DEFAULT_WORKERS_AI_COST_PER_1K = 0.000011;
+
+/**
+ * Compute the USD cost of a single API call.
+ */
+export function computeCost(model: ModelConfig, usage: TokenUsage): number {
+  const inputCost = (usage.prompt_tokens / 1000) * model.costPer1kInput;
+  const outputCost = (usage.completion_tokens / 1000) * model.costPer1kOutput;
+  return inputCost + outputCost;
+}
+
+/**
+ * Get the Workers AI estimated cost per 1K tokens.
+ * Reads WORKERS_AI_COST_PER_1K_TOKENS env var if set.
+ */
+export function getWorkersAiCostPer1k(): number {
+  const envVal = process.env.WORKERS_AI_COST_PER_1K_TOKENS;
+  if (envVal) {
+    const parsed = parseFloat(envVal);
+    if (!isNaN(parsed) && parsed >= 0) return parsed;
+  }
+  return DEFAULT_WORKERS_AI_COST_PER_1K;
+}
+
+/**
+ * Compute aggregate summary from individual scenario results.
+ */
+export function computeSummary(results: ScenarioResult[]): EvalSummary {
+  const totalScenarios = results.length;
+  const passedScenarios = results.filter((r) => r.rubric.pass).length;
+  const successRate = totalScenarios > 0 ? passedScenarios / totalScenarios : 0;
+  const totalCostUsd = results.reduce((sum, r) => sum + r.costUsd, 0);
+  const costPerSuccessUsd = passedScenarios > 0 ? totalCostUsd / passedScenarios : Infinity;
+  const avgLatencyMs =
+    totalScenarios > 0 ? results.reduce((sum, r) => sum + r.latencyMs, 0) / totalScenarios : 0;
+
+  // Per-model breakdown
+  const modelMap = new Map<
+    string,
+    {
+      model: string;
+      provider: string;
+      runs: ScenarioResult[];
+    }
+  >();
+  for (const r of results) {
+    const key = r.model.modelId;
+    if (!modelMap.has(key)) {
+      modelMap.set(key, { model: r.model.displayName, provider: r.model.provider, runs: [] });
+    }
+    modelMap.get(key)!.runs.push(r);
+  }
+
+  const perModel = Array.from(modelMap.values()).map(({ model, provider, runs }) => {
+    const passed = runs.filter((r) => r.rubric.pass).length;
+    const totalCost = runs.reduce((s, r) => s + r.costUsd, 0);
+    const totalTokens = runs.reduce((s, r) => s + r.usage.total_tokens, 0);
+    return {
+      model,
+      provider,
+      totalRuns: runs.length,
+      passed,
+      successRate: runs.length > 0 ? passed / runs.length : 0,
+      totalCostUsd: totalCost,
+      costPerSuccessUsd: passed > 0 ? totalCost / passed : Infinity,
+      avgLatencyMs: runs.length > 0 ? runs.reduce((s, r) => s + r.latencyMs, 0) / runs.length : 0,
+      totalTokens,
+    };
+  });
+
+  return {
+    totalScenarios,
+    passedScenarios,
+    successRate,
+    totalCostUsd,
+    costPerSuccessUsd,
+    avgLatencyMs,
+    perModel,
+  };
+}
diff --git a/experiments/harness-eval/models.ts b/experiments/harness-eval/models.ts
new file mode 100644
index 000000000..8f83c52fa
--- /dev/null
+++ b/experiments/harness-eval/models.ts
@@ -0,0 +1,101 @@
+/**
+ * Model configurations for the eval suite.
+ *
+ * All models route through SAM's AI Gateway (gateway ID: "sam").
+ * Workers AI models use the /workers-ai/v1/chat/completions path.
+ * Unified API models use the /compat/chat/completions path.
+ *
+ * Cost note: Workers AI models are treated as Cloudflare-billed, not free.
+ * We use estimated Cloudflare Workers AI pricing ($0.011/1M neurons).
+ */
+
+import type { ModelConfig } from './types.js';
+import { getWorkersAiCostPer1k } from './cost.js';
+
+/** Build the set of models to evaluate. */
+export function getEvalModels(): ModelConfig[] {
+  const waiCost = getWorkersAiCostPer1k();
+
+  return [
+    {
+      displayName: 'Gemma 4 26B',
+      modelId: '@cf/google/gemma-4-26b-a4b-it',
+      apiModelId: '@cf/google/gemma-4-26b-a4b-it',
+      path: 'workers-ai',
+      costPer1kInput: waiCost,
+      costPer1kOutput: waiCost,
+      provider: 'workers-ai',
+    },
+    {
+      displayName: 'GPT-4.1 Mini',
+      modelId: 'gpt-4.1-mini',
+      apiModelId: 'openai/gpt-4.1-mini',
+      path: 'unified',
+      costPer1kInput: 0.0004,
+      costPer1kOutput: 0.0016,
+      provider: 'openai',
+    },
+    {
+      displayName: 'Claude Haiku 4.5',
+      modelId: 'claude-haiku-4-5-20251001',
+      apiModelId: 'anthropic/claude-haiku-4-5-20251001',
+      path: 'unified',
+      costPer1kInput: 0.0008,
+      costPer1kOutput: 0.004,
+      provider: 'anthropic',
+    },
+    // GPT-5 Mini: not yet in PLATFORM_AI_MODELS registry.
+    // The closest available model is gpt-5.2. Uncomment when gpt-5-mini lands
+    // or update the apiModelId to the correct Unified API identifier.
+    // {
+    //   displayName: 'GPT-5 Mini',
+    //   modelId: 'gpt-5-mini',
+    //   apiModelId: 'openai/gpt-5-mini',
+    //   path: 'unified',
+    //   costPer1kInput: 0.002,    // TBD — placeholder
+    //   costPer1kOutput: 0.008,   // TBD — placeholder
+    //   provider: 'openai',
+    // },
+  ];
+}
+
+/**
+ * Build the AI Gateway URL for a model.
+ */
+export function buildGatewayUrl(
+  accountId: string,
+  gatewayId: string,
+  model: ModelConfig,
+): string {
+  if (model.path === 'workers-ai') {
+    return `https://gateway.ai.cloudflare.com/v1/${accountId}/${gatewayId}/workers-ai/v1/chat/completions`;
+  }
+  return `https://gateway.ai.cloudflare.com/v1/${accountId}/${gatewayId}/compat/chat/completions`;
+}
+
+/**
+ * Build request headers for a model.
+ */
+export function buildHeaders(model: ModelConfig, authToken: string): Record<string, string> {
+  const metadata = JSON.stringify({
+    userId: 'harness-eval',
+    workspaceId: 'harness-eval',
+    projectId: 'harness-eval',
+    source: 'harness-eval-suite',
+    modelId: model.modelId,
+  });
+
+  if (model.path === 'workers-ai') {
+    return {
+      'Content-Type': 'application/json',
+      'Authorization': `Bearer ${authToken}`,
+      'cf-aig-metadata': metadata,
+    };
+  }
+  // Unified API
+  return {
+    'Content-Type': 'application/json',
+    'cf-aig-authorization': `Bearer ${authToken}`,
+    'cf-aig-metadata': metadata,
+  };
+}
diff --git a/experiments/harness-eval/run.ts b/experiments/harness-eval/run.ts
new file mode 100644
index 000000000..6bc938c8c
--- /dev/null
+++ b/experiments/harness-eval/run.ts
@@ -0,0 +1,162 @@
+#!/usr/bin/env npx tsx
+/**
+ * Main entry point for the harness eval suite.
+ *
+ * Runs all scenarios against all configured models through SAM's AI Gateway,
+ * collects results, computes cost-per-success, and persists a JSON trace.
+ *
+ * Usage:
+ *   CF_ACCOUNT_ID=... CF_TOKEN=... npx tsx experiments/harness-eval/run.ts
+ *
+ * Optional env vars:
+ *   AI_GATEWAY_ID       — Gateway ID (default: "sam")
+ *   WORKERS_AI_COST_PER_1K — Override Workers AI cost estimate (default: 0.000011)
+ *   EVAL_SCENARIOS      — Comma-separated scenario IDs to run (default: all)
+ *   EVAL_MODELS         — Comma-separated model IDs to run (default: all)
+ */
+
+import { ALL_SCENARIOS } from './scenarios/index.js';
+import { getEvalModels } from './models.js';
+import { runScenario } from './runner.js';
+import { computeCost } from './cost.js';
+import { buildTrace, writeTrace, printSummary } from './trace.js';
+import type { ScenarioResult, ScenarioRun } from './types.js';
+
+function buildScenarioResult(
+  scenarioId: string,
+  scenarioName: string,
+  category: string,
+  model: ScenarioResult['model'],
+  run: ScenarioRun,
+  rubric: ScenarioResult['rubric'],
+  costUsd: number,
+): ScenarioResult {
+  return {
+    scenarioId,
+    scenarioName,
+    category,
+    model: {
+      displayName: model.displayName,
+      modelId: model.modelId,
+      provider: model.provider,
+      path: model.path,
+    },
+    rubric,
+    usage: run.totalUsage,
+    costUsd,
+    latencyMs: run.totalLatencyMs,
+    turnsUsed: run.turnsUsed,
+    stopReason: run.stopReason,
+    conversation: run.messages,
+    toolCalls: run.toolCalls,
+    turnUsage: run.turnUsage,
+    turnLatency: run.turnLatency,
+    error: run.error,
+  };
+}
+
+async function main() {
+  const accountId = process.env.CF_ACCOUNT_ID;
+  const authToken = process.env.CF_TOKEN;
+  const gatewayId = process.env.AI_GATEWAY_ID ?? 'sam';
+
+  if (!accountId || !authToken) {
+    console.error('ERROR: Missing required environment variables.');
+    console.error('  CF_ACCOUNT_ID — Cloudflare account ID');
+    console.error('  CF_TOKEN      — Cloudflare API token with AI Gateway access');
+    console.error('');
+    console.error('Usage:');
+    console.error('  CF_ACCOUNT_ID=... CF_TOKEN=... npx tsx experiments/harness-eval/run.ts');
+    process.exit(1);
+  }
+
+  const scenarioFilter = process.env.EVAL_SCENARIOS?.split(',').map((s) => s.trim());
+  const modelFilter = process.env.EVAL_MODELS?.split(',').map((s) => s.trim());
+
+  const scenarios = scenarioFilter
+    ? ALL_SCENARIOS.filter((s) => scenarioFilter.includes(s.id))
+    : ALL_SCENARIOS;
+
+  const allModels = getEvalModels();
+  const models = modelFilter
+    ? allModels.filter((m) => modelFilter.includes(m.modelId))
+    : allModels;
+
+  if (scenarios.length === 0) {
+    console.error('ERROR: No scenarios matched the filter:', scenarioFilter);
+    process.exit(1);
+  }
+  if (models.length === 0) {
+    console.error('ERROR: No models matched the filter:', modelFilter);
+    process.exit(1);
+  }
+
+  console.log(`Running ${scenarios.length} scenarios x ${models.length} models = ${scenarios.length * models.length} eval runs`);
+  console.log(`Gateway: ${gatewayId} | Account: ${accountId.slice(0, 8)}...`);
+  console.log('');
+
+  const results: ScenarioResult[] = [];
+
+  for (const scenario of scenarios) {
+    for (const model of models) {
+      const label = `[${scenario.id}] x [${model.modelId}]`;
+      process.stdout.write(`  ${label} ... `);
+
+      try {
+        const run = await runScenario(scenario, model, { accountId, gatewayId, authToken });
+        const rubric = scenario.evaluate(run);
+        const costUsd = computeCost(model, run.totalUsage);
+
+        results.push(buildScenarioResult(scenario.id, scenario.name, scenario.category, model, run, rubric, costUsd));
+
+        const status = run.stopReason === 'error'
+          ? `ERROR: ${run.error?.slice(0, 80)}`
+          : rubric.pass
+            ? `PASS ($${costUsd.toFixed(6)}, ${run.totalLatencyMs}ms)`
+            : `FAIL: ${rubric.reason.slice(0, 80)}`;
+
+        console.log(status);
+      } catch (err) {
+        const errMsg = err instanceof Error ? err.message : String(err);
+        console.log(`CRASH: ${errMsg}`);
+
+        const emptyRun: ScenarioRun = {
+          scenarioId: scenario.id,
+          model,
+          messages: [],
+          toolCalls: [],
+          totalUsage: { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 },
+          turnUsage: [],
+          totalLatencyMs: 0,
+          turnLatency: [],
+          stopReason: 'error',
+          turnsUsed: 0,
+          error: errMsg,
+        };
+
+        results.push(buildScenarioResult(
+          scenario.id, scenario.name, scenario.category, model, emptyRun,
+          { pass: false, reason: `Runner crash: ${errMsg}` },
+          0,
+        ));
+      }
+    }
+  }
+
+  const trace = buildTrace(results);
+  const tracePath = writeTrace(trace);
+  console.log(`\nTrace written to: ${tracePath}`);
+
+  printSummary(trace.summary);
+
+  const anyZeroPasses = trace.summary.perModel.some((m) => m.passed === 0 && m.totalRuns > 0);
+  if (anyZeroPasses) {
+    console.log('WARNING: One or more models had zero passing scenarios.');
+    process.exit(2);
+  }
+}
+
+main().catch((err) => {
+  console.error('Fatal error:', err);
+  process.exit(1);
+});
diff --git a/experiments/harness-eval/runner.ts b/experiments/harness-eval/runner.ts
new file mode 100644
index 000000000..f452bdc2b
--- /dev/null
+++ b/experiments/harness-eval/runner.ts
@@ -0,0 +1,248 @@
+/**
+ * Eval scenario runner.
+ *
+ * Executes a single eval scenario against a single model through SAM's AI Gateway.
+ * Implements the think-act-observe loop with full trace capture.
+ */
+
+import type {
+  ChatMessage,
+  ChatCompletionResponse,
+  EvalScenario,
+  EvalTool,
+  ModelConfig,
+  ScenarioRun,
+  TokenUsage,
+  ToolCallRecord,
+} from './types.js';
+import { buildGatewayUrl, buildHeaders } from './models.js';
+
+interface RunnerEnv {
+  accountId: string;
+  gatewayId: string;
+  authToken: string;
+}
+
+/**
+ * Run a single scenario against a single model.
+ */
+export async function runScenario(
+  scenario: EvalScenario,
+  model: ModelConfig,
+  env: RunnerEnv,
+): Promise<ScenarioRun> {
+  const url = buildGatewayUrl(env.accountId, env.gatewayId, model);
+  const headers = buildHeaders(model, env.authToken);
+
+  // Build tool definitions for the API call
+  const toolDefs = scenario.tools.map((t) => t.definition);
+
+  // Build tool handler lookup
+  const toolHandlers = new Map<string, EvalTool>();
+  for (const tool of scenario.tools) {
+    toolHandlers.set(tool.definition.function.name, tool);
+  }
+
+  // Initialize conversation
+  const messages: ChatMessage[] = [
+    { role: 'system', content: scenario.systemPrompt },
+    { role: 'user', content: scenario.userPrompt },
+  ];
+
+  const toolCalls: ToolCallRecord[] = [];
+  const turnUsage: Array<{ turn: number; usage: TokenUsage | null }> = [];
+  const turnLatency: Array<{ turn: number; latencyMs: number }> = [];
+  let totalUsage: TokenUsage = { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 };
+  let turnsUsed = 0;
+
+  const overallStart = Date.now();
+
+  try {
+    for (let turn = 1; turn <= scenario.maxTurns; turn++) {
+      turnsUsed = turn;
+      const turnStart = Date.now();
+
+      // Build request body
+      const requestModel = model.path === 'workers-ai' ? model.modelId : model.apiModelId;
+
+      // Workers AI quirk: content: null must be "" for some models (not Gemma 4, but keep for safety)
+      const sanitizedMessages =
+        model.path === 'workers-ai'
+          ? messages.map((m) => {
+              if (m.role === 'assistant' && m.tool_calls?.length && m.content === null) {
+                return { ...m, content: '' };
+              }
+              return m;
+            })
+          : messages;
+
+      const body = {
+        model: requestModel,
+        messages: sanitizedMessages,
+        tools: toolDefs.length > 0 ? toolDefs : undefined,
+        tool_choice: toolDefs.length > 0 ? 'auto' : undefined,
+      };
+
+      const resp = await fetch(url, {
+        method: 'POST',
+        headers,
+        body: JSON.stringify(body),
+      });
+
+      const latencyMs = Date.now() - turnStart;
+      turnLatency.push({ turn, latencyMs });
+
+      if (!resp.ok) {
+        const errText = await resp.text();
+        return buildErrorRun(scenario, model, messages, toolCalls, totalUsage, turnUsage, turnLatency, turnsUsed, `API error ${resp.status}: ${errText.slice(0, 500)}`);
+      }
+
+      const response = (await resp.json()) as ChatCompletionResponse;
+
+      // Track usage
+      if (response.usage) {
+        totalUsage = addUsage(totalUsage, response.usage);
+        turnUsage.push({ turn, usage: response.usage });
+      } else {
+        turnUsage.push({ turn, usage: null });
+      }
+
+      const choice = response.choices?.[0];
+      if (!choice) {
+        return buildErrorRun(scenario, model, messages, toolCalls, totalUsage, turnUsage, turnLatency, turnsUsed, 'No choices in API response');
+      }
+
+      const msg = choice.message;
+      messages.push(msg);
+
+      // If no tool calls, the model is done
+      if (choice.finish_reason === 'stop' || !msg.tool_calls?.length) {
+        return {
+          scenarioId: scenario.id,
+          model,
+          messages: [...messages],
+          toolCalls,
+          totalUsage,
+          turnUsage,
+          totalLatencyMs: Date.now() - overallStart,
+          turnLatency,
+          stopReason: 'complete',
+          turnsUsed,
+        };
+      }
+
+      // Execute tool calls
+      for (const tc of msg.tool_calls) {
+        let args: Record<string, unknown>;
+        try {
+          args = JSON.parse(tc.function.arguments);
+        } catch {
+          const errResult = `Error: invalid JSON in tool arguments: ${tc.function.arguments.slice(0, 200)}`;
+          toolCalls.push({
+            turn,
+            toolName: tc.function.name,
+            arguments: {},
+            result: errResult,
+            isError: true,
+          });
+          messages.push({
+            role: 'tool',
+            content: errResult,
+            tool_call_id: tc.id,
+          });
+          continue;
+        }
+
+        const handler = toolHandlers.get(tc.function.name);
+        let result: string;
+        let isError = false;
+
+        if (!handler) {
+          result = `Error: unknown tool "${tc.function.name}". Available tools: ${Array.from(toolHandlers.keys()).join(', ')}`;
+          isError = true;
+        } else {
+          try {
+            result = handler.handler(args);
+          } catch (err) {
+            result = `Error: tool execution failed: ${err instanceof Error ? err.message : String(err)}`;
+            isError = true;
+          }
+        }
+
+        toolCalls.push({
+          turn,
+          toolName: tc.function.name,
+          arguments: args,
+          result,
+          isError,
+        });
+
+        messages.push({
+          role: 'tool',
+          content: result,
+          tool_call_id: tc.id,
+        });
+      }
+    }
+
+    // Exceeded max turns
+    return {
+      scenarioId: scenario.id,
+      model,
+      messages: [...messages],
+      toolCalls,
+      totalUsage,
+      turnUsage,
+      totalLatencyMs: Date.now() - overallStart,
+      turnLatency,
+      stopReason: 'max_turns',
+      turnsUsed,
+    };
+  } catch (err) {
+    return buildErrorRun(
+      scenario,
+      model,
+      messages,
+      toolCalls,
+      totalUsage,
+      turnUsage,
+      turnLatency,
+      turnsUsed,
+      err instanceof Error ? err.message : String(err),
+    );
+  }
+}
+
+function buildErrorRun(
+  scenario: EvalScenario,
+  model: ModelConfig,
+  messages: ChatMessage[],
+  toolCalls: ToolCallRecord[],
+  totalUsage: TokenUsage,
+  turnUsage: Array<{ turn: number; usage: TokenUsage | null }>,
+  turnLatency: Array<{ turn: number; latencyMs: number }>,
+  turnsUsed: number,
+  error: string,
+): ScenarioRun {
+  return {
+    scenarioId: scenario.id,
+    model,
+    messages: [...messages],
+    toolCalls,
+    totalUsage,
+    turnUsage,
+    totalLatencyMs: turnLatency.reduce((s, t) => s + t.latencyMs, 0),
+    turnLatency,
+    stopReason: 'error',
+    turnsUsed,
+    error,
+  };
+}
+
+function addUsage(a: TokenUsage, b: TokenUsage): TokenUsage {
+  return {
+    prompt_tokens: a.prompt_tokens + b.prompt_tokens,
+    completion_tokens: a.completion_tokens + b.completion_tokens,
+    total_tokens: a.total_tokens + b.total_tokens,
+  };
+}
diff --git a/experiments/harness-eval/scenarios/grep-locate-code.ts b/experiments/harness-eval/scenarios/grep-locate-code.ts
new file mode 100644
index 000000000..f30904eb3
--- /dev/null
+++ b/experiments/harness-eval/scenarios/grep-locate-code.ts
@@ -0,0 +1,166 @@
+/**
+ * Scenario: Grep to Locate Code
+ *
+ * Tests the model's ability to use grep to find a function, then read_file to examine it.
+ * This is a core coding workflow: search-then-read.
+ */
+
+import type { EvalScenario, ScenarioRun } from '../types.js';
+import { createVirtualFs, makeReadFile, makeGrep, makeGlob } from '../tools.js';
+
+const FILES = [
+  {
+    path: 'src/utils/math.ts',
+    content: `/**
+ * Utility math functions for the billing module.
+ */
+
+export function calculateDiscount(price: number, percentage: number): number {
+  if (percentage < 0 || percentage > 100) {
+    throw new Error('Percentage must be between 0 and 100');
+  }
+  return price * (1 - percentage / 100);
+}
+
+export function roundToDecimals(value: number, decimals: number): number {
+  const factor = Math.pow(10, decimals);
+  return Math.round(value * factor) / factor;
+}
+
+export function formatCurrency(amount: number, currency = 'USD'): string {
+  return new Intl.NumberFormat('en-US', { style: 'currency', currency }).format(amount);
+}
+`,
+  },
+  {
+    path: 'src/services/billing.ts',
+    content: `import { calculateDiscount, formatCurrency } from '../utils/math';
+
+interface Invoice {
+  items: Array<{ name: string; price: number; quantity: number }>;
+  discountPercent: number;
+}
+
+export function generateInvoice(invoice: Invoice): string {
+  let subtotal = 0;
+  const lines: string[] = [];
+
+  for (const item of invoice.items) {
+    const lineTotal = item.price * item.quantity;
+    subtotal += lineTotal;
+    lines.push(\`  \${item.name}: \${formatCurrency(lineTotal)}\`);
+  }
+
+  const total = calculateDiscount(subtotal, invoice.discountPercent);
+  lines.push(\`  Subtotal: \${formatCurrency(subtotal)}\`);
+  lines.push(\`  Discount: \${invoice.discountPercent}%\`);
+  lines.push(\`  Total: \${formatCurrency(total)}\`);
+
+  return lines.join('\\n');
+}
+`,
+  },
+  {
+    path: 'src/services/users.ts',
+    content: `export interface User {
+  id: string;
+  name: string;
+  email: string;
+}
+
+export function validateEmail(email: string): boolean {
+  return /^[^@]+@[^@]+\\.[^@]+$/.test(email);
+}
+`,
+  },
+  {
+    path: 'src/index.ts',
+    content: `import { generateInvoice } from './services/billing';
+
+const invoice = generateInvoice({
+  items: [{ name: 'Widget', price: 10, quantity: 5 }],
+  discountPercent: 10,
+});
+console.log(invoice);
+`,
+  },
+];
+
+const vfs = createVirtualFs(FILES);
+
+const scenario: EvalScenario = {
+  id: 'grep-locate-code',
+  name: 'Grep to Locate and Read Code',
+  category: 'coding',
+  description: 'Model uses grep to find calculateDiscount, then reads the file to understand the function.',
+
+  systemPrompt:
+    'You are a code analysis assistant. Use the provided tools to search and read source files. Use grep to find code, then read_file to examine it in context.',
+
+  userPrompt:
+    'Find the calculateDiscount function in this project. What does it do, and which files call it?',
+
+  tools: [makeReadFile(vfs), makeGrep(vfs), makeGlob(vfs)],
+
+  maxTurns: 6,
+
+  evaluate: (run: ScenarioRun) => {
+    const checks = [
+      {
+        name: 'used_grep',
+        pass: run.toolCalls.some(
+          (tc) =>
+            tc.toolName === 'grep' &&
+            /calculateDiscount|discount/i.test(JSON.stringify(tc.arguments)),
+        ),
+        detail: 'Model should grep for calculateDiscount',
+      },
+      {
+        name: 'used_read_file',
+        pass: run.toolCalls.some((tc) => tc.toolName === 'read_file'),
+        detail: 'Model should read at least one file for context',
+      },
+      {
+        name: 'completed',
+        pass: run.stopReason === 'complete',
+        detail: 'Model should complete with an answer',
+      },
+      {
+        name: 'identifies_function_purpose',
+        pass: run.messages.some(
+          (m) =>
+            m.role === 'assistant' &&
+            !m.tool_calls?.length &&
+            m.content != null &&
+            /discount|percentage|price/i.test(m.content),
+        ),
+        detail: 'Answer should describe the discount calculation',
+      },
+      {
+        name: 'identifies_caller',
+        pass: run.messages.some(
+          (m) =>
+            m.role === 'assistant' &&
+            !m.tool_calls?.length &&
+            m.content != null &&
+            /billing|generateInvoice/i.test(m.content),
+        ),
+        detail: 'Answer should identify billing.ts or generateInvoice as the caller',
+      },
+    ];
+
+    const allPassed = checks.every((c) => c.pass);
+    return {
+      pass: allPassed,
+      reason: allPassed
+        ? 'Successfully used grep to locate code and analyzed callers'
+        : `Failed checks: ${checks
+            .filter((c) => !c.pass)
+            .map((c) => c.name)
+            .join(', ')}`,
+      checks,
+    };
+  },
+};
+
+export default scenario;
diff --git a/experiments/harness-eval/scenarios/index.ts b/experiments/harness-eval/scenarios/index.ts
new file mode 100644
index 000000000..fa547e19b
--- /dev/null
+++ b/experiments/harness-eval/scenarios/index.ts
@@ -0,0 +1,19 @@
+/**
+ * Scenario index — exports all eval scenarios.
+ */
+
+import weatherBaseline from './weather-baseline.js';
+import readAndSummarize from './read-and-summarize.js';
+import grepLocateCode from './grep-locate-code.js';
+import missingFileRecovery from './missing-file-recovery.js';
+import proposePatch from './propose-patch.js';
+import interpretTestFailure from './interpret-test-failure.js';
+
+export const ALL_SCENARIOS = [
+  weatherBaseline,
+  readAndSummarize,
+  grepLocateCode,
+  missingFileRecovery,
+  proposePatch,
+  interpretTestFailure,
+];
diff --git a/experiments/harness-eval/scenarios/interpret-test-failure.ts b/experiments/harness-eval/scenarios/interpret-test-failure.ts
new file mode 100644
index 000000000..3a5a75dcc
--- /dev/null
+++ b/experiments/harness-eval/scenarios/interpret-test-failure.ts
@@ -0,0 +1,176 @@
+/**
+ * Scenario: Interpret a Test Failure
+ *
+ * Tests the model's ability to read test output, trace the failure to root cause,
+ * and explain what went wrong — without needing to run anything.
+ */
+
+import type { EvalScenario, ScenarioRun } from '../types.js';
+import { createVirtualFs, makeReadFile, makeGrep, makeGlob } from '../tools.js';
+
+const TEST_OUTPUT = `FAIL tests/cart.test.ts
+  ShoppingCart
+    ✓ adds items to cart (3ms)
+    ✓ removes items from cart (1ms)
+    ✗ calculates total with tax (5ms)
+
+      Expected: 108
+      Received: 100
+
+      at Object.<anonymous> (tests/cart.test.ts:28:27)
+
+  3 tests, 1 failure
+`;
+
+const FILES = [
+  {
+    path: 'tests/cart.test.ts',
+    content: `import { ShoppingCart } from '../src/cart';
+
+describe('ShoppingCart', () => {
+  it('adds items to cart', () => {
+    const cart = new ShoppingCart();
+    cart.addItem({ name: 'Widget', price: 10, quantity: 2 });
+    expect(cart.items.length).toBe(1);
+  });
+
+  it('removes items from cart', () => {
+    const cart = new ShoppingCart();
+    cart.addItem({ name: 'Widget', price: 10, quantity: 1 });
+    cart.removeItem('Widget');
+    expect(cart.items.length).toBe(0);
+  });
+
+  it('calculates total with tax', () => {
+    const cart = new ShoppingCart(0.08); // 8% tax
+    cart.addItem({ name: 'Widget', price: 50, quantity: 2 }); // subtotal = 100
+    const total = cart.getTotal(); // should be 100 * 1.08 = 108
+    expect(total).toBe(108);
+  });
+});
+`,
+  },
+  {
+    path: 'src/cart.ts',
+    content: `interface CartItem {
+  name: string;
+  price: number;
+  quantity: number;
+}
+
+export class ShoppingCart {
+  items: CartItem[] = [];
+  private taxRate: number;
+
+  constructor(taxRate = 0) {
+    this.taxRate = taxRate;
+  }
+
+  addItem(item: CartItem): void {
+    this.items.push(item);
+  }
+
+  removeItem(name: string): void {
+    this.items = this.items.filter((i) => i.name !== name);
+  }
+
+  getSubtotal(): number {
+    return this.items.reduce((sum, item) => sum + item.price * item.quantity, 0);
+  }
+
+  getTotal(): number {
+    // BUG: tax is not applied — just returns subtotal
+    return this.getSubtotal();
+  }
+}
+`,
+  },
+  {
+    path: 'test-output.txt',
+    content: TEST_OUTPUT,
+  },
+];
+
+const vfs = createVirtualFs(FILES);
+
+const scenario: EvalScenario = {
+  id: 'interpret-test-failure',
+  name: 'Interpret a Test Failure',
+  category: 'coding',
+  description: 'Model reads test output and source to diagnose why a test is failing.',
+
+  systemPrompt:
+    'You are a debugging assistant. Use the provided tools to read files and search code. Diagnose test failures by examining both the test and the implementation.',
+
+  userPrompt:
+    'Our CI is failing. Read test-output.txt to see the error, then find the root cause in the source code. Explain what is wrong and how to fix it.',
+
+  tools: [makeReadFile(vfs), makeGrep(vfs), makeGlob(vfs)],
+
+  maxTurns: 6,
+
+  evaluate: (run: ScenarioRun) => {
+    const checks = [
+      {
+        name: 'read_test_output',
+        pass: run.toolCalls.some(
+          (tc) =>
+            tc.toolName === 'read_file' &&
+            /test-output/i.test(JSON.stringify(tc.arguments)),
+        ),
+        detail: 'Model should read the test output file',
+      },
+      {
+        name: 'read_source',
+        pass: run.toolCalls.some(
+          (tc) =>
+            tc.toolName === 'read_file' &&
+            /cart\.ts/i.test(JSON.stringify(tc.arguments)) &&
+            !tc.isError,
+        ),
+        detail: 'Model should read the cart source to find the bug',
+      },
+      {
+        name: 'completed',
+        pass: run.stopReason === 'complete',
+        detail: 'Model should complete with a diagnosis',
+      },
+      {
+        name: 'identifies_missing_tax',
+        pass: run.messages.some(
+          (m) =>
+            m.role === 'assistant' &&
+            !m.tool_calls?.length &&
+            m.content != null &&
+            /tax|taxRate|getTotal|subtotal/i.test(m.content),
+        ),
+        detail: 'Answer should identify that tax is not applied in getTotal',
+      },
+      {
+        name: 'suggests_fix',
+        pass: run.messages.some(
+          (m) =>
+            m.role === 'assistant' &&
+            !m.tool_calls?.length &&
+            m.content != null &&
+            /1\s*\+\s*.*tax|multiply|taxRate|\*\s*\(1/i.test(m.content),
+        ),
+        detail: 'Answer should suggest applying the tax rate in the calculation',
+      },
+    ];
+
+    const allPassed = checks.every((c) => c.pass);
+    return {
+      pass: allPassed,
+      reason: allPassed
+        ? 'Successfully diagnosed the missing tax calculation bug'
+        : `Failed checks: ${checks
+            .filter((c) => !c.pass)
+            .map((c) => c.name)
+            .join(', ')}`,
+      checks,
+    };
+  },
+};
+
+export default scenario;
diff --git a/experiments/harness-eval/scenarios/missing-file-recovery.ts b/experiments/harness-eval/scenarios/missing-file-recovery.ts
new file mode 100644
index 000000000..d4d81bd26
--- /dev/null
+++ b/experiments/harness-eval/scenarios/missing-file-recovery.ts
@@ -0,0 +1,129 @@
+/**
+ * Scenario: Missing File Recovery
+ *
+ * Tests the model's ability to handle a read_file error gracefully
+ * and recover by using glob/grep to find the correct file.
+ */
+
+import type { EvalScenario, ScenarioRun } from '../types.js';
+import { createVirtualFs, makeReadFile, makeGrep, makeGlob } from '../tools.js';
+
+const FILES = [
+  {
+    path: 'src/config/database.ts',
+    content: `export interface DatabaseConfig {
+  host: string;
+  port: number;
+  database: string;
+  ssl: boolean;
+}
+
+export const DEFAULT_CONFIG: DatabaseConfig = {
+  host: 'localhost',
+  port: 5432,
+  database: 'app_dev',
+  ssl: false,
+};
+
+export function buildConnectionString(config: DatabaseConfig): string {
+  const protocol = config.ssl ? 'postgresql+ssl' : 'postgresql';
+  return \`\${protocol}://\${config.host}:\${config.port}/\${config.database}\`;
+}
+`,
+  },
+  {
+    path: 'src/config/index.ts',
+    content: `export { DEFAULT_CONFIG, buildConnectionString } from './database';
+export type { DatabaseConfig } from './database';
+`,
+  },
+  {
+    path: 'src/server.ts',
+    content: `import { buildConnectionString, DEFAULT_CONFIG } from './config';
+
+const connStr = buildConnectionString(DEFAULT_CONFIG);
+console.log('Connecting to', connStr);
+`,
+  },
+];
+
+const vfs = createVirtualFs(FILES);
+
+const scenario: EvalScenario = {
+  id: 'missing-file-recovery',
+  name: 'Missing File Error Recovery',
+  category: 'coding',
+  description: 'Model tries to read a nonexistent file, gets an error, and recovers by searching for the correct path.',
+
+  systemPrompt:
+    'You are a code analysis assistant. Use the provided tools to search and read source files. If a file is not found, use glob or grep to locate it.',
+
+  userPrompt:
+    'Read the database configuration from src/db.ts and explain the default connection settings.',
+
+  tools: [makeReadFile(vfs), makeGrep(vfs), makeGlob(vfs)],
+
+  maxTurns: 6,
+
+  evaluate: (run: ScenarioRun) => {
+    const checks = [
+      {
+        name: 'attempted_wrong_path',
+        pass: run.toolCalls.some(
+          (tc) =>
+            tc.toolName === 'read_file' &&
+            /db\.ts/i.test(JSON.stringify(tc.arguments)) &&
+            tc.result.includes('Error'),
+        ),
+        detail: 'Model should first try src/db.ts and get an error',
+      },
+      {
+        name: 'recovered_with_search',
+        pass: run.toolCalls.some(
+          (tc) => tc.toolName === 'glob' || tc.toolName === 'grep',
+        ),
+        detail: 'Model should use glob or grep to find the correct file',
+      },
+      {
+        name: 'found_correct_file',
+        pass: run.toolCalls.some(
+          (tc) =>
+            tc.toolName === 'read_file' &&
+            /database\.ts|config/i.test(JSON.stringify(tc.arguments)) &&
+            !tc.isError,
+        ),
+        detail: 'Model should eventually read the correct database config file',
+      },
+      {
+        name: 'completed',
+        pass: run.stopReason === 'complete',
+        detail: 'Model should complete with an answer',
+      },
+      {
+        name: 'describes_config',
+        pass: run.messages.some(
+          (m) =>
+            m.role === 'assistant' &&
+            !m.tool_calls?.length &&
+            m.content != null &&
+            /localhost|5432|app_dev|connection/i.test(m.content),
+        ),
+        detail: 'Answer should describe the default database config values',
+      },
+    ];
+
+    const allPassed = checks.every((c) => c.pass);
+    return {
+      pass: allPassed,
+      reason: allPassed
+        ? 'Successfully recovered from missing file and found correct config'
+        : `Failed checks: ${checks
+            .filter((c) => !c.pass)
+            .map((c) => c.name)
+            .join(', ')}`,
+      checks,
+    };
+  },
+};
+
+export default scenario;
diff --git a/experiments/harness-eval/scenarios/propose-patch.ts b/experiments/harness-eval/scenarios/propose-patch.ts
new file mode 100644
index 000000000..4e5acf717
--- /dev/null
+++ b/experiments/harness-eval/scenarios/propose-patch.ts
@@ -0,0 +1,149 @@
+/**
+ * Scenario: Propose a Patch
+ *
+ * Tests the model's ability to read a file, identify a bug, and propose
+ * an edit_file call to fix it.
+ */
+
+import type { EvalScenario, ScenarioRun } from '../types.js';
+import { createVirtualFs, makeReadFile, makeEditFile, makeGrep, makeGlob } from '../tools.js';
+
+const FILES = [
+  {
+    path: 'src/utils/validate.ts',
+    content: `/**
+ * Validates an email address format.
+ * Returns true if valid, false otherwise.
+ */
+export function isValidEmail(email: string): boolean {
+  if (!email || email.length === 0) {
+    return false;
+  }
+  // BUG: This regex does not require a dot in the domain part.
+  // "user@localhost" passes but should fail for standard email validation.
+  const emailRegex = /^[^@]+@[^@]+$/;
+  return emailRegex.test(email);
+}
+
+/**
+ * Validates that a username meets requirements:
+ * - 3-20 characters
+ * - Alphanumeric and underscores only
+ */
+export function isValidUsername(username: string): boolean {
+  if (!username) return false;
+  return /^[a-zA-Z0-9_]{3,20}$/.test(username);
+}
+
+/**
+ * Validates a password meets minimum requirements:
+ * - At least 8 characters
+ * - At least one uppercase letter
+ * - At least one number
+ */
+export function isValidPassword(password: string): boolean {
+  if (!password || password.length < 8) return false;
+  if (!/[A-Z]/.test(password)) return false;
+  if (!/[0-9]/.test(password)) return false;
+  return true;
+}
+`,
+  },
+  {
+    path: 'tests/validate.test.ts',
+    content: `import { isValidEmail } from '../src/utils/validate';
+
+describe('isValidEmail', () => {
+  it('accepts standard emails', () => {
+    expect(isValidEmail('user@example.com')).toBe(true);
+  });
+
+  it('rejects empty strings', () => {
+    expect(isValidEmail('')).toBe(false);
+  });
+
+  // This test FAILS because the regex allows "user@localhost"
+  it('rejects emails without a domain dot', () => {
+    expect(isValidEmail('user@localhost')).toBe(false); // FAILS — returns true
+  });
+});
+`,
+  },
+];
+
+const vfs = createVirtualFs(FILES);
+
+const scenario: EvalScenario = {
+  id: 'propose-patch',
+  name: 'Propose a Bug Fix Patch',
+  category: 'coding',
+  description: 'Model reads code with a known bug, identifies the issue, and proposes an edit to fix it.',
+
+  systemPrompt:
+    'You are a code review assistant. Use the provided tools to read, search, and edit source files. When you find a bug, fix it using the edit_file tool.',
+
+  userPrompt:
+    'The test "rejects emails without a domain dot" in tests/validate.test.ts is failing. Find the bug and fix it.',
+
+  tools: [makeReadFile(vfs), makeEditFile(vfs), makeGrep(vfs), makeGlob(vfs)],
+
+  maxTurns: 8,
+
+  evaluate: (run: ScenarioRun) => {
+    const checks = [
+      {
+        name: 'read_test_or_source',
+        pass: run.toolCalls.some(
+          (tc) =>
+            tc.toolName === 'read_file' &&
+            /validate/i.test(JSON.stringify(tc.arguments)),
+        ),
+        detail: 'Model should read the test or source file',
+      },
+      {
+        name: 'used_edit_file',
+        pass: run.toolCalls.some((tc) => tc.toolName === 'edit_file'),
+        detail: 'Model should propose an edit to fix the bug',
+      },
+      {
+        name: 'fix_targets_regex',
+        pass: run.toolCalls.some(
+          (tc) =>
+            tc.toolName === 'edit_file' &&
+            /emailRegex|regex|\.\*\\\.|\\\./i.test(JSON.stringify(tc.arguments)),
+        ),
+        detail: 'The edit should modify the email regex to require a dot in the domain',
+      },
+      {
+        name: 'completed',
+        pass: run.stopReason === 'complete',
+        detail: 'Model should complete with an explanation',
+      },
+      {
+        name: 'explains_fix',
+        pass: run.messages.some(
+          (m) =>
+            m.role === 'assistant' &&
+            !m.tool_calls?.length &&
+            m.content != null &&
+            /dot|domain|\.\w|regex|pattern/i.test(m.content),
+        ),
+        detail: 'Answer should explain the regex fix',
+      },
+    ];
+
+    const allPassed = checks.every((c) => c.pass);
+    return {
+      pass: allPassed,
+      reason: allPassed
+        ? 'Successfully identified and fixed the email regex bug'
+        : `Failed checks: ${checks
+            .filter((c) => !c.pass)
+            .map((c) => c.name)
+            .join(', ')}`,
+      checks,
+    };
+  },
+};
+
+export default scenario;
diff --git a/experiments/harness-eval/scenarios/read-and-summarize.ts b/experiments/harness-eval/scenarios/read-and-summarize.ts
new file mode 100644
index 000000000..333414d10
--- /dev/null
+++ b/experiments/harness-eval/scenarios/read-and-summarize.ts
@@ -0,0 +1,140 @@
+/**
+ * Scenario: Read File and Summarize
+ *
+ * Tests the model's ability to read a source file and produce a meaningful summary.
+ * Exercises: read_file tool, code comprehension, concise output.
+ */
+
+import type { EvalScenario, ScenarioRun } from '../types.js';
+import { createVirtualFs, makeReadFile, makeGlob } from '../tools.js';
+
+const FILES = [
+  {
+    path: 'src/auth.ts',
+    content: `import { verify } from 'jsonwebtoken';
+import type { Request, Response, NextFunction } from 'express';
+
+interface JWTPayload {
+  userId: string;
+  role: 'admin' | 'user';
+  exp: number;
+}
+
+const JWT_SECRET = process.env.JWT_SECRET || 'dev-secret';
+
+export function authMiddleware(req: Request, _res: Response, next: NextFunction): void {
+  const header = req.headers.authorization;
+  if (!header?.startsWith('Bearer ')) {
+    throw new Error('Missing or invalid Authorization header');
+  }
+  const token = header.slice(7);
+  const payload = verify(token, JWT_SECRET) as JWTPayload;
+  req.user = { id: payload.userId, role: payload.role };
+  next();
+}
+
+export function requireAdmin(req: Request, _res: Response, next: NextFunction): void {
+  if (req.user?.role !== 'admin') {
+    throw new Error('Admin access required');
+  }
+  next();
+}
+`,
+  },
+  {
+    path: 'src/index.ts',
+    content: `import express from 'express';
+import { authMiddleware } from './auth';
+
+const app = express();
+app.use(express.json());
+app.use(authMiddleware);
+
+app.get('/health', (_req, res) => res.json({ status: 'ok' }));
+app.listen(3000);
+`,
+  },
+];
+
+const vfs = createVirtualFs(FILES);
+
+const scenario: EvalScenario = {
+  id: 'read-and-summarize',
+  name: 'Read File and Summarize Code',
+  category: 'coding',
+  description: 'Model reads a TypeScript auth module and summarizes its functionality.',
+
+  systemPrompt:
+    'You are a code analysis assistant. Use the provided tools to read source files and answer questions about the codebase. Be concise and precise.',
+
+  userPrompt:
+    'Read src/auth.ts and give me a brief summary of what it does, including the key functions and their purposes.',
+
+  tools: [makeReadFile(vfs), makeGlob(vfs)],
+
+  maxTurns: 4,
+
+  evaluate: (run: ScenarioRun) => {
+    const checks = [
+      {
+        name: 'read_auth_file',
+        pass: run.toolCalls.some(
+          (tc) => tc.toolName === 'read_file' && String(tc.arguments.path).includes('auth'),
+        ),
+        detail: 'Model should read src/auth.ts',
+      },
+      {
+        name: 'completed',
+        pass: run.stopReason === 'complete',
+        detail: 'Model should complete with a summary',
+      },
+      {
+        name: 'mentions_auth_middleware',
+        pass: run.messages.some(
+          (m) =>
+            m.role === 'assistant' &&
+            !m.tool_calls?.length &&
+            m.content != null &&
+            /authMiddleware|auth.*middleware/i.test(m.content),
+        ),
+        detail: 'Summary should mention authMiddleware',
+      },
+      {
+        name: 'mentions_jwt',
+        pass: run.messages.some(
+          (m) =>
+            m.role === 'assistant' &&
+            !m.tool_calls?.length &&
+            m.content != null &&
+            /jwt|token/i.test(m.content),
+        ),
+        detail: 'Summary should mention JWT/token verification',
+      },
+      {
+        name: 'mentions_admin',
+        pass: run.messages.some(
+          (m) =>
+            m.role === 'assistant' &&
+            !m.tool_calls?.length &&
+            m.content != null &&
+            /admin|requireAdmin/i.test(m.content),
+        ),
+        detail: 'Summary should mention admin role checking',
+      },
+    ];
+
+    const allPassed = checks.every((c) => c.pass);
+    return {
+      pass: allPassed,
+      reason: allPassed
+        ? 'Successfully read and summarized the auth module'
+        : `Failed checks: ${checks
+            .filter((c) => !c.pass)
+            .map((c) => c.name)
+            .join(', ')}`,
+      checks,
+    };
+  },
+};
+
+export default scenario;
diff --git a/experiments/harness-eval/scenarios/weather-baseline.ts b/experiments/harness-eval/scenarios/weather-baseline.ts
new file mode 100644
index 000000000..86c1f6035
--- /dev/null
+++ b/experiments/harness-eval/scenarios/weather-baseline.ts
@@ -0,0 +1,71 @@
+/**
+ * Scenario: Weather Baseline (continuity with existing experiment)
+ *
+ * Tests the two-tool loop: get_weather -> calculate -> final answer.
+ * This is the same test from experiments/ai-gateway-tool-call/experiment.ts
+ * preserved for continuity and regression detection.
+ */
+
+import type { EvalScenario, ScenarioRun } from '../types.js';
+import { makeGetWeather, makeCalculate } from '../tools.js';
+
+const scenario: EvalScenario = {
+  id: 'weather-baseline',
+  name: 'Weather Baseline (Two-Tool Loop)',
+  category: 'baseline',
+  description: 'Model calls get_weather, then calculate to convert F→C, then gives a final answer.',
+
+  systemPrompt:
+    'You are a helpful assistant. Use the provided tools to answer questions. When you need to convert temperature, use the calculate tool.',
+
+  userPrompt: 'What is the weather in Paris right now? Also tell me the temperature in Celsius.',
+
+  tools: [makeGetWeather(), makeCalculate()],
+
+  maxTurns: 6,
+
+  evaluate: (run: ScenarioRun) => {
+    const checks = [
+      {
+        name: 'used_get_weather',
+        pass: run.toolCalls.some((tc) => tc.toolName === 'get_weather'),
+        detail: 'Model should call get_weather',
+      },
+      {
+        name: 'used_calculate',
+        pass: run.toolCalls.some((tc) => tc.toolName === 'calculate'),
+        detail: 'Model should call calculate for F→C conversion',
+      },
+      {
+        name: 'completed',
+        pass: run.stopReason === 'complete',
+        detail: 'Model should complete (not max_turns or error)',
+      },
+      {
+        name: 'final_answer_has_celsius',
+        pass: run.messages.some(
+          (m) =>
+            m.role === 'assistant' &&
+            !m.tool_calls?.length &&
+            m.content != null &&
+            /22|celsius|°C/i.test(m.content),
+        ),
+        detail: 'Final answer should mention the Celsius temperature (~22°C)',
+      },
+    ];
+
+    const allPassed = checks.every((c) => c.pass);
+    return {
+      pass: allPassed,
+      reason: allPassed
+        ? 'Two-tool weather loop completed successfully'
+        : `Failed checks: ${checks
+            .filter((c) => !c.pass)
+            .map((c) => c.name)
+            .join(', ')}`,
+      checks,
+    };
+  },
+};
+
+export default scenario;
diff --git a/experiments/harness-eval/tools.ts b/experiments/harness-eval/tools.ts
new file mode 100644
index 000000000..25acb1895
--- /dev/null
+++ b/experiments/harness-eval/tools.ts
@@ -0,0 +1,313 @@
+/**
+ * Mock tool implementations for eval scenarios.
+ *
+ * These mirror the Go harness tools (read_file, grep, glob, edit_file, write_file, bash)
+ * but operate on a virtual filesystem so scenarios are deterministic and network-free.
+ */
+
+import type { EvalTool, ToolDefinition } from './types.js';
+
+// ---------------------------------------------------------------------------
+// Virtual filesystem for deterministic tool execution
+// ---------------------------------------------------------------------------
+
+export interface VirtualFile {
+  path: string;
+  content: string;
+}
+
+/**
+ * Create a virtual filesystem from a list of files.
+ * Returns tool implementations that operate on this filesystem.
+ */
+export function createVirtualFs(files: VirtualFile[]): Map<string, string> {
+  const fs = new Map<string, string>();
+  for (const f of files) {
+    fs.set(f.path, f.content);
+  }
+  return fs;
+}
+
+// ---------------------------------------------------------------------------
+// Tool definitions (OpenAI function-calling format)
+// ---------------------------------------------------------------------------
+
+export const READ_FILE_DEF: ToolDefinition = {
+  type: 'function',
+  function: {
+    name: 'read_file',
+    description: 'Read the contents of a file. Returns the file content with line numbers.',
+    parameters: {
+      type: 'object',
+      properties: {
+        path: { type: 'string', description: 'File path relative to the project root' },
+      },
+      required: ['path'],
+    },
+  },
+};
+
+export const GREP_DEF: ToolDefinition = {
+  type: 'function',
+  function: {
+    name: 'grep',
+    description: 'Search file contents for a regex pattern. Returns matching lines with file paths and line numbers.',
+    parameters: {
+      type: 'object',
+      properties: {
+        pattern: { type: 'string', description: 'Regex pattern to search for' },
+        include: { type: 'string', description: 'Glob pattern for files to search (e.g., "*.ts")' },
+      },
+      required: ['pattern'],
+    },
+  },
+};
+
+export const GLOB_DEF: ToolDefinition = {
+  type: 'function',
+  function: {
+    name: 'glob',
+    description: 'Find files matching a glob pattern. Returns a list of matching file paths.',
+    parameters: {
+      type: 'object',
+      properties: {
+        pattern: { type: 'string', description: 'Glob pattern (e.g., "**/*.ts", "src/*.go")' },
+      },
+      required: ['pattern'],
+    },
+  },
+};
+
+export const EDIT_FILE_DEF: ToolDefinition = {
+  type: 'function',
+  function: {
+    name: 'edit_file',
+    description: 'Replace a string in a file. The old_string must appear exactly once in the file.',
+    parameters: {
+      type: 'object',
+      properties: {
+        path: { type: 'string', description: 'File path relative to the project root' },
+        old_string: { type: 'string', description: 'The exact text to find and replace' },
+        new_string: { type: 'string', description: 'The replacement text' },
+      },
+      required: ['path', 'old_string', 'new_string'],
+    },
+  },
+};
+
+export const WRITE_FILE_DEF: ToolDefinition = {
+  type: 'function',
+  function: {
+    name: 'write_file',
+    description: 'Create or overwrite a file with the given content.',
+    parameters: {
+      type: 'object',
+      properties: {
+        path: { type: 'string', description: 'File path relative to the project root' },
+        content: { type: 'string', description: 'File content to write' },
+      },
+      required: ['path', 'content'],
+    },
+  },
+};
+
+export const GET_WEATHER_DEF: ToolDefinition = {
+  type: 'function',
+  function: {
+    name: 'get_weather',
+    description: 'Get current weather for a city. Returns temperature in Fahrenheit and condition.',
+    parameters: {
+      type: 'object',
+      properties: {
+        city: { type: 'string', description: 'City name (e.g., "Paris")' },
+      },
+      required: ['city'],
+    },
+  },
+};
+
+export const CALCULATE_DEF: ToolDefinition = {
+  type: 'function',
+  function: {
+    name: 'calculate',
+    description: 'Evaluate a mathematical expression and return the result.',
+    parameters: {
+      type: 'object',
+      properties: {
+        expression: { type: 'string', description: 'Math expression (e.g., "(72 - 32) * 5/9")' },
+      },
+      required: ['expression'],
+    },
+  },
+};
+
+// ---------------------------------------------------------------------------
+// Tool handler factories
+// ---------------------------------------------------------------------------
+
+/**
+ * Create a read_file tool that operates on a virtual filesystem.
+ */
+export function makeReadFile(vfs: Map<string, string>): EvalTool {
+  return {
+    definition: READ_FILE_DEF,
+    handler: (args) => {
+      const path = String(args.path ?? '');
+      const content = vfs.get(path);
+      if (content === undefined) {
+        return `Error: file not found: ${path}`;
+      }
+      const lines = content.split('\n');
+      const numbered = lines.map((line, i) => `${String(i + 1).padStart(4)}  ${line}`);
+      return numbered.join('\n');
+    },
+  };
+}
+
+/**
+ * Create a grep tool that operates on a virtual filesystem.
+ */
+export function makeGrep(vfs: Map<string, string>): EvalTool {
+  return {
+    definition: GREP_DEF,
+    handler: (args) => {
+      const pattern = String(args.pattern ?? '');
+      const include = args.include ? String(args.include) : undefined;
+      let re: RegExp;
+      try {
+        re = new RegExp(pattern);
+      } catch {
+        return `Error: invalid regex pattern: ${pattern}`;
+      }
+
+      const results: string[] = [];
+      for (const [filePath, content] of vfs) {
+        // Simple include filter (just file extension matching)
+        if (include) {
+          const ext = include.replace('*', '');
+          if (!filePath.endsWith(ext)) continue;
+        }
+        const lines = content.split('\n');
+        for (let i = 0; i < lines.length; i++) {
+          if (re.test(lines[i])) {
+            results.push(`${filePath}:${i + 1}:${lines[i]}`);
+          }
+        }
+      }
+      return results.length > 0 ? results.join('\n') : 'No matches found.';
+    },
+  };
+}
+
+/**
+ * Create a glob tool that operates on a virtual filesystem.
+ */
+export function makeGlob(vfs: Map<string, string>): EvalTool {
+  return {
+    definition: GLOB_DEF,
+    handler: (args) => {
+      const pattern = String(args.pattern ?? '');
+      const paths = Array.from(vfs.keys());
+
+      // Simple glob matching: ** matches everything, * matches within path segment
+      const matched = paths.filter((p) => {
+        if (pattern.includes('**')) {
+          const suffix = pattern.replace('**/', '').replace('**', '');
+          return p.endsWith(suffix) || suffix === '';
+        }
+        if (pattern.startsWith('*.')) {
+          return p.endsWith(pattern.slice(1));
+        }
+        return p.includes(pattern.replace('*', ''));
+      });
+
+      return matched.length > 0 ? matched.join('\n') : 'No files matched.';
+    },
+  };
+}
+
+/**
+ * Create an edit_file tool that operates on a virtual filesystem (mutates vfs).
+ */
+export function makeEditFile(vfs: Map<string, string>): EvalTool {
+  return {
+    definition: EDIT_FILE_DEF,
+    handler: (args) => {
+      const path = String(args.path ?? '');
+      const oldStr = String(args.old_string ?? '');
+      const newStr = String(args.new_string ?? '');
+
+      const content = vfs.get(path);
+      if (content === undefined) {
+        return `Error: file not found: ${path}`;
+      }
+
+      const count = content.split(oldStr).length - 1;
+      if (count === 0) {
+        return `Error: old_string not found in ${path}`;
+      }
+      if (count > 1) {
+        return `Error: old_string found ${count} times in ${path} (must be unique)`;
+      }
+
+      const updated = content.replace(oldStr, newStr);
+      vfs.set(path, updated);
+      return `File ${path} edited successfully.`;
+    },
+  };
+}
+
+/**
+ * Create a write_file tool that operates on a virtual filesystem (mutates vfs).
+ */
+export function makeWriteFile(vfs: Map<string, string>): EvalTool {
+  return {
+    definition: WRITE_FILE_DEF,
+    handler: (args) => {
+      const path = String(args.path ?? '');
+      const content = String(args.content ?? '');
+      vfs.set(path, content);
+      return `File ${path} written successfully.`;
+    },
+  };
+}
+
+/**
+ * Weather tool (mock — always returns sunny 72F for any city).
+ */
+export function makeGetWeather(): EvalTool {
+  return {
+    definition: GET_WEATHER_DEF,
+    handler: (args) => {
+      return JSON.stringify({
+        city: args.city ?? 'Unknown',
+        temperature_f: 72,
+        condition: 'sunny',
+        humidity_percent: 45,
+      });
+    },
+  };
+}
+
+/**
+ * Calculate tool (safe math eval).
+ */
+export function makeCalculate(): EvalTool {
+  return {
+    definition: CALCULATE_DEF,
+    handler: (args) => {
+      const expr = String(args.expression ?? '');
+      const sanitized = expr.replace(/[^0-9+\-*/().% ]/g, '');
+      if (sanitized !== expr) {
+        return JSON.stringify({ error: 'Invalid expression — only numbers and basic math operators allowed' });
+      }
+      try {
+        // eslint-disable-next-line no-eval
+        const result = Function(`"use strict"; return (${sanitized})`)();
+        return JSON.stringify({ result: Number(Number(result).toFixed(4)) });
+      } catch {
+        return JSON.stringify({ error: 'Evaluation failed' });
+      }
+    },
+  };
+}
diff --git a/experiments/harness-eval/trace.ts b/experiments/harness-eval/trace.ts
new file mode 100644
index 000000000..5173601ce
--- /dev/null
+++ b/experiments/harness-eval/trace.ts
@@ -0,0 +1,85 @@
+/**
+ * Trace persistence — writes full eval traces to JSON files.
+ *
+ * Each run produces a timestamped JSON file in the traces/ directory
+ * containing model config, prompts, tool schemas, full message logs,
+ * tool call records, token usage, latency, cost, and pass/fail rubric.
+ */
+
+import { writeFileSync, mkdirSync, existsSync } from 'node:fs';
+import { join, dirname } from 'node:path';
+import { execSync } from 'node:child_process';
+import type { EvalTrace, ScenarioResult, EvalSummary } from './types.js';
+import { computeSummary } from './cost.js';
+
+const TRACE_DIR = join(dirname(new URL(import.meta.url).pathname), 'traces');
+const SCHEMA_VERSION = '1.0';
+
+/**
+ * Get the current git commit hash, or 'unknown' if unavailable.
+ */
+function getCommitHash(): string {
+  try {
+    return execSync('git rev-parse --short HEAD', { encoding: 'utf-8' }).trim();
+  } catch {
+    return 'unknown';
+  }
+}
+
+/**
+ * Build a complete eval trace from scenario results.
+ */
+export function buildTrace(results: ScenarioResult[]): EvalTrace {
+  const summary = computeSummary(results);
+  return {
+    version: '1.0',
+    timestamp: new Date().toISOString(),
+    suite: {
+      commitHash: getCommitHash(),
+      schemaVersion: SCHEMA_VERSION,
+    },
+    results,
+    summary,
+  };
+}
+
+/**
+ * Write the trace to a timestamped JSON file in traces/.
+ * Returns the file path.
+ */
+export function writeTrace(trace: EvalTrace): string {
+  if (!existsSync(TRACE_DIR)) {
+    mkdirSync(TRACE_DIR, { recursive: true });
+  }
+
+  const timestamp = trace.timestamp.replace(/[:.]/g, '-').replace('T', '_').slice(0, 19);
+  const filename = `eval-${timestamp}.json`;
+  const filepath = join(TRACE_DIR, filename);
+
+  writeFileSync(filepath, JSON.stringify(trace, null, 2), 'utf-8');
+  return filepath;
+}
+
+/**
+ * Print a summary table to stdout.
+ */
+export function printSummary(summary: EvalSummary): void {
+  console.log('\n========================================');
+  console.log('  EVAL SUMMARY');
+  console.log('========================================\n');
+
+  for (const ms of summary.perModel) {
+    const passRate = ms.totalRuns > 0 ? ((ms.passed / ms.totalRuns) * 100).toFixed(0) : '0';
+    console.log(`Model: ${ms.model} (${ms.provider})`);
+    console.log(`  Pass rate:           ${ms.passed}/${ms.totalRuns} (${passRate}%)`);
+    console.log(`  Total cost:          $${ms.totalCostUsd.toFixed(6)}`);
+    console.log(`  Cost/success:        ${ms.passed > 0 ? '$' + ms.costPerSuccessUsd.toFixed(6) : 'N/A (0 passes)'}`);
+    console.log(`  Avg latency:         ${ms.avgLatencyMs.toFixed(0)}ms`);
+    console.log(`  Total tokens:        ${ms.totalTokens}`);
+    console.log('');
+  }
+
+  console.log(`Total scenarios: ${summary.totalScenarios}`);
+  console.log(`Overall cost:    $${summary.totalCostUsd.toFixed(6)}`);
+  console.log('');
+}
diff --git a/experiments/harness-eval/traces/.gitkeep b/experiments/harness-eval/traces/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/experiments/harness-eval/tsconfig.json b/experiments/harness-eval/tsconfig.json
new file mode 100644
index 000000000..77538941b
--- /dev/null
+++ b/experiments/harness-eval/tsconfig.json
@@ -0,0 +1,15 @@
+{
+  "compilerOptions": {
+    "target": "ES2022",
+    "module": "Node16",
+    "moduleResolution": "Node16",
+    "strict": true,
+    "esModuleInterop": true,
+    "skipLibCheck": true,
+    "outDir": "dist",
+    "rootDir": ".",
+    "declaration": false,
+    "sourceMap": false
+  },
+  "include": ["*.ts", "scenarios/*.ts"]
+}
diff --git a/experiments/harness-eval/types.ts b/experiments/harness-eval/types.ts
new file mode 100644
index 000000000..213418f56
--- /dev/null
+++ b/experiments/harness-eval/types.ts
@@ -0,0 +1,249 @@
+/**
+ * Core types for the harness model evaluation suite.
+ */
+
+// ---------------------------------------------------------------------------
+// Tool definitions (OpenAI function-calling format)
+// ---------------------------------------------------------------------------
+
+export interface ToolDefinition {
+  type: 'function';
+  function: {
+    name: string;
+    description: string;
+    parameters: Record<string, unknown>;
+  };
+}
+
+// ---------------------------------------------------------------------------
+// Chat messages (OpenAI format)
+// ---------------------------------------------------------------------------
+
+export interface ChatMessage {
+  role: 'system' | 'user' | 'assistant' | 'tool';
+  content?: string | null;
+  tool_calls?: ToolCallMessage[];
+  tool_call_id?: string;
+  /** Gemma 4 returns reasoning traces in this field. */
+  reasoning?: string;
+}
+
+export interface ToolCallMessage {
+  id: string;
+  type: 'function';
+  function: { name: string; arguments: string };
+}
+
+// ---------------------------------------------------------------------------
+// API response (OpenAI chat completion format)
+// ---------------------------------------------------------------------------
+
+export interface ChatCompletionResponse {
+  id: string;
+  choices: Array<{
+    message: ChatMessage;
+    finish_reason: string;
+  }>;
+  model: string;
+  usage?: TokenUsage;
+}
+
+export interface TokenUsage {
+  prompt_tokens: number;
+  completion_tokens: number;
+  total_tokens: number;
+}
+
+// ---------------------------------------------------------------------------
+// Model configuration
+// ---------------------------------------------------------------------------
+
+export type GatewayPath = 'workers-ai' | 'unified';
+
+export interface ModelConfig {
+  /** Human-friendly display name */
+  displayName: string;
+  /** Model ID for the API request */
+  modelId: string;
+  /** Unified API model ID (provider/model) for unified path, raw model ID for workers-ai */
+  apiModelId: string;
+  /** Which AI Gateway endpoint path to use */
+  path: GatewayPath;
+  /** Cost per 1K input tokens (USD). Workers AI = estimated Cloudflare cost, not $0. */
+  costPer1kInput: number;
+  /** Cost per 1K output tokens (USD). Workers AI = estimated Cloudflare cost, not $0. */
+  costPer1kOutput: number;
+  /** Provider name */
+  provider: 'workers-ai' | 'anthropic' | 'openai';
+}
+
+// ---------------------------------------------------------------------------
+// Tool execution
+// ---------------------------------------------------------------------------
+
+/** Mock tool handler — returns a string result given parsed arguments. */
+export type ToolHandler = (args: Record<string, unknown>) => string;
+
+/** A tool with its definition and mock handler. */
+export interface EvalTool {
+  definition: ToolDefinition;
+  handler: ToolHandler;
+}
+
+// ---------------------------------------------------------------------------
+// Scenario definition
+// ---------------------------------------------------------------------------
+
+export interface EvalScenario {
+  /** Unique scenario identifier (kebab-case) */
+  id: string;
+  /** Human-friendly name */
+  name: string;
+  /** Category for grouping */
+  category: 'baseline' | 'coding' | 'error-handling';
+  /** What this scenario tests */
+  description: string;
+  /** System prompt for the model */
+  systemPrompt: string;
+  /** User prompt that starts the conversation */
+  userPrompt: string;
+  /** Tools available in this scenario */
+  tools: EvalTool[];
+  /** Maximum turns before declaring failure */
+  maxTurns: number;
+  /** Rubric: evaluate whether the run passed */
+  evaluate: (run: ScenarioRun) => RubricResult;
+}
+
+// ---------------------------------------------------------------------------
+// Run results
+// ---------------------------------------------------------------------------
+
+export interface ScenarioRun {
+  /** Which scenario was run */
+  scenarioId: string;
+  /** Which model was used */
+  model: ModelConfig;
+  /** All messages exchanged (full conversation) */
+  messages: ChatMessage[];
+  /** Tool calls made during the run */
+  toolCalls: ToolCallRecord[];
+  /** Total token usage across all turns */
+  totalUsage: TokenUsage;
+  /** Per-turn usage for detailed analysis */
+  turnUsage: Array<{ turn: number; usage: TokenUsage | null }>;
+  /** Total wall-clock time in milliseconds */
+  totalLatencyMs: number;
+  /** Per-turn latency */
+  turnLatency: Array<{ turn: number; latencyMs: number }>;
+  /** How the run ended */
+  stopReason: 'complete' | 'max_turns' | 'error';
+  /** Number of turns used */
+  turnsUsed: number;
+  /** Error message if the run failed with an exception */
+  error?: string;
+}
+
+export interface ToolCallRecord {
+  turn: number;
+  toolName: string;
+  arguments: Record<string, unknown>;
+  result: string;
+  isError: boolean;
+}
+
+// ---------------------------------------------------------------------------
+// Rubric / scoring
+// ---------------------------------------------------------------------------
+
+export interface RubricResult {
+  /** Whether the scenario passed */
+  pass: boolean;
+  /** Human-readable explanation */
+  reason: string;
+  /** Optional structured checks */
+  checks?: Array<{ name: string; pass: boolean; detail?: string }>;
+}
+
+// ---------------------------------------------------------------------------
+// Trace (persisted to JSON)
+// ---------------------------------------------------------------------------
+
+export interface EvalTrace {
+  /** Trace format version */
+  version: '1.0';
+  /** When the trace was generated */
+  timestamp: string;
+  /** Suite-level metadata */
+  suite: {
+    /** Git commit hash */
+    commitHash: string;
+    /** Prompt/tool schema version (incremented when prompts or tool defs change) */
+    schemaVersion: string;
+  };
+  /** Individual scenario results */
+  results: ScenarioResult[];
+  /** Aggregate summary */
+  summary: EvalSummary;
+}
+
+export interface ScenarioResult {
+  scenarioId: string;
+  scenarioName: string;
+  category: string;
+  model: {
+    displayName: string;
+    modelId: string;
+    provider: string;
+    path: GatewayPath;
+  };
+  /** Pass/fail and reason */
+  rubric: RubricResult;
+  /** Token usage */
+  usage: TokenUsage;
+  /** Derived cost in USD */
+  costUsd: number;
+  /** Wall-clock latency in ms */
+  latencyMs: number;
+  /** Number of turns used */
+  turnsUsed: number;
+  /** How the run stopped */
+  stopReason: string;
+  /** Full conversation (messages, tool calls, results) */
+  conversation: ChatMessage[];
+  /** Tool call records */
+  toolCalls: ToolCallRecord[];
+  /** Per-turn token usage */
+  turnUsage: Array<{ turn: number; usage: TokenUsage | null }>;
+  /** Per-turn latency */
+  turnLatency: Array<{ turn: number; latencyMs: number }>;
+  /** Error if the run errored */
+  error?: string;
+}
+
+export interface EvalSummary {
+  /** Total scenarios run */
+  totalScenarios: number;
+  /** Scenarios that passed */
+  passedScenarios: number;
+  /** Success rate (0-1) */
+  successRate: number;
+  /** Total cost across all runs (USD) */
+  totalCostUsd: number;
+  /** Cost per successful task (total cost / passed scenarios), Infinity if none passed */
+  costPerSuccessUsd: number;
+  /** Average latency across all runs (ms) */
+  avgLatencyMs: number;
+  /** Per-model breakdown */
+  perModel: Array<{
+    model: string;
+    provider: string;
+    totalRuns: number;
+    passed: number;
+    successRate: number;
+    totalCostUsd: number;
+    costPerSuccessUsd: number;
+    avgLatencyMs: number;
+    totalTokens: number;
+  }>;
+}
diff --git a/tasks/backlog/2026-05-06-harness-eval-suite.md b/tasks/archive/2026-05-06-harness-eval-suite.md
similarity index 100%
rename from tasks/backlog/2026-05-06-harness-eval-suite.md
rename to tasks/archive/2026-05-06-harness-eval-suite.md