From cc28e3bb217eb99ac67b129176bf56b8cb9c4a66 Mon Sep 17 00:00:00 2001
From: Griffen Fargo <3642037+gfargo@users.noreply.github.com>
Date: Tue, 5 May 2026 11:09:25 -0400
Subject: [PATCH] feat(bench): measurement infrastructure for the
 diff-condensing pipeline (#845)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

First chunk of the #845 perf overhaul: a reproducible benchmark
harness so every later PR can show concrete before/after numbers
instead of hand-waving about "should be faster". Three pieces:

1. Telemetry persistence in `observability.ts`. When COCO_BENCH=1
   is set (or any non-`0` value), every llm call accumulates into
   a narrow `LlmBenchCall` buffer; `flushLlmBenchRun` writes the
   record to `<cwd>/.coco-bench.json` (overridable via
   COCO_BENCH_FILE). Best-effort: write failures are silent and
   the buffer self-clears after each flush.

2. Synthetic diff fixtures at
   `src/lib/parsers/default/__fixtures__/`. Three sizes:
     - tiny  ( 5 files,  ~790 tokens)  — early-exit path
     - medium (25 files, ~36k tokens)  — typical commit
     - large  (50 files, ~83k tokens)  — initial-commit shape
   Content comes from a seeded LCG so before/after runs compare
   the same input. Each fixture exports a fully-populated DiffNode
   tree so `summarizeDiffs` runs without a real git repo.

3. `bin/benchmark.ts` runner (`npm run bench`). Plugs the fixtures
   into `summarizeDiffs` with a duck-typed mock chain that simulates
   per-call latency proportional to input size (deterministic so
   PR diffs are apples-to-apples, not real-world wall-clock).
   Captures stage timings + per-call telemetry. `--update`
   overwrites `.bench/baseline.json`; `--fixture=<name>` narrows
   to a single fixture for tighter feedback loops.

Baseline numbers committed at `.bench/baseline.json` against
current `main`:

| fixture | wall-clock | llm calls | llm total ms | prompt tokens |
|---------|------------|-----------|--------------|---------------|
| tiny    |     2 ms   |     0     |     0 ms     |       0       |
| medium  | 30,213 ms  |    20     | 102,723 ms   |    91,766     |
| large   | 70,048 ms  |    41     | 236,818 ms   |   220,199     |

The 3.4× spread between large fixture's wall-clock and total LLM
time (236 s of model work in 70 s wall) reflects the existing
`maxConcurrent=6` parallelism. Subsequent PRs in the #845 sprint
will move these numbers and the deltas will land directly in PR
descriptions.
---
 .bench/baseline.json                          |  40 +++
 .gitignore                                    |   7 +
 bin/benchmark.ts                              | 250 ++++++++++++++++++
 package.json                                  |   1 +
 src/lib/langchain/utils/observability.ts      | 134 ++++++++++
 src/lib/parsers/default/__fixtures__/index.ts | 219 +++++++++++++++
 6 files changed, 651 insertions(+)
 create mode 100644 .bench/baseline.json
 create mode 100644 bin/benchmark.ts
 create mode 100644 src/lib/parsers/default/__fixtures__/index.ts
diff --git a/.bench/baseline.json b/.bench/baseline.json
new file mode 100644
index 0000000..9fe755c
--- /dev/null
+++ b/.bench/baseline.json
@@ -0,0 +1,40 @@
+{
+  "capturedAt": "2026-05-05T15:06:12.102Z",
+  "node": "v22.13.0",
+  "platform": "darwin-arm64",
+  "options": {
+    "baseLatencyMs": 1500,
+    "perTokenMs": 2,
+    "maxConcurrent": 6,
+    "maxTokens": 2048
+  },
+  "results": [
+    {
+      "fixture": "tiny",
+      "fileCount": 5,
+      "approxTokens": 790,
+      "durationMs": 2,
+      "llmCalls": 0,
+      "llmTotalMs": 0,
+      "llmTotalPromptTokens": 0
+    },
+    {
+      "fixture": "medium",
+      "fileCount": 25,
+      "approxTokens": 36150,
+      "durationMs": 30213,
+      "llmCalls": 20,
+      "llmTotalMs": 102723,
+      "llmTotalPromptTokens": 91766
+    },
+    {
+      "fixture": "large",
+      "fileCount": 50,
+      "approxTokens": 83410,
+      "durationMs": 70048,
+      "llmCalls": 41,
+      "llmTotalMs": 236818,
+      "llmTotalPromptTokens": 220199
+    }
+  ]
+}
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 1eb137f..33e3f90 100644
--- a/.gitignore
+++ b/.gitignore
@@ -59,3 +59,10 @@ commitlint.config.js
 
 # Internal specs, audits, and design docs
 specs/
+
+# Diff-condensing benchmark output (#845). Per-run files are local
+# noise; the committed baseline lives at .bench/baseline.json so PR
+# perf claims have a reference point. Telemetry sidecar from
+# COCO_BENCH=1 stays local too.
+.bench/run-*.json
+.coco-bench.json
diff --git a/bin/benchmark.ts b/bin/benchmark.ts
new file mode 100644
index 0000000..81a1a80
--- /dev/null
+++ b/bin/benchmark.ts
@@ -0,0 +1,250 @@
+#!/usr/bin/env tsx
+/**
+ * Diff-condensing pipeline benchmark (#845).
+ *
+ * Runs `summarizeDiffs` against the synthetic fixtures in
+ * `src/lib/parsers/default/__fixtures__/index.ts` using a mock LLM
+ * chain that simulates latency proportional to input size. Captures
+ * stage timings and per-call telemetry, writes the result to
+ * `.bench/<timestamp>.json`, and (when a baseline is present at
+ * `.bench/baseline.json`) prints a diff so PRs can show their wins
+ * concretely.
+ *
+ * Usage:
+ *   npm run bench                # run all fixtures, write bench file
+ *   npm run bench -- --update    # also overwrite the baseline
+ *   npm run bench -- --fixture=medium   # narrow to one fixture
+ *
+ * The mock chain uses a deterministic latency model so before/after
+ * runs compare apples to apples without paying for real API calls.
+ * Numbers don't reflect real-world wall-clock time; they reflect the
+ * pipeline's *behavior* (how many calls fire, how the stages fan
+ * out, where the bottlenecks are).
+ */
+
+import * as fs from 'node:fs'
+import * as path from 'node:path'
+import * as os from 'node:os'
+
+import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters'
+import { loadSummarizationChain } from '@langchain/classic/chains'
+import type { Document } from '@langchain/classic/document'
+
+import { fileChangeParser } from '../src/lib/parsers/default'
+import { summarizeDiffs } from '../src/lib/parsers/default/utils/summarizeDiffs'
+import { allFixtures, DiffFixture } from '../src/lib/parsers/default/__fixtures__'
+import { Logger } from '../src/lib/utils/logger'
+import { getTokenCounter } from '../src/lib/utils/tokenizer'
+import {
+  buildLlmBenchRun,
+  flushLlmBenchRun,
+  resetLlmTelemetry,
+} from '../src/lib/langchain/utils/observability'
+
+// Silence the type checker about the unused `fileChangeParser` import
+// being present for future bench scenarios; the active runner uses
+// `summarizeDiffs` directly so it can pass a pre-built DiffNode.
+void fileChangeParser
+
+const BENCH_DIR = path.join(process.cwd(), '.bench')
+const BASELINE_PATH = path.join(BENCH_DIR, 'baseline.json')
+
+// The bench runner is the canonical "I want telemetry" entry point,
+// so flip COCO_BENCH on in-process if the user didn't set it
+// externally. `recordBenchCall` checks this env var to decide
+// whether to retain per-call data.
+if (!process.env.COCO_BENCH) {
+  process.env.COCO_BENCH = '1'
+}
+
+type BenchOptions = {
+  baseLatencyMs: number
+  perTokenMs: number
+  maxConcurrent: number
+  maxTokens: number
+}
+
+const DEFAULT_OPTIONS: BenchOptions = {
+  // Calibrated to roughly match user-reported wall-clock on
+  // gpt-4.1-nano: ~3-7s for small calls, scaling up to ~25-40s for
+  // multi-thousand-token inputs. Adjust if real-world timings drift.
+  baseLatencyMs: 1500,
+  perTokenMs: 2,
+  maxConcurrent: 6,
+  maxTokens: 2048,
+}
+
+type BenchResult = {
+  fixture: string
+  fileCount: number
+  approxTokens: number
+  durationMs: number
+  llmCalls: number
+  llmTotalMs: number
+  llmTotalPromptTokens: number
+}
+
+function mockChain(options: BenchOptions): unknown {
+  // Duck-typed chain that satisfies the .invoke() shape
+  // `summarize()` expects. Latency is deterministic so before/after
+  // runs are directly comparable.
+  return {
+    invoke: async (input: { input_documents: Document[] }) => {
+      const totalChars = input.input_documents.reduce(
+        (sum, doc) => sum + doc.pageContent.length,
+        0
+      )
+      // Approximate token count from chars/4 — enough fidelity for
+      // the latency model. The pipeline's real tokenizer counts
+      // separately for telemetry.
+      const approxTokens = Math.floor(totalChars / 4)
+      const latencyMs = options.baseLatencyMs + Math.floor(approxTokens * options.perTokenMs)
+      await new Promise((resolve) => setTimeout(resolve, latencyMs))
+      return { text: `[mock summary of ${input.input_documents.length} doc(s), ~${approxTokens} tokens]` }
+    },
+  }
+}
+
+function silentLogger(): Logger {
+  // Tests already use this pattern; keep verbose calls a no-op so the
+  // bench output stays clean while still funneling timer + spinner
+  // calls through the real Logger surface.
+  const logger = new Logger({ verbose: false } as never)
+  return logger
+}
+
+async function runFixture(
+  fixture: DiffFixture,
+  options: BenchOptions
+): Promise<BenchResult> {
+  resetLlmTelemetry()
+
+  const tokenizer = await getTokenCounter('gpt-4.1-nano')
+  const textSplitter = new RecursiveCharacterTextSplitter({
+    chunkSize: 10000,
+    chunkOverlap: 250,
+  })
+  const chain = mockChain(options) as Parameters<typeof summarizeDiffs>[1]['chain']
+  const logger = silentLogger()
+
+  const startedAt = Date.now()
+  await summarizeDiffs(fixture.rootNode, {
+    tokenizer,
+    logger,
+    maxTokens: options.maxTokens,
+    minTokensForSummary: 400,
+    maxFileTokens: Math.floor(options.maxTokens * 0.25),
+    maxConcurrent: options.maxConcurrent,
+    textSplitter,
+    chain,
+    metadata: { command: 'benchmark', model: 'mock' },
+  })
+  const durationMs = Date.now() - startedAt
+
+  const run = buildLlmBenchRun({ command: `bench:${fixture.name}`, totalElapsedMs: durationMs })
+
+  return {
+    fixture: fixture.name,
+    fileCount: fixture.fileCount,
+    approxTokens: fixture.approxTokens,
+    durationMs,
+    llmCalls: run.callCount,
+    llmTotalMs: run.totalLlmElapsedMs,
+    llmTotalPromptTokens: run.totalPromptTokens,
+  }
+}
+
+function formatRow(label: string, value: string | number): string {
+  return `  ${label.padEnd(28)} ${value}`
+}
+
+function printSummary(results: BenchResult[], baseline?: BenchResult[]): void {
+  console.log('\n=== diff-condensing benchmark ===\n')
+  for (const result of results) {
+    console.log(`Fixture: ${result.fixture}  (${result.fileCount} files, ~${result.approxTokens} tokens)`)
+    console.log(formatRow('wall-clock duration', `${result.durationMs}ms`))
+    console.log(formatRow('llm calls', result.llmCalls))
+    console.log(formatRow('llm total time', `${result.llmTotalMs}ms`))
+    console.log(formatRow('llm prompt tokens', result.llmTotalPromptTokens))
+    if (baseline) {
+      const prior = baseline.find((entry) => entry.fixture === result.fixture)
+      if (prior) {
+        const deltaPct = (n: number, p: number) =>
+          p === 0 ? 'n/a' : `${(((n - p) / p) * 100).toFixed(1)}%`
+        console.log(formatRow('Δ duration', `${result.durationMs - prior.durationMs}ms (${deltaPct(result.durationMs, prior.durationMs)})`))
+        console.log(formatRow('Δ llm calls', `${result.llmCalls - prior.llmCalls} (${deltaPct(result.llmCalls, prior.llmCalls)})`))
+      }
+    }
+    console.log('')
+  }
+}
+
+function writeBenchFile(results: BenchResult[], updateBaseline: boolean): void {
+  if (!fs.existsSync(BENCH_DIR)) {
+    fs.mkdirSync(BENCH_DIR, { recursive: true })
+  }
+
+  const stamp = new Date().toISOString().replace(/[:.]/g, '-')
+  const runFile = path.join(BENCH_DIR, `run-${stamp}.json`)
+  const payload = {
+    capturedAt: new Date().toISOString(),
+    node: process.version,
+    platform: `${os.platform()}-${os.arch()}`,
+    options: DEFAULT_OPTIONS,
+    results,
+  }
+  fs.writeFileSync(runFile, JSON.stringify(payload, null, 2))
+  console.log(`Wrote ${runFile}`)
+
+  if (updateBaseline) {
+    fs.writeFileSync(BASELINE_PATH, JSON.stringify(payload, null, 2))
+    console.log(`Updated baseline at ${BASELINE_PATH}`)
+  }
+}
+
+function readBaseline(): BenchResult[] | undefined {
+  if (!fs.existsSync(BASELINE_PATH)) return undefined
+  try {
+    const raw = fs.readFileSync(BASELINE_PATH, 'utf8')
+    const parsed = JSON.parse(raw)
+    return Array.isArray(parsed.results) ? parsed.results : undefined
+  } catch {
+    return undefined
+  }
+}
+
+async function main(): Promise<void> {
+  const args = process.argv.slice(2)
+  const updateBaseline = args.includes('--update')
+  const fixtureArg = args.find((arg) => arg.startsWith('--fixture='))?.split('=')[1]
+
+  const fixtures = fixtureArg
+    ? allFixtures.filter((fixture) => fixture.name === fixtureArg)
+    : allFixtures
+  if (fixtures.length === 0) {
+    console.error(`No fixture matched ${fixtureArg}; available: ${allFixtures.map((f) => f.name).join(', ')}`)
+    process.exitCode = 1
+    return
+  }
+
+  const results: BenchResult[] = []
+  for (const fixture of fixtures) {
+    console.log(`Running fixture ${fixture.name}...`)
+    const result = await runFixture(fixture, DEFAULT_OPTIONS)
+    results.push(result)
+  }
+
+  const baseline = updateBaseline ? undefined : readBaseline()
+  printSummary(results, baseline)
+  writeBenchFile(results, updateBaseline)
+
+  // Flush any in-memory bench telemetry to a separate file when
+  // COCO_BENCH is set externally; lets devs capture the per-call
+  // data alongside the aggregated results.
+  flushLlmBenchRun({ command: 'benchmark' })
+}
+
+main().catch((error) => {
+  console.error(error)
+  process.exitCode = 1
+})
diff --git a/package.json b/package.json
index 5fd5f7a..9144808 100644
--- a/package.json
+++ b/package.json
@@ -39,6 +39,7 @@
     "test": "npm run test:jest && npm run test:publish",
     "test:publish": "npm run lint && npm run build && npm run test:cli && npm pack --dry-run",
     "test:cli": "tsx bin/smokeCli.ts",
+    "bench": "tsx bin/benchmark.ts",
     "pretest:jest": "npm run build:info",
     "test:jest": "jest",
     "test:jest:watch": "jest --watch",
diff --git a/src/lib/langchain/utils/observability.ts b/src/lib/langchain/utils/observability.ts
index 4fe6b86..ae9a468 100644
--- a/src/lib/langchain/utils/observability.ts
+++ b/src/lib/langchain/utils/observability.ts
@@ -1,3 +1,6 @@
+import * as fs from 'node:fs'
+import * as path from 'node:path'
+
 import { Logger } from '../../utils/logger'
 import { TokenCounter } from '../../utils/tokenizer'
 
@@ -15,6 +18,27 @@ export type LlmCallMetadata = {
   inputChunks?: number
 }
 
+/**
+ * Bench-mode call record (#845). Captured for every LLM call when
+ * `COCO_BENCH=1` (or a path) is set, then flushed to disk by
+ * `flushLlmBenchRun` at the end of the command. The structure stays
+ * narrow on purpose — fields the runner actually compares before /
+ * after, nothing more — so different runs with different model /
+ * provider mixes can still diff against the baseline cleanly.
+ */
+type LlmBenchCall = {
+  task: string
+  command?: string
+  provider?: string
+  model?: string
+  promptTokens?: number
+  elapsedMs?: number
+  inputDocuments?: number
+  inputChunks?: number
+}
+
+const benchCalls: LlmBenchCall[] = []
+
 type LlmTelemetrySummary = {
   calls: number
   promptTokens: number
@@ -40,10 +64,29 @@ export function estimatePromptTokens(
   }
 }
 
+function isBenchModeActive(): boolean {
+  return Boolean(process.env.COCO_BENCH && process.env.COCO_BENCH !== '0')
+}
+
+function recordBenchCall(metadata: LlmCallMetadata): void {
+  if (!isBenchModeActive()) return
+  benchCalls.push({
+    task: metadata.task,
+    command: metadata.command,
+    provider: metadata.provider,
+    model: metadata.model,
+    promptTokens: metadata.promptTokens,
+    elapsedMs: metadata.elapsedMs,
+    inputDocuments: metadata.inputDocuments,
+    inputChunks: metadata.inputChunks,
+  })
+}
+
 export function logLlmCall(logger: Logger | undefined, metadata: LlmCallMetadata): void {
   if (!logger) return
 
   recordLlmTelemetry(metadata)
+  recordBenchCall(metadata)
 
   const fields = [
     `task=${metadata.task}`,
@@ -113,4 +156,95 @@ export function logLlmTelemetrySummary(logger: Logger | undefined, command: stri
 
 export function resetLlmTelemetry(): void {
   telemetryByCommand.clear()
+  benchCalls.length = 0
+}
+
+export type LlmBenchRunStage = {
+  name: string
+  elapsedMs: number
+}
+
+export type LlmBenchRunRecord = {
+  command?: string
+  totalElapsedMs?: number
+  stages?: LlmBenchRunStage[]
+  callCount: number
+  totalLlmElapsedMs: number
+  totalPromptTokens: number
+  calls: LlmBenchCall[]
+}
+
+/**
+ * Build the in-memory bench run record from accumulated calls.
+ * Pure (no I/O) so callers can inspect or assert the contents without
+ * touching disk — useful in tests + the in-process benchmark runner.
+ */
+export function buildLlmBenchRun(
+  options: {
+    command?: string
+    totalElapsedMs?: number
+    stages?: LlmBenchRunStage[]
+  } = {}
+): LlmBenchRunRecord {
+  const calls = benchCalls.slice()
+  return {
+    command: options.command,
+    totalElapsedMs: options.totalElapsedMs,
+    stages: options.stages,
+    callCount: calls.length,
+    totalLlmElapsedMs: calls.reduce((sum, call) => sum + (call.elapsedMs || 0), 0),
+    totalPromptTokens: calls.reduce((sum, call) => sum + (call.promptTokens || 0), 0),
+    calls,
+  }
+}
+
+/**
+ * Persist the current bench run to a JSON file. No-op when bench
+ * mode is inactive (so production runs don't pay for disk I/O).
+ *
+ * The file path comes from `COCO_BENCH_FILE` if set, otherwise
+ * defaults to `<cwd>/.coco-bench.json`. Each call appends to the
+ * `runs` array of the file (creates the file if missing) so a single
+ * benchmark session that triggers multiple commands ends up with one
+ * file containing the full sequence.
+ *
+ * Best-effort: write failures are swallowed silently. The bench
+ * runner reports back the failure mode via the return value.
+ */
+export function flushLlmBenchRun(
+  options: {
+    command?: string
+    totalElapsedMs?: number
+    stages?: LlmBenchRunStage[]
+  } = {}
+): { ok: boolean; filePath?: string; error?: string } {
+  if (!isBenchModeActive()) {
+    return { ok: false, error: 'COCO_BENCH not set' }
+  }
+
+  const record = buildLlmBenchRun(options)
+  const filePath = path.resolve(process.env.COCO_BENCH_FILE || path.join(process.cwd(), '.coco-bench.json'))
+
+  try {
+    let existing: { runs: LlmBenchRunRecord[] } = { runs: [] }
+    if (fs.existsSync(filePath)) {
+      try {
+        const raw = fs.readFileSync(filePath, 'utf8')
+        const parsed = JSON.parse(raw)
+        if (parsed && Array.isArray(parsed.runs)) {
+          existing = parsed
+        }
+      } catch {
+        // Corrupt or pre-existing non-bench file: overwrite with a
+        // fresh structure. Bench mode is opt-in; collisions here are
+        // a developer-only concern.
+      }
+    }
+    existing.runs.push(record)
+    fs.writeFileSync(filePath, JSON.stringify(existing, null, 2))
+    benchCalls.length = 0
+    return { ok: true, filePath }
+  } catch (error) {
+    return { ok: false, error: (error as Error).message }
+  }
 }
diff --git a/src/lib/parsers/default/__fixtures__/index.ts b/src/lib/parsers/default/__fixtures__/index.ts
new file mode 100644
index 0000000..0a883ae
--- /dev/null
+++ b/src/lib/parsers/default/__fixtures__/index.ts
@@ -0,0 +1,219 @@
+/**
+ * Synthetic diff fixtures for benchmarking the diff-condensing
+ * pipeline (#845). Each fixture is a fully-populated `DiffNode` tree
+ * so callers can invoke `summarizeDiffs` directly without standing
+ * up a git repo.
+ *
+ * Numbers are picked to mirror the user-reported 4-minute repro
+ * shape:
+ *   - tiny: early-exit path (already under budget)
+ *   - medium: typical real commit (~25 files, ~40k tokens)
+ *   - large: initial-commit shape (~50 files, ~100k tokens)
+ *
+ * Determinism matters more than realism: the synthetic content is
+ * generated from a stable seed so before/after benchmark runs
+ * compare the same input.
+ */
+
+import { DiffNode, FileDiff } from '../../../types'
+
+/**
+ * Tiny pseudo-LCG — keeps the synthetic content stable across runs
+ * without pulling in a seedable PRNG dep. The output is character
+ * pattern, not statistically random; that's fine for a bench fixture.
+ */
+function seededTextBlob(lengthChars: number, seed: number): string {
+  const corpus = 'abcdefghijklmnopqrstuvwxyz0123456789 \n'
+  let state = seed >>> 0
+  let out = ''
+  for (let i = 0; i < lengthChars; i++) {
+    state = (state * 1664525 + 1013904223) >>> 0
+    out += corpus[state % corpus.length]
+  }
+  return out
+}
+
+/**
+ * Build a synthetic file diff at approximately the requested token
+ * count. Token estimate uses chars/4 which is rough but consistent
+ * with how tiktoken behaves for prose-like content; the runner
+ * re-tokenizes with the real counter at fixture-load time so the
+ * recorded `tokenCount` is exact.
+ */
+function buildFileDiff(file: string, approxTokens: number, seed: number): FileDiff {
+  const chars = approxTokens * 4
+  const header = `diff --git a/${file} b/${file}\n--- a/${file}\n+++ b/${file}\n@@ -1,1 +1,${Math.max(1, Math.floor(approxTokens / 4))} @@\n`
+  const body = seededTextBlob(chars, seed)
+    .split('\n')
+    .map((line) => `+${line}`)
+    .join('\n')
+  return {
+    file,
+    diff: header + body,
+    summary: '',
+    tokenCount: approxTokens,
+  }
+}
+
+type FixtureSpec = {
+  name: string
+  files: Array<{ path: string; tokens: number }>
+}
+
+const TINY_SPEC: FixtureSpec = {
+  name: 'tiny',
+  files: [
+    { path: 'src/index.ts', tokens: 200 },
+    { path: 'src/util.ts', tokens: 150 },
+    { path: 'README.md', tokens: 300 },
+    { path: 'package.json', tokens: 80 },
+    { path: 'tsconfig.json', tokens: 60 },
+  ],
+}
+
+const MEDIUM_SPEC: FixtureSpec = {
+  name: 'medium',
+  files: [
+    { path: 'src/api.ts', tokens: 3500 },
+    { path: 'src/auth.ts', tokens: 2400 },
+    { path: 'src/cli.ts', tokens: 4800 },
+    { path: 'src/parser.ts', tokens: 2900 },
+    { path: 'src/utils/http.ts', tokens: 1200 },
+    { path: 'src/utils/format.ts', tokens: 800 },
+    { path: 'src/utils/logger.ts', tokens: 600 },
+    { path: 'tests/api.test.ts', tokens: 1800 },
+    { path: 'tests/auth.test.ts', tokens: 1400 },
+    { path: 'tests/parser.test.ts', tokens: 1600 },
+    { path: 'tests/utils/http.test.ts', tokens: 700 },
+    { path: 'tests/fixtures/sample.json', tokens: 500 },
+    { path: 'docs/ARCHITECTURE.md', tokens: 2300 },
+    { path: 'docs/API.md', tokens: 1900 },
+    { path: 'docs/CONTRIBUTING.md', tokens: 1100 },
+    { path: 'README.md', tokens: 3000 },
+    { path: 'CHANGELOG.md', tokens: 1800 },
+    { path: '.github/workflows/ci.yml', tokens: 600 },
+    { path: '.github/workflows/release.yml', tokens: 900 },
+    { path: '.github/ISSUE_TEMPLATE/bug.md', tokens: 400 },
+    { path: 'package.json', tokens: 700 },
+    { path: 'tsconfig.json', tokens: 200 },
+    { path: '.gitignore', tokens: 150 },
+    { path: 'LICENSE', tokens: 300 },
+    { path: 'pyproject.toml', tokens: 600 },
+  ],
+}
+
+const LARGE_SPEC: FixtureSpec = {
+  name: 'large',
+  files: [
+    // Mirror of the user's 43-file initial commit shape, scaled up
+    // a bit (50 files / ~100k tokens) so we have headroom for both
+    // pre-process and consolidation phases to fire heavily.
+    { path: 'humble_bundle_keys/api.py', tokens: 4400 },
+    { path: 'humble_bundle_keys/auth.py', tokens: 2100 },
+    { path: 'humble_bundle_keys/cli.py', tokens: 7600 },
+    { path: 'humble_bundle_keys/diagnose.py', tokens: 6100 },
+    { path: 'humble_bundle_keys/scraper.py', tokens: 5200 },
+    { path: 'humble_bundle_keys/choice.py', tokens: 4500 },
+    { path: 'humble_bundle_keys/browser_choice.py', tokens: 5500 },
+    { path: 'humble_bundle_keys/exporter.py', tokens: 1300 },
+    { path: 'humble_bundle_keys/models.py', tokens: 700 },
+    { path: 'humble_bundle_keys/_browser_fetch.py', tokens: 1000 },
+    { path: 'humble_bundle_keys/_orders_cache.py', tokens: 1200 },
+    { path: 'humble_bundle_keys/__init__.py', tokens: 110 },
+    { path: 'humble_bundle_keys/__main__.py', tokens: 110 },
+    { path: 'tests/RUNBOOK.md', tokens: 1900 },
+    { path: 'tests/test_api_parser.py', tokens: 1400 },
+    { path: 'tests/test_browser_choice.py', tokens: 1200 },
+    { path: 'tests/test_browser_fetch.py', tokens: 1100 },
+    { path: 'tests/test_choice.py', tokens: 3000 },
+    { path: 'tests/test_diagnose_sanitiser.py', tokens: 2300 },
+    { path: 'tests/test_exporter.py', tokens: 1700 },
+    { path: 'tests/test_parsers.py', tokens: 600 },
+    { path: 'tests/__init__.py', tokens: 40 },
+    { path: 'tests/fixtures/choice_claim/README.md', tokens: 400 },
+    { path: 'tests/fixtures/choice_claim/analytics_get_game.json', tokens: 500 },
+    { path: 'tests/fixtures/choice_claim/analytics_tile_click.json', tokens: 500 },
+    { path: 'tests/fixtures/choice_claim/choosecontent.json', tokens: 600 },
+    { path: 'tests/fixtures/choice_claim/redeemkey.json', tokens: 600 },
+    { path: 'docs/ARCHITECTURE.md', tokens: 2300 },
+    { path: 'docs/CHOICE_CLAIM_SPEC.md', tokens: 3900 },
+    { path: 'docs/WHATS_CLAIMABLE.md', tokens: 1300 },
+    { path: 'README.md', tokens: 3900 },
+    { path: 'CHANGELOG.md', tokens: 3800 },
+    { path: 'CONTRIBUTING.md', tokens: 1200 },
+    { path: 'SECURITY.md', tokens: 1000 },
+    { path: 'LICENSE', tokens: 300 },
+    { path: 'pyproject.toml', tokens: 600 },
+    { path: '.gitignore', tokens: 700 },
+    { path: '.github/ISSUE_TEMPLATE/bug_report.md', tokens: 400 },
+    { path: '.github/ISSUE_TEMPLATE/feature_request.md', tokens: 250 },
+    { path: '.github/ISSUE_TEMPLATE/selector_broken.md', tokens: 500 },
+    { path: '.github/ISSUE_TEMPLATE/config.yml', tokens: 200 },
+    { path: '.github/workflows/ci.yml', tokens: 600 },
+    { path: '.github/workflows/release.yml', tokens: 900 },
+    { path: 'src/feature/a.ts', tokens: 1400 },
+    { path: 'src/feature/b.ts', tokens: 1100 },
+    { path: 'src/feature/c.ts', tokens: 900 },
+    { path: 'src/feature/d.ts', tokens: 800 },
+    { path: 'src/feature/e.ts', tokens: 700 },
+    { path: 'src/feature/utils.ts', tokens: 600 },
+    { path: 'src/feature/types.ts', tokens: 400 },
+  ],
+}
+
+/**
+ * Convert a flat fixture spec into a nested DiffNode tree, grouping
+ * by directory path. Mirrors `createDiffTree`'s behavior on real
+ * file lists.
+ */
+function buildDiffNode(spec: FixtureSpec): DiffNode {
+  const root: DiffNode = { path: '/', diffs: [], children: [] }
+  const dirIndex = new Map<string, DiffNode>([['/', root]])
+
+  spec.files.forEach((file, index) => {
+    const segments = file.path.split('/')
+    const fileName = segments.pop() as string
+    const dirSegments = segments
+
+    let node = root
+    let pathSoFar = ''
+    for (const segment of dirSegments) {
+      pathSoFar = pathSoFar ? `${pathSoFar}/${segment}` : segment
+      const cached = dirIndex.get(pathSoFar)
+      if (cached) {
+        node = cached
+        continue
+      }
+      const child: DiffNode = { path: segment, diffs: [], children: [] }
+      node.children.push(child)
+      dirIndex.set(pathSoFar, child)
+      node = child
+    }
+
+    node.diffs.push(buildFileDiff(`${dirSegments.join('/')}${dirSegments.length ? '/' : ''}${fileName}`, file.tokens, index + 1))
+  })
+
+  return root
+}
+
+export type DiffFixture = {
+  name: string
+  fileCount: number
+  approxTokens: number
+  rootNode: DiffNode
+}
+
+function asFixture(spec: FixtureSpec): DiffFixture {
+  return {
+    name: spec.name,
+    fileCount: spec.files.length,
+    approxTokens: spec.files.reduce((sum, file) => sum + file.tokens, 0),
+    rootNode: buildDiffNode(spec),
+  }
+}
+
+export const tinyFixture: DiffFixture = asFixture(TINY_SPEC)
+export const mediumFixture: DiffFixture = asFixture(MEDIUM_SPEC)
+export const largeFixture: DiffFixture = asFixture(LARGE_SPEC)
+
+export const allFixtures: DiffFixture[] = [tinyFixture, mediumFixture, largeFixture]