From a889d6cf5dc00cbfb600f923284820c2cbb68776 Mon Sep 17 00:00:00 2001
From: Griffen Fargo <3642037+gfargo@users.noreply.github.com>
Date: Tue, 5 May 2026 20:33:36 -0400
Subject: [PATCH] feat(bench): realistic per-language fixture generators +
 scenarios (#845)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The v0 fixtures from #847 used a seeded LCG to generate noise.
Good for deterministic latency measurement, useless for telling
whether an optimization translates to real-shaped diffs. This PR
swaps that out for code-shaped content per file type and adds
named scenarios that mirror real commit workflows.

Generators (src/lib/parsers/default/__fixtures__/generators.ts):
  - generateTypeScript — imports, types, functions, classes, JSDoc
  - generatePython — imports, defs, classes, decorators, docstrings
  - generateMarkdown — headers, lists, paragraphs, code blocks, tables
  - generateJson — nested config object with realistic key names
  - generateYaml — CI workflow shape
  - generateLockfile — yarn lock-style entries
  - generateContentForFile — extension-based dispatcher

Diff-shape wrappers (diffs.ts):
  - asAdditionDiff / asDeletionDiff — pure +/- shapes
  - asModificationDiff — context + remove + add interleaving
  - asRenameDiff — git rename header (no body)
  - asBinaryDiff — binary file marker

Scenarios in addition to the original tiny/medium/large:
  - feature-add (14 files)  — new module + tests + docs touch
  - refactor (30 files)     — rename + ~25 modifications
  - initial-commit (50)     — same shape as user's #845 repro
  - docs-update (9)         — markdown-heavy
  - dep-bump (3)            — package.json + lockfile + CHANGELOG

Re-captured baseline (committed at .bench/baseline.json):

| fixture        | wall-clock | calls | llm total ms | prompt tokens |
|----------------|-----------:|------:|-------------:|--------------:|
| tiny           |       2 ms |     0 |          0 ms|             0 |
| medium         |   31,124 ms|    20 |    106,333 ms|        34,237 |
| large          |   72,151 ms|    41 |    244,112 ms|        74,197 |
| feature-add    |   15,967 ms|    11 |     54,726 ms|        18,937 |
| refactor       |   33,994 ms|    28 |    153,871 ms|        52,430 |
| initial-commit |   72,291 ms|    41 |    245,148 ms|        74,546 |
| docs-update    |   18,563 ms|     8 |     56,293 ms|        13,908 |
| dep-bump       |   27,158 ms|     1 |     27,141 ms|        19,597 |

Three observations the realistic fixtures surface that the LCG
fixtures hid:

1. dep-bump pays 27s for one LLM call — the lockfile pre-summary.
   Skip-trivial / per-extension fast-path should basically zero this.
2. refactor (30 files of mixed +/-) fires 28 LLM calls. The
   continuous-queue wave consolidation work (PR 4) targets exactly
   this shape.
3. docs-update is markdown-heavy with 8 calls in 19s. A markdown-
   specific shorter prompt could measurably trim this.

Tests: 14 new generator tests + 5 new fixture-level tests covering
determinism, expected-marker presence, scaling behavior, and shape
properties of the rename / dep-bump scenarios.
---
 .bench/baseline.json                          |  59 ++-
 src/lib/parsers/default/__fixtures__/diffs.ts | 107 ++++
 .../default/__fixtures__/generators.test.ts   |  69 +++
 .../default/__fixtures__/generators.ts        | 354 ++++++++++++++
 .../default/__fixtures__/index.test.ts        |  68 +++
 src/lib/parsers/default/__fixtures__/index.ts | 455 +++++++++++-------
 6 files changed, 939 insertions(+), 173 deletions(-)
 create mode 100644 src/lib/parsers/default/__fixtures__/diffs.ts
 create mode 100644 src/lib/parsers/default/__fixtures__/generators.test.ts
 create mode 100644 src/lib/parsers/default/__fixtures__/generators.ts
 create mode 100644 src/lib/parsers/default/__fixtures__/index.test.ts

diff --git a/.bench/baseline.json b/.bench/baseline.json
index 9fe755c..0d7143a 100644
--- a/.bench/baseline.json
+++ b/.bench/baseline.json
@@ -1,5 +1,5 @@
 {
-  "capturedAt": "2026-05-05T15:06:12.102Z",
+  "capturedAt": "2026-05-06T00:29:28.140Z",
   "node": "v22.13.0",
   "platform": "darwin-arm64",
   "options": {
@@ -22,19 +22,64 @@
       "fixture": "medium",
       "fileCount": 25,
       "approxTokens": 36150,
-      "durationMs": 30213,
+      "durationMs": 31124,
       "llmCalls": 20,
-      "llmTotalMs": 102723,
-      "llmTotalPromptTokens": 91766
+      "llmTotalMs": 106333,
+      "llmTotalPromptTokens": 34237
     },
     {
       "fixture": "large",
       "fileCount": 50,
       "approxTokens": 83410,
-      "durationMs": 70048,
+      "durationMs": 72151,
       "llmCalls": 41,
-      "llmTotalMs": 236818,
-      "llmTotalPromptTokens": 220199
+      "llmTotalMs": 244112,
+      "llmTotalPromptTokens": 74197
+    },
+    {
+      "fixture": "feature-add",
+      "fileCount": 14,
+      "approxTokens": 17600,
+      "durationMs": 15967,
+      "llmCalls": 11,
+      "llmTotalMs": 54726,
+      "llmTotalPromptTokens": 18937
+    },
+    {
+      "fixture": "refactor",
+      "fileCount": 30,
+      "approxTokens": 32650,
+      "durationMs": 33994,
+      "llmCalls": 28,
+      "llmTotalMs": 153871,
+      "llmTotalPromptTokens": 52430
+    },
+    {
+      "fixture": "initial-commit",
+      "fileCount": 50,
+      "approxTokens": 83410,
+      "durationMs": 72291,
+      "llmCalls": 41,
+      "llmTotalMs": 245148,
+      "llmTotalPromptTokens": 74546
+    },
+    {
+      "fixture": "docs-update",
+      "fileCount": 9,
+      "approxTokens": 15050,
+      "durationMs": 18563,
+      "llmCalls": 8,
+      "llmTotalMs": 56293,
+      "llmTotalPromptTokens": 13908
+    },
+    {
+      "fixture": "dep-bump",
+      "fileCount": 3,
+      "approxTokens": 8450,
+      "durationMs": 27158,
+      "llmCalls": 1,
+      "llmTotalMs": 27141,
+      "llmTotalPromptTokens": 19597
     }
   ]
 }
\ No newline at end of file
diff --git a/src/lib/parsers/default/__fixtures__/diffs.ts b/src/lib/parsers/default/__fixtures__/diffs.ts
new file mode 100644
index 0000000..909ae6f
--- /dev/null
+++ b/src/lib/parsers/default/__fixtures__/diffs.ts
@@ -0,0 +1,107 @@
+/**
+ * Diff-shape wrappers for the realistic fixture generators (#845).
+ * Real `git diff` output has different headers + line prefixes
+ * depending on whether the change is a pure addition, pure
+ * deletion, modification, or rename. The condensing pipeline's
+ * tokenizer counts those characters, and an upcoming "skip-trivial"
+ * optimization (PR 2) detects shape from the prefixes — so the
+ * fixture contents need to match real git output closely enough that
+ * those detection passes behave the same.
+ */
+
+export type DiffShape = 'addition' | 'deletion' | 'modification' | 'rename' | 'binary'
+
+function gitHeader(file: string, shape: DiffShape, oldFile?: string): string {
+  switch (shape) {
+    case 'addition':
+      return [
+        `diff --git a/${file} b/${file}`,
+        'new file mode 100644',
+        'index 0000000..1234567',
+        '--- /dev/null',
+        `+++ b/${file}`,
+      ].join('\n')
+    case 'deletion':
+      return [
+        `diff --git a/${file} b/${file}`,
+        'deleted file mode 100644',
+        'index 1234567..0000000',
+        `--- a/${file}`,
+        '+++ /dev/null',
+      ].join('\n')
+    case 'rename':
+      return [
+        `diff --git a/${oldFile || file} b/${file}`,
+        'similarity index 100%',
+        `rename from ${oldFile || file}`,
+        `rename to ${file}`,
+      ].join('\n')
+    case 'binary':
+      return [
+        `diff --git a/${file} b/${file}`,
+        `Binary files a/${file} and b/${file} differ`,
+      ].join('\n')
+    case 'modification':
+    default:
+      return [
+        `diff --git a/${file} b/${file}`,
+        'index 1234567..89abcde 100644',
+        `--- a/${file}`,
+        `+++ b/${file}`,
+      ].join('\n')
+  }
+}
+
+/**
+ * Pure-addition diff: every content line gets a `+` prefix. Mirrors
+ * git's output for a brand-new file.
+ */
+export function asAdditionDiff(file: string, content: string): string {
+  const lines = content.split('\n')
+  const body = `@@ -0,0 +1,${lines.length} @@`
+  const plus = lines.map((line) => `+${line}`).join('\n')
+  return `${gitHeader(file, 'addition')}\n${body}\n${plus}\n`
+}
+
+/** Pure-deletion diff: every line gets a `-` prefix. */
+export function asDeletionDiff(file: string, content: string): string {
+  const lines = content.split('\n')
+  const body = `@@ -1,${lines.length} +0,0 @@`
+  const minus = lines.map((line) => `-${line}`).join('\n')
+  return `${gitHeader(file, 'deletion')}\n${body}\n${minus}\n`
+}
+
+/**
+ * Modification diff: realistic mix of context lines, removals, and
+ * additions. The "modify ratio" picks roughly what fraction of lines
+ * are touched (default 30%) — the rest render as context (` ` prefix).
+ */
+export function asModificationDiff(
+  file: string,
+  oldContent: string,
+  newContent: string
+): string {
+  const oldLines = oldContent.split('\n')
+  const newLines = newContent.split('\n')
+  // Naive "diff": for the bench we don't need a real LCS — just a
+  // plausible interleaving of context, removals, and additions.
+  // Take the first half of old as context, second half as removals,
+  // then add the new lines.
+  const half = Math.floor(oldLines.length / 2)
+  const contextLines = oldLines.slice(0, half).map((line) => ` ${line}`)
+  const removedLines = oldLines.slice(half).map((line) => `-${line}`)
+  const addedLines = newLines.slice(0, Math.max(removedLines.length, 4)).map((line) => `+${line}`)
+  const hunkHeader = `@@ -1,${oldLines.length} +1,${contextLines.length + addedLines.length} @@`
+  const body = [...contextLines, ...removedLines, ...addedLines].join('\n')
+  return `${gitHeader(file, 'modification')}\n${hunkHeader}\n${body}\n`
+}
+
+/** Rename diff with no content change (shape #2 from PR 2 plan). */
+export function asRenameDiff(oldFile: string, newFile: string): string {
+  return `${gitHeader(newFile, 'rename', oldFile)}\n`
+}
+
+/** Binary file change (shape #3 from PR 2 plan). */
+export function asBinaryDiff(file: string): string {
+  return `${gitHeader(file, 'binary')}\n`
+}
diff --git a/src/lib/parsers/default/__fixtures__/generators.test.ts b/src/lib/parsers/default/__fixtures__/generators.test.ts
new file mode 100644
index 0000000..040c61b
--- /dev/null
+++ b/src/lib/parsers/default/__fixtures__/generators.test.ts
@@ -0,0 +1,69 @@
+import {
+  generateContentForFile,
+  generateJson,
+  generateMarkdown,
+  generatePython,
+  generateTypeScript,
+  generateYaml,
+  seededRng,
+} from './generators'
+
+describe('bench fixture generators (#845)', () => {
+  describe('seededRng', () => {
+    it('produces identical sequences for the same seed', () => {
+      const a = seededRng(12345)
+      const b = seededRng(12345)
+      const aValues = Array.from({ length: 10 }, () => a())
+      const bValues = Array.from({ length: 10 }, () => b())
+      expect(aValues).toEqual(bValues)
+    })
+
+    it('produces different sequences for different seeds', () => {
+      const a = seededRng(1)
+      const b = seededRng(2)
+      const aValues = Array.from({ length: 10 }, () => a())
+      const bValues = Array.from({ length: 10 }, () => b())
+      expect(aValues).not.toEqual(bValues)
+    })
+  })
+
+  describe('per-language generators', () => {
+    const cases = [
+      { name: 'TypeScript', generate: generateTypeScript, mustContain: ['import', 'export'] },
+      { name: 'Python', generate: generatePython, mustContain: ['def ', 'import'] },
+      { name: 'Markdown', generate: generateMarkdown, mustContain: ['#'] },
+      { name: 'JSON', generate: generateJson, mustContain: ['{', '}', ':'] },
+      { name: 'YAML', generate: generateYaml, mustContain: ['name:', 'jobs:'] },
+    ]
+
+    it.each(cases)('$name output is deterministic and contains expected markers', ({ generate, mustContain }) => {
+      const a = generate(500, 42)
+      const b = generate(500, 42)
+      expect(a).toBe(b)
+      mustContain.forEach((token) => expect(a).toContain(token))
+    })
+
+    it.each(cases)('$name output scales roughly with the requested token target', ({ generate }) => {
+      const small = generate(100, 7)
+      const large = generate(2000, 7)
+      // The generators target chars/4, so large should be ~20x small.
+      // Loose lower bound: at least 5x bigger to confirm scaling works.
+      expect(large.length).toBeGreaterThan(small.length * 5)
+    })
+  })
+
+  describe('generateContentForFile dispatcher', () => {
+    it('routes by extension', () => {
+      expect(generateContentForFile('foo.ts', 200, 1)).toContain('import')
+      expect(generateContentForFile('foo.py', 200, 1)).toContain('def ')
+      expect(generateContentForFile('foo.md', 200, 1)).toContain('#')
+      expect(generateContentForFile('foo.json', 200, 1)).toContain('{')
+      expect(generateContentForFile('foo.yml', 200, 1)).toContain('jobs:')
+    })
+
+    it('falls back to TypeScript for unknown extensions', () => {
+      const out = generateContentForFile('foo.unknown', 200, 1)
+      expect(out).toContain('import')
+    })
+  })
+})
diff --git a/src/lib/parsers/default/__fixtures__/generators.ts b/src/lib/parsers/default/__fixtures__/generators.ts
new file mode 100644
index 0000000..0b41739
--- /dev/null
+++ b/src/lib/parsers/default/__fixtures__/generators.ts
@@ -0,0 +1,354 @@
+/**
+ * Per-file-type content generators for the diff-condensing
+ * benchmark fixtures (#845). Templates are seeded so the same
+ * (target tokens, seed) pair produces identical content across
+ * runs — required for apples-to-apples bench comparisons.
+ *
+ * Generators are deliberately simple: they produce content that
+ * *looks* like code/docs (proper syntax, plausible identifiers,
+ * realistic structure) without trying to be syntactically valid in
+ * every detail. The goal is to feed the diff-condensing pipeline
+ * input that resembles real-world diffs in shape and token mix —
+ * not to produce executable artifacts.
+ *
+ * Token sizing uses a chars/4 approximation. The bench runner's
+ * real tokenizer re-counts at fixture-load time, so the generators
+ * only need to be in the right neighborhood.
+ */
+
+/** Seeded pseudo-random — LCG. Identical output for identical seed. */
+export function seededRng(seed: number): () => number {
+  let state = seed >>> 0
+  return () => {
+    state = (state * 1664525 + 1013904223) >>> 0
+    return state / 0xffffffff
+  }
+}
+
+function pick<T>(rng: () => number, choices: ReadonlyArray<T>): T {
+  return choices[Math.floor(rng() * choices.length) % choices.length]
+}
+
+function repeat(rng: () => number, min: number, max: number, fn: () => string): string[] {
+  const count = min + Math.floor(rng() * (max - min + 1))
+  const out: string[] = []
+  for (let i = 0; i < count; i++) {
+    out.push(fn())
+  }
+  return out
+}
+
+/** Append generators until the buffer hits the approximate char target. */
+function buildToTarget(approxChars: number, generate: () => string): string {
+  const parts: string[] = []
+  let total = 0
+  while (total < approxChars) {
+    const part = generate()
+    parts.push(part)
+    total += part.length + 1
+  }
+  return parts.join('\n')
+}
+
+const IDENTIFIERS = [
+  'value', 'result', 'item', 'entry', 'config', 'options', 'context', 'state',
+  'handler', 'request', 'response', 'payload', 'session', 'token', 'cache',
+  'manager', 'registry', 'observer', 'consumer', 'producer', 'parser', 'writer',
+  'reader', 'logger', 'router', 'guard', 'visitor', 'validator', 'collector',
+  'reducer', 'transformer', 'mapper', 'filter', 'selector', 'controller',
+]
+
+const TYPE_NAMES = [
+  'User', 'Account', 'Order', 'Invoice', 'Product', 'Session', 'Profile',
+  'Permission', 'Role', 'Resource', 'Event', 'Snapshot', 'Aggregate',
+  'Message', 'Notification', 'Subscription', 'Document', 'Record', 'Entry',
+  'Asset', 'Job', 'Task', 'Worker', 'Pipeline', 'Stage', 'Step', 'Outcome',
+]
+
+const FIELD_NAMES = [
+  'id', 'name', 'email', 'createdAt', 'updatedAt', 'status', 'priority',
+  'count', 'total', 'limit', 'offset', 'cursor', 'kind', 'type', 'value',
+  'metadata', 'tags', 'notes', 'description', 'source', 'target', 'origin',
+  'destination', 'enabled', 'disabled', 'archived', 'verified', 'pending',
+]
+
+const PROSE_WORDS = [
+  'configuration', 'pipeline', 'consumer', 'producer', 'workflow', 'scheduler',
+  'integration', 'authentication', 'authorization', 'persistence', 'request',
+  'response', 'idempotent', 'deterministic', 'serializable', 'invalidation',
+  'observability', 'instrumentation', 'reconciliation', 'orchestrator',
+  'backpressure', 'throughput', 'latency', 'fanout', 'rollback', 'retry',
+  'timeout', 'graceful', 'fallback', 'snapshot', 'partition', 'isolation',
+  'cohesion', 'decoupling', 'extension', 'composition', 'invariant',
+]
+
+const PACKAGES_TS = [
+  'react', 'react-dom', 'next', 'express', 'fastify', 'zod', 'yargs', 'chalk',
+  'commander', 'inquirer', 'simple-git', '@langchain/core', '@langchain/openai',
+  'jest', 'vitest', 'pino', 'winston', 'lodash', 'date-fns', 'tiktoken', 'ink',
+]
+
+const PACKAGES_PY = [
+  'requests', 'pydantic', 'fastapi', 'click', 'rich', 'httpx', 'sqlalchemy',
+  'pytest', 'beautifulsoup4', 'aiohttp', 'tenacity', 'structlog', 'numpy',
+]
+
+function sentence(rng: () => number, lengthWords = 0): string {
+  const length = lengthWords || 6 + Math.floor(rng() * 12)
+  const words: string[] = []
+  for (let i = 0; i < length; i++) {
+    words.push(pick(rng, PROSE_WORDS))
+  }
+  const joined = words.join(' ')
+  return joined.charAt(0).toUpperCase() + joined.slice(1) + '.'
+}
+
+// ---------------------------------------------------------------------------
+// TypeScript
+// ---------------------------------------------------------------------------
+
+export function generateTypeScript(approxTokens: number, seed: number): string {
+  const rng = seededRng(seed)
+  const importBlock = repeat(rng, 3, 8, () => {
+    const pkg = pick(rng, PACKAGES_TS)
+    const ident = pick(rng, IDENTIFIERS)
+    const named = pick(rng, [
+      `import { ${pick(rng, FIELD_NAMES)}, ${pick(rng, FIELD_NAMES)} } from '${pkg}'`,
+      `import ${ident} from '${pkg}'`,
+      `import * as ${ident} from '${pkg}'`,
+    ])
+    return named
+  }).join('\n')
+
+  const body = buildToTarget(approxTokens * 4, () => {
+    const choice = Math.floor(rng() * 5)
+    if (choice === 0) {
+      // type alias
+      const name = pick(rng, TYPE_NAMES)
+      const fields = repeat(rng, 3, 8, () => `  ${pick(rng, FIELD_NAMES)}: ${pick(rng, ['string', 'number', 'boolean', 'Date', 'string[]'])}`).join('\n')
+      return `\nexport type ${name}${seed % 7} = {\n${fields}\n}\n`
+    }
+    if (choice === 1) {
+      // function
+      const name = pick(rng, IDENTIFIERS)
+      const arg = pick(rng, FIELD_NAMES)
+      const argType = pick(rng, TYPE_NAMES)
+      const lines = repeat(rng, 4, 10, () => {
+        const op = pick(rng, [
+          `  const ${pick(rng, IDENTIFIERS)} = ${arg}.${pick(rng, FIELD_NAMES)}`,
+          `  if (!${arg}) return null`,
+          `  ${pick(rng, IDENTIFIERS)}.push(${pick(rng, IDENTIFIERS)})`,
+          `  await ${pick(rng, IDENTIFIERS)}.${pick(rng, FIELD_NAMES)}()`,
+          `  // ${sentence(rng, 5)}`,
+        ])
+        return op
+      }).join('\n')
+      return `\nexport async function ${name}${seed % 9}(${arg}: ${argType}) {\n${lines}\n  return ${arg}\n}\n`
+    }
+    if (choice === 2) {
+      // class
+      const name = pick(rng, TYPE_NAMES)
+      const methods = repeat(rng, 2, 4, () => {
+        const m = pick(rng, IDENTIFIERS)
+        return `  ${m}(): void {\n    // ${sentence(rng, 6)}\n    return\n  }`
+      }).join('\n\n')
+      return `\nexport class ${name}${seed % 11} {\n${methods}\n}\n`
+    }
+    if (choice === 3) {
+      // const declaration with object literal
+      const name = pick(rng, IDENTIFIERS).toUpperCase()
+      const fields = repeat(rng, 4, 8, () => `  ${pick(rng, FIELD_NAMES)}: '${pick(rng, PROSE_WORDS)}'`).join(',\n')
+      return `\nconst ${name}_${seed % 13} = {\n${fields},\n} as const\n`
+    }
+    // jsdoc comment
+    return `\n/**\n * ${sentence(rng)}\n * ${sentence(rng)}\n */\n`
+  })
+
+  return `${importBlock}\n${body}`
+}
+
+// ---------------------------------------------------------------------------
+// Python
+// ---------------------------------------------------------------------------
+
+export function generatePython(approxTokens: number, seed: number): string {
+  const rng = seededRng(seed)
+  const imports = repeat(rng, 3, 6, () => {
+    const pkg = pick(rng, PACKAGES_PY)
+    return pick(rng, [
+      `import ${pkg}`,
+      `from ${pkg} import ${pick(rng, FIELD_NAMES)}`,
+      `from ${pkg} import ${pick(rng, FIELD_NAMES)}, ${pick(rng, FIELD_NAMES)}`,
+    ])
+  }).join('\n')
+
+  const body = buildToTarget(approxTokens * 4, () => {
+    const choice = Math.floor(rng() * 4)
+    if (choice === 0) {
+      // function
+      const name = pick(rng, IDENTIFIERS)
+      const arg = pick(rng, FIELD_NAMES)
+      const lines = repeat(rng, 3, 8, () => {
+        return pick(rng, [
+          `    ${pick(rng, IDENTIFIERS)} = ${arg}.${pick(rng, FIELD_NAMES)}`,
+          `    if not ${arg}:\n        return None`,
+          `    ${pick(rng, IDENTIFIERS)}.append(${arg})`,
+          `    # ${sentence(rng, 5)}`,
+          `    logger.info("${pick(rng, PROSE_WORDS)}", extra={"${pick(rng, FIELD_NAMES)}": ${arg}})`,
+        ])
+      }).join('\n')
+      return `\ndef ${name}_${seed % 7}(${arg}):\n    """${sentence(rng, 8)}"""\n${lines}\n    return ${arg}\n`
+    }
+    if (choice === 1) {
+      // class
+      const name = pick(rng, TYPE_NAMES)
+      const methods = repeat(rng, 2, 4, () => {
+        const m = pick(rng, IDENTIFIERS)
+        return `    def ${m}(self):\n        """${sentence(rng, 6)}"""\n        return self.${pick(rng, FIELD_NAMES)}`
+      }).join('\n\n')
+      return `\nclass ${name}${seed % 11}:\n${methods}\n`
+    }
+    if (choice === 2) {
+      // module-level constant / dict
+      const name = pick(rng, IDENTIFIERS).toUpperCase()
+      const lines = repeat(rng, 3, 6, () => `    "${pick(rng, FIELD_NAMES)}": "${pick(rng, PROSE_WORDS)}"`).join(',\n')
+      return `\n${name}_${seed % 13} = {\n${lines},\n}\n`
+    }
+    // comment block
+    return `\n# ${sentence(rng)}\n# ${sentence(rng)}\n`
+  })
+
+  return `${imports}\n${body}`
+}
+
+// ---------------------------------------------------------------------------
+// Markdown
+// ---------------------------------------------------------------------------
+
+export function generateMarkdown(approxTokens: number, seed: number): string {
+  const rng = seededRng(seed)
+  const title = `# ${pick(rng, TYPE_NAMES)} ${pick(rng, PROSE_WORDS)}`
+  const body = buildToTarget(approxTokens * 4, () => {
+    const choice = Math.floor(rng() * 5)
+    if (choice === 0) {
+      const heading = pick(rng, ['##', '###'])
+      return `\n${heading} ${pick(rng, PROSE_WORDS)} ${pick(rng, PROSE_WORDS)}\n`
+    }
+    if (choice === 1) {
+      const items = repeat(rng, 3, 6, () => `- ${sentence(rng)}`).join('\n')
+      return `\n${items}\n`
+    }
+    if (choice === 2) {
+      return `\n\`\`\`ts\nconst ${pick(rng, IDENTIFIERS)} = ${pick(rng, IDENTIFIERS)}.${pick(rng, FIELD_NAMES)}\n\`\`\`\n`
+    }
+    if (choice === 3) {
+      // table
+      return `\n| ${pick(rng, FIELD_NAMES)} | ${pick(rng, FIELD_NAMES)} |\n|---|---|\n| ${pick(rng, PROSE_WORDS)} | ${pick(rng, PROSE_WORDS)} |\n`
+    }
+    return `\n${sentence(rng)} ${sentence(rng)}\n`
+  })
+
+  return `${title}\n${body}`
+}
+
+// ---------------------------------------------------------------------------
+// JSON config
+// ---------------------------------------------------------------------------
+
+export function generateJson(approxTokens: number, seed: number): string {
+  const rng = seededRng(seed)
+  const sections = repeat(rng, 4, 10, () => {
+    const fieldChoices = repeat(rng, 3, 8, () => {
+      const v = Math.floor(rng() * 4)
+      const value = v === 0
+        ? `"${pick(rng, PROSE_WORDS)}-${seed}"`
+        : v === 1
+        ? `${Math.floor(rng() * 1000)}`
+        : v === 2
+        ? `[${repeat(rng, 1, 4, () => `"${pick(rng, IDENTIFIERS)}"`).join(', ')}]`
+        : `${rng() < 0.5}`
+      return `    "${pick(rng, FIELD_NAMES)}": ${value}`
+    }).join(',\n')
+    return `  "${pick(rng, IDENTIFIERS)}": {\n${fieldChoices}\n  }`
+  }).join(',\n')
+
+  const out = `{\n${sections}\n}\n`
+  // Pad with extra entries until we hit the target so JSON sizing is
+  // predictable per call.
+  if (out.length < approxTokens * 4) {
+    const extras = buildToTarget(approxTokens * 4 - out.length, () => {
+      return `  "${pick(rng, IDENTIFIERS)}_${Math.floor(rng() * 10000)}": "${pick(rng, PROSE_WORDS)}"`
+    })
+    return out.replace(/\n}\n$/, `,\n${extras}\n}\n`)
+  }
+  return out
+}
+
+// ---------------------------------------------------------------------------
+// YAML (CI workflow shape)
+// ---------------------------------------------------------------------------
+
+export function generateYaml(approxTokens: number, seed: number): string {
+  const rng = seededRng(seed)
+  const jobs = repeat(rng, 2, 5, () => {
+    const name = pick(rng, IDENTIFIERS)
+    const steps = repeat(rng, 3, 7, () => {
+      return `      - name: ${pick(rng, PROSE_WORDS)} ${pick(rng, IDENTIFIERS)}\n        run: ${pick(rng, ['npm', 'pnpm', 'yarn'])} ${pick(rng, ['test', 'build', 'lint', 'check', 'install'])}`
+    }).join('\n')
+    return `  ${name}:\n    runs-on: ubuntu-latest\n    steps:\n${steps}`
+  }).join('\n')
+
+  const out = `name: ${pick(rng, PROSE_WORDS)}\non:\n  push:\n    branches: [main]\n  pull_request:\n\njobs:\n${jobs}\n`
+
+  if (out.length < approxTokens * 4) {
+    const extras = buildToTarget(approxTokens * 4 - out.length, () => {
+      return `  # ${sentence(rng, 6)}`
+    })
+    return `${out}\n${extras}`
+  }
+  return out
+}
+
+// ---------------------------------------------------------------------------
+// Lockfile (npm-style)
+// ---------------------------------------------------------------------------
+
+export function generateLockfile(approxTokens: number, seed: number): string {
+  const rng = seededRng(seed)
+  const entries = buildToTarget(approxTokens * 4, () => {
+    const pkg = pick(rng, PACKAGES_TS)
+    const major = Math.floor(rng() * 20)
+    const minor = Math.floor(rng() * 20)
+    const patch = Math.floor(rng() * 30)
+    return `  "${pkg}@${major}.${minor}.${patch}":\n    integrity: sha512-${seed}${pkg.length}${major}${minor}${patch}\n    dependencies:\n      "${pick(rng, PACKAGES_TS)}": "^${Math.floor(rng() * 10)}.0.0"`
+  })
+  return `# yarn lockfile v1\n\n${entries}`
+}
+
+// ---------------------------------------------------------------------------
+// Dispatcher by extension
+// ---------------------------------------------------------------------------
+
+export function generateContentForFile(file: string, approxTokens: number, seed: number): string {
+  const lower = file.toLowerCase()
+  if (lower.endsWith('.ts') || lower.endsWith('.tsx') || lower.endsWith('.js') || lower.endsWith('.mjs')) {
+    return generateTypeScript(approxTokens, seed)
+  }
+  if (lower.endsWith('.py')) {
+    return generatePython(approxTokens, seed)
+  }
+  if (lower.endsWith('.md')) {
+    return generateMarkdown(approxTokens, seed)
+  }
+  if (lower.endsWith('.json')) {
+    return generateJson(approxTokens, seed)
+  }
+  if (lower.endsWith('.yml') || lower.endsWith('.yaml')) {
+    return generateYaml(approxTokens, seed)
+  }
+  if (lower.endsWith('.lock') || lower.includes('lockfile') || lower.endsWith('lock.json')) {
+    return generateLockfile(approxTokens, seed)
+  }
+  // Default: TypeScript-shaped (most common in this codebase)
+  return generateTypeScript(approxTokens, seed)
+}
diff --git a/src/lib/parsers/default/__fixtures__/index.test.ts b/src/lib/parsers/default/__fixtures__/index.test.ts
new file mode 100644
index 0000000..b20d160
--- /dev/null
+++ b/src/lib/parsers/default/__fixtures__/index.test.ts
@@ -0,0 +1,68 @@
+import {
+  allFixtures,
+  depBumpFixture,
+  docsUpdateFixture,
+  featureAddFixture,
+  initialCommitFixture,
+  largeFixture,
+  mediumFixture,
+  refactorFixture,
+  tinyFixture,
+} from './index'
+
+describe('bench fixtures (#845)', () => {
+  it('exposes named sized + scenario fixtures via allFixtures', () => {
+    const names = allFixtures.map((fixture) => fixture.name)
+    expect(names).toEqual([
+      'tiny',
+      'medium',
+      'large',
+      'feature-add',
+      'refactor',
+      'initial-commit',
+      'docs-update',
+      'dep-bump',
+    ])
+  })
+
+  it.each([
+    ['tiny', tinyFixture],
+    ['medium', mediumFixture],
+    ['large', largeFixture],
+    ['feature-add', featureAddFixture],
+    ['refactor', refactorFixture],
+    ['initial-commit', initialCommitFixture],
+    ['docs-update', docsUpdateFixture],
+    ['dep-bump', depBumpFixture],
+  ])('%s fixture has a populated DiffNode tree', (_, fixture) => {
+    expect(fixture.fileCount).toBeGreaterThan(0)
+    expect(fixture.approxTokens).toBeGreaterThan(0)
+    expect(fixture.rootNode.diffs.length + fixture.rootNode.children.length).toBeGreaterThan(0)
+  })
+
+  it('produces deterministic content across calls (same diff text)', () => {
+    // Same export referenced twice should be byte-identical — the
+    // generator is pure on (name, files, seed) so the module
+    // initialization shouldn't introduce any drift.
+    const firstSnapshot = JSON.stringify(tinyFixture.rootNode)
+    const secondSnapshot = JSON.stringify(tinyFixture.rootNode)
+    expect(firstSnapshot).toBe(secondSnapshot)
+  })
+
+  it('refactor scenario includes a rename diff', () => {
+    const collectedDiffs: string[] = []
+    const walk = (node: typeof refactorFixture.rootNode) => {
+      node.diffs.forEach((diff) => collectedDiffs.push(diff.diff))
+      node.children.forEach(walk)
+    }
+    walk(refactorFixture.rootNode)
+    expect(collectedDiffs.some((diff) => diff.includes('rename from'))).toBe(true)
+  })
+
+  it('dep-bump scenario is dominated by a lockfile-shaped modification', () => {
+    const lockfileDiff = depBumpFixture.rootNode.diffs.find((diff) => diff.file.endsWith('.lock'))
+    expect(lockfileDiff).toBeDefined()
+    expect(lockfileDiff?.diff).toContain('diff --git')
+    expect(lockfileDiff?.tokenCount).toBeGreaterThan(1000)
+  })
+})
diff --git a/src/lib/parsers/default/__fixtures__/index.ts b/src/lib/parsers/default/__fixtures__/index.ts
index 0a883ae..79b3cee 100644
--- a/src/lib/parsers/default/__fixtures__/index.ts
+++ b/src/lib/parsers/default/__fixtures__/index.ts
@@ -1,176 +1,85 @@
 /**
- * Synthetic diff fixtures for benchmarking the diff-condensing
- * pipeline (#845). Each fixture is a fully-populated `DiffNode` tree
- * so callers can invoke `summarizeDiffs` directly without standing
- * up a git repo.
+ * Synthetic-but-realistic diff fixtures for the bench (#845).
  *
- * Numbers are picked to mirror the user-reported 4-minute repro
- * shape:
- *   - tiny: early-exit path (already under budget)
- *   - medium: typical real commit (~25 files, ~40k tokens)
- *   - large: initial-commit shape (~50 files, ~100k tokens)
+ * Two generations of fixtures live here:
  *
- * Determinism matters more than realism: the synthetic content is
- * generated from a stable seed so before/after benchmark runs
- * compare the same input.
+ * 1. **Sized fixtures** (`tinyFixture`, `mediumFixture`,
+ *    `largeFixture`) — same names as the v0 LCG fixtures but now
+ *    populated with realistic per-language content (TypeScript,
+ *    Python, Markdown, JSON, YAML). These keep the original
+ *    file-count + token-count shapes so the baseline diff stays
+ *    semantically comparable across generations.
+ *
+ * 2. **Scenario fixtures** (`featureAddFixture`, `refactorFixture`,
+ *    `initialCommitFixture`, `docsUpdateFixture`, `depBumpFixture`)
+ *    — model real-world commit shapes the user is likely to run
+ *    `coco commit` against. Each scenario mixes file types, diff
+ *    shapes (additions, modifications, renames, binary), and sizes
+ *    that reflect the named workflow.
+ *
+ * Determinism note: every generator and shape wrapper is seeded so
+ * the same fixture name always produces identical content. Re-runs
+ * of `npm run bench` therefore compare apples-to-apples without
+ * any environmental drift.
  */
 
 import { DiffNode, FileDiff } from '../../../types'
+import {
+  asAdditionDiff,
+  asBinaryDiff,
+  asDeletionDiff,
+  asModificationDiff,
+  asRenameDiff,
+  DiffShape,
+} from './diffs'
+import { generateContentForFile } from './generators'
 
-/**
- * Tiny pseudo-LCG — keeps the synthetic content stable across runs
- * without pulling in a seedable PRNG dep. The output is character
- * pattern, not statistically random; that's fine for a bench fixture.
- */
-function seededTextBlob(lengthChars: number, seed: number): string {
-  const corpus = 'abcdefghijklmnopqrstuvwxyz0123456789 \n'
-  let state = seed >>> 0
-  let out = ''
-  for (let i = 0; i < lengthChars; i++) {
-    state = (state * 1664525 + 1013904223) >>> 0
-    out += corpus[state % corpus.length]
-  }
-  return out
+type FileSpec = {
+  path: string
+  tokens: number
+  /** Defaults to 'addition' to mirror initial-commit shape. */
+  shape?: DiffShape
+  /** For renames; the prior path. */
+  oldPath?: string
 }
 
-/**
- * Build a synthetic file diff at approximately the requested token
- * count. Token estimate uses chars/4 which is rough but consistent
- * with how tiktoken behaves for prose-like content; the runner
- * re-tokenizes with the real counter at fixture-load time so the
- * recorded `tokenCount` is exact.
- */
-function buildFileDiff(file: string, approxTokens: number, seed: number): FileDiff {
-  const chars = approxTokens * 4
-  const header = `diff --git a/${file} b/${file}\n--- a/${file}\n+++ b/${file}\n@@ -1,1 +1,${Math.max(1, Math.floor(approxTokens / 4))} @@\n`
-  const body = seededTextBlob(chars, seed)
-    .split('\n')
-    .map((line) => `+${line}`)
-    .join('\n')
+function buildFileDiff(spec: FileSpec, seed: number): FileDiff {
+  const shape = spec.shape || 'addition'
+  let diff: string
+  switch (shape) {
+    case 'addition':
+      diff = asAdditionDiff(spec.path, generateContentForFile(spec.path, spec.tokens, seed))
+      break
+    case 'deletion':
+      diff = asDeletionDiff(spec.path, generateContentForFile(spec.path, spec.tokens, seed))
+      break
+    case 'modification':
+      diff = asModificationDiff(
+        spec.path,
+        generateContentForFile(spec.path, spec.tokens, seed),
+        generateContentForFile(spec.path, Math.floor(spec.tokens * 0.6), seed + 1)
+      )
+      break
+    case 'rename':
+      diff = asRenameDiff(spec.oldPath || `old/${spec.path}`, spec.path)
+      break
+    case 'binary':
+      diff = asBinaryDiff(spec.path)
+      break
+  }
   return {
-    file,
-    diff: header + body,
+    file: spec.path,
+    diff,
     summary: '',
-    tokenCount: approxTokens,
+    tokenCount: spec.tokens,
   }
 }
 
-type FixtureSpec = {
-  name: string
-  files: Array<{ path: string; tokens: number }>
-}
-
-const TINY_SPEC: FixtureSpec = {
-  name: 'tiny',
-  files: [
-    { path: 'src/index.ts', tokens: 200 },
-    { path: 'src/util.ts', tokens: 150 },
-    { path: 'README.md', tokens: 300 },
-    { path: 'package.json', tokens: 80 },
-    { path: 'tsconfig.json', tokens: 60 },
-  ],
-}
-
-const MEDIUM_SPEC: FixtureSpec = {
-  name: 'medium',
-  files: [
-    { path: 'src/api.ts', tokens: 3500 },
-    { path: 'src/auth.ts', tokens: 2400 },
-    { path: 'src/cli.ts', tokens: 4800 },
-    { path: 'src/parser.ts', tokens: 2900 },
-    { path: 'src/utils/http.ts', tokens: 1200 },
-    { path: 'src/utils/format.ts', tokens: 800 },
-    { path: 'src/utils/logger.ts', tokens: 600 },
-    { path: 'tests/api.test.ts', tokens: 1800 },
-    { path: 'tests/auth.test.ts', tokens: 1400 },
-    { path: 'tests/parser.test.ts', tokens: 1600 },
-    { path: 'tests/utils/http.test.ts', tokens: 700 },
-    { path: 'tests/fixtures/sample.json', tokens: 500 },
-    { path: 'docs/ARCHITECTURE.md', tokens: 2300 },
-    { path: 'docs/API.md', tokens: 1900 },
-    { path: 'docs/CONTRIBUTING.md', tokens: 1100 },
-    { path: 'README.md', tokens: 3000 },
-    { path: 'CHANGELOG.md', tokens: 1800 },
-    { path: '.github/workflows/ci.yml', tokens: 600 },
-    { path: '.github/workflows/release.yml', tokens: 900 },
-    { path: '.github/ISSUE_TEMPLATE/bug.md', tokens: 400 },
-    { path: 'package.json', tokens: 700 },
-    { path: 'tsconfig.json', tokens: 200 },
-    { path: '.gitignore', tokens: 150 },
-    { path: 'LICENSE', tokens: 300 },
-    { path: 'pyproject.toml', tokens: 600 },
-  ],
-}
-
-const LARGE_SPEC: FixtureSpec = {
-  name: 'large',
-  files: [
-    // Mirror of the user's 43-file initial commit shape, scaled up
-    // a bit (50 files / ~100k tokens) so we have headroom for both
-    // pre-process and consolidation phases to fire heavily.
-    { path: 'humble_bundle_keys/api.py', tokens: 4400 },
-    { path: 'humble_bundle_keys/auth.py', tokens: 2100 },
-    { path: 'humble_bundle_keys/cli.py', tokens: 7600 },
-    { path: 'humble_bundle_keys/diagnose.py', tokens: 6100 },
-    { path: 'humble_bundle_keys/scraper.py', tokens: 5200 },
-    { path: 'humble_bundle_keys/choice.py', tokens: 4500 },
-    { path: 'humble_bundle_keys/browser_choice.py', tokens: 5500 },
-    { path: 'humble_bundle_keys/exporter.py', tokens: 1300 },
-    { path: 'humble_bundle_keys/models.py', tokens: 700 },
-    { path: 'humble_bundle_keys/_browser_fetch.py', tokens: 1000 },
-    { path: 'humble_bundle_keys/_orders_cache.py', tokens: 1200 },
-    { path: 'humble_bundle_keys/__init__.py', tokens: 110 },
-    { path: 'humble_bundle_keys/__main__.py', tokens: 110 },
-    { path: 'tests/RUNBOOK.md', tokens: 1900 },
-    { path: 'tests/test_api_parser.py', tokens: 1400 },
-    { path: 'tests/test_browser_choice.py', tokens: 1200 },
-    { path: 'tests/test_browser_fetch.py', tokens: 1100 },
-    { path: 'tests/test_choice.py', tokens: 3000 },
-    { path: 'tests/test_diagnose_sanitiser.py', tokens: 2300 },
-    { path: 'tests/test_exporter.py', tokens: 1700 },
-    { path: 'tests/test_parsers.py', tokens: 600 },
-    { path: 'tests/__init__.py', tokens: 40 },
-    { path: 'tests/fixtures/choice_claim/README.md', tokens: 400 },
-    { path: 'tests/fixtures/choice_claim/analytics_get_game.json', tokens: 500 },
-    { path: 'tests/fixtures/choice_claim/analytics_tile_click.json', tokens: 500 },
-    { path: 'tests/fixtures/choice_claim/choosecontent.json', tokens: 600 },
-    { path: 'tests/fixtures/choice_claim/redeemkey.json', tokens: 600 },
-    { path: 'docs/ARCHITECTURE.md', tokens: 2300 },
-    { path: 'docs/CHOICE_CLAIM_SPEC.md', tokens: 3900 },
-    { path: 'docs/WHATS_CLAIMABLE.md', tokens: 1300 },
-    { path: 'README.md', tokens: 3900 },
-    { path: 'CHANGELOG.md', tokens: 3800 },
-    { path: 'CONTRIBUTING.md', tokens: 1200 },
-    { path: 'SECURITY.md', tokens: 1000 },
-    { path: 'LICENSE', tokens: 300 },
-    { path: 'pyproject.toml', tokens: 600 },
-    { path: '.gitignore', tokens: 700 },
-    { path: '.github/ISSUE_TEMPLATE/bug_report.md', tokens: 400 },
-    { path: '.github/ISSUE_TEMPLATE/feature_request.md', tokens: 250 },
-    { path: '.github/ISSUE_TEMPLATE/selector_broken.md', tokens: 500 },
-    { path: '.github/ISSUE_TEMPLATE/config.yml', tokens: 200 },
-    { path: '.github/workflows/ci.yml', tokens: 600 },
-    { path: '.github/workflows/release.yml', tokens: 900 },
-    { path: 'src/feature/a.ts', tokens: 1400 },
-    { path: 'src/feature/b.ts', tokens: 1100 },
-    { path: 'src/feature/c.ts', tokens: 900 },
-    { path: 'src/feature/d.ts', tokens: 800 },
-    { path: 'src/feature/e.ts', tokens: 700 },
-    { path: 'src/feature/utils.ts', tokens: 600 },
-    { path: 'src/feature/types.ts', tokens: 400 },
-  ],
-}
-
-/**
- * Convert a flat fixture spec into a nested DiffNode tree, grouping
- * by directory path. Mirrors `createDiffTree`'s behavior on real
- * file lists.
- */
-function buildDiffNode(spec: FixtureSpec): DiffNode {
+function buildDiffNode(name: string, files: FileSpec[]): DiffNode {
   const root: DiffNode = { path: '/', diffs: [], children: [] }
   const dirIndex = new Map<string, DiffNode>([['/', root]])
 
-  spec.files.forEach((file, index) => {
+  files.forEach((file, index) => {
     const segments = file.path.split('/')
     const fileName = segments.pop() as string
     const dirSegments = segments
@@ -190,12 +99,211 @@ function buildDiffNode(spec: FixtureSpec): DiffNode {
       node = child
     }
 
-    node.diffs.push(buildFileDiff(`${dirSegments.join('/')}${dirSegments.length ? '/' : ''}${fileName}`, file.tokens, index + 1))
+    const seed = hashString(`${name}:${file.path}:${index}`)
+    node.diffs.push(buildFileDiff({ ...file, path: file.path }, seed))
+    void fileName
   })
 
   return root
 }
 
+/**
+ * Cheap deterministic seed derivation from a string. We don't care
+ * about distribution quality — just stability across runs and
+ * reasonable spread between adjacent file paths.
+ */
+function hashString(input: string): number {
+  let hash = 5381
+  for (let i = 0; i < input.length; i++) {
+    hash = ((hash << 5) + hash + input.charCodeAt(i)) >>> 0
+  }
+  return hash || 1
+}
+
+// ---------------------------------------------------------------------------
+// Sized fixtures (preserve v0 names + counts; fresh content)
+// ---------------------------------------------------------------------------
+
+const TINY_FILES: FileSpec[] = [
+  { path: 'src/index.ts', tokens: 200 },
+  { path: 'src/util.ts', tokens: 150 },
+  { path: 'README.md', tokens: 300 },
+  { path: 'package.json', tokens: 80 },
+  { path: 'tsconfig.json', tokens: 60 },
+]
+
+const MEDIUM_FILES: FileSpec[] = [
+  { path: 'src/api.ts', tokens: 3500 },
+  { path: 'src/auth.ts', tokens: 2400 },
+  { path: 'src/cli.ts', tokens: 4800 },
+  { path: 'src/parser.ts', tokens: 2900 },
+  { path: 'src/utils/http.ts', tokens: 1200 },
+  { path: 'src/utils/format.ts', tokens: 800 },
+  { path: 'src/utils/logger.ts', tokens: 600 },
+  { path: 'tests/api.test.ts', tokens: 1800 },
+  { path: 'tests/auth.test.ts', tokens: 1400 },
+  { path: 'tests/parser.test.ts', tokens: 1600 },
+  { path: 'tests/utils/http.test.ts', tokens: 700 },
+  { path: 'tests/fixtures/sample.json', tokens: 500 },
+  { path: 'docs/ARCHITECTURE.md', tokens: 2300 },
+  { path: 'docs/API.md', tokens: 1900 },
+  { path: 'docs/CONTRIBUTING.md', tokens: 1100 },
+  { path: 'README.md', tokens: 3000 },
+  { path: 'CHANGELOG.md', tokens: 1800 },
+  { path: '.github/workflows/ci.yml', tokens: 600 },
+  { path: '.github/workflows/release.yml', tokens: 900 },
+  { path: '.github/ISSUE_TEMPLATE/bug.md', tokens: 400 },
+  { path: 'package.json', tokens: 700 },
+  { path: 'tsconfig.json', tokens: 200 },
+  { path: '.gitignore', tokens: 150 },
+  { path: 'LICENSE', tokens: 300 },
+  { path: 'pyproject.toml', tokens: 600 },
+]
+
+const LARGE_FILES: FileSpec[] = [
+  { path: 'humble_bundle_keys/api.py', tokens: 4400 },
+  { path: 'humble_bundle_keys/auth.py', tokens: 2100 },
+  { path: 'humble_bundle_keys/cli.py', tokens: 7600 },
+  { path: 'humble_bundle_keys/diagnose.py', tokens: 6100 },
+  { path: 'humble_bundle_keys/scraper.py', tokens: 5200 },
+  { path: 'humble_bundle_keys/choice.py', tokens: 4500 },
+  { path: 'humble_bundle_keys/browser_choice.py', tokens: 5500 },
+  { path: 'humble_bundle_keys/exporter.py', tokens: 1300 },
+  { path: 'humble_bundle_keys/models.py', tokens: 700 },
+  { path: 'humble_bundle_keys/_browser_fetch.py', tokens: 1000 },
+  { path: 'humble_bundle_keys/_orders_cache.py', tokens: 1200 },
+  { path: 'humble_bundle_keys/__init__.py', tokens: 110 },
+  { path: 'humble_bundle_keys/__main__.py', tokens: 110 },
+  { path: 'tests/RUNBOOK.md', tokens: 1900 },
+  { path: 'tests/test_api_parser.py', tokens: 1400 },
+  { path: 'tests/test_browser_choice.py', tokens: 1200 },
+  { path: 'tests/test_browser_fetch.py', tokens: 1100 },
+  { path: 'tests/test_choice.py', tokens: 3000 },
+  { path: 'tests/test_diagnose_sanitiser.py', tokens: 2300 },
+  { path: 'tests/test_exporter.py', tokens: 1700 },
+  { path: 'tests/test_parsers.py', tokens: 600 },
+  { path: 'tests/__init__.py', tokens: 40 },
+  { path: 'tests/fixtures/choice_claim/README.md', tokens: 400 },
+  { path: 'tests/fixtures/choice_claim/analytics_get_game.json', tokens: 500 },
+  { path: 'tests/fixtures/choice_claim/analytics_tile_click.json', tokens: 500 },
+  { path: 'tests/fixtures/choice_claim/choosecontent.json', tokens: 600 },
+  { path: 'tests/fixtures/choice_claim/redeemkey.json', tokens: 600 },
+  { path: 'docs/ARCHITECTURE.md', tokens: 2300 },
+  { path: 'docs/CHOICE_CLAIM_SPEC.md', tokens: 3900 },
+  { path: 'docs/WHATS_CLAIMABLE.md', tokens: 1300 },
+  { path: 'README.md', tokens: 3900 },
+  { path: 'CHANGELOG.md', tokens: 3800 },
+  { path: 'CONTRIBUTING.md', tokens: 1200 },
+  { path: 'SECURITY.md', tokens: 1000 },
+  { path: 'LICENSE', tokens: 300 },
+  { path: 'pyproject.toml', tokens: 600 },
+  { path: '.gitignore', tokens: 700 },
+  { path: '.github/ISSUE_TEMPLATE/bug_report.md', tokens: 400 },
+  { path: '.github/ISSUE_TEMPLATE/feature_request.md', tokens: 250 },
+  { path: '.github/ISSUE_TEMPLATE/selector_broken.md', tokens: 500 },
+  { path: '.github/ISSUE_TEMPLATE/config.yml', tokens: 200 },
+  { path: '.github/workflows/ci.yml', tokens: 600 },
+  { path: '.github/workflows/release.yml', tokens: 900 },
+  { path: 'src/feature/a.ts', tokens: 1400 },
+  { path: 'src/feature/b.ts', tokens: 1100 },
+  { path: 'src/feature/c.ts', tokens: 900 },
+  { path: 'src/feature/d.ts', tokens: 800 },
+  { path: 'src/feature/e.ts', tokens: 700 },
+  { path: 'src/feature/utils.ts', tokens: 600 },
+  { path: 'src/feature/types.ts', tokens: 400 },
+]
+
+// ---------------------------------------------------------------------------
+// Scenario fixtures (real-world commit shapes)
+// ---------------------------------------------------------------------------
+
+/**
+ * Feature add: new component + supporting modules + tests + a doc
+ * touch-up + a README mention. Mostly additions, a couple of
+ * modifications. Mirrors "I just shipped a new screen / endpoint /
+ * CLI command" workflow.
+ */
+const FEATURE_ADD_FILES: FileSpec[] = [
+  { path: 'src/features/billing/index.ts', tokens: 2200, shape: 'addition' },
+  { path: 'src/features/billing/handler.ts', tokens: 3800, shape: 'addition' },
+  { path: 'src/features/billing/schema.ts', tokens: 1100, shape: 'addition' },
+  { path: 'src/features/billing/validators.ts', tokens: 1600, shape: 'addition' },
+  { path: 'src/features/billing/types.ts', tokens: 700, shape: 'addition' },
+  { path: 'src/features/billing/utils.ts', tokens: 900, shape: 'addition' },
+  { path: 'tests/features/billing/handler.test.ts', tokens: 2100, shape: 'addition' },
+  { path: 'tests/features/billing/validators.test.ts', tokens: 1400, shape: 'addition' },
+  { path: 'tests/features/billing/fixtures.json', tokens: 800, shape: 'addition' },
+  { path: 'src/router.ts', tokens: 600, shape: 'modification' },
+  { path: 'src/index.ts', tokens: 400, shape: 'modification' },
+  { path: 'README.md', tokens: 500, shape: 'modification' },
+  { path: 'docs/billing.md', tokens: 1200, shape: 'addition' },
+  { path: 'CHANGELOG.md', tokens: 300, shape: 'modification' },
+]
+
+/**
+ * Refactor: lots of touched files, mostly modifications. Common
+ * pattern: rename a module, propagate the new import path through
+ * dozens of call sites. Real-world this is where the pipeline does
+ * the most LLM work because each file has both `+` and `-` lines.
+ */
+const REFACTOR_FILES: FileSpec[] = [
+  { path: 'src/parsers/legacy/index.ts', tokens: 0, shape: 'rename', oldPath: 'src/parsers/index.ts' },
+  ...Array.from({ length: 18 }, (_, i): FileSpec => ({
+    path: `src/parsers/legacy/handler-${i}.ts`,
+    tokens: 600 + (i * 90) % 1500,
+    shape: 'modification',
+  })),
+  ...Array.from({ length: 8 }, (_, i): FileSpec => ({
+    path: `tests/parsers/handler-${i}.test.ts`,
+    tokens: 500 + (i * 110) % 900,
+    shape: 'modification',
+  })),
+  { path: 'src/router.ts', tokens: 800, shape: 'modification' },
+  { path: 'src/registry.ts', tokens: 1100, shape: 'modification' },
+  { path: 'docs/ARCHITECTURE.md', tokens: 600, shape: 'modification' },
+]
+
+/**
+ * Initial commit: the user's #845 repro shape — many files, mostly
+ * pure additions, mixed languages. Reuses LARGE_FILES.
+ */
+const INITIAL_COMMIT_FILES: FileSpec[] = LARGE_FILES.map((file): FileSpec => ({
+  ...file,
+  shape: 'addition',
+}))
+
+/**
+ * Docs update: a documentation pass — multiple markdown files
+ * touched, no code. Should be relatively cheap if the pipeline can
+ * fast-path markdown.
+ */
+const DOCS_UPDATE_FILES: FileSpec[] = [
+  { path: 'README.md', tokens: 1800, shape: 'modification' },
+  { path: 'docs/getting-started.md', tokens: 2400, shape: 'modification' },
+  { path: 'docs/configuration.md', tokens: 2100, shape: 'modification' },
+  { path: 'docs/troubleshooting.md', tokens: 1500, shape: 'modification' },
+  { path: 'docs/api/overview.md', tokens: 1900, shape: 'modification' },
+  { path: 'docs/api/reference.md', tokens: 3200, shape: 'modification' },
+  { path: 'CHANGELOG.md', tokens: 800, shape: 'modification' },
+  { path: 'CONTRIBUTING.md', tokens: 1100, shape: 'modification' },
+  { path: '.github/ISSUE_TEMPLATE/bug.md', tokens: 250, shape: 'addition' },
+]
+
+/**
+ * Dep bump: the dependabot-style commit. Tiny content change in
+ * package.json, large lockfile delta. Pipeline should mostly
+ * skip-trivial these.
+ */
+const DEP_BUMP_FILES: FileSpec[] = [
+  { path: 'package.json', tokens: 250, shape: 'modification' },
+  { path: 'yarn.lock', tokens: 8000, shape: 'modification' },
+  { path: 'CHANGELOG.md', tokens: 200, shape: 'modification' },
+]
+
+// ---------------------------------------------------------------------------
+// Public surface
+// ---------------------------------------------------------------------------
+
 export type DiffFixture = {
   name: string
   fileCount: number
@@ -203,17 +311,32 @@ export type DiffFixture = {
   rootNode: DiffNode
 }
 
-function asFixture(spec: FixtureSpec): DiffFixture {
+function asFixture(name: string, files: FileSpec[]): DiffFixture {
   return {
-    name: spec.name,
-    fileCount: spec.files.length,
-    approxTokens: spec.files.reduce((sum, file) => sum + file.tokens, 0),
-    rootNode: buildDiffNode(spec),
+    name,
+    fileCount: files.length,
+    approxTokens: files.reduce((sum, file) => sum + file.tokens, 0),
+    rootNode: buildDiffNode(name, files),
   }
 }
 
-export const tinyFixture: DiffFixture = asFixture(TINY_SPEC)
-export const mediumFixture: DiffFixture = asFixture(MEDIUM_SPEC)
-export const largeFixture: DiffFixture = asFixture(LARGE_SPEC)
+export const tinyFixture: DiffFixture = asFixture('tiny', TINY_FILES)
+export const mediumFixture: DiffFixture = asFixture('medium', MEDIUM_FILES)
+export const largeFixture: DiffFixture = asFixture('large', LARGE_FILES)
+
+export const featureAddFixture: DiffFixture = asFixture('feature-add', FEATURE_ADD_FILES)
+export const refactorFixture: DiffFixture = asFixture('refactor', REFACTOR_FILES)
+export const initialCommitFixture: DiffFixture = asFixture('initial-commit', INITIAL_COMMIT_FILES)
+export const docsUpdateFixture: DiffFixture = asFixture('docs-update', DOCS_UPDATE_FILES)
+export const depBumpFixture: DiffFixture = asFixture('dep-bump', DEP_BUMP_FILES)
 
-export const allFixtures: DiffFixture[] = [tinyFixture, mediumFixture, largeFixture]
+export const allFixtures: DiffFixture[] = [
+  tinyFixture,
+  mediumFixture,
+  largeFixture,
+  featureAddFixture,
+  refactorFixture,
+  initialCommitFixture,
+  docsUpdateFixture,
+  depBumpFixture,
+]