From 59e9a8e319ff5e0e45e4b3798a137e888c69876b Mon Sep 17 00:00:00 2001
From: Ben Vinegar <ben@benv.ca>
Date: Fri, 29 May 2026 18:53:44 -0400
Subject: [PATCH 1/5] feat: add CI performance benchmark suite

---
 .github/workflows/benchmarks.yml   |  75 ++++++--
 CHANGELOG.md                       |   2 +
 benchmarks/README.md               | 100 ++++++++--
 benchmarks/changeset-parse.ts      |  59 ++++++
 benchmarks/comment-pr.ts           |  82 ++++++++
 benchmarks/compare.ts              | 293 +++++++++++++++++++++++++++++
 benchmarks/competitors.ts          | 112 +++++++++++
 benchmarks/large-stream-fixture.ts |  67 +++----
 benchmarks/large-stream-profile.ts |  27 +--
 benchmarks/large-stream.ts         |  23 +--
 benchmarks/lib/benchmark-result.ts | 123 ++++++++++++
 benchmarks/lib/fixtures.ts         | 136 +++++++++++++
 benchmarks/memory.ts               |  72 +++++++
 benchmarks/render-layout.ts        |  78 ++++++++
 benchmarks/run.ts                  | 188 ++++++++++++++++++
 benchmarks/working-tree-load.ts    |  68 +++++++
 package.json                       |   8 +
 17 files changed, 1411 insertions(+), 102 deletions(-)
 create mode 100644 benchmarks/changeset-parse.ts
 create mode 100644 benchmarks/comment-pr.ts
 create mode 100644 benchmarks/compare.ts
 create mode 100644 benchmarks/competitors.ts
 create mode 100644 benchmarks/lib/benchmark-result.ts
 create mode 100644 benchmarks/lib/fixtures.ts
 create mode 100644 benchmarks/memory.ts
 create mode 100644 benchmarks/render-layout.ts
 create mode 100644 benchmarks/run.ts
 create mode 100644 benchmarks/working-tree-load.ts

diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index 510e966d..50820f6e 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -9,10 +9,22 @@ on:
       - "docs/**"
       - "assets/**"
       - "LICENSE"
+  pull_request:
+    paths-ignore:
+      - "**/*.md"
+      - "docs/**"
+      - "assets/**"
+      - "LICENSE"
   workflow_dispatch:
 
+permissions:
+  contents: read
+  issues: write
+  pull-requests: write
+
 env:
   SKIP_INSTALL_SIMPLE_GIT_HOOKS: "1"
+  HUNK_BENCHMARK_SAMPLES: "3"
 
 concurrency:
   group: benchmarks-${{ github.workflow }}-${{ github.ref }}
@@ -25,6 +37,8 @@ jobs:
     steps:
       - name: Check out repository
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          fetch-depth: 0
 
       - name: Set up Bun
         uses: oven-sh/setup-bun@0c5077e51419868618aeaa5fe8019c62421857d6 # v2.2.0
@@ -34,36 +48,65 @@ jobs:
       - name: Install dependencies
         run: bun install --frozen-lockfile
 
-      - name: Run bootstrap benchmark
+      - name: Run head benchmarks
         run: |
           mkdir -p benchmark-results
-          bun run bench:bootstrap-load | tee benchmark-results/bootstrap-load.txt
+          bun run bench -- --samples "$HUNK_BENCHMARK_SAMPLES" --include-competitors --out benchmark-results/head.json \
+            | tee benchmark-results/head.txt
 
-      - name: Run highlight prefetch benchmark
+      - name: Run base benchmarks
+        if: github.event_name == 'pull_request'
         run: |
-          bun run bench:highlight-prefetch | tee benchmark-results/highlight-prefetch.txt
+          git fetch origin main
+          git worktree add ../hunk-benchmark-base origin/main
+          rm -rf ../hunk-benchmark-base/benchmarks
+          cp -R benchmarks ../hunk-benchmark-base/benchmarks
+          cd ../hunk-benchmark-base
+          bun install --frozen-lockfile
+          bun run benchmarks/run.ts --samples "$HUNK_BENCHMARK_SAMPLES" --include-competitors --out "$GITHUB_WORKSPACE/benchmark-results/base.json" \
+            | tee "$GITHUB_WORKSPACE/benchmark-results/base.txt"
 
-      - name: Run large stream benchmark
+      - name: Compare benchmark results
+        id: compare
+        if: github.event_name == 'pull_request'
+        continue-on-error: true
         run: |
-          bun run bench:large-stream | tee benchmark-results/large-stream.txt
+          bun run bench:compare -- \
+            --base benchmark-results/base.json \
+            --head benchmark-results/head.json \
+            --out benchmark-results/comparison.json \
+            --markdown benchmark-results/summary.md
 
       - name: Publish benchmark summary
+        if: always()
         run: |
-          {
-            echo '## Benchmark results'
-            echo
-            for file in benchmark-results/*.txt; do
-              echo "### $(basename "$file")"
+          if [ -f benchmark-results/summary.md ]; then
+            cat benchmark-results/summary.md >> "$GITHUB_STEP_SUMMARY"
+          else
+            {
+              echo '## Benchmark results'
+              echo
               echo '```text'
-              cat "$file"
+              cat benchmark-results/head.txt
               echo '```'
-              echo
-            done
-          } >> "$GITHUB_STEP_SUMMARY"
+            } >> "$GITHUB_STEP_SUMMARY"
+          fi
+
+      - name: Comment benchmark summary on PR
+        if: always() && github.event_name == 'pull_request' && hashFiles('benchmark-results/summary.md') != ''
+        continue-on-error: true
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: bun run bench:comment-pr -- --body benchmark-results/summary.md
+
+      - name: Fail on benchmark regression
+        if: github.event_name == 'pull_request' && steps.compare.outcome == 'failure'
+        run: exit 1
 
       - name: Upload benchmark artifacts
+        if: always()
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: benchmark-results
-          path: benchmark-results/*.txt
+          path: benchmark-results/*
           if-no-files-found: error
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7f62d9f5..723c75af 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,8 @@ All notable user-visible changes to Hunk are documented in this file.
 
 ### Added
 
+- Added CI performance benchmarks with PR comparison comments to guard Hunk startup, loading, rendering, highlighting, navigation, and memory costs.
+
 ### Changed
 
 ### Fixed
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 1bb5e78d..91595cc6 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -1,28 +1,104 @@
 # Benchmarks
 
-Benchmark scripts, shared fixtures, and local result artifacts live here.
+Benchmark scripts, shared fixtures, and local result artifacts live here. These benchmarks protect Hunk's core promise: fast loading, fast first render, fast navigation, and predictable memory use on large diffs.
 
-## Scripts
+## Running locally
 
-- `bootstrap-load.ts` — measures bootstrap and git-loader cost on a synthetic large repo
-- `highlight-prefetch.ts` — measures selected-file highlight startup and adjacent prefetch readiness
-- `large-stream.ts` — measures large split-stream first-frame and scroll cost, including note-enabled cases
-- `large-stream-profile.ts` — profiles the main pure planning stages behind the large split-stream benchmark
-- `large-stream-fixture.ts` — shared synthetic diff fixture used by the large-stream benchmarks
+Run the full benchmark suite with one JSON result file:
 
-## Running
+```bash
+bun run bench -- --samples 3 --include-competitors --out benchmarks/results/head.json
+```
 
-From the project root:
+Run focused scripts while iterating:
 
 ```bash
 bun run bench:bootstrap-load
+bun run bench:working-tree-load
+bun run bench:changeset-parse
+bun run bench:render-layout
 bun run bench:highlight-prefetch
 bun run bench:large-stream
 bun run bench:large-stream-profile
+bun run bench:memory
+bun run bench:competitors
 ```
 
-## Results
+Compare two JSON result files:
+
+```bash
+bun run bench:compare -- \
+  --base benchmarks/results/base.json \
+  --head benchmarks/results/head.json \
+  --markdown benchmarks/results/summary.md
+```
+
+## Scripts
+
+- `bootstrap-load.ts` — measures bootstrap and git-loader cost on a synthetic large repo, including file-pair bootstrap.
+- `working-tree-load.ts` — measures git working-tree loads across small, medium, large, many-untracked, and few-large-untracked repos.
+- `changeset-parse.ts` — measures patch normalization, Pierre parsing, patch chunking, and normalized `DiffFile` construction for many-small-files, balanced, and large-single-file patches.
+- `render-layout.ts` — measures pure split/stack row building, section geometry, and review-plan construction for many-small-files, balanced, and large-single-file streams.
+- `highlight-prefetch.ts` — measures selected-file highlight startup and adjacent prefetch readiness.
+- `large-stream.ts` — measures large split-stream first-frame and scroll cost.
+- `large-stream-profile.ts` — profiles the main pure planning stages behind the large split-stream benchmark.
+- `memory.ts` — records RSS/heap after fixture loading, planning, first frame, and next-hunk navigation.
+- `competitors.ts` — optional informational comparisons against `git diff --no-ext-diff`, `delta`, `difftastic`, and `diff-so-fancy` when installed.
+- `large-stream-fixture.ts` and `lib/fixtures.ts` — shared deterministic synthetic fixtures.
+
+## Output format
+
+Each script prints `METRIC name=value` lines. `benchmarks/run.ts` repeats scripts, aggregates samples, and writes JSON:
+
+```json
+{
+  "version": 1,
+  "samplesPerBenchmark": 3,
+  "results": [
+    {
+      "name": "large-stream/cold_first_frame_ms",
+      "unit": "ms",
+      "samples": [61.2, 60.8, 62.1],
+      "median": 61.2,
+      "p75": 62.1,
+      "p95": 62.1,
+      "threshold": {
+        "maxRegressionRatio": 1.15,
+        "minAbsoluteRegression": 5
+      },
+      "comparable": true
+    }
+  ]
+}
+```
+
+## CI policy
+
+`.github/workflows/benchmarks.yml` runs the suite on `main`, pull requests, and manual dispatch. On pull requests it:
+
+1. Runs benchmarks on the PR revision.
+2. Checks out `origin/main` in a sibling worktree.
+3. Copies the PR benchmark harness into that base worktree so new benchmarks can compare base code during the PR that introduces them.
+4. Runs the same benchmarks on base.
+5. Compares medians and fails if comparable metrics regress beyond threshold.
+6. Uploads raw JSON/text artifacts.
+7. Posts or updates one PR comment with a curated key-benchmark table, always including regressions and hiding noisy supporting metrics.
+
+Initial thresholds:
+
+- Time metrics (`*_ms`): fail when PR median is more than 15% slower **and** at least 5ms slower.
+- Memory metrics (`rss`/`heap`): fail when PR median is more than 20% higher **and** at least 8MiB higher.
+- Counts, fixture sizes, availability flags, and competitor metrics are informational.
+
+Competitor comparisons are intentionally non-failing because installed tool versions and feature parity vary by environment.
+
+## Updating thresholds
+
+Prefer fixing regressions first. If a maintainer accepts an intentional tradeoff, update the threshold in `benchmarks/lib/benchmark-result.ts` and mention why in the PR. Keep thresholds broad enough for CI variability but tight enough to catch visible slowdowns.
 
-Use `benchmarks/results/` for local benchmark output, notes, or captured runs.
+## Noise troubleshooting
 
-The folder stays in the repo so the convention is discoverable, but local result files inside it are ignored by default.
+- Re-run failed jobs before investigating tiny deltas; thresholds include absolute tolerances to avoid failing on sub-5ms noise.
+- PTY/renderer-adjacent metrics are noisier than pure parsing/planning metrics.
+- Use `--samples 5` locally when validating borderline changes.
+- Inspect uploaded raw samples before changing thresholds.
diff --git a/benchmarks/changeset-parse.ts b/benchmarks/changeset-parse.ts
new file mode 100644
index 00000000..5925623d
--- /dev/null
+++ b/benchmarks/changeset-parse.ts
@@ -0,0 +1,59 @@
+// Benchmark raw patch parsing and normalized DiffFile construction for several diff shapes.
+import { performance } from "perf_hooks";
+import { parsePatchFiles } from "@pierre/diffs";
+import { buildDiffFile } from "../src/core/diffFile";
+import { findPatchChunk, splitPatchIntoFileChunks } from "../src/core/patch/chunks";
+import { normalizePatchText } from "../src/core/patch/normalize";
+import { createSyntheticPatch } from "./lib/fixtures";
+
+interface Scenario {
+  name: string;
+  patch: string;
+}
+
+const scenarios: Scenario[] = [
+  {
+    name: "many_small_files",
+    patch: createSyntheticPatch({ fileCount: 240, lines: 48, changedLines: 8 }),
+  },
+  {
+    name: "balanced_changeset",
+    patch: createSyntheticPatch({ fileCount: 96, lines: 220, changedLines: 48 }),
+  },
+  {
+    name: "large_single_file",
+    patch: createSyntheticPatch({ fileCount: 1, lines: 18_000, changedLines: 2_000 }),
+  },
+];
+
+function measureScenario({ name, patch }: Scenario) {
+  const normalizeStart = performance.now();
+  const normalized = normalizePatchText(patch);
+  const normalizeMs = performance.now() - normalizeStart;
+
+  const parseStart = performance.now();
+  const parsed = parsePatchFiles(normalized, "patch", true);
+  const parseMs = performance.now() - parseStart;
+
+  const splitStart = performance.now();
+  const chunks = splitPatchIntoFileChunks(normalized);
+  const splitMs = performance.now() - splitStart;
+
+  const files = parsed.flatMap((entry) => entry.files);
+  const buildStart = performance.now();
+  const diffFiles = files.map((metadata, index) =>
+    buildDiffFile(metadata, findPatchChunk(metadata, chunks, index), index, name, null),
+  );
+  const buildMs = performance.now() - buildStart;
+
+  console.log(`METRIC ${name}_normalize_patch_ms=${normalizeMs.toFixed(2)}`);
+  console.log(`METRIC ${name}_parse_patch_ms=${parseMs.toFixed(2)}`);
+  console.log(`METRIC ${name}_split_chunks_ms=${splitMs.toFixed(2)}`);
+  console.log(`METRIC ${name}_build_diff_files_ms=${buildMs.toFixed(2)}`);
+  console.log(`METRIC ${name}_files=${diffFiles.length}`);
+  console.log(`METRIC ${name}_patch_bytes=${Buffer.byteLength(normalized)}`);
+}
+
+for (const scenario of scenarios) {
+  measureScenario(scenario);
+}
diff --git a/benchmarks/comment-pr.ts b/benchmarks/comment-pr.ts
new file mode 100644
index 00000000..687afcd3
--- /dev/null
+++ b/benchmarks/comment-pr.ts
@@ -0,0 +1,82 @@
+#!/usr/bin/env bun
+import { readFileSync } from "node:fs";
+
+const marker = "<!-- hunk-benchmark-comment -->";
+
+function requireEnv(name: string) {
+  const value = process.env[name];
+  if (!value) {
+    throw new Error(`Missing ${name}`);
+  }
+  return value;
+}
+
+function parseArgs(args: string[]) {
+  for (let index = 0; index < args.length; index += 1) {
+    if (args[index] === "--body") {
+      const value = args[index + 1];
+      if (!value) {
+        throw new Error("Missing value for --body");
+      }
+      return { bodyPath: value };
+    }
+  }
+
+  throw new Error("Usage: bun run benchmarks/comment-pr.ts --body benchmark-results/summary.md");
+}
+
+async function githubRequest(path: string, init: RequestInit = {}) {
+  const token = requireEnv("GITHUB_TOKEN");
+  const response = await fetch(`https://api.github.com${path}`, {
+    ...init,
+    headers: {
+      Accept: "application/vnd.github+json",
+      Authorization: `Bearer ${token}`,
+      "X-GitHub-Api-Version": "2022-11-28",
+      ...init.headers,
+    },
+  });
+
+  if (!response.ok) {
+    const text = await response.text();
+    throw new Error(
+      `GitHub API ${init.method ?? "GET"} ${path} failed: ${response.status} ${text}`,
+    );
+  }
+
+  return response.status === 204 ? null : response.json();
+}
+
+const { bodyPath } = parseArgs(Bun.argv.slice(2));
+const repository = requireEnv("GITHUB_REPOSITORY");
+const eventPath = requireEnv("GITHUB_EVENT_PATH");
+const event = JSON.parse(readFileSync(eventPath, "utf8")) as { pull_request?: { number: number } };
+const pullRequestNumber = event.pull_request?.number;
+
+if (!pullRequestNumber) {
+  console.log("No pull request in event payload; skipping benchmark comment.");
+  process.exit(0);
+}
+
+const body = readFileSync(bodyPath, "utf8");
+const comments = (await githubRequest(
+  `/repos/${repository}/issues/${pullRequestNumber}/comments?per_page=100`,
+)) as Array<{ id: number; body?: string }>;
+const existing = comments.find((comment) => comment.body?.includes(marker));
+
+if (existing) {
+  await githubRequest(`/repos/${repository}/issues/comments/${existing.id}`, {
+    method: "PATCH",
+    body: JSON.stringify({ body }),
+  });
+  console.log(`Updated benchmark comment ${existing.id}.`);
+} else {
+  const created = (await githubRequest(
+    `/repos/${repository}/issues/${pullRequestNumber}/comments`,
+    {
+      method: "POST",
+      body: JSON.stringify({ body }),
+    },
+  )) as { id: number };
+  console.log(`Created benchmark comment ${created.id}.`);
+}
diff --git a/benchmarks/compare.ts b/benchmarks/compare.ts
new file mode 100644
index 00000000..157c6349
--- /dev/null
+++ b/benchmarks/compare.ts
@@ -0,0 +1,293 @@
+#!/usr/bin/env bun
+import { mkdirSync, readFileSync, writeFileSync } from "node:fs";
+import { dirname, resolve } from "node:path";
+import type {
+  BenchmarkComparisonResult,
+  BenchmarkComparisonRow,
+  BenchmarkMetricResult,
+  BenchmarkRunResult,
+} from "./lib/benchmark-result";
+
+interface CompareOptions {
+  base: string;
+  head: string;
+  out?: string;
+  markdown?: string;
+}
+
+function readArgValue(args: string[], index: number) {
+  const value = args[index + 1];
+  if (!value) {
+    throw new Error(`Missing value for ${args[index]}`);
+  }
+  return value;
+}
+
+function parseArgs(args: string[]): CompareOptions {
+  const options: Partial<CompareOptions> = {};
+
+  for (let index = 0; index < args.length; index += 1) {
+    const arg = args[index]!;
+
+    if (arg === "--base") {
+      options.base = readArgValue(args, index);
+      index += 1;
+      continue;
+    }
+
+    if (arg === "--head") {
+      options.head = readArgValue(args, index);
+      index += 1;
+      continue;
+    }
+
+    if (arg === "--out") {
+      options.out = readArgValue(args, index);
+      index += 1;
+      continue;
+    }
+
+    if (arg === "--markdown") {
+      options.markdown = readArgValue(args, index);
+      index += 1;
+      continue;
+    }
+
+    throw new Error(`Unknown benchmark compare argument: ${arg}`);
+  }
+
+  if (!options.base || !options.head) {
+    throw new Error(
+      "Usage: bun run benchmarks/compare.ts --base base.json --head head.json [--out compare.json] [--markdown summary.md]",
+    );
+  }
+
+  return options as CompareOptions;
+}
+
+function readRun(path: string): BenchmarkRunResult {
+  return JSON.parse(readFileSync(path, "utf8")) as BenchmarkRunResult;
+}
+
+function compareMetric(
+  base: BenchmarkMetricResult | undefined,
+  head: BenchmarkMetricResult | undefined,
+) {
+  if (!base && !head) {
+    throw new Error("Cannot compare two missing metrics");
+  }
+
+  const metric = head ?? base!;
+  const baseMedian = base?.median ?? 0;
+  const headMedian = head?.median ?? 0;
+  const absoluteDelta = headMedian - baseMedian;
+  const relativeDelta = baseMedian === 0 ? 0 : absoluteDelta / baseMedian;
+
+  let status: BenchmarkComparisonRow["status"] = "pass";
+  if (!base) {
+    status = "missing-base";
+  } else if (!head) {
+    status = "missing-head";
+  } else if (!metric.comparable || metric.informational || metric.name.includes("competitor_")) {
+    status = "informational";
+  } else if (
+    metric.threshold &&
+    headMedian > baseMedian * metric.threshold.maxRegressionRatio &&
+    absoluteDelta > metric.threshold.minAbsoluteRegression
+  ) {
+    status = "fail";
+  }
+
+  return {
+    name: metric.name,
+    unit: metric.unit,
+    baseMedian,
+    headMedian,
+    absoluteDelta,
+    relativeDelta,
+    threshold: metric.threshold,
+    status,
+    source: metric.source,
+  } satisfies BenchmarkComparisonRow;
+}
+
+function formatNumber(value: number, unit: BenchmarkComparisonRow["unit"]) {
+  if (unit === "bytes") {
+    const mib = value / (1024 * 1024);
+    return `${mib.toFixed(1)} MiB`;
+  }
+
+  if (unit === "ms") {
+    return `${value.toFixed(value >= 100 ? 1 : 2)} ms`;
+  }
+
+  if (unit === "boolean") {
+    return value ? "yes" : "no";
+  }
+
+  return value.toFixed(Number.isInteger(value) ? 0 : 2);
+}
+
+function formatDelta(row: BenchmarkComparisonRow) {
+  const sign = row.absoluteDelta >= 0 ? "+" : "";
+  const relative = row.baseMedian === 0 ? "n/a" : `${sign}${(row.relativeDelta * 100).toFixed(1)}%`;
+  return `${sign}${formatNumber(row.absoluteDelta, row.unit)} (${relative})`;
+}
+
+function formatThreshold(row: BenchmarkComparisonRow) {
+  if (!row.threshold) {
+    return "—";
+  }
+
+  return `+${((row.threshold.maxRegressionRatio - 1) * 100).toFixed(0)}% and +${formatNumber(row.threshold.minAbsoluteRegression, row.unit)}`;
+}
+
+function statusIcon(status: BenchmarkComparisonRow["status"]) {
+  switch (status) {
+    case "pass":
+      return "✅";
+    case "fail":
+      return "❌";
+    case "informational":
+      return "ℹ️";
+    case "missing-base":
+    case "missing-head":
+      return "⚠️";
+  }
+}
+
+const keyBenchmarkNames = new Set([
+  "bootstrap-load/git_bootstrap_ms",
+  "bootstrap-load/file_pair_bootstrap_ms",
+  "working-tree-load/small_worktree_load_ms",
+  "working-tree-load/medium_worktree_load_ms",
+  "working-tree-load/large_worktree_load_ms",
+  "working-tree-load/untracked_many_small_load_ms",
+  "working-tree-load/untracked_few_large_load_ms",
+  "changeset-parse/many_small_files_parse_patch_ms",
+  "changeset-parse/balanced_changeset_parse_patch_ms",
+  "changeset-parse/large_single_file_parse_patch_ms",
+  "render-layout/many_small_files_review_plan_ms",
+  "render-layout/balanced_stream_review_plan_ms",
+  "render-layout/large_single_file_review_plan_ms",
+  "large-stream/cold_first_frame_ms",
+  "large-stream/warm_first_frame_ms",
+  "large-stream/windowed_scroll_ticks_ms",
+  "large-stream-profile/section_geometry_ms",
+  "large-stream-profile/review_plan_ms",
+  "highlight-prefetch/selected_startup_ms",
+  "highlight-prefetch/next_file_ready_ms",
+  "memory/first_frame_ms",
+  "memory/next_hunk_navigation_ms",
+  "memory/after_first_frame_rss_bytes",
+  "memory/after_navigation_rss_bytes",
+]);
+
+/** Keep PR comments readable while all metrics remain enforced and available as artifacts. */
+function selectDisplayedComparableRows(rows: BenchmarkComparisonRow[]) {
+  const displayed = new Map<string, BenchmarkComparisonRow>();
+
+  for (const row of rows) {
+    if (row.status === "fail" || row.status === "missing-head" || keyBenchmarkNames.has(row.name)) {
+      displayed.set(row.name, row);
+    }
+  }
+
+  return [...displayed.values()].sort((left, right) => left.name.localeCompare(right.name));
+}
+
+function competitorTimingRows(rows: BenchmarkComparisonRow[]) {
+  return rows.filter(
+    (row) =>
+      row.status === "informational" &&
+      row.name.includes("/competitor_") &&
+      row.name.endsWith("_ms"),
+  );
+}
+
+function buildMarkdown(comparison: BenchmarkComparisonResult) {
+  const comparableRows = comparison.rows.filter((row) => row.status !== "informational");
+  const displayedComparableRows = selectDisplayedComparableRows(comparableRows);
+  const hiddenComparableCount = comparableRows.length - displayedComparableRows.length;
+  const displayedCompetitorRows = competitorTimingRows(comparison.rows);
+  const lines = [
+    "<!-- hunk-benchmark-comment -->",
+    "## Hunk benchmark results",
+    "",
+    comparison.failed
+      ? "❌ One or more benchmarks regressed beyond the configured threshold."
+      : "✅ Benchmarks are within the configured thresholds.",
+    "",
+    `Base: \`${comparison.baseSha?.slice(0, 12) ?? "unknown"}\` · Head: \`${comparison.headSha?.slice(0, 12) ?? "unknown"}\``,
+    "",
+    "### Key Hunk benchmarks",
+    "",
+    "| Benchmark | Base median | PR median | Delta | Threshold | Status |",
+    "|---|---:|---:|---:|---:|:---:|",
+  ];
+
+  for (const row of displayedComparableRows) {
+    lines.push(
+      `| ${row.name} | ${formatNumber(row.baseMedian, row.unit)} | ${formatNumber(row.headMedian, row.unit)} | ${formatDelta(row)} | ${formatThreshold(row)} | ${statusIcon(row.status)} |`,
+    );
+  }
+
+  if (hiddenComparableCount > 0) {
+    lines.push(
+      "",
+      `${hiddenComparableCount} additional comparable Hunk metrics were checked but hidden to keep this comment readable. See the workflow artifacts for full JSON and text output.`,
+    );
+  }
+
+  if (displayedCompetitorRows.length > 0) {
+    lines.push("", "### Informational competitor comparison", "");
+    lines.push("| Benchmark | Base median | PR median | Delta | Status |");
+    lines.push("|---|---:|---:|---:|:---:|");
+    for (const row of displayedCompetitorRows) {
+      lines.push(
+        `| ${row.name} | ${formatNumber(row.baseMedian, row.unit)} | ${formatNumber(row.headMedian, row.unit)} | ${formatDelta(row)} | ${statusIcon(row.status)} |`,
+      );
+    }
+  }
+
+  lines.push("", "Raw JSON and text logs are available in the benchmark workflow artifacts.", "");
+  return lines.join("\n");
+}
+
+const options = parseArgs(Bun.argv.slice(2));
+const base = readRun(options.base);
+const head = readRun(options.head);
+const baseByName = new Map(base.results.map((result) => [result.name, result]));
+const headByName = new Map(head.results.map((result) => [result.name, result]));
+const names = new Set([...baseByName.keys(), ...headByName.keys()]);
+const rows = [...names]
+  .map((name) => compareMetric(baseByName.get(name), headByName.get(name)))
+  .sort((left, right) => left.name.localeCompare(right.name));
+
+const comparison: BenchmarkComparisonResult = {
+  version: 1,
+  generatedAt: new Date().toISOString(),
+  baseSha: base.gitSha,
+  headSha: head.gitSha,
+  failed: rows.some((row) => row.status === "fail" || row.status === "missing-head"),
+  rows,
+};
+const markdown = buildMarkdown(comparison);
+
+console.log(markdown);
+
+if (options.out) {
+  const outPath = resolve(options.out);
+  mkdirSync(dirname(outPath), { recursive: true });
+  writeFileSync(outPath, `${JSON.stringify(comparison, null, 2)}\n`);
+}
+
+if (options.markdown) {
+  const markdownPath = resolve(options.markdown);
+  mkdirSync(dirname(markdownPath), { recursive: true });
+  writeFileSync(markdownPath, markdown);
+}
+
+if (comparison.failed) {
+  process.exitCode = 1;
+}
diff --git a/benchmarks/competitors.ts b/benchmarks/competitors.ts
new file mode 100644
index 00000000..63ab61c9
--- /dev/null
+++ b/benchmarks/competitors.ts
@@ -0,0 +1,112 @@
+// Optional informational comparisons against diff-oriented CLI tools when installed.
+import { writeFileSync } from "node:fs";
+import { join } from "node:path";
+import { performance } from "perf_hooks";
+import {
+  createChangedRepo,
+  createSyntheticPatch,
+  createSyntheticSource,
+  createTemporaryDirectory,
+  git,
+} from "./lib/fixtures";
+
+interface ToolScenario {
+  metric: string;
+  command: string[];
+  stdin?: string;
+  cwd?: string;
+}
+
+function commandExists(command: string) {
+  const proc = Bun.spawnSync(["sh", "-c", `command -v ${command} >/dev/null 2>&1`], {
+    stdout: "ignore",
+    stderr: "ignore",
+  });
+  return proc.exitCode === 0;
+}
+
+function measureTool({ metric, command, stdin, cwd }: ToolScenario) {
+  const start = performance.now();
+  const proc = Bun.spawnSync(command, {
+    cwd,
+    stdin: stdin === undefined ? "ignore" : Buffer.from(stdin),
+    stdout: "ignore",
+    stderr: "pipe",
+    env: { ...process.env, NO_COLOR: "1", TERM: "xterm-256color" },
+  });
+  const duration = performance.now() - start;
+
+  if (proc.exitCode !== 0) {
+    const stderr = Buffer.from(proc.stderr).toString("utf8").trim();
+    console.log(`METRIC ${metric}_available=0`);
+    if (stderr) {
+      console.warn(`${command.join(" ")} failed: ${stderr}`);
+    }
+    return;
+  }
+
+  console.log(`METRIC ${metric}_ms=${duration.toFixed(2)}`);
+  console.log(`METRIC ${metric}_available=1`);
+}
+
+const patch = createSyntheticPatch({ fileCount: 96, lines: 180, changedLines: 36 });
+const patchFixture = createTemporaryDirectory("hunk-competitor-patch-");
+const repoFixture = createChangedRepo({ fileCount: 96, lines: 180, changedLines: 36 });
+
+try {
+  const patchPath = join(patchFixture.path, "large.patch");
+  const beforePath = join(patchFixture.path, "before.ts");
+  const afterPath = join(patchFixture.path, "after.ts");
+  writeFileSync(patchPath, patch);
+  writeFileSync(
+    beforePath,
+    createSyntheticSource(1, false, { lines: 12_000, changedLines: 2_000 }),
+  );
+  writeFileSync(afterPath, createSyntheticSource(1, true, { lines: 12_000, changedLines: 2_000 }));
+
+  measureTool({
+    metric: "competitor_git_diff_no_ext_diff",
+    command: ["git", "diff", "--no-ext-diff", "--no-color"],
+    cwd: repoFixture.path,
+  });
+
+  // Warm git's object lookup so the metric above still validates the fixture even if not compared.
+  git(repoFixture.path, "status", "--short");
+
+  if (commandExists("delta")) {
+    measureTool({
+      metric: "competitor_delta_patch_stdin",
+      command: ["delta", "--no-gitconfig", "--paging=never"],
+      stdin: patch,
+    });
+  } else {
+    console.log("METRIC competitor_delta_patch_stdin_available=0");
+  }
+
+  if (commandExists("difft")) {
+    measureTool({
+      metric: "competitor_difftastic_file_pair",
+      command: ["difft", "--color=never", beforePath, afterPath],
+    });
+  } else if (commandExists("difftastic")) {
+    measureTool({
+      metric: "competitor_difftastic_file_pair",
+      command: ["difftastic", "--color=never", beforePath, afterPath],
+    });
+  } else {
+    console.log("METRIC competitor_difftastic_file_pair_available=0");
+  }
+
+  if (commandExists("diff-so-fancy")) {
+    measureTool({
+      metric: "competitor_diff_so_fancy_patch_stdin",
+      command: ["diff-so-fancy"],
+      stdin: patch,
+    });
+  } else {
+    console.log("METRIC competitor_diff_so_fancy_patch_stdin_available=0");
+  }
+} finally {
+  patchFixture.cleanup();
+  repoFixture.cleanup();
+}
diff --git a/benchmarks/large-stream-fixture.ts b/benchmarks/large-stream-fixture.ts
index 0a95c184..386a3e31 100644
--- a/benchmarks/large-stream-fixture.ts
+++ b/benchmarks/large-stream-fixture.ts
@@ -3,37 +3,19 @@ import type { AppBootstrap, DiffFile } from "../src/core/types";
 
 export const DEFAULT_FILE_COUNT = 180;
 export const DEFAULT_LINES_PER_FILE = 120;
-export const DEFAULT_NOTES_PER_FILE = 2;
-
 interface LargeSplitStreamFixtureOptions {
   fileCount?: number;
   linesPerFile?: number;
-  notesPerFile?: number;
-}
-
-function createAgentAnnotations(index: number, notesPerFile: number) {
-  if (notesPerFile <= 0) {
-    return [];
-  }
-
-  return Array.from({ length: notesPerFile }, (_, noteIndex) => {
-    const startLine = 40 + noteIndex * 12;
-    const endLine = startLine + 5;
-    return {
-      id: `note:${index}:${noteIndex}`,
-      newRange: [startLine, endLine] as [number, number],
-      summary: `Explain the split-mode refactor in file ${index}, hunk note ${noteIndex + 1}.`,
-      rationale:
-        "Synthetic benchmark note to exercise inline note placement, guide rows, and note-enabled full-stream rendering.",
-    };
-  });
+  changedStartLine?: number;
+  changedEndLine?: number;
 }
 
 export function createLargeSplitDiffFile(
   index: number,
   {
     linesPerFile = DEFAULT_LINES_PER_FILE,
-    notesPerFile = 0,
+    changedStartLine = 37,
+    changedEndLine = 84,
   }: Omit<LargeSplitStreamFixtureOptions, "fileCount"> = {},
 ): DiffFile {
   const path = `src/stream${index}.ts`;
@@ -44,7 +26,7 @@ export function createLargeSplitDiffFile(
 
   const after = Array.from({ length: linesPerFile }, (_, lineIndex) => {
     const line = lineIndex + 1;
-    if (lineIndex >= 36 && lineIndex < 84) {
+    if (line >= changedStartLine && line <= changedEndLine) {
       return `export function stream${index}_${line}(value: number) { return value * ${line} + ${index}; }\n`;
     }
 
@@ -66,40 +48,40 @@ export function createLargeSplitDiffFile(
     true,
   );
 
-  const annotations = createAgentAnnotations(index, notesPerFile);
-
   return {
     id: `stream:${index}`,
     path,
     patch: "",
     language: "typescript",
-    stats: { additions: 48, deletions: 48 },
+    stats: {
+      additions: Math.max(0, changedEndLine - changedStartLine + 1),
+      deletions: Math.max(0, changedEndLine - changedStartLine + 1),
+    },
     metadata,
-    agent:
-      annotations.length > 0
-        ? {
-            path,
-            summary: `Synthetic note-heavy benchmark context for ${path}`,
-            annotations,
-          }
-        : null,
+    agent: null,
   };
 }
 
 export function createLargeSplitStreamFiles({
   fileCount = DEFAULT_FILE_COUNT,
   linesPerFile = DEFAULT_LINES_PER_FILE,
-  notesPerFile = 0,
+  changedStartLine,
+  changedEndLine,
 }: LargeSplitStreamFixtureOptions = {}) {
   return Array.from({ length: fileCount }, (_, index) =>
-    createLargeSplitDiffFile(index + 1, { linesPerFile, notesPerFile }),
+    createLargeSplitDiffFile(index + 1, {
+      linesPerFile,
+      changedStartLine,
+      changedEndLine,
+    }),
   );
 }
 
 export function createLargeSplitStreamBootstrap({
   fileCount = DEFAULT_FILE_COUNT,
   linesPerFile = DEFAULT_LINES_PER_FILE,
-  notesPerFile = 0,
+  changedStartLine,
+  changedEndLine,
 }: LargeSplitStreamFixtureOptions = {}): AppBootstrap {
   return {
     input: {
@@ -110,13 +92,18 @@ export function createLargeSplitStreamBootstrap({
       },
     },
     changeset: {
-      id: `changeset:large-split-stream:${fileCount}:${linesPerFile}:${notesPerFile}`,
+      id: `changeset:large-split-stream:${fileCount}:${linesPerFile}`,
       sourceLabel: "repo",
       title: "repo working tree",
-      files: createLargeSplitStreamFiles({ fileCount, linesPerFile, notesPerFile }),
+      files: createLargeSplitStreamFiles({
+        fileCount,
+        linesPerFile,
+        changedStartLine,
+        changedEndLine,
+      }),
     },
     initialMode: "split",
     initialTheme: "midnight",
-    initialShowAgentNotes: notesPerFile > 0,
+    initialShowAgentNotes: false,
   };
 }
diff --git a/benchmarks/large-stream-profile.ts b/benchmarks/large-stream-profile.ts
index aa8ee4b7..fa77419a 100644
--- a/benchmarks/large-stream-profile.ts
+++ b/benchmarks/large-stream-profile.ts
@@ -9,20 +9,10 @@ import {
   createLargeSplitStreamFiles,
   DEFAULT_FILE_COUNT,
   DEFAULT_LINES_PER_FILE,
-  DEFAULT_NOTES_PER_FILE,
 } from "./large-stream-fixture";
 
 const theme = resolveTheme("midnight", null);
-const windowedFiles = createLargeSplitStreamFiles({ notesPerFile: 0 });
-const noteFiles = createLargeSplitStreamFiles({ notesPerFile: DEFAULT_NOTES_PER_FILE });
-
-function visibleAgentNotesForFile(file: (typeof noteFiles)[number]) {
-  const annotations = file.agent?.annotations ?? [];
-  return annotations.map((annotation, index) => ({
-    id: `annotation:${file.id}:${annotation.id ?? index}`,
-    annotation,
-  }));
-}
+const windowedFiles = createLargeSplitStreamFiles();
 
 function measureMs(run: () => void) {
   const start = performance.now();
@@ -43,24 +33,23 @@ const splitRowsMs = measureMs(() => {
   });
 });
 
-let notePlannedRows = 0;
-const noteReviewPlanMs = measureMs(() => {
-  noteFiles.forEach((file) => {
+let plannedRows = 0;
+const reviewPlanMs = measureMs(() => {
+  windowedFiles.forEach((file) => {
     const rows = buildSplitRows(file, null, theme);
-    notePlannedRows += buildReviewRenderPlan({
+    plannedRows += buildReviewRenderPlan({
       fileId: file.id,
       rows,
       showHunkHeaders: true,
-      visibleAgentNotes: visibleAgentNotesForFile(file),
+      visibleAgentNotes: [],
     }).length;
   });
 });
 
 console.log(`METRIC section_geometry_ms=${sectionGeometryMs.toFixed(2)}`);
 console.log(`METRIC split_rows_ms=${splitRowsMs.toFixed(2)}`);
-console.log(`METRIC note_review_plan_ms=${noteReviewPlanMs.toFixed(2)}`);
+console.log(`METRIC review_plan_ms=${reviewPlanMs.toFixed(2)}`);
 console.log(`METRIC split_rows=${windowedRows}`);
-console.log(`METRIC note_planned_rows=${notePlannedRows}`);
+console.log(`METRIC planned_rows=${plannedRows}`);
 console.log(`METRIC files=${DEFAULT_FILE_COUNT}`);
 console.log(`METRIC lines_per_file=${DEFAULT_LINES_PER_FILE}`);
-console.log(`METRIC notes_per_file=${DEFAULT_NOTES_PER_FILE}`);
diff --git a/benchmarks/large-stream.ts b/benchmarks/large-stream.ts
index 21c739c3..117cd6e3 100644
--- a/benchmarks/large-stream.ts
+++ b/benchmarks/large-stream.ts
@@ -1,5 +1,4 @@
-// Benchmark split-mode startup and scroll behaviour on very large review streams,
-// including note-enabled cases that disable the placeholder windowing path.
+// Benchmark split-mode startup and scroll behaviour on very large review streams.
 import { performance } from "perf_hooks";
 import React from "react";
 import { testRender } from "@opentui/react/test-utils";
@@ -9,7 +8,6 @@ import {
   createLargeSplitStreamBootstrap,
   DEFAULT_FILE_COUNT,
   DEFAULT_LINES_PER_FILE,
-  DEFAULT_NOTES_PER_FILE,
 } from "./large-stream-fixture";
 
 const VIEWPORT = {
@@ -67,10 +65,10 @@ async function destroyRenderer(setup: BenchmarkRenderer) {
   });
 }
 
-async function measureFirstFrameMs(notesPerFile: number) {
+async function measureFirstFrameMs() {
   const setup = await testRender(
     React.createElement(AppHost, {
-      bootstrap: createLargeSplitStreamBootstrap({ notesPerFile }),
+      bootstrap: createLargeSplitStreamBootstrap(),
     }),
     VIEWPORT,
   );
@@ -85,10 +83,10 @@ async function measureFirstFrameMs(notesPerFile: number) {
   }
 }
 
-async function measureScrollTicksMs(notesPerFile: number) {
+async function measureScrollTicksMs() {
   const setup = await testRender(
     React.createElement(AppHost, {
-      bootstrap: createLargeSplitStreamBootstrap({ notesPerFile }),
+      bootstrap: createLargeSplitStreamBootstrap(),
     }),
     VIEWPORT,
   );
@@ -112,18 +110,13 @@ async function measureScrollTicksMs(notesPerFile: number) {
   }
 }
 
-const coldFirstFrameMs = await measureFirstFrameMs(0);
-const warmFirstFrameMs = await measureFirstFrameMs(0);
-const noteFirstFrameMs = await measureFirstFrameMs(DEFAULT_NOTES_PER_FILE);
-const windowedScrollMs = await measureScrollTicksMs(0);
-const noteScrollMs = await measureScrollTicksMs(DEFAULT_NOTES_PER_FILE);
+const coldFirstFrameMs = await measureFirstFrameMs();
+const warmFirstFrameMs = await measureFirstFrameMs();
+const windowedScrollMs = await measureScrollTicksMs();
 
 console.log(`METRIC cold_first_frame_ms=${coldFirstFrameMs.toFixed(2)}`);
 console.log(`METRIC warm_first_frame_ms=${warmFirstFrameMs.toFixed(2)}`);
-console.log(`METRIC note_first_frame_ms=${noteFirstFrameMs.toFixed(2)}`);
 console.log(`METRIC windowed_scroll_ticks_ms=${windowedScrollMs.toFixed(2)}`);
-console.log(`METRIC note_scroll_ticks_ms=${noteScrollMs.toFixed(2)}`);
 console.log(`METRIC scroll_ticks=${SCROLL_TICKS}`);
 console.log(`METRIC files=${DEFAULT_FILE_COUNT}`);
 console.log(`METRIC lines_per_file=${DEFAULT_LINES_PER_FILE}`);
-console.log(`METRIC notes_per_file=${DEFAULT_NOTES_PER_FILE}`);
diff --git a/benchmarks/lib/benchmark-result.ts b/benchmarks/lib/benchmark-result.ts
new file mode 100644
index 00000000..8c1bc3fc
--- /dev/null
+++ b/benchmarks/lib/benchmark-result.ts
@@ -0,0 +1,123 @@
+export interface BenchmarkThreshold {
+  maxRegressionRatio: number;
+  minAbsoluteRegression: number;
+}
+
+export interface BenchmarkMetricResult {
+  name: string;
+  unit: "ms" | "bytes" | "count" | "ratio" | "boolean";
+  samples: number[];
+  median: number;
+  p75: number;
+  p95: number;
+  min: number;
+  max: number;
+  threshold?: BenchmarkThreshold;
+  comparable: boolean;
+  informational?: boolean;
+  source: string;
+}
+
+export interface BenchmarkRunResult {
+  version: 1;
+  generatedAt: string;
+  gitSha?: string;
+  samplesPerBenchmark: number;
+  results: BenchmarkMetricResult[];
+}
+
+export interface BenchmarkComparisonRow {
+  name: string;
+  unit: BenchmarkMetricResult["unit"];
+  baseMedian: number;
+  headMedian: number;
+  absoluteDelta: number;
+  relativeDelta: number;
+  threshold?: BenchmarkThreshold;
+  status: "pass" | "fail" | "missing-base" | "missing-head" | "informational";
+  source: string;
+}
+
+export interface BenchmarkComparisonResult {
+  version: 1;
+  generatedAt: string;
+  baseSha?: string;
+  headSha?: string;
+  failed: boolean;
+  rows: BenchmarkComparisonRow[];
+}
+
+/** Return percentile values using nearest-rank indexing over sorted samples. */
+export function percentile(samples: number[], percentileValue: number) {
+  if (samples.length === 0) {
+    return 0;
+  }
+
+  const sorted = [...samples].sort((left, right) => left - right);
+  const index = Math.min(
+    sorted.length - 1,
+    Math.max(0, Math.ceil((percentileValue / 100) * sorted.length) - 1),
+  );
+  return sorted[index]!;
+}
+
+/** Infer display and comparison metadata from the metric name emitted by a script. */
+export function classifyMetric(
+  name: string,
+): Pick<BenchmarkMetricResult, "unit" | "comparable" | "threshold"> {
+  if (name.endsWith("_ms")) {
+    return {
+      unit: "ms",
+      comparable: true,
+      threshold: { maxRegressionRatio: 1.15, minAbsoluteRegression: 5 },
+    };
+  }
+
+  if (
+    name.startsWith("is_") ||
+    name.endsWith("_ready_before_move") ||
+    name.endsWith("_available")
+  ) {
+    return { unit: "boolean", comparable: false };
+  }
+
+  if (name.includes("rss") || name.includes("heap")) {
+    return {
+      unit: "bytes",
+      comparable: true,
+      threshold: { maxRegressionRatio: 1.2, minAbsoluteRegression: 8 * 1024 * 1024 },
+    };
+  }
+
+  if (name.endsWith("_bytes")) {
+    return { unit: "bytes", comparable: false };
+  }
+
+  if (name.startsWith("competitor_")) {
+    return { unit: "ms", comparable: false };
+  }
+
+  return { unit: "count", comparable: false };
+}
+
+/** Build an aggregated result from raw numeric samples. */
+export function aggregateMetric(
+  source: string,
+  name: string,
+  samples: number[],
+): BenchmarkMetricResult {
+  const classification = classifyMetric(name);
+  const sorted = [...samples].sort((left, right) => left - right);
+
+  return {
+    name: `${source}/${name}`,
+    source,
+    samples,
+    median: percentile(sorted, 50),
+    p75: percentile(sorted, 75),
+    p95: percentile(sorted, 95),
+    min: sorted[0] ?? 0,
+    max: sorted.at(-1) ?? 0,
+    ...classification,
+  };
+}
diff --git a/benchmarks/lib/fixtures.ts b/benchmarks/lib/fixtures.ts
new file mode 100644
index 00000000..f7258e4e
--- /dev/null
+++ b/benchmarks/lib/fixtures.ts
@@ -0,0 +1,136 @@
+import { mkdtempSync, rmSync, writeFileSync, mkdirSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { dirname, join } from "node:path";
+import { createTwoFilesPatch } from "diff";
+
+export interface SyntheticFileOptions {
+  lines: number;
+  changedStart?: number;
+  changedLines?: number;
+  extension?: string;
+}
+
+export interface SyntheticPatchOptions extends SyntheticFileOptions {
+  fileCount: number;
+  prefix?: string;
+}
+
+export interface TemporaryDirectory {
+  path: string;
+  cleanup: () => void;
+}
+
+/** Create a temporary directory with a cleanup helper for benchmark fixtures. */
+export function createTemporaryDirectory(prefix: string): TemporaryDirectory {
+  const path = mkdtempSync(join(tmpdir(), prefix));
+  return {
+    path,
+    cleanup: () => rmSync(path, { recursive: true, force: true }),
+  };
+}
+
+/** Run git in a benchmark fixture and throw with stderr on failure. */
+export function git(cwd: string, ...cmd: string[]) {
+  const proc = Bun.spawnSync(["git", ...cmd], {
+    cwd,
+    stdout: "pipe",
+    stderr: "pipe",
+    stdin: "ignore",
+  });
+
+  if (proc.exitCode !== 0) {
+    const stderr = Buffer.from(proc.stderr).toString("utf8");
+    throw new Error(stderr.trim() || `git ${cmd.join(" ")} failed`);
+  }
+
+  return Buffer.from(proc.stdout).toString("utf8");
+}
+
+/** Generate deterministic TypeScript-like contents with a controlled changed region. */
+export function createSyntheticSource(
+  fileIndex: number,
+  changed: boolean,
+  options: SyntheticFileOptions,
+) {
+  const changedStart = options.changedStart ?? Math.floor(options.lines / 3);
+  const changedEnd =
+    changedStart + (options.changedLines ?? Math.max(4, Math.floor(options.lines / 6)));
+
+  return Array.from({ length: options.lines }, (_, lineIndex) => {
+    const line = lineIndex + 1;
+    if (changed && lineIndex >= changedStart && lineIndex < changedEnd) {
+      return `export function bench${fileIndex}_${line}(value: number) { return value * ${line} + ${fileIndex}; }\n`;
+    }
+
+    return `export function bench${fileIndex}_${line}(value: number) { return value + ${line}; }\n`;
+  }).join("");
+}
+
+/** Build one deterministic multi-file unified patch. */
+export function createSyntheticPatch({
+  fileCount,
+  lines,
+  changedStart,
+  changedLines,
+  extension = "ts",
+  prefix = "src/bench",
+}: SyntheticPatchOptions) {
+  return Array.from({ length: fileCount }, (_, index) => {
+    const fileIndex = index + 1;
+    const path = `${prefix}${fileIndex}.${extension}`;
+    const before = createSyntheticSource(fileIndex, false, { lines, changedStart, changedLines });
+    const after = createSyntheticSource(fileIndex, true, { lines, changedStart, changedLines });
+
+    const patch = createTwoFilesPatch(path, path, before, after, "", "", { context: 3 });
+    // Pierre's patch parser expects unified/git hunks; remove diff-package index banners.
+    return patch.replace(/^Index: .*\n=+\n/, "").trimEnd();
+  }).join("\n");
+}
+
+/** Create a git repo with committed files and modified tracked contents. */
+export function createChangedRepo({
+  fileCount,
+  lines,
+  changedStart,
+  changedLines,
+  extension = "ts",
+}: SyntheticPatchOptions) {
+  const fixture = createTemporaryDirectory("hunk-benchmark-repo-");
+
+  git(fixture.path, "init");
+  git(fixture.path, "config", "user.name", "Benchmark User");
+  git(fixture.path, "config", "user.email", "benchmark@example.com");
+
+  for (let index = 1; index <= fileCount; index += 1) {
+    const relativePath = join("src", `bench${index}.${extension}`);
+    const absolutePath = join(fixture.path, relativePath);
+    mkdirSync(dirname(absolutePath), { recursive: true });
+    writeFileSync(
+      absolutePath,
+      createSyntheticSource(index, false, { lines, changedStart, changedLines }),
+    );
+  }
+
+  git(fixture.path, "add", ".");
+  git(fixture.path, "commit", "-m", "initial benchmark fixture");
+
+  for (let index = 1; index <= fileCount; index += 1) {
+    const relativePath = join("src", `bench${index}.${extension}`);
+    writeFileSync(
+      join(fixture.path, relativePath),
+      createSyntheticSource(index, true, { lines, changedStart, changedLines }),
+    );
+  }
+
+  return fixture;
+}
+
+/** Add deterministic untracked files to an existing benchmark repository. */
+export function addUntrackedFiles(repoDir: string, fileCount: number, lines: number) {
+  for (let index = 1; index <= fileCount; index += 1) {
+    const relativePath = join("untracked", `new${index}.ts`);
+    const absolutePath = join(repoDir, relativePath);
+    mkdirSync(dirname(absolutePath), { recursive: true });
+    writeFileSync(absolutePath, createSyntheticSource(index, true, { lines }));
+  }
+}
diff --git a/benchmarks/memory.ts b/benchmarks/memory.ts
new file mode 100644
index 00000000..7bbb58d0
--- /dev/null
+++ b/benchmarks/memory.ts
@@ -0,0 +1,72 @@
+// Track heap/RSS pressure for loading, planning, rendering, and navigating a large diff.
+import { performance } from "perf_hooks";
+import React from "react";
+import { testRender } from "@opentui/react/test-utils";
+import { act } from "react";
+import { buildSplitRows } from "../src/ui/diff/pierre";
+import { buildReviewRenderPlan } from "../src/ui/diff/reviewRenderPlan";
+import { resolveTheme } from "../src/ui/themes";
+import { AppHost } from "../src/ui/AppHost";
+import { createLargeSplitStreamBootstrap } from "./large-stream-fixture";
+
+const viewport = { width: 240, height: 28 } as const;
+
+function printMemory(prefix: string) {
+  const usage = process.memoryUsage();
+  console.log(`METRIC ${prefix}_rss_bytes=${usage.rss}`);
+  console.log(`METRIC ${prefix}_heap_used_bytes=${usage.heapUsed}`);
+}
+
+async function renderOnce(setup: Awaited<ReturnType<typeof testRender>>) {
+  await act(async () => {
+    await setup.renderOnce();
+    await Bun.sleep(0);
+  });
+}
+
+const bootstrapStart = performance.now();
+const bootstrap = createLargeSplitStreamBootstrap({
+  fileCount: 120,
+  linesPerFile: 120,
+});
+console.log(`METRIC bootstrap_fixture_ms=${(performance.now() - bootstrapStart).toFixed(2)}`);
+printMemory("after_bootstrap");
+
+const theme = resolveTheme("midnight", null);
+let plannedRows = 0;
+const planningStart = performance.now();
+for (const file of bootstrap.changeset.files) {
+  const rows = buildSplitRows(file, null, theme);
+  plannedRows += buildReviewRenderPlan({
+    fileId: file.id,
+    rows,
+    showHunkHeaders: true,
+    visibleAgentNotes: [],
+  }).length;
+}
+console.log(`METRIC planning_ms=${(performance.now() - planningStart).toFixed(2)}`);
+console.log(`METRIC planned_rows=${plannedRows}`);
+printMemory("after_planning");
+
+const setup = await testRender(React.createElement(AppHost, { bootstrap }), viewport);
+try {
+  const firstFrameStart = performance.now();
+  await renderOnce(setup);
+  console.log(`METRIC first_frame_ms=${(performance.now() - firstFrameStart).toFixed(2)}`);
+  printMemory("after_first_frame");
+
+  const navigationStart = performance.now();
+  for (let index = 0; index < 6; index += 1) {
+    await act(async () => {
+      await setup.mockInput.typeText("]");
+      await setup.renderOnce();
+      await Bun.sleep(0);
+    });
+  }
+  console.log(`METRIC next_hunk_navigation_ms=${(performance.now() - navigationStart).toFixed(2)}`);
+  printMemory("after_navigation");
+} finally {
+  await act(async () => {
+    setup.renderer.destroy();
+  });
+}
diff --git a/benchmarks/render-layout.ts b/benchmarks/render-layout.ts
new file mode 100644
index 00000000..0b805b06
--- /dev/null
+++ b/benchmarks/render-layout.ts
@@ -0,0 +1,78 @@
+// Benchmark pure diff row/layout planning across split, stack, and size-shape cases.
+import { performance } from "perf_hooks";
+import { buildSplitRows, buildStackRows } from "../src/ui/diff/pierre";
+import { buildReviewRenderPlan } from "../src/ui/diff/reviewRenderPlan";
+import { measureDiffSectionGeometry } from "../src/ui/diff/diffSectionGeometry";
+import { resolveTheme } from "../src/ui/themes";
+import { createLargeSplitStreamFiles } from "./large-stream-fixture";
+
+const theme = resolveTheme("midnight", null);
+
+function measureMs(run: () => void) {
+  const start = performance.now();
+  run();
+  return performance.now() - start;
+}
+
+function measureScenario(name: string, files: ReturnType<typeof createLargeSplitStreamFiles>) {
+  let splitRows = 0;
+  let stackRows = 0;
+  let plannedRows = 0;
+
+  const splitRowsMs = measureMs(() => {
+    for (const file of files) {
+      splitRows += buildSplitRows(file, null, theme).length;
+    }
+  });
+
+  const stackRowsMs = measureMs(() => {
+    for (const file of files) {
+      stackRows += buildStackRows(file, null, theme).length;
+    }
+  });
+
+  const geometryMs = measureMs(() => {
+    for (const file of files) {
+      measureDiffSectionGeometry(file, "split", true, theme);
+    }
+  });
+
+  const reviewPlanMs = measureMs(() => {
+    for (const file of files) {
+      const rows = buildSplitRows(file, null, theme);
+      plannedRows += buildReviewRenderPlan({
+        fileId: file.id,
+        rows,
+        showHunkHeaders: true,
+        visibleAgentNotes: [],
+      }).length;
+    }
+  });
+
+  console.log(`METRIC ${name}_split_rows_ms=${splitRowsMs.toFixed(2)}`);
+  console.log(`METRIC ${name}_stack_rows_ms=${stackRowsMs.toFixed(2)}`);
+  console.log(`METRIC ${name}_geometry_ms=${geometryMs.toFixed(2)}`);
+  console.log(`METRIC ${name}_review_plan_ms=${reviewPlanMs.toFixed(2)}`);
+  console.log(`METRIC ${name}_files=${files.length}`);
+  console.log(`METRIC ${name}_split_rows=${splitRows}`);
+  console.log(`METRIC ${name}_stack_rows=${stackRows}`);
+  console.log(`METRIC ${name}_planned_rows=${plannedRows}`);
+}
+
+measureScenario(
+  "many_small_files",
+  createLargeSplitStreamFiles({ fileCount: 360, linesPerFile: 48 }),
+);
+measureScenario(
+  "balanced_stream",
+  createLargeSplitStreamFiles({ fileCount: 180, linesPerFile: 120 }),
+);
+measureScenario(
+  "large_single_file",
+  createLargeSplitStreamFiles({
+    fileCount: 1,
+    linesPerFile: 18_000,
+    changedStartLine: 1_000,
+    changedEndLine: 17_000,
+  }),
+);
diff --git a/benchmarks/run.ts b/benchmarks/run.ts
new file mode 100644
index 00000000..dc83d052
--- /dev/null
+++ b/benchmarks/run.ts
@@ -0,0 +1,188 @@
+#!/usr/bin/env bun
+import { mkdirSync, writeFileSync } from "node:fs";
+import { dirname, resolve } from "node:path";
+import { aggregateMetric, type BenchmarkRunResult } from "./lib/benchmark-result";
+
+const defaultScripts = [
+  "bootstrap-load.ts",
+  "working-tree-load.ts",
+  "changeset-parse.ts",
+  "render-layout.ts",
+  "highlight-prefetch.ts",
+  "large-stream.ts",
+  "large-stream-profile.ts",
+  "memory.ts",
+];
+
+interface RunOptions {
+  samples: number;
+  out?: string;
+  includeCompetitors: boolean;
+  scripts: string[];
+}
+
+function readArgValue(args: string[], index: number) {
+  const value = args[index + 1];
+  if (!value) {
+    throw new Error(`Missing value for ${args[index]}`);
+  }
+  return value;
+}
+
+function parseArgs(args: string[]): RunOptions {
+  const options: RunOptions = {
+    samples: Number(process.env.HUNK_BENCHMARK_SAMPLES ?? 3),
+    includeCompetitors: false,
+    scripts: [],
+  };
+
+  for (let index = 0; index < args.length; index += 1) {
+    const arg = args[index]!;
+
+    if (arg === "--samples") {
+      options.samples = Number(readArgValue(args, index));
+      index += 1;
+      continue;
+    }
+
+    if (arg === "--out") {
+      options.out = readArgValue(args, index);
+      index += 1;
+      continue;
+    }
+
+    if (arg === "--include-competitors") {
+      options.includeCompetitors = true;
+      continue;
+    }
+
+    if (arg === "--script") {
+      options.scripts.push(readArgValue(args, index));
+      index += 1;
+      continue;
+    }
+
+    throw new Error(`Unknown benchmark runner argument: ${arg}`);
+  }
+
+  if (!Number.isFinite(options.samples) || options.samples < 1) {
+    throw new Error("--samples must be a positive number");
+  }
+
+  return options;
+}
+
+function gitSha() {
+  const proc = Bun.spawnSync(["git", "rev-parse", "HEAD"], {
+    stdout: "pipe",
+    stderr: "ignore",
+    stdin: "ignore",
+  });
+
+  if (proc.exitCode !== 0) {
+    return undefined;
+  }
+
+  return Buffer.from(proc.stdout).toString("utf8").trim();
+}
+
+function parseMetrics(output: string) {
+  const metrics = new Map<string, number>();
+  const metricPattern = /^METRIC\s+([A-Za-z0-9_.:-]+)=(-?\d+(?:\.\d+)?)$/;
+
+  for (const line of output.split(/\r?\n/)) {
+    const match = metricPattern.exec(line.trim());
+    if (!match) {
+      continue;
+    }
+
+    metrics.set(match[1]!, Number(match[2]!));
+  }
+
+  return metrics;
+}
+
+async function runScript(script: string) {
+  const proc = Bun.spawn(["bun", "run", `benchmarks/${script}`], {
+    stdout: "pipe",
+    stderr: "pipe",
+    stdin: "ignore",
+    env: { ...process.env, CI: process.env.CI ?? "1" },
+  });
+
+  const [stdout, stderr, exitCode] = await Promise.all([
+    new Response(proc.stdout).text(),
+    new Response(proc.stderr).text(),
+    proc.exited,
+  ]);
+
+  if (stderr.trim()) {
+    console.warn(stderr.trim());
+  }
+
+  if (exitCode !== 0) {
+    throw new Error(`${script} failed with exit code ${exitCode}\n${stderr}`);
+  }
+
+  process.stdout.write(stdout);
+  return parseMetrics(stdout);
+}
+
+function formatValue(value: number) {
+  if (Math.abs(value) >= 100) {
+    return value.toFixed(1);
+  }
+  return value.toFixed(2);
+}
+
+const options = parseArgs(Bun.argv.slice(2));
+const scripts = options.scripts.length > 0 ? options.scripts : [...defaultScripts];
+if (options.includeCompetitors) {
+  scripts.push("competitors.ts");
+}
+
+const samplesByMetric = new Map<string, { source: string; metric: string; samples: number[] }>();
+
+for (const script of scripts) {
+  const source = script.replace(/\.ts$/, "");
+  console.log(`\n## ${source}`);
+
+  for (let sample = 1; sample <= options.samples; sample += 1) {
+    console.log(`\n# sample ${sample}/${options.samples}`);
+    const metrics = await runScript(script);
+
+    for (const [metric, value] of metrics) {
+      const key = `${source}/${metric}`;
+      const entry = samplesByMetric.get(key) ?? { source, metric, samples: [] };
+      entry.samples.push(value);
+      samplesByMetric.set(key, entry);
+    }
+  }
+}
+
+const results = [...samplesByMetric.values()]
+  .map(({ source, metric, samples }) => aggregateMetric(source, metric, samples))
+  .sort((left, right) => left.name.localeCompare(right.name));
+
+const runResult: BenchmarkRunResult = {
+  version: 1,
+  generatedAt: new Date().toISOString(),
+  gitSha: gitSha(),
+  samplesPerBenchmark: options.samples,
+  results,
+};
+
+console.log("\n## Aggregated benchmark medians");
+for (const result of results) {
+  const suffix = result.unit === "ms" ? "ms" : result.unit === "bytes" ? " bytes" : "";
+  console.log(
+    `${result.name}: median=${formatValue(result.median)}${suffix} p95=${formatValue(result.p95)}${suffix}`,
+  );
+}
+
+if (options.out) {
+  const outPath = resolve(options.out);
+  mkdirSync(dirname(outPath), { recursive: true });
+  writeFileSync(outPath, `${JSON.stringify(runResult, null, 2)}\n`);
+  console.log(`\nWrote ${outPath}`);
+}
diff --git a/benchmarks/working-tree-load.ts b/benchmarks/working-tree-load.ts
new file mode 100644
index 00000000..8476113e
--- /dev/null
+++ b/benchmarks/working-tree-load.ts
@@ -0,0 +1,68 @@
+// Benchmark git-backed working-tree loading, including untracked file handling.
+import { performance } from "perf_hooks";
+import { loadAppBootstrap } from "../src/core/loaders";
+import { addUntrackedFiles, createChangedRepo } from "./lib/fixtures";
+
+interface Scenario {
+  name: string;
+  fileCount: number;
+  lines: number;
+  untrackedFiles?: number;
+  untrackedLines?: number;
+}
+
+const scenarios: Scenario[] = [
+  { name: "small_worktree", fileCount: 16, lines: 80 },
+  { name: "medium_worktree", fileCount: 96, lines: 180 },
+  { name: "large_worktree", fileCount: 240, lines: 220 },
+  {
+    name: "untracked_many_small",
+    fileCount: 16,
+    lines: 80,
+    untrackedFiles: 120,
+    untrackedLines: 36,
+  },
+  {
+    name: "untracked_few_large",
+    fileCount: 8,
+    lines: 80,
+    untrackedFiles: 6,
+    untrackedLines: 5_000,
+  },
+];
+
+async function measureScenario(scenario: Scenario) {
+  const fixture = createChangedRepo({ fileCount: scenario.fileCount, lines: scenario.lines });
+
+  try {
+    if (scenario.untrackedFiles) {
+      addUntrackedFiles(fixture.path, scenario.untrackedFiles, scenario.untrackedLines ?? 40);
+    }
+
+    const start = performance.now();
+    const bootstrap = await loadAppBootstrap(
+      { kind: "vcs", staged: false, options: { mode: "auto" } },
+      { cwd: fixture.path },
+    );
+    const loadMs = performance.now() - start;
+    const additions = bootstrap.changeset.files.reduce(
+      (sum, file) => sum + file.stats.additions,
+      0,
+    );
+    const deletions = bootstrap.changeset.files.reduce(
+      (sum, file) => sum + file.stats.deletions,
+      0,
+    );
+
+    console.log(`METRIC ${scenario.name}_load_ms=${loadMs.toFixed(2)}`);
+    console.log(`METRIC ${scenario.name}_files=${bootstrap.changeset.files.length}`);
+    console.log(`METRIC ${scenario.name}_additions=${additions}`);
+    console.log(`METRIC ${scenario.name}_deletions=${deletions}`);
+  } finally {
+    fixture.cleanup();
+  }
+}
+
+for (const scenario of scenarios) {
+  await measureScenario(scenario);
+}
diff --git a/package.json b/package.json
index c4366970..9ea41a6f 100644
--- a/package.json
+++ b/package.json
@@ -68,10 +68,18 @@
     "publish:prebuilt:npm": "bun run ./scripts/publish-prebuilt-npm.ts",
     "update:homebrew-formula": "bun run ./scripts/update-homebrew-formula.ts",
     "prepack": "bun run build:npm",
+    "bench": "bun run benchmarks/run.ts",
+    "bench:compare": "bun run benchmarks/compare.ts",
+    "bench:comment-pr": "bun run benchmarks/comment-pr.ts",
     "bench:bootstrap-load": "bun run benchmarks/bootstrap-load.ts",
+    "bench:working-tree-load": "bun run benchmarks/working-tree-load.ts",
+    "bench:changeset-parse": "bun run benchmarks/changeset-parse.ts",
+    "bench:render-layout": "bun run benchmarks/render-layout.ts",
     "bench:highlight-prefetch": "bun run benchmarks/highlight-prefetch.ts",
     "bench:large-stream": "bun run benchmarks/large-stream.ts",
     "bench:large-stream-profile": "bun run benchmarks/large-stream-profile.ts",
+    "bench:memory": "bun run benchmarks/memory.ts",
+    "bench:competitors": "bun run benchmarks/competitors.ts",
     "nix:update-lock": "nix run .#update-bun-lock"
   },
   "dependencies": {

From ca0092bb0a928782a9d8e7e1155a922c2754a541 Mon Sep 17 00:00:00 2001
From: Ben Vinegar <ben@benv.ca>
Date: Fri, 29 May 2026 19:02:13 -0400
Subject: [PATCH 2/5] perf: trim default benchmark suite

---
 .github/workflows/benchmarks.yml |  4 ++--
 benchmarks/README.md             | 12 +++++++-----
 benchmarks/large-stream.ts       |  2 +-
 benchmarks/run.ts                |  2 --
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index 50820f6e..c3a9ff76 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -51,7 +51,7 @@ jobs:
       - name: Run head benchmarks
         run: |
           mkdir -p benchmark-results
-          bun run bench -- --samples "$HUNK_BENCHMARK_SAMPLES" --include-competitors --out benchmark-results/head.json \
+          bun run bench -- --samples "$HUNK_BENCHMARK_SAMPLES" --out benchmark-results/head.json \
             | tee benchmark-results/head.txt
 
       - name: Run base benchmarks
@@ -63,7 +63,7 @@ jobs:
           cp -R benchmarks ../hunk-benchmark-base/benchmarks
           cd ../hunk-benchmark-base
           bun install --frozen-lockfile
-          bun run benchmarks/run.ts --samples "$HUNK_BENCHMARK_SAMPLES" --include-competitors --out "$GITHUB_WORKSPACE/benchmark-results/base.json" \
+          bun run benchmarks/run.ts --samples "$HUNK_BENCHMARK_SAMPLES" --out "$GITHUB_WORKSPACE/benchmark-results/base.json" \
             | tee "$GITHUB_WORKSPACE/benchmark-results/base.txt"
 
       - name: Compare benchmark results
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 91595cc6..c150f735 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -7,7 +7,7 @@ Benchmark scripts, shared fixtures, and local result artifacts live here. These
 Run the full benchmark suite with one JSON result file:
 
 ```bash
-bun run bench -- --samples 3 --include-competitors --out benchmarks/results/head.json
+bun run bench -- --samples 3 --out benchmarks/results/head.json
 ```
 
 Run focused scripts while iterating:
@@ -41,9 +41,9 @@ bun run bench:compare -- \
 - `render-layout.ts` — measures pure split/stack row building, section geometry, and review-plan construction for many-small-files, balanced, and large-single-file streams.
 - `highlight-prefetch.ts` — measures selected-file highlight startup and adjacent prefetch readiness.
 - `large-stream.ts` — measures large split-stream first-frame and scroll cost.
-- `large-stream-profile.ts` — profiles the main pure planning stages behind the large split-stream benchmark.
-- `memory.ts` — records RSS/heap after fixture loading, planning, first frame, and next-hunk navigation.
-- `competitors.ts` — optional informational comparisons against `git diff --no-ext-diff`, `delta`, `difftastic`, and `diff-so-fancy` when installed.
+- `large-stream-profile.ts` — optional local profiler for the main pure planning stages behind the large split-stream benchmark.
+- `memory.ts` — optional local RSS/heap profiler after fixture loading, planning, first frame, and next-hunk navigation.
+- `competitors.ts` — optional local informational comparisons against `git diff --no-ext-diff`, `delta`, `difftastic`, and `diff-so-fancy` when installed.
 - `large-stream-fixture.ts` and `lib/fixtures.ts` — shared deterministic synthetic fixtures.
 
 ## Output format
@@ -84,11 +84,13 @@ Each script prints `METRIC name=value` lines. `benchmarks/run.ts` repeats script
 6. Uploads raw JSON/text artifacts.
 7. Posts or updates one PR comment with a curated key-benchmark table, always including regressions and hiding noisy supporting metrics.
 
+The default CI suite intentionally excludes optional memory profiling, pure-planning profiling, and competitor comparisons to keep PR feedback fast. Run `bun run bench -- --include-competitors` or focused scripts locally when deeper diagnostics are needed.
+
 Initial thresholds:
 
 - Time metrics (`*_ms`): fail when PR median is more than 15% slower **and** at least 5ms slower.
 - Memory metrics (`rss`/`heap`): fail when PR median is more than 20% higher **and** at least 8MiB higher.
-- Counts, fixture sizes, availability flags, and competitor metrics are informational.
+- Counts, fixture sizes, availability flags, and optional competitor metrics are informational.
 
 Competitor comparisons are intentionally non-failing because installed tool versions and feature parity vary by environment.
 
diff --git a/benchmarks/large-stream.ts b/benchmarks/large-stream.ts
index 117cd6e3..13c5d802 100644
--- a/benchmarks/large-stream.ts
+++ b/benchmarks/large-stream.ts
@@ -14,7 +14,7 @@ const VIEWPORT = {
   width: 240,
   height: 28,
 } as const;
-const SCROLL_TICKS = 18;
+const SCROLL_TICKS = 8;
 const SCROLL_TARGET = {
   x: 170,
   y: 12,
diff --git a/benchmarks/run.ts b/benchmarks/run.ts
index dc83d052..d8560fd9 100644
--- a/benchmarks/run.ts
+++ b/benchmarks/run.ts
@@ -10,8 +10,6 @@ const defaultScripts = [
   "render-layout.ts",
   "highlight-prefetch.ts",
   "large-stream.ts",
-  "large-stream-profile.ts",
-  "memory.ts",
 ];
 
 interface RunOptions {

From 16e0f801a39ce20361e9452730c962ec6398e474 Mon Sep 17 00:00:00 2001
From: Ben Vinegar <ben@benv.ca>
Date: Fri, 29 May 2026 19:05:09 -0400
Subject: [PATCH 3/5] fix: harden benchmark metadata handling

---
 benchmarks/comment-pr.ts           | 21 ++++++++++++++++++---
 benchmarks/compare.ts              |  2 +-
 benchmarks/lib/benchmark-result.ts |  9 ++++-----
 3 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/benchmarks/comment-pr.ts b/benchmarks/comment-pr.ts
index 687afcd3..387df039 100644
--- a/benchmarks/comment-pr.ts
+++ b/benchmarks/comment-pr.ts
@@ -47,6 +47,23 @@ async function githubRequest(path: string, init: RequestInit = {}) {
   return response.status === 204 ? null : response.json();
 }
 
+/** Fetch every issue comment page so the marker lookup can update old bot comments. */
+async function fetchAllComments(repository: string, pullRequestNumber: number) {
+  const comments: Array<{ id: number; body?: string }> = [];
+
+  for (let page = 1; ; page += 1) {
+    const batch = (await githubRequest(
+      `/repos/${repository}/issues/${pullRequestNumber}/comments?per_page=100&page=${page}`,
+    )) as Array<{ id: number; body?: string }>;
+
+    comments.push(...batch);
+
+    if (batch.length < 100) {
+      return comments;
+    }
+  }
+}
+
 const { bodyPath } = parseArgs(Bun.argv.slice(2));
 const repository = requireEnv("GITHUB_REPOSITORY");
 const eventPath = requireEnv("GITHUB_EVENT_PATH");
@@ -59,9 +76,7 @@ if (!pullRequestNumber) {
 }
 
 const body = readFileSync(bodyPath, "utf8");
-const comments = (await githubRequest(
-  `/repos/${repository}/issues/${pullRequestNumber}/comments?per_page=100`,
-)) as Array<{ id: number; body?: string }>;
+const comments = await fetchAllComments(repository, pullRequestNumber);
 const existing = comments.find((comment) => comment.body?.includes(marker));
 
 if (existing) {
diff --git a/benchmarks/compare.ts b/benchmarks/compare.ts
index 157c6349..f66fd45d 100644
--- a/benchmarks/compare.ts
+++ b/benchmarks/compare.ts
@@ -88,7 +88,7 @@ function compareMetric(
     status = "missing-base";
   } else if (!head) {
     status = "missing-head";
-  } else if (!metric.comparable || metric.informational || metric.name.includes("competitor_")) {
+  } else if (!metric.comparable || metric.name.includes("competitor_")) {
     status = "informational";
   } else if (
     metric.threshold &&
diff --git a/benchmarks/lib/benchmark-result.ts b/benchmarks/lib/benchmark-result.ts
index 8c1bc3fc..dc0e8a26 100644
--- a/benchmarks/lib/benchmark-result.ts
+++ b/benchmarks/lib/benchmark-result.ts
@@ -14,7 +14,6 @@ export interface BenchmarkMetricResult {
   max: number;
   threshold?: BenchmarkThreshold;
   comparable: boolean;
-  informational?: boolean;
   source: string;
 }
 
@@ -65,6 +64,10 @@ export function percentile(samples: number[], percentileValue: number) {
 export function classifyMetric(
   name: string,
 ): Pick<BenchmarkMetricResult, "unit" | "comparable" | "threshold"> {
+  if (name.startsWith("competitor_")) {
+    return { unit: "ms", comparable: false };
+  }
+
   if (name.endsWith("_ms")) {
     return {
       unit: "ms",
@@ -93,10 +96,6 @@ export function classifyMetric(
     return { unit: "bytes", comparable: false };
   }
 
-  if (name.startsWith("competitor_")) {
-    return { unit: "ms", comparable: false };
-  }
-
   return { unit: "count", comparable: false };
 }
 

From 5e816a50fc9e1813ca6bae82a9ce14e14ecb3d16 Mon Sep 17 00:00:00 2001
From: Ben Vinegar <ben@benv.ca>
Date: Fri, 29 May 2026 20:39:29 -0400
Subject: [PATCH 4/5] ci: speed up pull request benchmarks

---
 .github/workflows/benchmarks.yml | 2 +-
 benchmarks/README.md             | 2 +-
 benchmarks/large-stream.ts       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index c3a9ff76..d2cded6d 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -24,7 +24,7 @@ permissions:
 
 env:
   SKIP_INSTALL_SIMPLE_GIT_HOOKS: "1"
-  HUNK_BENCHMARK_SAMPLES: "3"
+  HUNK_BENCHMARK_SAMPLES: ${{ github.event_name == 'pull_request' && '1' || '3' }}
 
 concurrency:
   group: benchmarks-${{ github.workflow }}-${{ github.ref }}
diff --git a/benchmarks/README.md b/benchmarks/README.md
index c150f735..ec6b675f 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -84,7 +84,7 @@ Each script prints `METRIC name=value` lines. `benchmarks/run.ts` repeats script
 6. Uploads raw JSON/text artifacts.
 7. Posts or updates one PR comment with a curated key-benchmark table, always including regressions and hiding noisy supporting metrics.
 
-The default CI suite intentionally excludes optional memory profiling, pure-planning profiling, and competitor comparisons to keep PR feedback fast. Run `bun run bench -- --include-competitors` or focused scripts locally when deeper diagnostics are needed.
+The default CI suite intentionally excludes optional memory profiling, pure-planning profiling, and competitor comparisons to keep PR feedback fast. Pull requests use one sample per benchmark so the benchmark job should finish around the normal CI runtime; `main` runs keep three samples for a more stable history. Run `bun run bench -- --include-competitors` or focused scripts locally when deeper diagnostics are needed.
 
 Initial thresholds:
 
diff --git a/benchmarks/large-stream.ts b/benchmarks/large-stream.ts
index 13c5d802..3777ad30 100644
--- a/benchmarks/large-stream.ts
+++ b/benchmarks/large-stream.ts
@@ -14,7 +14,7 @@ const VIEWPORT = {
   width: 240,
   height: 28,
 } as const;
-const SCROLL_TICKS = 8;
+const SCROLL_TICKS = 4;
 const SCROLL_TARGET = {
   x: 170,
   y: 12,

From fe94ea13d9653492f864335267f1596667203f3b Mon Sep 17 00:00:00 2001
From: Ben Vinegar <ben@benv.ca>
Date: Fri, 29 May 2026 20:44:47 -0400
Subject: [PATCH 5/5] ci: make PR benchmarks non-blocking

---
 .github/workflows/benchmarks.yml | 4 ----
 benchmarks/README.md             | 4 ++--
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index d2cded6d..5aeb6e97 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -99,10 +99,6 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: bun run bench:comment-pr -- --body benchmark-results/summary.md
 
-      - name: Fail on benchmark regression
-        if: github.event_name == 'pull_request' && steps.compare.outcome == 'failure'
-        run: exit 1
-
       - name: Upload benchmark artifacts
         if: always()
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
diff --git a/benchmarks/README.md b/benchmarks/README.md
index ec6b675f..7027d596 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -80,11 +80,11 @@ Each script prints `METRIC name=value` lines. `benchmarks/run.ts` repeats script
 2. Checks out `origin/main` in a sibling worktree.
 3. Copies the PR benchmark harness into that base worktree so new benchmarks can compare base code during the PR that introduces them.
 4. Runs the same benchmarks on base.
-5. Compares medians and fails if comparable metrics regress beyond threshold.
+5. Compares medians and marks regressions in the PR summary without blocking the PR.
 6. Uploads raw JSON/text artifacts.
 7. Posts or updates one PR comment with a curated key-benchmark table, always including regressions and hiding noisy supporting metrics.
 
-The default CI suite intentionally excludes optional memory profiling, pure-planning profiling, and competitor comparisons to keep PR feedback fast. Pull requests use one sample per benchmark so the benchmark job should finish around the normal CI runtime; `main` runs keep three samples for a more stable history. Run `bun run bench -- --include-competitors` or focused scripts locally when deeper diagnostics are needed.
+The default CI suite intentionally excludes optional memory profiling, pure-planning profiling, and competitor comparisons to keep PR feedback fast. Pull requests use one sample per benchmark and are informational/non-blocking; `main` runs keep three samples for a more stable history. Run `bun run bench -- --include-competitors` or focused scripts locally when deeper diagnostics are needed.
 
 Initial thresholds: