modem-dev · benvinegar · Jun 13, 2026 · Jun 13, 2026 · Jun 13, 2026
diff --git a/.github/workflows/release-prebuilt-npm.yml b/.github/workflows/release-prebuilt-npm.yml
@@ -13,6 +13,16 @@ on:
         required: true
         default: latest
         type: string
+      allow_benchmark_regression:
+        description: Allow a manual release despite a material benchmark regression
+        required: true
+        default: false
+        type: boolean
+      benchmark_regression_reason:
+        description: Required reason when allowing a benchmark regression
+        required: false
+        default: ""
+        type: string
   push:
     tags:
       - "v*"
@@ -25,8 +35,62 @@ concurrency:
   cancel-in-progress: false
 
 jobs:
+  release-benchmark-gate:
+    name: Release benchmark gate
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+
+      - name: Set up Bun
+        uses: oven-sh/setup-bun@0c5077e51419868618aeaa5fe8019c62421857d6 # v2.2.0
+        with:
+          bun-version: 1.3.10
+
+      - name: Install dependencies
+        run: bun install --frozen-lockfile
+
+      - name: Verify tag matches package version
+        if: github.event_name == 'push'
+        run: bun run ./scripts/check-release-version.ts "${{ github.ref_name }}"
+
+      - name: Compare release benchmark snapshot
+        id: benchmark-gate
+        continue-on-error: ${{ github.event_name == 'workflow_dispatch' && inputs.allow_benchmark_regression }}
+        run: |
+          mkdir -p dist/release
+          bun run bench:release:compare -- \
+            --out dist/release/benchmark-comparison.json \
+            --summary "$GITHUB_STEP_SUMMARY"
+
+      - name: Require benchmark override reason
+        if: ${{ steps.benchmark-gate.outcome == 'failure' && github.event_name == 'workflow_dispatch' && inputs.allow_benchmark_regression }}
+        env:
+          BENCHMARK_REGRESSION_REASON: ${{ inputs.benchmark_regression_reason }}
+        run: |
+          if [ -z "$BENCHMARK_REGRESSION_REASON" ]; then
+            echo "benchmark_regression_reason is required when allow_benchmark_regression is true." >&2
+            exit 1
+          fi
+          {
+            echo
+            echo "## Benchmark regression override"
+            echo
+            echo "Manual override reason: $BENCHMARK_REGRESSION_REASON"
+          } >> "$GITHUB_STEP_SUMMARY"
+
+      - name: Upload benchmark comparison
+        if: always()
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: release-benchmark-comparison
+          path: dist/release/benchmark-comparison.json
+          if-no-files-found: ignore
+
   build-binaries:
     name: Build ${{ matrix.package_name }}
+    needs:
+      - release-benchmark-gate
     runs-on: ${{ matrix.runner }}
     strategy:
       fail-fast: false

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,8 @@ All notable user-visible changes to Hunk are documented in this file.
 
 ### Added
 
+- Added release benchmark snapshots and a release workflow gate that blocks publishing when committed benchmark results show material performance regressions.
+
 ### Changed
 
 ### Fixed

diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -22,6 +22,13 @@ Include the opt-in huge fixture tier (~1k files / 300k+ diff lines plus one ~50k
 bun run bench -- --samples 1 --include-huge --out benchmarks/results/local-with-huge.json
 ```
 
+Generate the committed release benchmark snapshot during release prep:
+
+```bash
+bun run bench:release
+bun run bench:release:compare
+```
+
 Run focused scripts while iterating:
 
 ```bash

diff --git a/benchmarks/lib/benchmark-result.ts b/benchmarks/lib/benchmark-result.ts
@@ -17,10 +17,18 @@ export interface BenchmarkMetricResult {
   source: string;
 }
 
+export interface BenchmarkRuntimeInfo {
+  bunVersion?: string;
+  platform: string;
+  arch: string;
+}
+
 export interface BenchmarkRunResult {
   version: 1;
   generatedAt: string;
   gitSha?: string;
+  packageVersion?: string;
+  runtime?: BenchmarkRuntimeInfo;
   samplesPerBenchmark: number;
   results: BenchmarkMetricResult[];
 }

diff --git a/benchmarks/release/.gitkeep b/benchmarks/release/.gitkeep
diff --git a/benchmarks/release/README.md b/benchmarks/release/README.md
@@ -0,0 +1,38 @@
+# Release benchmark snapshots
+
+Committed files in this directory are the performance baselines used by the release workflow. They are intentionally versioned so a release can be audited after publishing.
+
+## Release prep
+
+Before pushing a release tag, run the benchmark suite for the version in `package.json`:
+
+```bash
+bun run bench:release
+```
+
+This writes:
+
+```text
+benchmarks/release/bench-x.y.z.json
+```
+
+Then compare it against the latest lower stable release snapshot:
+
+```bash
+bun run bench:release:compare
+```
+
+Commit the new `bench-x.y.z.json` file with the release-prep change. The tag release workflow validates that this file exists and fails before publishing npm packages if the comparison finds a material regression.
+
+## Regression policy
+
+The gate compares benchmark medians and only fails on regressions that exceed both the relative and absolute thresholds embedded in the benchmark result metadata:
+
+- timing metrics: default `+15%` and at least `+5ms`
+- memory metrics: default `+20%` and at least `+8MiB`
+
+New metrics are informational until a later release has a baseline. Missing previously comparable metrics fail, because that means the gate can no longer protect that measurement.
+
+## Backfilling
+
+When adding this gate or restoring a missing baseline, check out the release tag and generate the snapshot with the same Bun version and runner class used for current release prep. Commit backfilled snapshots before relying on the release gate.
diff --git a/benchmarks/run.ts b/benchmarks/run.ts
@@ -1,4 +1,5 @@
 #!/usr/bin/env bun
+import os from "node:os";
 import { mkdirSync, writeFileSync } from "node:fs";
 import { dirname, resolve } from "node:path";
 import { aggregateMetric, type BenchmarkRunResult } from "./lib/benchmark-result";
@@ -94,6 +95,15 @@ function gitSha() {
   return Buffer.from(proc.stdout).toString("utf8").trim();
 }
 
+async function packageVersion() {
+  try {
+    const packageJson = JSON.parse(await Bun.file("package.json").text()) as { version?: string };
+    return packageJson.version;
+  } catch {
+    return undefined;
+  }
+}
+
 function parseMetrics(output: string) {
   const metrics = new Map<string, number>();
   const metricPattern = /^METRIC\s+([A-Za-z0-9_.:-]+)=(-?\d+(?:\.\d+)?)$/;
@@ -179,6 +189,12 @@ const runResult: BenchmarkRunResult = {
   version: 1,
   generatedAt: new Date().toISOString(),
   gitSha: gitSha(),
+  packageVersion: await packageVersion(),
+  runtime: {
+    bunVersion: Bun.version,
+    platform: os.platform(),
+    arch: os.arch(),
+  },
   samplesPerBenchmark: options.samples,
   results,
 };

diff --git a/package.json b/package.json
@@ -69,6 +69,8 @@
     "update:homebrew-formula": "bun run ./scripts/update-homebrew-formula.ts",
     "prepack": "bun run build:npm",
     "bench": "bun run benchmarks/run.ts",
+    "bench:release": "bun run ./scripts/run-release-benchmark.ts",
+    "bench:release:compare": "bun run ./scripts/compare-release-benchmarks.ts",
     "bench:bootstrap-load": "bun run benchmarks/bootstrap-load.ts",
     "bench:working-tree-load": "bun run benchmarks/working-tree-load.ts",
     "bench:changeset-parse": "bun run benchmarks/changeset-parse.ts",

diff --git a/scripts/compare-release-benchmarks.test.ts b/scripts/compare-release-benchmarks.test.ts
@@ -0,0 +1,163 @@
+import { mkdtempSync, mkdirSync, rmSync, writeFileSync } from "node:fs";
+import os from "node:os";
+import path from "node:path";
+import { afterEach, describe, expect, test } from "bun:test";
+import type { BenchmarkMetricResult, BenchmarkRunResult } from "../benchmarks/lib/benchmark-result";
+import {
+  compareBenchmarkRuns,
+  findPreviousReleaseBenchmark,
+  formatComparisonMarkdown,
+  isMaterialRegression,
+} from "./compare-release-benchmarks";
+import { parseRunReleaseBenchmarkArgs } from "./run-release-benchmark";
+
+let tempRoot: string | undefined;
+
+function createTempReleaseDir() {
+  tempRoot = mkdtempSync(path.join(os.tmpdir(), "hunk-release-benchmarks-"));
+  const releaseDir = path.join(tempRoot, "benchmarks", "release");
+  mkdirSync(releaseDir, { recursive: true });
+  return releaseDir;
+}
+
+function metric(overrides: Partial<BenchmarkMetricResult>): BenchmarkMetricResult {
+  return {
+    name: "large-stream/cold_first_frame_ms",
+    source: "large-stream",
+    unit: "ms",
+    samples: [100, 101, 99],
+    median: 100,
+    p75: 101,
+    p95: 101,
+    min: 99,
+    max: 101,
+    comparable: true,
+    threshold: { maxRegressionRatio: 1.15, minAbsoluteRegression: 5 },
+    ...overrides,
+  };
+}
+
+function runResult(results: BenchmarkMetricResult[]): BenchmarkRunResult {
+  return {
+    version: 1,
+    generatedAt: "2026-06-13T00:00:00.000Z",
+    gitSha: "abc1234",
+    samplesPerBenchmark: 3,
+    results,
+  };
+}
+
+afterEach(() => {
+  if (tempRoot) {
+    rmSync(tempRoot, { recursive: true, force: true });
+    tempRoot = undefined;
+  }
+});
+
+describe("findPreviousReleaseBenchmark", () => {
+  test("selects the latest lower stable release benchmark", () => {
+    const releaseDir = createTempReleaseDir();
+    for (const version of ["0.14.1", "0.15.0", "0.15.3-beta.1", "0.15.3"]) {
+      writeFileSync(path.join(releaseDir, `bench-${version}.json`), "{}\n");
+    }
+
+    expect(findPreviousReleaseBenchmark("0.15.4", releaseDir)).toMatchObject({
+      version: "0.15.3",
+    });
+  });
+});
+
+describe("isMaterialRegression", () => {
+  test("requires both relative and absolute timing thresholds", () => {
+    const threshold = { maxRegressionRatio: 1.15, minAbsoluteRegression: 5 };
+
+    expect(isMaterialRegression(100, 116, threshold)).toBe(true);
+    expect(isMaterialRegression(100, 104, threshold)).toBe(false);
+    expect(isMaterialRegression(10, 12, threshold)).toBe(false);
+    expect(isMaterialRegression(100, 90, threshold)).toBe(false);
+  });
+});
+
+describe("parseRunReleaseBenchmarkArgs", () => {
+  test("keeps an explicit output path when --version appears later", async () => {
+    const outPath = path.join(os.tmpdir(), "custom-release-benchmark.json");
+
+    await expect(
+      parseRunReleaseBenchmarkArgs(["--out", outPath, "--version", "0.16.0"]),
+    ).resolves.toMatchObject({
+      version: "0.16.0",
+      out: outPath,
+    });
+  });
+});
+
+describe("compareBenchmarkRuns", () => {
+  test("fails material comparable regressions", () => {
+    const comparison = compareBenchmarkRuns(
+      runResult([metric({ median: 100 })]),
+      runResult([metric({ median: 120 })]),
+    );
+
+    expect(comparison.failed).toBe(true);
+    expect(comparison.rows[0]?.status).toBe("fail");
+  });
+
+  test("passes comparable changes inside the material threshold", () => {
+    const comparison = compareBenchmarkRuns(
+      runResult([metric({ median: 100 })]),
+      runResult([metric({ median: 110 })]),
+    );
+
+    expect(comparison.failed).toBe(false);
+    expect(comparison.rows[0]?.status).toBe("pass");
+  });
+
+  test("treats new comparable metrics as informational until a baseline exists", () => {
+    const comparison = compareBenchmarkRuns(runResult([]), runResult([metric({ median: 100 })]));
+
+    expect(comparison.failed).toBe(false);
+    expect(comparison.rows[0]?.status).toBe("missing-base");
+  });
+
+  test("fails when a previously comparable metric disappears", () => {
+    const comparison = compareBenchmarkRuns(runResult([metric({ median: 100 })]), runResult([]));
+
+    expect(comparison.failed).toBe(true);
+    expect(comparison.rows[0]?.status).toBe("missing-head");
+  });
+});
+
+describe("formatComparisonMarkdown", () => {
+  test("shows absolute threshold units", () => {
+    const comparison = compareBenchmarkRuns(
+      runResult([
+        metric({ median: 100 }),
+        metric({
+          name: "memory/rss_bytes",
+          source: "memory",
+          unit: "bytes",
+          median: 100 * 1024 * 1024,
+          threshold: { maxRegressionRatio: 1.2, minAbsoluteRegression: 8 * 1024 * 1024 },
+        }),
+      ]),
+      runResult([
+        metric({ median: 110 }),
+        metric({
+          name: "memory/rss_bytes",
+          source: "memory",
+          unit: "bytes",
+          median: 105 * 1024 * 1024,
+          threshold: { maxRegressionRatio: 1.2, minAbsoluteRegression: 8 * 1024 * 1024 },
+        }),
+      ]),
+    );
+
+    const markdown = formatComparisonMarkdown(comparison, {
+      baseLabel: "0.15.1",
+      headLabel: "0.15.2",
+    });
+
+    expect(markdown).toContain("+15% and +5.00 ms");
+    expect(markdown).toContain("+20% and +8.00 MiB");
+  });
+});