tigrbl · MichaelDecent · Apr 1, 2026 · Apr 1, 2026
diff --git a/.github/workflows/benchmark-pr.yml b/.github/workflows/benchmark-pr.yml
@@ -0,0 +1,77 @@
+name: benchmark-pr
+
+on:
+  pull_request:
+    types: [opened, synchronize]
+
+permissions:
+  pull-requests: write
+  contents: read
+
+jobs:
+  benchmark:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install uv
+        run: |
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          echo "$HOME/.local/bin" >> "$GITHUB_PATH"
+
+      - name: Install dependencies
+        run: uv pip install --system -e ".[certification,dev]"
+
+      - name: Run benchmarks
+        id: bench
+        run: |
+          mkdir -p .perf-artifacts
+          set +e
+          PYTHONPATH=src python tools/run_perf_matrix.py \
+            --establish-baseline \
+            --shuffle \
+            --artifact-root .perf-artifacts \
+            > .perf-artifacts/runner_output.json 2>&1
+          EXIT_CODE=$?
+          set -e
+          echo "exit_code=$EXIT_CODE" >> "$GITHUB_OUTPUT"
+
+      - name: Format benchmark comment
+        run: |
+          PYTHONPATH=src python tools/format_benchmark_comment.py \
+            .perf-artifacts/summary.json > .perf-artifacts/comment.md
+
+      - name: Post or update PR comment
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          COMMENT_TAG="<!-- benchmark-results -->"
+          BODY=$(cat .perf-artifacts/comment.md)
+          FULL_BODY="${COMMENT_TAG}
+          ${BODY}"
+
+          EXISTING=$(gh api "repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments" \
+            --jq '[.[] | select(.body | startswith("<!-- benchmark-results -->")) | .id][0]' 2>/dev/null || true)
+
+          if [ -n "$EXISTING" ] && [ "$EXISTING" != "null" ]; then
+            gh api "repos/${{ github.repository }}/issues/comments/${EXISTING}" \
+              -X PATCH -f body="$FULL_BODY"
+          else
+            gh api "repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments" \
+              -f body="$FULL_BODY"
+          fi
+
+      - name: Upload benchmark artifacts
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: benchmark-artifacts-${{ github.event.pull_request.number }}
+          path: .perf-artifacts/
+
+      - name: Fail if benchmarks failed
+        if: steps.bench.outputs.exit_code != '0'
+        run: exit 1
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 *.pyc
 /src/tigrcorn.egg-info
+.perf-artifacts/
diff --git a/src/tigrcorn/compat/perf_runner.py b/src/tigrcorn/compat/perf_runner.py
@@ -3,6 +3,7 @@
 import json
 import os
 import platform
+import random
 import subprocess
 import sys
 import time
@@ -67,6 +68,8 @@ class PerfRunSummary:
     passed: int
     failed: int
     profiles: list[PerfProfileResult]
+    shuffle_seed: int | None = None
+    execution_order: list[str] | None = None
 
 
 class PerfRunnerError(RuntimeError):
@@ -115,12 +118,19 @@ def run_performance_matrix(
     baseline_root: str | Path | None = None,
     profile_ids: list[str] | None = None,
     establish_baseline: bool = False,
+    shuffle: bool = False,
+    seed: int | None = None,
 ) -> PerfRunSummary:
     source_root = Path(source_root)
     matrix_file = source_root / (Path(matrix_path) if matrix_path is not None else DEFAULT_PERFORMANCE_MATRIX_PATH)
     matrix = load_performance_matrix(matrix_file)
     selected_ids = set(profile_ids or [profile.profile_id for profile in matrix.profiles])
     selected_profiles = [profile for profile in matrix.profiles if profile.profile_id in selected_ids]
+    effective_seed: int | None = None
+    if shuffle:
+        effective_seed = seed if seed is not None else random.randint(0, 2**32 - 1)
+        rng = random.Random(effective_seed)
+        rng.shuffle(selected_profiles)
     if not selected_profiles:
         raise PerfRunnerError('no performance profiles selected')
 
@@ -199,8 +209,15 @@ def run_performance_matrix(
         passed=sum(1 for result in results if result.passed),
         failed=sum(1 for result in results if not result.passed),
         profiles=results,
+        shuffle_seed=effective_seed if shuffle else None,
+        execution_order=[p.profile_id for p in selected_profiles] if shuffle else None,
     )
-    _write_run_summary(artifact_root, summary, environment, profiles=selected_profiles)
+    shuffle_meta = {
+        'enabled': True,
+        'seed': effective_seed,
+        'execution_order': [p.profile_id for p in selected_profiles],
+    } if shuffle else None
+    _write_run_summary(artifact_root, summary, environment, profiles=selected_profiles, shuffle_metadata=shuffle_meta)
     return summary
 
 
@@ -600,7 +617,7 @@ def _write_samples_csv(path: Path, samples: list[Any]) -> None:
     path.write_text('\n'.join(lines) + '\n', encoding='utf-8')
 
 
-def _write_run_summary(artifact_root: Path, summary: PerfRunSummary, environment: Mapping[str, Any], *, profiles: list[PerfProfile]) -> None:
+def _write_run_summary(artifact_root: Path, summary: PerfRunSummary, environment: Mapping[str, Any], *, profiles: list[PerfProfile], shuffle_metadata: dict[str, Any] | None = None) -> None:
     lane_counts: dict[str, int] = {}
     for profile in profiles:
         lane_counts[profile.lane] = lane_counts.get(profile.lane, 0) + 1
@@ -625,6 +642,8 @@ def _write_run_summary(artifact_root: Path, summary: PerfRunSummary, environment
         ],
         'generated_at_epoch': environment.get('generated_at_epoch'),
     }
+    if shuffle_metadata is not None:
+        payload['shuffle'] = shuffle_metadata
     (artifact_root / 'summary.json').write_text(json.dumps(_jsonable(payload), indent=2, sort_keys=True) + '\n', encoding='utf-8')
     (artifact_root / 'index.json').write_text(json.dumps(_jsonable(payload), indent=2, sort_keys=True) + '\n', encoding='utf-8')
 

diff --git a/tests/perf/test_shuffle_order.py b/tests/perf/test_shuffle_order.py
@@ -0,0 +1,70 @@
+from __future__ import annotations
+
+import json
+import tempfile
+from pathlib import Path
+
+from tigrcorn.compat.perf_runner import load_performance_matrix, run_performance_matrix
+
+ROOT = Path(__file__).resolve().parents[2]
+MATRIX_PATH = 'docs/review/performance/performance_matrix.json'
+
+
+def _default_profile_order(profile_ids: list[str]) -> list[str]:
+    matrix = load_performance_matrix(ROOT / MATRIX_PATH)
+    return [p.profile_id for p in matrix.profiles if p.profile_id in set(profile_ids)]
+
+
+def test_same_seed_produces_identical_order():
+    profile_ids = ['http11_baseline', 'http11_keepalive', 'ws_http11', 'tls_handshake']
+    with tempfile.TemporaryDirectory() as tmp1, tempfile.TemporaryDirectory() as tmp2:
+        s1 = run_performance_matrix(
+            ROOT, artifact_root=Path(tmp1) / 'perf', profile_ids=profile_ids,
+            establish_baseline=True, shuffle=True, seed=12345,
+        )
+        s2 = run_performance_matrix(
+            ROOT, artifact_root=Path(tmp2) / 'perf', profile_ids=profile_ids,
+            establish_baseline=True, shuffle=True, seed=12345,
+        )
+        assert s1.execution_order == s2.execution_order
+
+
+def test_shuffle_changes_order():
+    profile_ids = ['http11_baseline', 'http11_keepalive', 'ws_http11', 'tls_handshake']
+    original_order = _default_profile_order(profile_ids)
+    found_different = False
+    for seed in range(10):
+        with tempfile.TemporaryDirectory() as tmp:
+            s = run_performance_matrix(
+                ROOT, artifact_root=Path(tmp) / 'perf', profile_ids=profile_ids,
+                establish_baseline=True, shuffle=True, seed=seed,
+            )
+            if s.execution_order != original_order:
+                found_different = True
+                break
+    assert found_different, 'shuffle never produced a different order across 10 seeds'
+
+
+def test_seed_recorded_in_artifact():
+    seed = 99999
+    with tempfile.TemporaryDirectory() as tmp:
+        summary = run_performance_matrix(
+            ROOT, artifact_root=Path(tmp) / 'perf', profile_ids=['http11_baseline'],
+            establish_baseline=True, shuffle=True, seed=seed,
+        )
+        artifact = json.loads((Path(tmp) / 'perf' / 'summary.json').read_text(encoding='utf-8'))
+        assert artifact['shuffle']['seed'] == seed
+        assert artifact['shuffle']['enabled'] is True
+        assert artifact['shuffle']['execution_order'] == [summary.profiles[0].profile_id]
+
+
+def test_no_shuffle_omits_metadata():
+    with tempfile.TemporaryDirectory() as tmp:
+        summary = run_performance_matrix(
+            ROOT, artifact_root=Path(tmp) / 'perf', profile_ids=['http11_baseline'],
+            establish_baseline=True, shuffle=False,
+        )
+        assert summary.shuffle_seed is None
+        assert summary.execution_order is None
+        artifact = json.loads((Path(tmp) / 'perf' / 'summary.json').read_text(encoding='utf-8'))
+        assert 'shuffle' not in artifact
diff --git a/tools/format_benchmark_comment.py b/tools/format_benchmark_comment.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+"""Format benchmark artifacts into a GitHub PR comment (markdown)."""
+from __future__ import annotations
+
+import json
+import sys
+from pathlib import Path
+
+
+def load_json(path: Path) -> dict:
+    return json.loads(path.read_text(encoding='utf-8'))
+
+
+def fmt_throughput(value: float) -> str:
+    return f'{value:,.2f}'
+
+
+def fmt_latency(value: float) -> str:
+    return f'{value:.3f}'
+
+
+def fmt_error_rate(value: float) -> str:
+    return f'{value:.3f}'
+
+
+def format_comment(summary: dict, artifact_root: Path) -> str:
+    total = summary.get('total', 0)
+    passed = summary.get('passed', 0)
+    failed = summary.get('failed', 0)
+    commit = summary.get('commit_hash', 'unknown')
+    platform = summary.get('certification_platform', 'unknown')
+    shuffle = summary.get('shuffle', {})
+    seed = shuffle.get('seed') if shuffle else None
+
+    lines: list[str] = []
+    lines.append('## Benchmark Results\n')
+
+    status_icon = ':white_check_mark:' if failed == 0 else ':x:'
+    lines.append(f'**Status:** {status_icon} {passed}/{total} passed, {failed} failed  |  **Commit:** `{commit[:8]}`')
+    meta = f'**Platform:** {platform}'
+    if seed is not None:
+        meta += f'  |  **Shuffle seed:** {seed}'
+    lines.append(meta)
+    lines.append('')
+
+    # Collect per-profile data grouped by lane
+    profiles_by_lane: dict[str, list[dict]] = {}
+    failures: list[dict] = []
+
+    for entry in summary.get('profiles', []):
+        pid = entry['profile_id']
+        profile_summary_path = artifact_root / pid / 'summary.json'
+        if profile_summary_path.exists():
+            profile_data = load_json(profile_summary_path)
+        else:
+            profile_data = {'lane': 'unknown', 'metrics': {}, 'passed': entry.get('passed', False)}
+
+        profile_data['_profile_id'] = pid
+        profile_data['_failure_reasons'] = entry.get('failure_reasons', [])
+        profile_data['_passed'] = entry.get('passed', True)
+
+        lane = profile_data.get('lane', 'unknown')
+        profiles_by_lane.setdefault(lane, []).append(profile_data)
+
+        if not entry.get('passed', True):
+            failures.append(profile_data)
+
+    # Failures section
+    if failures:
+        lines.append('### Failures\n')
+        lines.append('| Profile | Reasons |')
+        lines.append('|---------|---------|')
+        for f in failures:
+            reasons = '; '.join(f['_failure_reasons']) if f['_failure_reasons'] else 'unknown'
+            lines.append(f'| {f["_profile_id"]} | {reasons} |')
+        lines.append('')
+
+    # Results by lane
+    lines.append('### Results\n')
+
+    for lane, profiles in sorted(profiles_by_lane.items()):
+        count = len(profiles)
+        lines.append(f'<details><summary>{lane} ({count} profiles)</summary>\n')
+        lines.append('| Profile | Status | Throughput (ops/s) | p99 (ms) | p99.9 (ms) | Error Rate |')
+        lines.append('|---------|--------|--------------------|----------|------------|------------|')
+
+        for p in profiles:
+            pid = p['_profile_id']
+            icon = ':white_check_mark:' if p['_passed'] else ':x:'
+            m = p.get('metrics', {})
+            throughput = fmt_throughput(m.get('throughput_ops_per_sec', 0))
+            p99 = fmt_latency(m.get('p99_ms', 0))
+            p99_9 = fmt_latency(m.get('p99_9_ms', 0))
+            err = fmt_error_rate(m.get('error_rate', 0))
+            lines.append(f'| {pid} | {icon} | {throughput} | {p99} | {p99_9} | {err} |')
+
+        lines.append('\n</details>\n')
+
+    return '\n'.join(lines)
+
+
+def main() -> int:
+    if len(sys.argv) < 2:
+        print('Usage: format_benchmark_comment.py <summary.json>', file=sys.stderr)
+        return 1
+
+    summary_path = Path(sys.argv[1])
+    if not summary_path.exists():
+        print(f'Error: {summary_path} not found', file=sys.stderr)
+        return 1
+
+    artifact_root = summary_path.parent
+    summary = load_json(summary_path)
+    print(format_comment(summary, artifact_root))
+    return 0
+
+
+if __name__ == '__main__':
+    raise SystemExit(main())
diff --git a/tools/run_perf_matrix.py b/tools/run_perf_matrix.py
@@ -33,6 +33,8 @@ def build_parser() -> argparse.ArgumentParser:
     parser.add_argument('--list-profiles', action='store_true', help='List profile ids and exit.')
     parser.add_argument('--list-lanes', action='store_true', help='List matrix lanes and their profile ids, then exit.')
     parser.add_argument('--validate', action='store_true', help='Validate an existing artifact root instead of running benchmarks.')
+    parser.add_argument('--shuffle', action='store_true', help='Randomize profile execution order.')
+    parser.add_argument('--seed', type=int, default=None, help='Random seed for reproducible shuffle (implies --shuffle).')
     return parser
 
 
@@ -71,19 +73,25 @@ def main(argv: list[str] | None = None) -> int:
         baseline_root=None if ns.establish_baseline else ns.baseline_root,
         profile_ids=ns.profiles,
         establish_baseline=ns.establish_baseline,
+        shuffle=ns.shuffle or ns.seed is not None,
+        seed=ns.seed,
     )
     lane_counts: dict[str, int] = {}
     for profile in matrix.profiles:
         lane_counts[profile.lane] = lane_counts.get(profile.lane, 0) + 1
-    print(json.dumps({
+    output = {
         'matrix_name': summary.matrix_name,
         'artifact_root': summary.artifact_root,
         'baseline_root': summary.baseline_root,
         'passed': summary.passed,
         'failed': summary.failed,
         'total': summary.total,
         'lane_counts': lane_counts,
-    }, indent=2, sort_keys=True))
+    }
+    if summary.shuffle_seed is not None:
+        output['shuffle_seed'] = summary.shuffle_seed
+        output['execution_order'] = summary.execution_order
+    print(json.dumps(output, indent=2, sort_keys=True))
     return 0 if summary.failed == 0 else 1