chrishayuk · chrishayuk · May 4, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 25, 2026
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,9 @@
+target/
+output/
+.git/
+.claude/
+knowledge/
+experiments/
+docs/
+*.vindex
+/tmp/
diff --git a/.github/workflows/bench-regress.yml b/.github/workflows/bench-regress.yml
@@ -0,0 +1,98 @@
+# Bench regression detector — runs `make bench-check` on every PR
+# against a baseline saved on `main`. Fails the workflow if any cell
+# in the criterion bench suite regresses past Criterion's noise
+# threshold.
+#
+# Surface covered (`make bench` = `make bench-quant + bench-matmul + bench-linalg`):
+#   - `quant_matvec`: Q4_0 / Q4_K / Q4_KF / Q6_K × 3 shapes × cpu/metal
+#   - `matmul`: f32 matmul + f32_gemv (lm-head) — cpu vs metal
+#   - `linalg`: cholesky + ridge solve (cpu only)
+#
+# That's the surface where the next throughput cliff would show up
+# first. The 75 %-row drop in `q4_matvec_v4` would have shown as a 4×
+# regression at `quant_matvec_q4_0/metal/lm_head_262144` weeks before
+# goldens caught it.
+
+name: bench-regress
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+  # Manual trigger so a maintainer can re-baseline after intentional
+  # perf changes without waiting for the next merge to main.
+  workflow_dispatch: {}
+
+jobs:
+  bench:
+    # macos-14 = Apple Silicon (M1+). Required for the metal cells —
+    # without it, drop --features metal from FEATURES to skip them
+    # and run only the CPU surface on any runner.
+    runs-on: macos-14
+    timeout-minutes: 90
+
+    steps:
+      - uses: actions/checkout@v4
+
+      # Cargo deps are big and stable across PRs — separate cache.
+      - name: Cache cargo deps
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+            target
+          key: ${{ runner.os }}-cargo-bench-${{ hashFiles('**/Cargo.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-cargo-bench-
+
+      # Criterion baselines: write-through on main, read-only on PRs.
+      # Keyed by the run number so each main push refreshes the cache.
+      - name: Cache criterion baseline (main only)
+        if: github.ref == 'refs/heads/main'
+        uses: actions/cache@v4
+        with:
+          path: target/criterion
+          key: ${{ runner.os }}-criterion-baseline-${{ github.run_number }}
+          restore-keys: |
+            ${{ runner.os }}-criterion-baseline-
+
+      - name: Restore criterion baseline (PRs only)
+        if: github.event_name == 'pull_request'
+        uses: actions/cache/restore@v4
+        with:
+          path: target/criterion
+          key: ${{ runner.os }}-criterion-baseline-
+          restore-keys: |
+            ${{ runner.os }}-criterion-baseline-
+
+      - name: Save baseline (main only)
+        if: github.ref == 'refs/heads/main'
+        run: make bench-save
+
+      - name: Check vs baseline (PRs + manual)
+        if: github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch'
+        run: |
+          # Cold cache → bench-check prints "no baseline found" and
+          # exits 2. Treat as neutral: the first PR after CI is stood
+          # up shouldn't fail just because there's no baseline yet.
+          set +e
+          make bench-check
+          rc=$?
+          set -e
+          if [ "$rc" -eq 2 ]; then
+            echo "::warning::no criterion baseline cached; skipping regression check"
+            exit 0
+          fi
+          exit "$rc"
+
+      # On regression, attach the criterion HTML report so reviewers
+      # can see the per-cell delta without re-running locally.
+      - name: Upload criterion report on failure
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: criterion-report
+          path: target/criterion/
+          retention-days: 14
diff --git a/.github/workflows/larql-models.yml b/.github/workflows/larql-models.yml
@@ -0,0 +1,68 @@
+# larql-models cross-platform CI
+#
+# Runs check + clippy + tests + bench test-mode on Linux, Windows, and macOS
+# for every change to the larql-models crate. Validates cross-platform compatibility:
+#   - Linux  (x86_64-unknown-linux-gnu)
+#   - Windows (x86_64-pc-windows-msvc) — HF cache path, mmap, path separators
+#   - macOS  (aarch64-apple-darwin)    — NEON SIMD paths
+
+name: larql-models
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - 'crates/larql-models/**'
+      - 'Cargo.toml'
+      - 'Cargo.lock'
+      - '.github/workflows/larql-models.yml'
+  pull_request:
+    branches: [main]
+    paths:
+      - 'crates/larql-models/**'
+      - 'Cargo.toml'
+      - 'Cargo.lock'
+      - '.github/workflows/larql-models.yml'
+  workflow_dispatch: {}
+
+jobs:
+  test:
+    name: test · ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    timeout-minutes: 20
+
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, windows-latest, macos-14]
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install stable Rust
+        uses: dtolnay/rust-toolchain@stable
+        with:
+          components: clippy
+
+      - name: Cache cargo registry + build artefacts
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+            target
+          key: ${{ runner.os }}-cargo-models-${{ hashFiles('**/Cargo.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-cargo-models-
+
+      - name: Check (all targets)
+        run: cargo check -p larql-models --all-targets
+
+      - name: Clippy (warnings as errors)
+        run: cargo clippy -p larql-models --all-targets -- -D warnings
+
+      - name: Test
+        run: cargo test -p larql-models
+
+      - name: Test benches
+        run: cargo test -p larql-models --benches
diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: build release test check clean fmt lint demos
+.PHONY: build release test test-fast test-full test-integration test-models check clean fmt lint demos bench bench-save bench-check coverage coverage-summary
 
 # Build
 build:
@@ -8,9 +8,31 @@ release:
 	cargo build --release -p larql-cli
 
 # Test
-test:
+#
+# Default test target is intentionally fast: no integration binaries, no
+# model-backed ignored tests. Use `test-full` for the historical full
+# workspace run, and `test-models` for real-model/vindex checks.
+test: test-fast
+
+test-fast:
+	cargo test --workspace --lib --bins
+
+test-full:
 	cargo test --workspace
 
+test-integration:
+	cargo test --workspace --tests
+
+test-models:
+	cargo test -p larql-inference --test test_arch_golden -- --ignored
+	cargo test -p larql-inference --test test_logits_goldens -- --ignored
+	cargo test -p larql-inference --test test_gemma3_smoke -- --ignored
+	cargo test -p larql-inference --test test_generate_q4k_cpu -- --ignored
+	cargo test -p larql-inference --test bench_probe_latency -- --ignored --nocapture
+	cargo test -p larql-inference --test test_llm_dispatch -- --ignored --nocapture
+	cargo test -p larql-inference --test test_constrained_dispatch -- --ignored --nocapture
+	cargo test -p larql-inference --test test_trie_dispatch -- --ignored --nocapture
+
 # Check (compile without building)
 check:
 	cargo check --workspace
@@ -26,12 +48,29 @@ lint:
 	cargo clippy --workspace --tests -- -D warnings
 
 # All quality checks
-ci: fmt-check lint test
+ci: fmt-check lint test-full
 
 # Clean
 clean:
 	cargo clean
 
+# Benchmarks
+#
+# `bench` runs the full quant_matvec suite and writes HTML reports under
+# `target/criterion/`. `bench-save` records a baseline named `main`;
+# `bench-check` re-runs and fails if any cell regresses past Criterion's
+# default noise threshold. Plug `bench-check` into CI to catch the next
+# 4× throughput cliff (the kind the q4_matvec_v4 row-drop bug caused) at
+# PR time, not at goldens-fail time weeks later.
+bench:
+	cargo bench -p larql-compute --bench quant_matvec --features metal
+
+bench-save:
+	bash scripts/bench-regress.sh save
+
+bench-check:
+	bash scripts/bench-regress.sh check
+
 # Demos
 demos:
 	cargo run --release -p larql-models --example architecture_demo
@@ -52,7 +91,43 @@ bench-core:
 bench-inference:
 	cargo run --release -p larql-inference --example bench_inference
 
-bench-all: bench-core bench-inference
+# Vindex micro-benches — synthetic, fast, safe under load.
+bench-vindex:
+	cargo bench -p larql-vindex --bench vindex_ops
+
+# Vindex production-dim scaling bench. Refuses if larql-server / router
+# are alive (they distort 1-2 GB matmuls). Run alone, on a cool host;
+# results feed PERFORMANCE.md.
+bench-vindex-scaling:
+	@if pgrep -fl 'larql-(server|router)' >/dev/null 2>&1; then \
+		echo "Refusing bench-vindex-scaling: larql daemons running. Stop them first."; \
+		pgrep -fl 'larql-(server|router)'; \
+		exit 2; \
+	fi
+	cargo bench -p larql-vindex --bench vindex_scaling
+
+bench-all: bench-core bench-inference bench-vindex
+
+# Coverage — uses cargo-llvm-cov (install with `cargo install cargo-llvm-cov`).
+# Writes an HTML report to coverage/ that can be opened in a browser.
+# Scoped to larql-vindex by default since the audit owner cares about
+# that crate; pass CRATE=… to scope elsewhere.
+COVERAGE_CRATE ?= larql-vindex
+coverage:
+	@if ! command -v cargo-llvm-cov >/dev/null 2>&1; then \
+		echo "cargo-llvm-cov not installed. Install with:"; \
+		echo "  cargo install cargo-llvm-cov"; \
+		exit 1; \
+	fi
+	cargo llvm-cov --package $(COVERAGE_CRATE) --html --output-dir coverage
+	@echo "Report: coverage/html/index.html"
+
+coverage-summary:
+	@if ! command -v cargo-llvm-cov >/dev/null 2>&1; then \
+		echo "cargo-llvm-cov not installed."; \
+		exit 1; \
+	fi
+	cargo llvm-cov --package $(COVERAGE_CRATE) --summary-only
 
 # Python extension (managed via uv)
 python-setup: