diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index f8f779b5..e7750e98 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -12,3 +12,30 @@ updates:
     github-actions:
       patterns:
       - '*'
+
+# Pinned ``[benchmarks]`` extra in pyproject.toml. One PR per dep bump
+# → CodSpeed CI runs and attributes any perf delta to that specific
+# bump. Keeps the cross-version ``sweep`` baseline (lockfile-pinned)
+# stable while still surfacing upstream perf changes per-PR with
+# eyes-open review. Loose ``[project.dependencies]`` (numpy, scipy, ...)
+# have no version specifier so Dependabot leaves them alone — only the
+# ``==`` pins in ``[benchmarks]`` produce PRs.
+- package-ecosystem: pip
+  directory: /
+  schedule:
+    interval: monthly
+  open-pull-requests-limit: 5
+  groups:
+    # Measurement scaffolding + CLI/notebook tooling. Perf-irrelevant —
+    # they don't move CodSpeed signal, so batching into one PR cuts
+    # review noise. Perf-relevant deps (numpy, xarray, highspy, …) stay
+    # un-grouped so each gets its own attributed CodSpeed delta.
+    benchmark-tooling:
+      patterns:
+        - pytest
+        - pytest-benchmark
+        - pytest-memray
+        - pytest-codspeed
+        - nbconvert
+        - typer
+        - plotly
diff --git a/.github/workflows/benchmark-smoke.yml b/.github/workflows/benchmark-smoke.yml
new file mode 100644
index 00000000..689eaf35
--- /dev/null
+++ b/.github/workflows/benchmark-smoke.yml
@@ -0,0 +1,39 @@
+name: Benchmark smoke
+
+# Builds every spec and fires every phase once (--benchmark-disable):
+# a "did a refactor break a spec?" check, not timing.
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ '*' ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  smoke:
+    name: Benchmark smoke (quick)
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v6
+      with:
+        fetch-depth: 0  # setuptools_scm
+
+    - name: Set up Python 3.12
+      uses: actions/setup-python@v6
+      with:
+        python-version: "3.12"
+
+    - name: Install package and benchmark dependencies
+      run: |
+        python -m pip install uv
+        uv pip install --system -e ".[dev,benchmarks]"
+
+    - name: Run benchmark smoke
+      # Every spec builds at one size and every phase fires once, no timings.
+      run: |
+        pytest benchmarks/ --benchmark-disable -q
diff --git a/.github/workflows/codspeed-macro.yml b/.github/workflows/codspeed-macro.yml
new file mode 100644
index 00000000..2d6ae3dd
--- /dev/null
+++ b/.github/workflows/codspeed-macro.yml
@@ -0,0 +1,62 @@
+name: CodSpeed (walltime macro)
+
+# Wall-clock benchmarks on CodSpeed's dedicated bare-metal macro runners — the
+# mode that reflects the real cost of dense-vs-sparse work (cache, allocation,
+# native numpy/scipy), which instruction counting under-weights.
+#
+# Master push (updates the walltime baseline) + manual dispatch + opt-in per-PR
+# via the ``trigger:benchmark`` label. Off every *unlabelled* PR: macro-runner
+# minutes are metered (600/month free), and self-hosted bare-metal shouldn't run
+# arbitrary PR code — the label is a maintainer-controlled gate, so only apply it
+# to trusted (same-repo) PRs.
+#
+# Requires the repo under a GitHub org (macro runners are org-only) with the
+# CodSpeed app connected to the repo (OIDC auth — no token secret needed).
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    types: [ labeled, synchronize ]
+    branches: [ master ]
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  macro:
+    name: CodSpeed walltime (macro runner)
+    # Always on master push / dispatch; on PRs only when explicitly labelled.
+    if: >-
+      ${{ github.event_name != 'pull_request' ||
+          contains(github.event.pull_request.labels.*.name, 'trigger:benchmark') }}
+    runs-on: codspeed-macro
+    # Non-gating until the CodSpeed app is connected to the repo (OIDC auth).
+    continue-on-error: true
+    permissions:
+      contents: read   # actions/checkout
+      id-token: write  # OIDC auth with CodSpeed — no token secret
+    steps:
+    - uses: actions/checkout@v6
+      with:
+        fetch-depth: 0  # setuptools_scm
+
+    - name: Set up Python 3.12
+      uses: actions/setup-python@v6
+      with:
+        python-version: "3.12"
+
+    - name: Install pinned benchmark environment
+      # Pinned ``[benchmarks]`` extra so Dependabot bumps → one CodSpeed delta each.
+      run: |
+        python -m pip install uv
+        uv pip install --system -e ".[dev,benchmarks]"
+
+    - name: Run benchmarks under CodSpeed (walltime)
+      uses: CodSpeedHQ/action@v4
+      with:
+        mode: walltime
+        run: |
+          pytest benchmarks/ --codspeed
diff --git a/.github/workflows/codspeed-memory.yml b/.github/workflows/codspeed-memory.yml
new file mode 100644
index 00000000..25df1b33
--- /dev/null
+++ b/.github/workflows/codspeed-memory.yml
@@ -0,0 +1,48 @@
+name: CodSpeed (memory)
+
+# Heap-allocation tracking — the always-on signal for this sparsity/memory fork.
+# Fast (~2 min) and free on a GitHub runner, so it runs on master (baseline) and
+# every PR. A solo instrument on ubuntu: its one upload per (commit, env) never
+# clashes with the walltime run, which is a separate bare-metal environment.
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  memory:
+    name: CodSpeed memory
+    runs-on: ubuntu-latest
+    # Non-gating: informational, never blocks a merge.
+    continue-on-error: true
+    permissions:
+      contents: read   # actions/checkout
+      id-token: write  # OIDC auth with CodSpeed — no token secret
+    steps:
+    - uses: actions/checkout@v6
+      with:
+        fetch-depth: 0  # setuptools_scm
+
+    - name: Set up Python 3.12
+      uses: actions/setup-python@v6
+      with:
+        python-version: "3.12"
+
+    - name: Install pinned benchmark environment
+      run: |
+        python -m pip install uv
+        uv pip install --system -e ".[dev,benchmarks]"
+
+    - name: Run benchmarks under CodSpeed (memory)
+      uses: CodSpeedHQ/action@v4
+      with:
+        mode: memory
+        run: |
+          pytest benchmarks/ --codspeed
diff --git a/.gitignore b/.gitignore
index 8b369aea..7e6d63e2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -45,6 +45,10 @@ benchmark/scripts/__pycache__
 benchmark/scripts/benchmarks-pypsa-eur/__pycache__
 benchmark/scripts/leftovers/
 
+# Benchmarks (internal suite): regenerable .ipynb viewing artifacts
+benchmarks/walkthrough.ipynb
+benchmarks/.ipynb_checkpoints/
+
 # IDE
 .idea/
 
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 22ac73ce..1362cb84 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -1,94 +1,74 @@
 # Internal Performance Benchmarks
 
-Measures linopy's own performance (build time, LP write speed, memory usage) across problem sizes using [pytest-benchmark](https://pytest-benchmark.readthedocs.io/) and [pytest-memray](https://pytest-memray.readthedocs.io/). Use these to check whether a code change introduces a regression or improvement.
+End-to-end performance tracking for `linopy` — build → matrix generation →
+LP / netCDF (de)serialization → solver handoff → a fixed PyPSA model. Solver
+algorithm runtime is out of scope.
 
-> **Note:** The `benchmark/` directory (singular) contains *external* benchmarks comparing linopy against other modeling frameworks. This directory (`benchmarks/`) is for *internal* performance tracking only.
+The suite is a set of `pytest-benchmark` tests driven by a model registry.
+**CodSpeed** measures them in CI (walltime on dedicated runners, memory on every
+PR); locally you just run `pytest`.
 
-## Setup
+> `benchmark/` (singular) is the legacy external-framework suite.
+> `benchmarks/` (plural) is this internal suite.
 
-```bash
-pip install -e ".[benchmarks]"
-```
+## Models vs patterns
 
-## Running benchmarks
+Two kinds of benchmark spec, same harness and same phases, distinguished by
+their sweep axis:
 
-```bash
-# Quick smoke test (small sizes only)
-pytest benchmarks/ --quick
+- **Models** (`models/`, `REGISTRY`) — whole `linopy.Model`s swept over
+  `size` (axis `n`): "how does cost scale with the problem?"
+- **Patterns** (`patterns/`, `PATTERNS`) — fragments of realistic modelling
+  code (a balance constraint, a KVL contraction) swept over `severity`
+  (0–100, axis `severity`): "how does cost respond as one data shape goes
+  from benign to pathological?"
 
-# Full timing benchmarks
-pytest benchmarks/test_build.py benchmarks/test_lp_write.py benchmarks/test_matrices.py
+Both kinds build a complete `linopy.Model`, so both run the **same phases** and
+share the phase drivers (`test_build.py`, `test_matrices.py`, …) — they're just
+more `(spec, value)` rows, tagged by `axis`. There is no separate pattern
+driver. Running a pattern through `build` *and* `to_lp` shows whether a
+dense-`_term` blow-up propagates to export or collapses.
 
-# Run a specific model
-pytest benchmarks/test_build.py -k basic
-```
+Patterns target the operations where the dense-`_term` representation forces
+materialisation — `groupby().sum()` padding, sparse `@` densification — so a
+`severity` sweep draws the cost cliff. Adding either kind is one file: drop it
+in `models/` or `patterns/`, call `register(...)` / `register_pattern(...)`.
 
-## Comparing timing between branches
+## Install
 
 ```bash
-# Save baseline results on master
-git checkout master
-pytest benchmarks/test_build.py --benchmark-save=master
-
-# Switch to feature branch and compare
-git checkout my-feature
-pytest benchmarks/test_build.py --benchmark-save=my-feature --benchmark-compare=0001_master
-
-# Compare saved results without re-running
-pytest-benchmark compare 0001_master 0002_my-feature --columns=median,iqr
+uv sync --extra dev --extra benchmarks
+source .venv/bin/activate
 ```
 
-Results are stored in `.benchmarks/` (gitignored).
+`pypsa` is optional — `pypsa_scigrid` and `test_pypsa_carbon_management.py`
+skip gracefully without it: `uv pip install pypsa`.
 
-## Memory benchmarks
+The `[benchmarks]` extra in `pyproject.toml` pins every direct dep that affects
+measurement (`numpy`, `scipy`, `xarray`, `pandas`, `polars`, `dask`, …) so
+run-to-run deltas reflect linopy changes, not dependency bumps.
 
-`memory.py` runs each test in a separate process with pytest-memray to get accurate per-test peak memory (including C/numpy allocations). Results are saved as JSON and can be compared across branches.
-
-By default, only the build phase (`test_build.py`) is measured. Unlike timing benchmarks where `benchmark()` isolates the measured function, memray tracks all allocations within a test — including model construction in setup. This means LP write and matrix tests would report build + phase memory combined, making the phase-specific contribution impossible to isolate. Since model construction dominates memory usage, measuring build alone gives the most actionable numbers.
+## Running
 
 ```bash
-# Save baseline on master
-git checkout master
-python benchmarks/memory.py save master
-
-# Save feature branch
-git checkout my-feature
-python benchmarks/memory.py save my-feature
-
-# Compare
-python benchmarks/memory.py compare master my-feature
-
-# Quick mode (smaller sizes, faster)
-python benchmarks/memory.py save master --quick
-
-# Measure a specific phase (includes build overhead)
-python benchmarks/memory.py save master --test-path benchmarks/test_lp_write.py
+pytest benchmarks/                       # the suite
+pytest benchmarks/ --benchmark-disable -q   # smoke: every spec builds once
+pytest benchmarks/ --pipeline            # + the opt-in end-to-end pipeline test
 ```
 
-Results are stored in `.benchmarks/memory/` (gitignored). Requires Linux or macOS (memray is not available on Windows).
-
-> **Note:** Small tests (~5 MiB) are near the import-overhead floor and may show noise of ~1 MiB between runs. Focus on larger tests for meaningful memory comparisons. Do not combine `--memray` with timing benchmarks — memray adds ~2x overhead that invalidates timing results.
-
-## Models
-
-| Model | Description | Sizes |
-|-------|-------------|-------|
-| `basic` | Dense N*N model, 2*N^2 vars/cons | 10 — 1600 |
-| `knapsack` | N binary variables, 1 constraint | 100 — 1M |
-| `expression_arithmetic` | Broadcasting, scaling, summation across dims | 10 — 1000 |
-| `sparse_network` | Ring network with mismatched bus/line coords | 10 — 1000 |
-| `pypsa_scigrid` | Real power system (requires `pypsa`) | 10 — 200 snapshots |
-
-## Phases
+Each spec declares one `sizes` (models) / `severities` (patterns) tuple — a
+small representative set, kept tight because CodSpeed measures it on every PR.
+Need a scaling curve? That's a local pytest-benchmem job, not this suite.
 
-| Phase | File | What it measures |
-|-------|------|------------------|
-| Build | `test_build.py` | Model construction (add_variables, add_constraints, add_objective) |
-| LP write | `test_lp_write.py` | Writing the model to an LP file |
-| Matrices | `test_matrices.py` | Generating sparse matrices (A, b, c, bounds) from the model |
+## CI
 
-## Adding a new model
+- **Smoke** (`benchmark-smoke.yml`) — every PR: every spec builds and every
+  phase fires once under `--benchmark-disable`. A "did a refactor break a
+  spec?" check, not timing.
+- **CodSpeed memory** (`codspeed-memory.yml`) — every PR: heap-allocation
+  tracking, informational, non-gating.
+- **CodSpeed walltime** (`codspeed-macro.yml`) — on `master` or a PR labelled
+  `trigger:benchmark`: wall-clock on dedicated bare-metal runners.
 
-1. Create `benchmarks/models/my_model.py` with a `build_my_model(n)` function and a `SIZES` list
-2. Add parametrized tests in the relevant `test_*.py` files
-3. Add a quick threshold in `conftest.py`
+Activating CodSpeed upstream needs a maintainer to connect the repo to the
+CodSpeed app (OIDC auth, no token secret); the workflows are already wired.
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
index 6bf202cc..48c26ef0 100644
--- a/benchmarks/__init__.py
+++ b/benchmarks/__init__.py
@@ -1 +1,15 @@
-"""Linopy benchmark suite — run with ``pytest benchmarks/`` (use ``--quick`` for smaller sizes)."""
+"""
+Linopy benchmark suite — run with ``pytest benchmarks/``.
+
+The model registry it drives is reusable on its own::
+
+    from benchmarks import REGISTRY
+    model = REGISTRY["basic"].build(100)
+"""
+
+# Importing the models / patterns packages triggers each module's
+# ``register(...)`` / ``register_pattern(...)`` call at import time.
+from benchmarks import models, patterns  # noqa: F401
+from benchmarks.registry import PATTERNS, REGISTRY
+
+__all__ = ["PATTERNS", "REGISTRY"]
diff --git a/benchmarks/conftest.py b/benchmarks/conftest.py
index 6f9a9467..b9ef6014 100644
--- a/benchmarks/conftest.py
+++ b/benchmarks/conftest.py
@@ -1,30 +1,104 @@
-"""Benchmark configuration and shared fixtures."""
+"""Benchmark configuration and shared test helpers."""
 
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
 import pytest
 
-QUICK_THRESHOLD = {
-    "basic": 100,
-    "knapsack": 10_000,
-    "pypsa_scigrid": 50,
-    "expression_arithmetic": 100,
-    "sparse_network": 100,
-}
+from benchmarks.registry import iter_params, spec_param_id
+
+if TYPE_CHECKING:
+    import linopy
+    from benchmarks.registry import BenchSpec
 
+# Test modules the CodSpeed instruments measure (edit to change coverage).
+# build + the two export paths: to_lp (LP text) and to_solver (direct handoff,
+# which also exercises matrix-gen). matrices is dropped — a subset of to_solver;
+# netcdf excluded — disk I/O, noisy. All still run under the smoke job.
+CODSPEED_MODULES = (
+    "test_build",
+    "test_to_lp",
+    "test_to_solver",
+)
 
-def pytest_addoption(parser):
+
+def pytest_addoption(parser: pytest.Parser) -> None:
     parser.addoption(
-        "--quick",
+        "--pipeline",
         action="store_true",
         default=False,
-        help="Use smaller problem sizes for quick benchmarking",
+        help=(
+            "Include the opt-in end-to-end pipeline benchmark (build → matrices "
+            "→ lp in one measured region). Off by default — it re-runs the "
+            "per-phase work and includes the build."
+        ),
+    )
+
+
+def pytest_collection_modifyitems(
+    config: pytest.Config, items: list[pytest.Item]
+) -> None:
+    """
+    ``test_pipeline`` (end-to-end) is opt-in — deselected unless ``--pipeline``.
+    ``--codspeed`` narrows the run to ``CODSPEED_MODULES`` (drops netcdf/matrices).
+    """
+    if not config.getoption("--pipeline"):
+        dropped = [i for i in items if i.path.stem == "test_pipeline"]
+        if dropped:
+            config.hook.pytest_deselected(items=dropped)
+            items[:] = [i for i in items if i.path.stem != "test_pipeline"]
+
+    if getattr(config.option, "codspeed", False):
+        deselected = [i for i in items if i.path.stem not in CODSPEED_MODULES]
+        if deselected:
+            config.hook.pytest_deselected(items=deselected)
+            items[:] = [i for i in items if i.path.stem in CODSPEED_MODULES]
+
+
+def cases(phase: str) -> pytest.MarkDecorator:
+    """Parametrize a phase driver over every ``(spec, n)`` that phase runs."""
+    params = iter_params(phase)
+    return pytest.mark.parametrize(
+        ("spec", "n"),
+        params,
+        ids=[spec_param_id(s.name, s.axis, v) for s, v in params],
     )
 
 
-def skip_if_quick(request, model: str, size: int):
-    """Skip large sizes when --quick is passed."""
-    if request.config.getoption("--quick"):
-        threshold = QUICK_THRESHOLD.get(model, float("inf"))
-        if size > threshold:
-            pytest.skip(f"--quick: skipping {model} size {size}")
+def require(spec: BenchSpec) -> None:
+    """``importorskip`` a spec's optional dependencies before it runs."""
+    for mod in spec.requires:
+        pytest.importorskip(mod)
+
+
+def build_model(spec: BenchSpec, n: int) -> linopy.Model:
+    """Build ``spec`` at ``n`` — the untimed setup, after the requires-skip."""
+    require(spec)
+    return spec.build(n)
+
+
+@pytest.fixture(autouse=True)
+def _benchmem_dims(request: pytest.FixtureRequest, benchmark: object) -> None:
+    """
+    Mirror each case's ``spec``/``phase``/``axis`` into pytest-benchmark
+    ``extra_info`` as analysis dims, so a ``--benchmark-json`` run plots cleanly
+    under pytest-benchmem — which reads dims from ``params``/``extra_info`` and
+    can see neither the (unserialisable) spec param nor the phase, which lives in
+    the test-function name. The numeric ``n`` is already a clean param. No-op
+    under CodSpeed, whose fixture carries no ``extra_info``.
+    """
+    callspec = getattr(request.node, "callspec", None)
+    info = getattr(benchmark, "extra_info", None)
+    func = getattr(request, "function", None)
+    if (
+        callspec is None
+        or info is None
+        or func is None
+        or "spec" not in callspec.params
+    ):
+        return
+    spec = callspec.params["spec"]
+    info.update(
+        spec=spec.name, phase=func.__name__.removeprefix("test_"), axis=spec.axis
+    )
diff --git a/benchmarks/memory.py b/benchmarks/memory.py
deleted file mode 100644
index 20af4b8a..00000000
--- a/benchmarks/memory.py
+++ /dev/null
@@ -1,199 +0,0 @@
-#!/usr/bin/env python
-"""
-Measure and compare peak memory using pytest-memray.
-
-Usage:
-    # Save a baseline (on master)
-    python benchmarks/memory.py save master
-
-    # Save current branch
-    python benchmarks/memory.py save my-feature
-
-    # Compare two saved runs
-    python benchmarks/memory.py compare master my-feature
-
-    # Quick mode (smaller sizes)
-    python benchmarks/memory.py save master --quick
-
-Results are stored in .benchmarks/memory/.
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-import platform
-import re
-import subprocess
-import sys
-from pathlib import Path
-
-if platform.system() == "Windows":
-    raise RuntimeError(
-        "memory.py requires pytest-memray which is not available on Windows. "
-        "Run memory benchmarks on Linux or macOS."
-    )
-
-RESULTS_DIR = Path(".benchmarks/memory")
-MEMORY_RE = re.compile(
-    r"Allocation results for (.+?) at the high watermark\s+"
-    r"📦 Total memory allocated: ([\d.]+)(MiB|KiB|GiB|B)",
-)
-# Only the build phase is measured by default. Unlike timing benchmarks (where
-# pytest-benchmark isolates the measured function), memray tracks all allocations
-# within a test — including model construction in setup. This means LP write and
-# matrix tests would report build + phase memory combined, making the phase-specific
-# contribution hard to isolate. Since model construction dominates memory usage,
-# measuring build alone gives the most accurate and actionable numbers.
-DEFAULT_TEST_PATHS = [
-    "benchmarks/test_build.py",
-]
-
-
-def _to_mib(value: float, unit: str) -> float:
-    factors = {"B": 1 / 1048576, "KiB": 1 / 1024, "MiB": 1, "GiB": 1024}
-    return value * factors[unit]
-
-
-def _collect_test_ids(test_paths: list[str], quick: bool) -> list[str]:
-    """Collect test IDs without running them."""
-    cmd = [
-        sys.executable,
-        "-m",
-        "pytest",
-        *test_paths,
-        "--collect-only",
-        "-q",
-    ]
-    if quick:
-        cmd.append("--quick")
-    result = subprocess.run(cmd, capture_output=True, text=True)
-    return [
-        line.strip()
-        for line in result.stdout.splitlines()
-        if "::" in line and not line.startswith(("=", "-", " "))
-    ]
-
-
-def save(label: str, quick: bool = False, test_paths: list[str] | None = None) -> Path:
-    """Run each benchmark in a separate process for accurate memory measurement."""
-    if test_paths is None:
-        test_paths = DEFAULT_TEST_PATHS
-    test_ids = _collect_test_ids(test_paths, quick)
-    if not test_ids:
-        print("No tests collected.", file=sys.stderr)
-        sys.exit(1)
-
-    print(f"Running {len(test_ids)} tests (each in a separate process)...")
-    entries = {}
-    for i, test_id in enumerate(test_ids, 1):
-        short = test_id.split("::")[-1]
-        print(f"  [{i}/{len(test_ids)}] {short}...", end=" ", flush=True)
-
-        cmd = [
-            sys.executable,
-            "-m",
-            "pytest",
-            test_id,
-            "--memray",
-            "--benchmark-disable",
-            "-v",
-            "--tb=short",
-            "-q",
-        ]
-        result = subprocess.run(cmd, capture_output=True, text=True)
-        output = result.stdout + result.stderr
-
-        match = MEMORY_RE.search(output)
-        if match:
-            value = float(match.group(2))
-            unit = match.group(3)
-            mib = round(_to_mib(value, unit), 3)
-            entries[test_id] = mib
-            print(f"{mib:.1f} MiB")
-        elif "SKIPPED" in output or "skipped" in output:
-            print("skipped")
-        else:
-            print(
-                "WARNING: no memray data (pytest-memray output format may have changed)",
-                file=sys.stderr,
-            )
-
-    if not entries:
-        print("No memray results found. Is pytest-memray installed?", file=sys.stderr)
-        sys.exit(1)
-
-    RESULTS_DIR.mkdir(parents=True, exist_ok=True)
-    out_path = RESULTS_DIR / f"{label}.json"
-    out_path.write_text(json.dumps({"label": label, "peak_mib": entries}, indent=2))
-    print(f"\nSaved {len(entries)} results to {out_path}")
-    return out_path
-
-
-def compare(label_a: str, label_b: str) -> None:
-    """Compare two saved memory results."""
-    path_a = RESULTS_DIR / f"{label_a}.json"
-    path_b = RESULTS_DIR / f"{label_b}.json"
-    for p in (path_a, path_b):
-        if not p.exists():
-            print(f"Not found: {p}. Run 'save {p.stem}' first.", file=sys.stderr)
-            sys.exit(1)
-
-    data_a = json.loads(path_a.read_text())["peak_mib"]
-    data_b = json.loads(path_b.read_text())["peak_mib"]
-
-    all_tests = sorted(set(data_a) | set(data_b))
-
-    print(f"\n{'Test':<60} {label_a:>10} {label_b:>10} {'Change':>10}")
-    print("-" * 94)
-
-    for test in all_tests:
-        a = data_a.get(test)
-        b = data_b.get(test)
-        a_str = f"{a:.1f}" if a is not None else "—"
-        b_str = f"{b:.1f}" if b is not None else "—"
-        if a is not None and b is not None and a > 0:
-            pct = (b - a) / a * 100
-            change = f"{pct:+.1f}%"
-        else:
-            change = "—"
-        # Shorten test name for readability
-        short = test.split("::")[-1] if "::" in test else test
-        print(f"{short:<60} {a_str:>10} {b_str:>10} {change:>10}")
-
-    print()
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
-    )
-    sub = parser.add_subparsers(dest="cmd", required=True)
-
-    p_save = sub.add_parser("save", help="Run benchmarks and save memory results")
-    p_save.add_argument(
-        "label", help="Label for this run (e.g. 'master', 'my-feature')"
-    )
-    p_save.add_argument(
-        "--quick", action="store_true", help="Use smaller problem sizes"
-    )
-    p_save.add_argument(
-        "--test-path",
-        nargs="+",
-        default=None,
-        help="Test file(s) to run (default: all phases)",
-    )
-
-    p_cmp = sub.add_parser("compare", help="Compare two saved runs")
-    p_cmp.add_argument("label_a", help="First run label (baseline)")
-    p_cmp.add_argument("label_b", help="Second run label")
-
-    args = parser.parse_args()
-    if args.cmd == "save":
-        save(args.label, quick=args.quick, test_paths=args.test_path)
-    elif args.cmd == "compare":
-        compare(args.label_a, args.label_b)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmarks/models/__init__.py b/benchmarks/models/__init__.py
index fcff9caf..66c9a7c7 100644
--- a/benchmarks/models/__init__.py
+++ b/benchmarks/models/__init__.py
@@ -1,21 +1,25 @@
-"""Model builders for benchmarks."""
+"""
+Model builders for benchmarks.
 
-from benchmarks.models.basic import SIZES as BASIC_SIZES
-from benchmarks.models.basic import build_basic
-from benchmarks.models.expression_arithmetic import SIZES as EXPR_SIZES
-from benchmarks.models.expression_arithmetic import build_expression_arithmetic
-from benchmarks.models.knapsack import SIZES as KNAPSACK_SIZES
-from benchmarks.models.knapsack import build_knapsack
-from benchmarks.models.sparse_network import SIZES as SPARSE_SIZES
-from benchmarks.models.sparse_network import build_sparse_network
+Importing this package triggers every submodule's ``register(...)`` call,
+populating :data:`benchmarks.registry.REGISTRY`. Each submodule exposes a
+``build_<name>(size) -> linopy.Model`` callable and a module-level ``SPEC``
+:class:`~benchmarks.registry.BenchSpec`. The documented access path is
+``REGISTRY["<name>"]``; submodule re-exports are intentionally not exposed
+here so that adding a new model is one new file plus one import below.
+"""
 
-__all__ = [
-    "BASIC_SIZES",
-    "EXPR_SIZES",
-    "KNAPSACK_SIZES",
-    "SPARSE_SIZES",
-    "build_basic",
-    "build_expression_arithmetic",
-    "build_knapsack",
-    "build_sparse_network",
-]
+# Side-effect imports — each module calls ``register(...)`` at import time.
+from benchmarks.models import (  # noqa: F401
+    basic,
+    expression_arithmetic,
+    knapsack,
+    masked,
+    milp,
+    piecewise,
+    pypsa_scigrid,
+    qp,
+    sos,
+    sparse_network,
+    storage,
+)
diff --git a/benchmarks/models/basic.py b/benchmarks/models/basic.py
index 2aea49d9..554ad05e 100644
--- a/benchmarks/models/basic.py
+++ b/benchmarks/models/basic.py
@@ -1,10 +1,11 @@
-"""Basic benchmark model: 2*N^2 variables and constraints."""
+"""Basic benchmark model: 2*N^2 variables and constraints (continuous LP)."""
 
 from __future__ import annotations
 
 import linopy
+from benchmarks.registry import BenchSpec, register
 
-SIZES = [10, 50, 100, 250, 500, 1000, 1600]
+SIZES = (10, 250)
 
 
 def build_basic(n: int) -> linopy.Model:
@@ -16,3 +17,12 @@ def build_basic(n: int) -> linopy.Model:
     m.add_constraints(x - y >= -5, name="lower")
     m.add_objective(x.sum() + 2 * y.sum())
     return m
+
+
+SPEC = register(
+    BenchSpec(
+        name="basic",
+        build=build_basic,
+        sweep=SIZES,
+    )
+)
diff --git a/benchmarks/models/expression_arithmetic.py b/benchmarks/models/expression_arithmetic.py
index 339c651d..0d5af581 100644
--- a/benchmarks/models/expression_arithmetic.py
+++ b/benchmarks/models/expression_arithmetic.py
@@ -5,8 +5,9 @@
 import numpy as np
 
 import linopy
+from benchmarks.registry import BenchSpec, register
 
-SIZES = [10, 50, 100, 250, 500, 1000]
+SIZES = (10, 250)
 
 
 def build_expression_arithmetic(n: int) -> linopy.Model:
@@ -28,3 +29,12 @@ def build_expression_arithmetic(n: int) -> linopy.Model:
     m.add_constraints(expr1.sum("j") >= -10, name="row_sum")
     m.add_objective(combined.sum())
     return m
+
+
+SPEC = register(
+    BenchSpec(
+        name="expression_arithmetic",
+        build=build_expression_arithmetic,
+        sweep=SIZES,
+    )
+)
diff --git a/benchmarks/models/knapsack.py b/benchmarks/models/knapsack.py
index 83ce7394..fe01ad8b 100644
--- a/benchmarks/models/knapsack.py
+++ b/benchmarks/models/knapsack.py
@@ -1,12 +1,13 @@
-"""Knapsack benchmark model: N binary variables, 1 constraint."""
+"""Knapsack benchmark model: N binary variables, 1 constraint (MILP, binary)."""
 
 from __future__ import annotations
 
 import numpy as np
 
 import linopy
+from benchmarks.registry import DEFAULT_PHASES, BenchSpec, register
 
-SIZES = [100, 1_000, 10_000, 100_000, 1_000_000]
+SIZES = (100, 10_000)
 
 
 def build_knapsack(n: int) -> linopy.Model:
@@ -21,3 +22,13 @@ def build_knapsack(n: int) -> linopy.Model:
     m.add_constraints((x * weights).sum() <= capacity, name="capacity")
     m.add_objective(-(x * values).sum())
     return m
+
+
+SPEC = register(
+    BenchSpec(
+        name="knapsack",
+        build=build_knapsack,
+        sweep=SIZES,
+        phases=DEFAULT_PHASES,  # HiGHS handles binary; matrices handles MILP
+    )
+)
diff --git a/benchmarks/models/masked.py b/benchmarks/models/masked.py
new file mode 100644
index 00000000..eb9255fb
--- /dev/null
+++ b/benchmarks/models/masked.py
@@ -0,0 +1,86 @@
+"""
+Masked-variables benchmark: transportation with sparse allowed routes.
+
+A standard transportation LP, but only a sparse subset of (origin, dest) pairs
+are valid routes. The ``mask=`` keyword on ``add_variables`` skips the rest,
+keeping the variable count sub-quadratic.
+
+Decision variables:
+    x[origin, dest] >= 0   continuous, only created for allowed routes
+
+Constraints:
+    sum_dest x[o, .]   <= supply[o]
+    sum_orig x[., d]   == demand[d]
+
+Objective:
+    minimize  sum cost[o, d] * x[o, d]
+
+The mask is dense at small sizes and sparser at large sizes, mimicking
+real-world transport networks where each origin only serves a fixed
+fan-out regardless of total node count.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import xarray as xr
+
+import linopy
+from benchmarks.registry import (
+    DEFAULT_PHASES,
+    BenchSpec,
+    register,
+)
+
+SIZES = (10, 100)
+
+
+def build_masked(n: int) -> linopy.Model:
+    rng = np.random.default_rng(42)
+    origins = np.arange(n)
+    dests = np.arange(n)
+
+    # Each origin serves at most ~min(20, n) destinations.
+    fan_out = min(20, n)
+    mask_np = np.zeros((n, n), dtype=bool)
+    for o in range(n):
+        # Deterministic fan-out so size determines connectivity.
+        targets = rng.choice(n, size=fan_out, replace=False)
+        mask_np[o, targets] = True
+
+    mask = xr.DataArray(mask_np, coords=[("origin", origins), ("dest", dests)])
+    cost = xr.DataArray(
+        rng.uniform(1, 10, size=(n, n)),
+        coords=[("origin", origins), ("dest", dests)],
+    )
+
+    # Supply scaled so the problem stays feasible at any size:
+    # each origin can ship up to ``demand_per_dest * fan_out`` units.
+    demand_per_dest = 5.0
+    supply_per_origin = demand_per_dest * n  # plenty of slack
+    supply = xr.DataArray(np.full(n, supply_per_origin), coords=[("origin", origins)])
+    demand = xr.DataArray(np.full(n, demand_per_dest), coords=[("dest", dests)])
+
+    m = linopy.Model()
+    x = m.add_variables(
+        lower=0,
+        coords=[("origin", origins), ("dest", dests)],
+        mask=mask,
+        name="x",
+    )
+
+    m.add_constraints(x.sum("dest") <= supply, name="supply", mask=mask.any("dest"))
+    m.add_constraints(x.sum("origin") == demand, name="demand", mask=mask.any("origin"))
+
+    m.add_objective((cost * x).sum())
+    return m
+
+
+SPEC = register(
+    BenchSpec(
+        name="masked",
+        build=build_masked,
+        sweep=SIZES,
+        phases=DEFAULT_PHASES,
+    )
+)
diff --git a/benchmarks/models/milp.py b/benchmarks/models/milp.py
new file mode 100644
index 00000000..f6058cc8
--- /dev/null
+++ b/benchmarks/models/milp.py
@@ -0,0 +1,75 @@
+"""
+MILP benchmark: capacitated facility location with general integers.
+
+Decision variables:
+    y_f  in {0,1,...,K}      integer "modules" to open at facility f
+    x_{f,c} >= 0             continuous flow from facility f to customer c
+
+Constraints:
+    sum_c x_{f,c}  <=  cap * y_f       (capacity per facility)
+    sum_f x_{f,c}  ==  d_c             (demand at each customer)
+
+Objective:
+    minimize  sum_{f,c} t_{f,c} * x_{f,c}  +  sum_f f_f * y_f
+
+The general-integer ``y`` exercises the matrix accessor's MIP integer-section
+path and the LP-writer's general-integer block — neither the binary knapsack
+nor the continuous LPs hit those paths.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+
+import linopy
+from benchmarks.registry import (
+    DEFAULT_PHASES,
+    BenchSpec,
+    register,
+)
+
+SIZES = (10, 50)
+
+
+def build_milp(n: int) -> linopy.Model:
+    rng = np.random.default_rng(42)
+    facilities = np.arange(n)
+    customers = np.arange(n)
+
+    cap = 100.0  # capacity per module
+    Y_MAX = 5  # max modules per facility
+    transport = rng.uniform(1, 20, size=(n, n))  # per-unit shipping cost
+    fixed = rng.uniform(50, 200, size=n)  # cost per facility module
+    demand = rng.uniform(20, 80, size=n)  # demand at each customer
+
+    m = linopy.Model()
+    y = m.add_variables(
+        lower=0,
+        upper=Y_MAX,
+        coords=[facilities],
+        dims=["facility"],
+        integer=True,
+        name="y",
+    )
+    x = m.add_variables(
+        lower=0,
+        coords=[facilities, customers],
+        dims=["facility", "customer"],
+        name="x",
+    )
+
+    m.add_constraints(x.sum("customer") - cap * y <= 0, name="capacity")
+    m.add_constraints(x.sum("facility") == demand, name="demand")
+
+    m.add_objective((transport * x).sum() + (fixed * y).sum())
+    return m
+
+
+SPEC = register(
+    BenchSpec(
+        name="milp",
+        build=build_milp,
+        sweep=SIZES,
+        phases=DEFAULT_PHASES,
+    )
+)
diff --git a/benchmarks/models/piecewise.py b/benchmarks/models/piecewise.py
new file mode 100644
index 00000000..895e854a
--- /dev/null
+++ b/benchmarks/models/piecewise.py
@@ -0,0 +1,89 @@
+"""
+Piecewise-linear benchmark: generation with piecewise fuel-cost curves.
+
+Each generator has a piecewise fuel cost curve pinned via
+``add_piecewise_formulation``. The default ``method="auto"`` picks an
+SOS2 or incremental expansion, generating auxiliary variables and
+constraints — that overhead is what we want to measure.
+
+Decision variables:
+    power[gen]  in [0, 100]      (continuous)
+    fuel[gen]   in [0, inf)      (continuous, pinned to piecewise curve)
+
+Constraints:
+    sum_gen power[gen]  >=  demand
+    piecewise:  fuel[gen] = f(power[gen])    for each gen
+
+Objective:
+    minimize  sum_gen fuel[gen]
+"""
+
+from __future__ import annotations
+
+import warnings
+
+import linopy
+from benchmarks.registry import (
+    DEFAULT_PHASES,
+    BenchSpec,
+    register,
+)
+
+SIZES = (10, 1_000)
+
+_API_AVAILABLE = hasattr(linopy.Model, "add_piecewise_formulation") and hasattr(
+    linopy, "EvolvingAPIWarning"
+)
+
+
+def build_piecewise(n_gens: int) -> linopy.Model:
+    # Shared breakpoints, broadcast across generators.
+    x_pts = [0.0, 30.0, 60.0, 100.0]
+    y_pts = [0.0, 36.0, 84.0, 170.0]  # convex-ish fuel curve
+
+    m = linopy.Model()
+    power = m.add_variables(
+        lower=0,
+        upper=100,
+        coords=[range(n_gens)],
+        dims=["gen"],
+        name="power",
+    )
+    fuel = m.add_variables(
+        lower=0,
+        coords=[range(n_gens)],
+        dims=["gen"],
+        name="fuel",
+    )
+
+    demand = 0.5 * n_gens * x_pts[-1]
+    m.add_constraints(power.sum() >= demand, name="demand")
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=linopy.EvolvingAPIWarning)
+        m.add_piecewise_formulation(
+            (power, x_pts),
+            (fuel, y_pts),
+        )
+
+    m.add_objective(fuel.sum())
+    return m
+
+
+# ``add_piecewise_formulation`` is a recent (still-evolving) API. Skip
+# registration silently on older linopy so the rest of the suite stays usable.
+SPEC: BenchSpec | None
+if _API_AVAILABLE:
+    SPEC = register(
+        BenchSpec(
+            name="piecewise",
+            build=build_piecewise,
+            sweep=SIZES,
+            # Monotonic breakpoints + ``method="auto"`` → incremental
+            # reformulation (pure MILP with binaries), which every supported
+            # solver handles.
+            phases=DEFAULT_PHASES,
+        )
+    )
+else:
+    SPEC = None
diff --git a/benchmarks/models/pypsa_scigrid.py b/benchmarks/models/pypsa_scigrid.py
index 2fcce217..bb6e8653 100644
--- a/benchmarks/models/pypsa_scigrid.py
+++ b/benchmarks/models/pypsa_scigrid.py
@@ -1,20 +1,36 @@
-"""PyPSA SciGrid-DE benchmark model."""
+"""PyPSA SciGrid-DE benchmark model (requires pypsa)."""
 
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
 
+from benchmarks.registry import BenchSpec, register
+
 if TYPE_CHECKING:
     import linopy
 
-SIZES = [10, 50, 100, 200]
+SIZES = (10, 50)  # small networks — PyPSA import already dominates the cost
 
 
 def build_pypsa_scigrid(snapshots: int = 100) -> linopy.Model:
     """Build PyPSA SciGrid model. Requires pypsa to be installed."""
     import pypsa
+    import pytest
 
-    n = pypsa.examples.scigrid_de()
+    try:
+        n = pypsa.examples.scigrid_de()
+    except Exception as exc:  # network / example-data drift, not a linopy signal
+        pytest.skip(f"pypsa example data unavailable: {exc}")
     n.set_snapshots(n.snapshots[:snapshots])
-    n.optimize.create_model()
+    n.optimize.create_model()  # the linopy build under benchmark — unguarded
     return n.model
+
+
+SPEC = register(
+    BenchSpec(
+        name="pypsa_scigrid",
+        build=build_pypsa_scigrid,
+        sweep=SIZES,
+        requires=("pypsa",),
+    )
+)
diff --git a/benchmarks/models/qp.py b/benchmarks/models/qp.py
new file mode 100644
index 00000000..50e39e7b
--- /dev/null
+++ b/benchmarks/models/qp.py
@@ -0,0 +1,61 @@
+"""
+QP benchmark: continuous quadratic objective on a portfolio-style model.
+
+Decision variables:
+    x_i  >= 0   (weight on asset i, continuous)
+
+Constraints:
+    sum_i x_i  == 1
+    x_i        <= 0.3        (no asset > 30% of portfolio)
+
+Objective:
+    minimize  sum_i q_i * x_i^2  -  sum_i r_i * x_i
+
+A pure diagonal quadratic — enough to exercise the QP build / write / matrix
+paths without paying for cross-terms. Cross-term coupling needs single-term
+factors on both sides (see ``LinearExpression._multiply_by_linear_expression``),
+which is awkward to set up cleanly via the public API.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+
+import linopy
+from benchmarks.registry import (
+    DEFAULT_PHASES,
+    BenchSpec,
+    register,
+)
+
+SIZES = (10, 1_000)
+
+
+def build_qp(n_assets: int) -> linopy.Model:
+    rng = np.random.default_rng(42)
+    q = rng.uniform(0.5, 2.0, size=n_assets)
+    r = rng.uniform(0.05, 0.15, size=n_assets)
+
+    m = linopy.Model()
+    x = m.add_variables(
+        lower=0,
+        upper=0.3,
+        coords=[range(n_assets)],
+        dims=["asset"],
+        name="x",
+    )
+
+    m.add_constraints(x.sum() == 1, name="budget")
+
+    m.add_objective((q * x**2).sum() - (r * x).sum())
+    return m
+
+
+SPEC = register(
+    BenchSpec(
+        name="qp",
+        build=build_qp,
+        sweep=SIZES,
+        phases=DEFAULT_PHASES,
+    )
+)
diff --git a/benchmarks/models/sos.py b/benchmarks/models/sos.py
new file mode 100644
index 00000000..3c1e2db8
--- /dev/null
+++ b/benchmarks/models/sos.py
@@ -0,0 +1,96 @@
+"""
+SOS1 benchmark: multi-mode generation with at-most-one-mode-per-generator.
+
+Each generator has ``n_modes`` operating modes (different cap/cost tradeoff).
+SOS1 over the ``mode`` dimension enforces that each generator picks at most
+one mode.
+
+Decision variables:
+    y[gen, mode]  >= 0     continuous output per (generator, mode)
+
+Constraints:
+    y[gen, mode]  <= cap[mode]
+    sum_{gen,mode} y  >= demand_total
+    SOS1 over "mode" for each gen
+
+This benchmark exercises ``Model.add_sos_constraints`` (commits be6d3a3 /
+8aa8d0c) and the LP-writer's SOS section. In linopy, native SOS support is
+declared by Gurobi / Cplex / Xpress only (see ``SolverFeature.SOS_CONSTRAINTS``).
+HiGHS and Mosek would need ``apply_sos_reformulation()`` first.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import xarray as xr
+
+import linopy
+from benchmarks.registry import (
+    BUILD,
+    FROM_NETCDF,
+    MATRICES,
+    TO_GUROBIPY,
+    TO_LP,
+    TO_NETCDF,
+    TO_XPRESS,
+    BenchSpec,
+    register,
+)
+
+SIZES = (10, 1_000)
+
+_N_MODES = 5
+_API_AVAILABLE = hasattr(linopy.Model, "add_sos_constraints")
+
+
+def build_sos(n_gens: int) -> linopy.Model:
+    modes = np.arange(_N_MODES)
+    cap = xr.DataArray(np.linspace(20.0, 100.0, _N_MODES), coords=[("mode", modes)])
+    cost = xr.DataArray(np.linspace(1.0, 8.0, _N_MODES), coords=[("mode", modes)])
+
+    m = linopy.Model()
+    y = m.add_variables(
+        lower=0,
+        upper=float(cap.max()),
+        coords=[range(n_gens), modes],
+        dims=["gen", "mode"],
+        name="y",
+    )
+
+    m.add_constraints(y <= cap, name="mode_cap")
+    demand_total = 0.4 * n_gens * float(cap.max())
+    m.add_constraints(y.sum() >= demand_total, name="demand")
+
+    m.add_sos_constraints(y, sos_type=1, sos_dim="mode")
+
+    m.add_objective((cost * y).sum())
+    return m
+
+
+# ``add_sos_constraints`` is a recent API. On older linopy we silently skip
+# registering this model — the rest of the suite stays usable.
+SPEC: BenchSpec | None
+if _API_AVAILABLE:
+    SPEC = register(
+        BenchSpec(
+            name="sos",
+            build=build_sos,
+            sweep=SIZES,
+            # HiGHS / Mosek lack native SOS in linopy — would need
+            # ``reformulate_sos=True``, which mutates the model and defeats
+            # the benchmark. Only solvers with native SOS appear here.
+            phases=frozenset(
+                {
+                    BUILD,
+                    MATRICES,
+                    TO_LP,
+                    TO_NETCDF,
+                    FROM_NETCDF,
+                    TO_GUROBIPY,
+                    TO_XPRESS,
+                }
+            ),
+        )
+    )
+else:
+    SPEC = None
diff --git a/benchmarks/models/sparse_network.py b/benchmarks/models/sparse_network.py
index afc6be06..13d6c3ad 100644
--- a/benchmarks/models/sparse_network.py
+++ b/benchmarks/models/sparse_network.py
@@ -7,8 +7,9 @@
 import xarray as xr
 
 import linopy
+from benchmarks.registry import BenchSpec, register
 
-SIZES = [10, 50, 100, 250, 500, 1000]
+SIZES = (10, 250)
 
 
 def build_sparse_network(n_buses: int) -> linopy.Model:
@@ -48,3 +49,12 @@ def build_sparse_network(n_buses: int) -> linopy.Model:
 
     m.add_objective(gen.sum())
     return m
+
+
+SPEC = register(
+    BenchSpec(
+        name="sparse_network",
+        build=build_sparse_network,
+        sweep=SIZES,
+    )
+)
diff --git a/benchmarks/models/storage.py b/benchmarks/models/storage.py
new file mode 100644
index 00000000..5e841728
--- /dev/null
+++ b/benchmarks/models/storage.py
@@ -0,0 +1,53 @@
+"""
+Storage state-of-charge model — intertemporal coupling via ``.shift()``.
+
+A fleet of storage units, each with a bidiagonal SoC recursion
+``soc[t] - decay*soc[t-1] - eff*charge[t] + discharge[t]/eff == 0`` built with
+``soc.shift(time=1)`` (``t=0`` falls off as the boundary). This is the one op
+family no other model exercises — the ``.shift()``/``.isel()`` intertemporal
+coupling that PyPSA's SoC and flixopt's ``charge_state.isel`` recursion lean on.
+
+It is a *model*, not a pattern: each balance row has a fixed ~4 terms regardless
+of horizon or unit count, so it scales with ``size`` (units × timesteps) and has
+no benign→worst data-shape dial. ``size`` is the number of storage units.
+"""
+
+from __future__ import annotations
+
+import pandas as pd
+
+import linopy
+from benchmarks.registry import BenchSpec, register
+
+SIZES = (10, 250)
+N_TIME = 168
+DECAY = 0.99
+ETA = 0.95
+
+
+def build_storage(n_storage: int) -> linopy.Model:
+    storages = pd.RangeIndex(n_storage, name="storage")
+    time = pd.RangeIndex(N_TIME, name="time")
+
+    m = linopy.Model()
+    soc = m.add_variables(lower=0, upper=100, coords=[storages, time], name="soc")
+    charge = m.add_variables(lower=0, upper=50, coords=[storages, time], name="charge")
+    discharge = m.add_variables(
+        lower=0, upper=50, coords=[storages, time], name="discharge"
+    )
+
+    prev = soc.shift(time=1)  # soc[t-1]; t=0 shifted out (initial-SoC boundary)
+    m.add_constraints(
+        soc - DECAY * prev - ETA * charge + discharge / ETA == 0, name="soc_balance"
+    )
+    m.add_objective((charge + discharge).sum())
+    return m
+
+
+SPEC = register(
+    BenchSpec(
+        name="storage",
+        build=build_storage,
+        sweep=SIZES,
+    )
+)
diff --git a/benchmarks/patterns/__init__.py b/benchmarks/patterns/__init__.py
new file mode 100644
index 00000000..09097674
--- /dev/null
+++ b/benchmarks/patterns/__init__.py
@@ -0,0 +1,22 @@
+"""
+Benchmark *patterns* — realistic modelling idioms swept over a severity dial.
+
+A pattern is a fragment of real modelling code (a balance constraint, a KVL
+contraction), not a whole model and not an isolated method call. Each is
+measured the same way a model is — time and peak memory, through the shared
+phases — but parametrised by ``severity`` (0–100, how pathological the data
+shape is) instead of ``size``. See :class:`benchmarks.registry.BenchSpec`.
+
+Importing this package registers every idiom into
+:data:`benchmarks.registry.PATTERNS` (mirrors :mod:`benchmarks.models`); adding
+a pattern is one new file plus one import below.
+"""
+
+# Side-effect imports — each module calls ``register_pattern(...)`` at import.
+from benchmarks.patterns import (  # noqa: F401
+    cumsum,
+    kvl_cycles,
+    merge_balance,
+    nodal_balance,
+    rolling,
+)
diff --git a/benchmarks/patterns/cumsum.py b/benchmarks/patterns/cumsum.py
new file mode 100644
index 00000000..212e96e7
--- /dev/null
+++ b/benchmarks/patterns/cumsum.py
@@ -0,0 +1,44 @@
+"""
+Cumulative-sum fold — ``.cumsum(dim)`` stacks a growing window into ``_term``.
+
+A running total over time — cumulative energy, a rolling budget:
+``(1 * x).cumsum("time")``. linopy currently routes ``cumsum`` through
+``rolling(window=full_dim)`` (``expressions.py``), so its ``_term`` grows
+triangularly to the dim size. It is benchmarked as its own op — not folded into
+``rolling`` — because it is a distinct public op and a natural de-densification
+target (a prefix sum need not materialise the triangle), so this is the
+instrument that would show such a kernel change land. ``severity`` dials the
+size of the cumulated dimension.
+"""
+
+from __future__ import annotations
+
+import pandas as pd
+
+import linopy
+from benchmarks.registry import SEVERITIES, BenchSpec, register_pattern
+
+N_ROW = 64  # broadcast/volume dim — the triangular fold is on t, not row
+DIM_MAX = 200
+
+
+def build_cumsum(severity: int) -> linopy.Model:
+    rows = pd.RangeIndex(N_ROW, name="row")
+    n = max(2, round(severity / 100 * DIM_MAX))
+
+    m = linopy.Model()
+    x = m.add_variables(coords=[rows, pd.RangeIndex(n, name="t")], name="x")
+    running = (1 * x).cumsum("t")  # (row, t); _term grows triangularly to n
+    m.add_constraints(running == 0, name="cumulative")
+    m.add_objective((1 * x).sum())
+    return m
+
+
+SPEC = register_pattern(
+    BenchSpec(
+        name="cumsum",
+        build=build_cumsum,
+        sweep=SEVERITIES,
+        axis="severity",
+    )
+)
diff --git a/benchmarks/patterns/kvl_cycles.py b/benchmarks/patterns/kvl_cycles.py
new file mode 100644
index 00000000..5657eedd
--- /dev/null
+++ b/benchmarks/patterns/kvl_cycles.py
@@ -0,0 +1,73 @@
+"""
+KVL-cycles pattern — sparse ``@`` densifies the result to a full ``_term`` (#748).
+
+The idiom: contract a per-branch flow against a (branch × cycle) cycle matrix —
+Kirchhoff's voltage law, ``flow @ C``. ``__matmul__`` is ``(flow * C).sum(...)``,
+which stacks *every* branch into ``_term`` regardless of whether ``C`` is zero
+there. ``severity`` dials ``C``'s sparsity: at 0 it is dense (every branch in
+every cycle — nothing to gain), at 100 only ~3 branches per cycle carry a
+nonzero (the real grid shape), yet the current kernel still produces
+``_term == n_branch``. So the *cost is flat* across severity on today's kernel
+— the win from a sparse-aware ``@`` is what grows with it.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import pandas as pd
+import xarray as xr
+
+import linopy
+from benchmarks.registry import SEVERITIES, BenchSpec, register_pattern
+
+N_BRANCH = 300
+N_CYCLE = 100
+N_TIME = 168  # snapshot horizon — sets the always-paid flat level (the
+# densification width is branch; severity dials C's sparsity, which today's
+# kernel ignores, so memory stays flat across severity)
+MIN_PER_CYCLE = 3
+
+
+def _cycle_matrix(severity: int, branches: pd.Index, cycles: pd.Index) -> xr.DataArray:
+    """
+    Branch×cycle incidence whose density falls as ``severity`` rises.
+
+    - ``severity == 0``  → dense: every branch participates in every cycle.
+    - ``severity == 100`` → ~``MIN_PER_CYCLE`` branches per cycle (real KVL).
+
+    Entries are ±1. The number of nonzeros per cycle interpolates linearly
+    between ``N_BRANCH`` (dense) and ``MIN_PER_CYCLE`` (sparse).
+    """
+    rng = np.random.default_rng(0)
+    n_branch = len(branches)
+    per_cycle = round(n_branch - severity / 100 * (n_branch - MIN_PER_CYCLE))
+    per_cycle = max(MIN_PER_CYCLE, per_cycle)
+    c_mat = np.zeros((n_branch, len(cycles)))
+    for col in range(len(cycles)):
+        idx = rng.choice(n_branch, size=per_cycle, replace=False)
+        c_mat[idx, col] = rng.choice([-1.0, 1.0], size=per_cycle)
+    return xr.DataArray(c_mat, coords=[branches, cycles])
+
+
+def build_kvl_cycles(severity: int) -> linopy.Model:
+    branches = pd.RangeIndex(N_BRANCH, name="branch")
+    cycles = pd.RangeIndex(N_CYCLE, name="cycle")
+    time = pd.RangeIndex(N_TIME, name="time")
+
+    m = linopy.Model()
+    flow = m.add_variables(lower=-100, upper=100, coords=[time, branches], name="flow")
+    cycle_matrix = _cycle_matrix(severity, branches, cycles)
+    kvl = (flow * cycle_matrix).sum("branch")
+    m.add_constraints(kvl == 0.0, name="kvl")
+    m.add_objective(flow.sum())
+    return m
+
+
+SPEC = register_pattern(
+    BenchSpec(
+        name="kvl_cycles",
+        build=build_kvl_cycles,
+        sweep=SEVERITIES,
+        axis="severity",
+    )
+)
diff --git a/benchmarks/patterns/merge_balance.py b/benchmarks/patterns/merge_balance.py
new file mode 100644
index 00000000..84bb1d91
--- /dev/null
+++ b/benchmarks/patterns/merge_balance.py
@@ -0,0 +1,57 @@
+"""
+Ragged merge — concat of mixed-width blocks pads all to the global max (#749).
+
+The documented build peak: a balance assembled by merging sub-expressions of
+*different* ``_term`` widths along a shared dim. PyPSA's nodal balance does
+``merge(gen + storage + lines + links, join="outer")`` (the single largest
+allocation in a SciGRID build); flixopt's bus balance is the sibling
+``sum([flow_rate for flow in flows])``. Merging along a non-``_term`` dim makes
+linopy align the ``_term`` axes by padding every block to the widest one — so
+one fat block leaves the narrow blocks mostly fill. ``severity`` dials the
+widest block's term count.
+"""
+
+from __future__ import annotations
+
+import pandas as pd
+
+import linopy
+from benchmarks.registry import SEVERITIES, BenchSpec, register_pattern
+
+N_BLOCKS = 30
+N_ROW = 128  # broadcast/volume dim — the ragged padding is on _term, not row
+NARROW = 3
+WIDE = 200
+
+
+def _block(
+    m: linopy.Model, rows: pd.Index, name: str, width: int
+) -> linopy.LinearExpression:
+    """A ``(row,)`` expression with ``width`` terms (a ``(row, k)`` var folded over ``k``)."""
+    k = pd.RangeIndex(width, name=f"k_{name}")
+    x = m.add_variables(coords=[rows, k], name=name)
+    return (1 * x).sum(f"k_{name}")
+
+
+def build_merge_balance(severity: int) -> linopy.Model:
+    rows = pd.RangeIndex(N_ROW, name="row")
+    widest = max(NARROW, round(NARROW + severity / 100 * (WIDE - NARROW)))
+
+    m = linopy.Model()
+    blocks = [_block(m, rows, f"narrow{i}", NARROW) for i in range(N_BLOCKS - 1)]
+    blocks.append(_block(m, rows, "wide", widest))
+
+    lhs = linopy.merge(blocks, dim="block", join="outer")
+    m.add_constraints(lhs == 0, name="balance")
+    m.add_objective(blocks[0])
+    return m
+
+
+SPEC = register_pattern(
+    BenchSpec(
+        name="merge_balance",
+        build=build_merge_balance,
+        sweep=SEVERITIES,
+        axis="severity",
+    )
+)
diff --git a/benchmarks/patterns/nodal_balance.py b/benchmarks/patterns/nodal_balance.py
new file mode 100644
index 00000000..458df39a
--- /dev/null
+++ b/benchmarks/patterns/nodal_balance.py
@@ -0,0 +1,72 @@
+"""
+Nodal-balance pattern — grouped-sum padding under bus-connectivity skew (#745).
+
+The idiom: sum each bus's generators (``groupby(bus).sum()``) and balance the
+result against demand. ``LinearExpression.groupby(...).sum()`` pads every group
+to the largest group's term count, so as generators concentrate on one hub the
+result's ``_term`` axis blows up — most of it fill. ``severity`` dials that
+skew; the build's peak memory is expected to climb steeply with it on the
+current (dense) kernel.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import pandas as pd
+import xarray as xr
+
+import linopy
+from benchmarks.registry import SEVERITIES, BenchSpec, register_pattern
+
+N_GEN = 2000
+N_BUS = 50
+N_TIME = 8  # broadcast/volume dim — the groupby pathology is on gen, not time
+
+
+def _bus_of_gen(severity: int) -> np.ndarray:
+    """
+    Assign each generator to a bus, skewed toward one hub by ``severity``.
+
+    - ``severity == 0``  → round-robin: every bus holds ~``N_GEN / N_BUS``.
+    - ``severity == 100`` → bus 0 holds almost all generators.
+
+    The first ``N_BUS`` generators anchor one bus each, so no bus is ever empty
+    — the constraint *shape* (``N_BUS`` rows) is fixed across the sweep and only
+    the per-group term count (the padding) varies.
+    """
+    rng = np.random.default_rng(0)
+    bus = np.arange(N_GEN) % N_BUS  # uniform baseline
+    anchor = np.zeros(N_GEN, dtype=bool)
+    anchor[:N_BUS] = True  # pin one generator per bus
+    move = (~anchor) & (rng.random(N_GEN) < severity / 100)
+    bus[move] = 0  # reassign a severity-fraction of the rest onto the hub
+    return bus
+
+
+def build_nodal_balance(severity: int) -> linopy.Model:
+    gens = pd.RangeIndex(N_GEN, name="gen")
+    time = pd.RangeIndex(N_TIME, name="time")
+    buses = pd.RangeIndex(N_BUS, name="bus")
+    rng = np.random.default_rng(1)
+
+    m = linopy.Model()
+    gen = m.add_variables(lower=0, coords=[gens, time], name="gen")
+
+    bus_of_gen = pd.Series(_bus_of_gen(severity), index=gens, name="bus")
+    supply = (1 * gen).groupby(bus_of_gen).sum()
+    demand = xr.DataArray(
+        rng.uniform(10.0, 100.0, size=(N_BUS, N_TIME)), coords=[buses, time]
+    )
+    m.add_constraints(supply == demand, name="balance")
+    m.add_objective(gen.sum())
+    return m
+
+
+SPEC = register_pattern(
+    BenchSpec(
+        name="nodal_balance",
+        build=build_nodal_balance,
+        sweep=SEVERITIES,
+        axis="severity",
+    )
+)
diff --git a/benchmarks/patterns/rolling.py b/benchmarks/patterns/rolling.py
new file mode 100644
index 00000000..30065179
--- /dev/null
+++ b/benchmarks/patterns/rolling.py
@@ -0,0 +1,46 @@
+"""
+Rolling-window coupling — ``rolling(K).sum()`` stacks K terms into ``_term``.
+
+The *windowed* form of intertemporal coupling (unlike the 1-step storage SoC,
+this one has a real density dial): minimum up/down time and windowed energy /
+ramp limits sum a variable over a sliding window of K timesteps
+(PyPSA ``status.rolling(K).sum()`` for min-up-time, ``constraints.py:450``).
+``rolling(K).sum()`` builds a result with **K terms per row** — so the window
+width is a clean severity dial. ``severity`` dials K from a single step to the
+full horizon.
+"""
+
+from __future__ import annotations
+
+import pandas as pd
+
+import linopy
+from benchmarks.registry import SEVERITIES, BenchSpec, register_pattern
+
+N_UNIT = 8  # broadcast dim — the window densification is on time, not unit
+N_TIME = 1000
+MIN_WINDOW = 1
+
+
+def build_rolling(severity: int) -> linopy.Model:
+    units = pd.RangeIndex(N_UNIT, name="unit")
+    time = pd.RangeIndex(N_TIME, name="time")
+    window = max(MIN_WINDOW, round(MIN_WINDOW + severity / 100 * (N_TIME - MIN_WINDOW)))
+
+    m = linopy.Model()
+    status = m.add_variables(lower=0, upper=1, coords=[units, time], name="status")
+    # min-up-time style: every K-step window carries at most K active steps.
+    windowed = status.rolling(time=window).sum()
+    m.add_constraints(windowed <= window, name="window_limit")
+    m.add_objective(status.sum())
+    return m
+
+
+SPEC = register_pattern(
+    BenchSpec(
+        name="rolling",
+        build=build_rolling,
+        sweep=SEVERITIES,
+        axis="severity",
+    )
+)
diff --git a/benchmarks/phases.py b/benchmarks/phases.py
new file mode 100644
index 00000000..983fb9c1
--- /dev/null
+++ b/benchmarks/phases.py
@@ -0,0 +1,74 @@
+"""
+The measured operations — what each benchmark phase *does to a model*.
+
+The ``test_<phase>.py`` drivers wrap these verbs in ``benchmark(...)``; setup
+(building the model, scratch files) stays in the driver, only the verb itself
+lives here.
+"""
+
+from __future__ import annotations
+
+import inspect
+from collections.abc import Callable
+from pathlib import Path
+
+import linopy
+import linopy.io as lio
+from benchmarks.registry import TO_GUROBIPY, TO_HIGHSPY, TO_MOSEK, TO_XPRESS
+from linopy import read_netcdf
+
+# linopy <0.4.1's ``to_file`` doesn't accept ``progress``. Checked once at import
+# so the suite stays runnable against older linopy (e.g. cross-version sweeps),
+# and the benchmark loop stays branchless.
+_TO_FILE_HAS_PROGRESS = "progress" in inspect.signature(linopy.Model.to_file).parameters
+
+# Re-export so a driver can ``from benchmarks.phases import read_netcdf``.
+__all__ = [
+    "SOLVER_HANDOFFS",
+    "read_netcdf",
+    "touch_matrices",
+    "write_lp",
+    "write_netcdf",
+]
+
+
+def touch_matrices(m: linopy.Model) -> None:
+    """Force every matrix block to materialise — the thing we measure."""
+    mats = m.matrices
+    for attr in ("A", "b", "c", "lb", "ub", "sense", "vlabels", "clabels"):
+        getattr(mats, attr)
+    if m.is_quadratic:
+        mats.Q
+
+
+def write_lp(m: linopy.Model, path: Path) -> None:
+    """
+    Write the model as an LP file.
+
+    Where supported, ``progress=False`` is pinned so the progress bar's overhead
+    doesn't leak into the measurement; linopy <0.4.1 doesn't accept the kwarg.
+    """
+    if _TO_FILE_HAS_PROGRESS:
+        m.to_file(path, progress=False)
+    else:
+        m.to_file(path)
+
+
+def write_netcdf(m: linopy.Model, path: Path) -> None:
+    m.to_netcdf(path)
+
+
+# (solver_name, registry phase tag, wrapper) — consumed by test_to_solver.py.
+# Each wrapper is fetched via ``getattr`` so the tuple silently drops any wrapper
+# missing from the installed linopy (e.g. ``to_xpress`` is absent before linopy
+# 0.7.1) — keeping the suite runnable on older releases for cross-version sweeps.
+SOLVER_HANDOFFS: tuple[tuple[str, str, Callable[[linopy.Model], object]], ...] = tuple(
+    (name, tag, wrapper)
+    for name, tag, wrapper in (
+        ("highs", TO_HIGHSPY, getattr(lio, "to_highspy", None)),
+        ("gurobi", TO_GUROBIPY, getattr(lio, "to_gurobipy", None)),
+        ("mosek", TO_MOSEK, getattr(lio, "to_mosek", None)),
+        ("xpress", TO_XPRESS, getattr(lio, "to_xpress", None)),
+    )
+    if wrapper is not None
+)
diff --git a/benchmarks/registry.py b/benchmarks/registry.py
new file mode 100644
index 00000000..5f3f98ef
--- /dev/null
+++ b/benchmarks/registry.py
@@ -0,0 +1,137 @@
+"""
+Registry of benchmark models and patterns.
+
+A :class:`BenchSpec` declares how to build a model and which values (sizes for a
+model, ``axis="n"``; severities for a pattern, ``axis="severity"``) and phases
+it runs; ``register`` / ``register_pattern`` add it to :data:`REGISTRY` /
+:data:`PATTERNS`::
+
+    from benchmarks import REGISTRY
+    model = REGISTRY["basic"].build(100)
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable, Iterable
+from dataclasses import dataclass
+
+import linopy
+
+# --- Phase tags -------------------------------------------------------------
+
+BUILD = "build"
+MATRICES = "matrices"
+TO_LP = "to_lp"
+TO_NETCDF = "to_netcdf"
+FROM_NETCDF = "from_netcdf"
+TO_HIGHSPY = "to_highspy"
+TO_GUROBIPY = "to_gurobipy"
+TO_MOSEK = "to_mosek"
+TO_XPRESS = "to_xpress"
+
+ALL_PHASES = frozenset(
+    {
+        BUILD,
+        MATRICES,
+        TO_LP,
+        TO_NETCDF,
+        FROM_NETCDF,
+        TO_HIGHSPY,
+        TO_GUROBIPY,
+        TO_MOSEK,
+        TO_XPRESS,
+    }
+)
+
+# The default phase set; a spec overrides with a narrower one when the default
+# solvers can't ingest it natively (e.g. native SOS for HiGHS).
+DEFAULT_PHASES = ALL_PHASES
+
+# The severity sweep every pattern runs (axis "severity").
+SEVERITIES: tuple[int, ...] = (0, 50, 100)
+
+
+@dataclass(frozen=True, repr=False)
+class BenchSpec:
+    """
+    One benchmark spec. A model is swept over ``sweep`` sizes (``axis="n"``); a
+    pattern over a 0–100 severity dial (``axis="severity"``). Both build a
+    :class:`linopy.Model` from one integer and run the same ``phases`` — the
+    model-vs-pattern distinction lives in :func:`register` vs
+    :func:`register_pattern` (and the ``models/`` vs ``patterns/`` dirs).
+    """
+
+    name: str
+    build: Callable[[int], linopy.Model]
+    sweep: tuple[int, ...]
+    axis: str = "n"
+    phases: frozenset[str] = DEFAULT_PHASES
+    requires: tuple[str, ...] = ()
+
+    def applies_to(self, phase: str) -> bool:
+        return phase in self.phases
+
+    def __repr__(self) -> str:
+        return f"BenchSpec({self.name!r}, axis={self.axis!r}, sweep={self.sweep})"
+
+
+REGISTRY: dict[str, BenchSpec] = {}
+PATTERNS: dict[str, BenchSpec] = {}
+
+
+def _validate(spec: BenchSpec, registry: dict[str, BenchSpec], kind: str) -> None:
+    if spec.name in registry:
+        raise ValueError(f"{kind} {spec.name!r} already registered")
+    unknown = spec.phases - ALL_PHASES
+    if unknown:
+        raise ValueError(f"{kind} {spec.name!r}: unknown phases {sorted(unknown)}")
+
+
+def register(spec: BenchSpec) -> BenchSpec:
+    """Add a model ``spec`` to :data:`REGISTRY`. Returns it for chaining."""
+    _validate(spec, REGISTRY, "model")
+    REGISTRY[spec.name] = spec
+    return spec
+
+
+def register_pattern(spec: BenchSpec) -> BenchSpec:
+    """Add a pattern ``spec`` (``axis="severity"``) to :data:`PATTERNS`."""
+    _validate(spec, PATTERNS, "pattern")
+    if spec.axis != "severity" or not all(0 <= s <= 100 for s in spec.sweep):
+        raise ValueError(
+            f"pattern {spec.name!r}: needs axis='severity' and sweep in [0, 100], "
+            f"got axis={spec.axis!r} sweep={spec.sweep}"
+        )
+    PATTERNS[spec.name] = spec
+    return spec
+
+
+def all_specs() -> list[BenchSpec]:
+    """Every spec in the suite — models then patterns."""
+    return [*REGISTRY.values(), *PATTERNS.values()]
+
+
+def iter_params(
+    phase: str, specs: Iterable[BenchSpec] | None = None
+) -> list[tuple[BenchSpec, int]]:
+    """
+    Flatten ``(spec, value)`` pairs for one phase — the pytest parametrize
+    source. ``specs`` defaults to every model and pattern in the suite.
+    """
+    specs = all_specs() if specs is None else specs
+    return [
+        (spec, value)
+        for spec in specs
+        if spec.applies_to(phase)
+        for value in spec.sweep
+    ]
+
+
+def spec_param_id(name: str, axis: str, value: object) -> str:
+    """
+    The ``<name>-<axis>=<value>`` fragment that fills a test id's ``[...]``.
+
+    Single source of truth for the parametrize-id shape — the pytest param
+    ids and the solver-handoff ids all build on it.
+    """
+    return f"{name}-{axis}={value}"
diff --git a/benchmarks/test_build.py b/benchmarks/test_build.py
index f657715e..8d6e536a 100644
--- a/benchmarks/test_build.py
+++ b/benchmarks/test_build.py
@@ -2,52 +2,17 @@
 
 from __future__ import annotations
 
-import pytest
+from collections.abc import Callable
+from typing import TYPE_CHECKING
 
-from benchmarks.conftest import skip_if_quick
-from benchmarks.models import (
-    BASIC_SIZES,
-    EXPR_SIZES,
-    KNAPSACK_SIZES,
-    SPARSE_SIZES,
-    build_basic,
-    build_expression_arithmetic,
-    build_knapsack,
-    build_sparse_network,
-)
-from benchmarks.models.pypsa_scigrid import SIZES as PYPSA_SIZES
+from benchmarks.conftest import cases, require
+from benchmarks.registry import BUILD
 
+if TYPE_CHECKING:
+    from benchmarks.registry import BenchSpec
 
-@pytest.mark.parametrize("n", BASIC_SIZES, ids=[f"n={n}" for n in BASIC_SIZES])
-def test_build_basic(benchmark, n, request):
-    skip_if_quick(request, "basic", n)
-    benchmark(build_basic, n)
 
-
-@pytest.mark.parametrize("n", KNAPSACK_SIZES, ids=[f"n={n}" for n in KNAPSACK_SIZES])
-def test_build_knapsack(benchmark, n, request):
-    skip_if_quick(request, "knapsack", n)
-    benchmark(build_knapsack, n)
-
-
-@pytest.mark.parametrize("n", EXPR_SIZES, ids=[f"n={n}" for n in EXPR_SIZES])
-def test_build_expression_arithmetic(benchmark, n, request):
-    skip_if_quick(request, "expression_arithmetic", n)
-    benchmark(build_expression_arithmetic, n)
-
-
-@pytest.mark.parametrize("n", SPARSE_SIZES, ids=[f"n={n}" for n in SPARSE_SIZES])
-def test_build_sparse_network(benchmark, n, request):
-    skip_if_quick(request, "sparse_network", n)
-    benchmark(build_sparse_network, n)
-
-
-@pytest.mark.parametrize(
-    "snapshots", PYPSA_SIZES, ids=[f"snapshots={s}" for s in PYPSA_SIZES]
-)
-def test_build_pypsa_scigrid(benchmark, snapshots, request):
-    pytest.importorskip("pypsa")
-    skip_if_quick(request, "pypsa_scigrid", snapshots)
-    from benchmarks.models.pypsa_scigrid import build_pypsa_scigrid
-
-    benchmark(build_pypsa_scigrid, snapshots)
+@cases(BUILD)
+def test_build(benchmark: Callable[..., object], spec: BenchSpec, n: int) -> None:
+    require(spec)
+    benchmark(lambda: spec.build(n))
diff --git a/benchmarks/test_lp_write.py b/benchmarks/test_lp_write.py
deleted file mode 100644
index 6442ccd6..00000000
--- a/benchmarks/test_lp_write.py
+++ /dev/null
@@ -1,63 +0,0 @@
-"""Benchmarks for LP file writing speed."""
-
-from __future__ import annotations
-
-import pytest
-
-from benchmarks.conftest import skip_if_quick
-from benchmarks.models import (
-    BASIC_SIZES,
-    EXPR_SIZES,
-    KNAPSACK_SIZES,
-    SPARSE_SIZES,
-    build_basic,
-    build_expression_arithmetic,
-    build_knapsack,
-    build_sparse_network,
-)
-from benchmarks.models.pypsa_scigrid import SIZES as PYPSA_SIZES
-
-
-@pytest.mark.parametrize("n", BASIC_SIZES, ids=[f"n={n}" for n in BASIC_SIZES])
-def test_lp_write_basic(benchmark, n, request, tmp_path):
-    skip_if_quick(request, "basic", n)
-    m = build_basic(n)
-    lp_file = tmp_path / "model.lp"
-    benchmark(m.to_file, lp_file, progress=False)
-
-
-@pytest.mark.parametrize("n", KNAPSACK_SIZES, ids=[f"n={n}" for n in KNAPSACK_SIZES])
-def test_lp_write_knapsack(benchmark, n, request, tmp_path):
-    skip_if_quick(request, "knapsack", n)
-    m = build_knapsack(n)
-    lp_file = tmp_path / "model.lp"
-    benchmark(m.to_file, lp_file, progress=False)
-
-
-@pytest.mark.parametrize("n", EXPR_SIZES, ids=[f"n={n}" for n in EXPR_SIZES])
-def test_lp_write_expression_arithmetic(benchmark, n, request, tmp_path):
-    skip_if_quick(request, "expression_arithmetic", n)
-    m = build_expression_arithmetic(n)
-    lp_file = tmp_path / "model.lp"
-    benchmark(m.to_file, lp_file, progress=False)
-
-
-@pytest.mark.parametrize("n", SPARSE_SIZES, ids=[f"n={n}" for n in SPARSE_SIZES])
-def test_lp_write_sparse_network(benchmark, n, request, tmp_path):
-    skip_if_quick(request, "sparse_network", n)
-    m = build_sparse_network(n)
-    lp_file = tmp_path / "model.lp"
-    benchmark(m.to_file, lp_file, progress=False)
-
-
-@pytest.mark.parametrize(
-    "snapshots", PYPSA_SIZES, ids=[f"snapshots={s}" for s in PYPSA_SIZES]
-)
-def test_lp_write_pypsa_scigrid(benchmark, snapshots, request, tmp_path):
-    pytest.importorskip("pypsa")
-    skip_if_quick(request, "pypsa_scigrid", snapshots)
-    from benchmarks.models.pypsa_scigrid import build_pypsa_scigrid
-
-    m = build_pypsa_scigrid(snapshots)
-    lp_file = tmp_path / "model.lp"
-    benchmark(m.to_file, lp_file, progress=False)
diff --git a/benchmarks/test_matrices.py b/benchmarks/test_matrices.py
index 352844fb..a7e61b05 100644
--- a/benchmarks/test_matrices.py
+++ b/benchmarks/test_matrices.py
@@ -2,48 +2,18 @@
 
 from __future__ import annotations
 
-import pytest
+from collections.abc import Callable
+from typing import TYPE_CHECKING
 
-from benchmarks.conftest import skip_if_quick
-from benchmarks.models import (
-    BASIC_SIZES,
-    EXPR_SIZES,
-    SPARSE_SIZES,
-    build_basic,
-    build_expression_arithmetic,
-    build_sparse_network,
-)
+from benchmarks.conftest import build_model, cases
+from benchmarks.phases import touch_matrices
+from benchmarks.registry import MATRICES
 
+if TYPE_CHECKING:
+    from benchmarks.registry import BenchSpec
 
-def _access_matrices(m):
-    """Access all matrix properties to force computation."""
-    matrices = m.matrices
-    _ = matrices.A
-    _ = matrices.b
-    _ = matrices.c
-    _ = matrices.lb
-    _ = matrices.ub
-    _ = matrices.sense
-    _ = matrices.vlabels
-    _ = matrices.clabels
 
-
-@pytest.mark.parametrize("n", BASIC_SIZES, ids=[f"n={n}" for n in BASIC_SIZES])
-def test_matrices_basic(benchmark, n, request):
-    skip_if_quick(request, "basic", n)
-    m = build_basic(n)
-    benchmark(_access_matrices, m)
-
-
-@pytest.mark.parametrize("n", EXPR_SIZES, ids=[f"n={n}" for n in EXPR_SIZES])
-def test_matrices_expression_arithmetic(benchmark, n, request):
-    skip_if_quick(request, "expression_arithmetic", n)
-    m = build_expression_arithmetic(n)
-    benchmark(_access_matrices, m)
-
-
-@pytest.mark.parametrize("n", SPARSE_SIZES, ids=[f"n={n}" for n in SPARSE_SIZES])
-def test_matrices_sparse_network(benchmark, n, request):
-    skip_if_quick(request, "sparse_network", n)
-    m = build_sparse_network(n)
-    benchmark(_access_matrices, m)
+@cases(MATRICES)
+def test_matrices(benchmark: Callable[..., object], spec: BenchSpec, n: int) -> None:
+    m = build_model(spec, n)
+    benchmark(lambda: touch_matrices(m))
diff --git a/benchmarks/test_netcdf.py b/benchmarks/test_netcdf.py
new file mode 100644
index 00000000..3764e31d
--- /dev/null
+++ b/benchmarks/test_netcdf.py
@@ -0,0 +1,39 @@
+"""
+Benchmarks for the netCDF persistence round-trip.
+
+We track ``to_netcdf`` and ``read_netcdf`` separately because the cost split
+matters in practice: distributed workflows tend to do many reads of a single
+written artifact.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from typing import TYPE_CHECKING
+
+from benchmarks.conftest import build_model, cases
+from benchmarks.phases import read_netcdf, write_netcdf
+from benchmarks.registry import FROM_NETCDF, TO_NETCDF
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from benchmarks.registry import BenchSpec
+
+
+@cases(TO_NETCDF)
+def test_to_netcdf(
+    benchmark: Callable[..., object], spec: BenchSpec, n: int, tmp_path: Path
+) -> None:
+    m = build_model(spec, n)
+    benchmark(lambda: write_netcdf(m, tmp_path / "model.nc"))
+
+
+@cases(FROM_NETCDF)
+def test_from_netcdf(
+    benchmark: Callable[..., object], spec: BenchSpec, n: int, tmp_path: Path
+) -> None:
+    m = build_model(spec, n)
+    path = tmp_path / "model.nc"
+    write_netcdf(m, path)  # setup — untimed
+    benchmark(lambda: read_netcdf(path))
diff --git a/benchmarks/test_pipeline.py b/benchmarks/test_pipeline.py
new file mode 100644
index 00000000..1033ad2e
--- /dev/null
+++ b/benchmarks/test_pipeline.py
@@ -0,0 +1,38 @@
+"""
+End-to-end pipeline benchmark: build → matrices → LP write in one region.
+
+Opt-in (deselected unless ``--pipeline``): it re-runs the per-phase work and,
+unlike the individual phase benchmarks, *includes the model build* — so it
+captures the end-to-end cost/peak a real build-then-export session hits, which
+can't be recovered by summing the marginal per-phase numbers. Parametrized over
+the ``to_lp`` specs (it ends in an LP write).
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from typing import TYPE_CHECKING
+
+from benchmarks.conftest import cases, require
+from benchmarks.phases import touch_matrices, write_lp
+from benchmarks.registry import TO_LP
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from benchmarks.registry import BenchSpec
+
+
+@cases(TO_LP)
+def test_pipeline(
+    benchmark: Callable[..., object], spec: BenchSpec, n: int, tmp_path: Path
+) -> None:
+    require(spec)
+    path = tmp_path / "model.lp"
+
+    def pipeline() -> None:
+        m = spec.build(n)
+        touch_matrices(m)
+        write_lp(m, path)
+
+    benchmark(pipeline)
diff --git a/benchmarks/test_pypsa_carbon_management.py b/benchmarks/test_pypsa_carbon_management.py
index 7f29a52e..209416ba 100644
--- a/benchmarks/test_pypsa_carbon_management.py
+++ b/benchmarks/test_pypsa_carbon_management.py
@@ -1,43 +1,60 @@
-import pypsa
+from __future__ import annotations
+
+from collections.abc import Callable
+from typing import Any
+
 import pytest
 
 import linopy as lp
 
+# pypsa is an optional benchmark dep. Skip the whole module if it's missing
+# so the rest of the suite stays collectable without it.
+pypsa = pytest.importorskip("pypsa")
+
 
 @pytest.fixture(scope="module")
-def network():
-    return pypsa.examples.carbon_management()
+def network() -> Any:
+    try:
+        return pypsa.examples.carbon_management()
+    except Exception as exc:  # network / example-data drift, not a linopy signal
+        pytest.skip(f"pypsa example data unavailable: {exc}")
 
 
-def test_create_model_frozen(benchmark, network):
+def test_create_model_frozen(benchmark: Callable[..., object], network: Any) -> None:
     benchmark(network.optimize.create_model, freeze_constraints=True)
 
 
-def test_create_model_mutable(benchmark, network):
+def test_create_model_mutable(benchmark: Callable[..., object], network: Any) -> None:
     benchmark(network.optimize.create_model, freeze_constraints=False)
 
 
 @pytest.fixture(scope="module")
-def model_frozen(network):
+def model_frozen(network: Any) -> Any:
     return network.optimize.create_model(freeze_constraints=True)
 
 
 @pytest.fixture(scope="module")
-def model_mutable(network):
+def model_mutable(network: Any) -> Any:
     return network.optimize.create_model(freeze_constraints=False)
 
 
-def test_to_highspy_frozen(benchmark, model_frozen):
+def test_to_highspy_frozen(benchmark: Callable[..., object], model_frozen: Any) -> None:
     benchmark(lp.io.to_highspy, model_frozen)
 
 
-def test_to_highspy_mutable(benchmark, model_mutable):
+def test_to_highspy_mutable(
+    benchmark: Callable[..., object], model_mutable: Any
+) -> None:
     benchmark(lp.io.to_highspy, model_mutable)
 
 
-def test_to_highspy_mutable_no_names(benchmark, model_mutable):
+def test_to_highspy_mutable_no_names(
+    benchmark: Callable[..., object], model_mutable: Any
+) -> None:
     benchmark(lp.io.to_highspy, model_mutable, set_names=False)
 
 
-def test_to_highspy_frozen_no_names(benchmark, model_frozen):
+def test_to_highspy_frozen_no_names(
+    benchmark: Callable[..., object], model_frozen: Any
+) -> None:
     benchmark(lp.io.to_highspy, model_frozen, set_names=False)
diff --git a/benchmarks/test_to_lp.py b/benchmarks/test_to_lp.py
new file mode 100644
index 00000000..2303d7cb
--- /dev/null
+++ b/benchmarks/test_to_lp.py
@@ -0,0 +1,24 @@
+"""Benchmarks for LP file writing speed."""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from typing import TYPE_CHECKING
+
+from benchmarks.conftest import build_model, cases
+from benchmarks.phases import write_lp
+from benchmarks.registry import TO_LP
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from benchmarks.registry import BenchSpec
+
+
+@cases(TO_LP)
+def test_to_lp(
+    benchmark: Callable[..., object], spec: BenchSpec, n: int, tmp_path: Path
+) -> None:
+    m = build_model(spec, n)
+    path = tmp_path / "model.lp"
+    benchmark(lambda: write_lp(m, path))
diff --git a/benchmarks/test_to_solver.py b/benchmarks/test_to_solver.py
new file mode 100644
index 00000000..defb14d2
--- /dev/null
+++ b/benchmarks/test_to_solver.py
@@ -0,0 +1,50 @@
+"""
+Benchmarks for solver handoff (model -> native solver instance).
+
+Times each ``linopy.io.to_<solver>`` wrapper. These wrappers delegate to the
+same direct-API build path as the new stateful Solver API
+(``Solver.from_name(name, model, io_api="direct")``), so the numbers serve
+double duty: regression tracking for the wrappers, *and* for the underlying
+``Solver._build_direct`` paths. They've also been available for many releases
+— using them keeps the suite runnable on older linopy versions.
+
+The actual ``Solver.solve()`` runtime (i.e. solver-side algorithm time) is
+intentionally not benchmarked.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from typing import TYPE_CHECKING
+
+import pytest
+
+from benchmarks.conftest import build_model
+from benchmarks.phases import SOLVER_HANDOFFS
+from benchmarks.registry import iter_params, spec_param_id
+from linopy.solvers import available_solvers
+
+if TYPE_CHECKING:
+    from benchmarks.registry import BenchSpec
+
+# One case per (available solver wrapper) × (spec, value) it applies to.
+_PARAMS = [
+    (name, wrapper, spec, n)
+    for name, tag, wrapper in SOLVER_HANDOFFS
+    for spec, n in iter_params(tag)
+]
+_IDS = [f"{name}-{spec_param_id(s.name, s.axis, v)}" for name, _w, s, v in _PARAMS]
+
+
+@pytest.mark.parametrize(("name", "wrapper", "spec", "n"), _PARAMS, ids=_IDS)
+def test_to_solver(
+    benchmark: Callable[..., object],
+    name: str,
+    wrapper: Callable[..., object],
+    spec: BenchSpec,
+    n: int,
+) -> None:
+    if name not in available_solvers:
+        pytest.skip(f"{name} not installed")
+    m = build_model(spec, n)
+    benchmark(lambda: wrapper(m))
diff --git a/pyproject.toml b/pyproject.toml
index 19d0abb3..90434190 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -82,11 +82,21 @@ dev = [
     "highspy",
     "jupyter",
 ]
+# Perf-relevant deps pinned exactly so run-to-run deltas reflect linopy
+# changes, not dependency bumps.
 benchmarks = [
-    "pytest-benchmark",
-    "pypsa",
-    "highspy>=1.7.1",
-    "pytest-memray",
+    "highspy==1.13.1",
+    "netcdf4==1.7.4",
+    "numpy==1.26.4",
+    "scipy==1.16.3",
+    "xarray==2025.1.2",
+    "pandas==2.3.3",
+    "polars==1.35.2",
+    "dask==2025.11.0",
+    "pytest==9.0.3",
+    "pytest-benchmark==5.2.3",
+    "pytest-memray==1.8.0",
+    "pytest-codspeed==5.0.3",
 ]
 solvers = [
     "gurobipy",
@@ -139,7 +149,7 @@ omit = ["test/*"]
 exclude_also = ["if TYPE_CHECKING:"]
 
 [tool.mypy]
-exclude = ['dev/*', 'examples/*', 'benchmark/*', 'benchmarks/*', 'doc/*']
+exclude = ['dev/*', 'examples/*', '^benchmark/', 'doc/*']
 ignore_missing_imports = true
 no_implicit_optional = true
 warn_unused_ignores = true