diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
index d87050dc..c2cc3000 100644
--- a/benchmarks/__init__.py
+++ b/benchmarks/__init__.py
@@ -34,8 +34,11 @@
     from benchmarks.snapshot import Metric
 
 # Importing the models / patterns packages triggers each module's
-# ``register(...)`` / ``register_pattern(...)`` call at import time.
-from benchmarks import bench, models, patterns  # noqa: F401, E402
+# ``register(...)`` / ``register_pattern(...)`` call at import time. ``bench``
+# (analytics) is intentionally NOT eagerly imported — that keeps ``import
+# benchmarks`` / the CodSpeed CI baseline free of the local-only layer;
+# ``from benchmarks import bench`` still works (submodule import on demand).
+from benchmarks import models, patterns  # noqa: F401, E402
 
 
 def load_long_df(
diff --git a/benchmarks/_tests/test_memory_id_alignment.py b/benchmarks/_tests/test_memory_id_alignment.py
deleted file mode 100644
index 5d2377c8..00000000
--- a/benchmarks/_tests/test_memory_id_alignment.py
+++ /dev/null
@@ -1,72 +0,0 @@
-"""
-Guard test for the timing ↔ memory test-id seam.
-
-``memory.py`` hand-rolls f-strings to label each measurement with the
-same node id pytest-benchmark produces (e.g.
-``benchmarks/test_matrices.py::test_matrices[basic-n=10]``). If a
-benchmark test function gets renamed and the matching f-string in
-``memory.py`` isn't updated, ``plot`` would silently end up with
-non-overlapping timing and memory sets — no error, just missing data.
-
-This test exercises both sides once and asserts every memory-emitted
-id is present in pytest's collection.
-"""
-
-from __future__ import annotations
-
-import re
-import subprocess
-import sys
-from pathlib import Path
-
-from benchmarks.memory import MEMORY_PHASES, _measurements
-from benchmarks.registry import REGISTRY
-
-
-def _collect_benchmark_ids() -> set[str]:
-    """Return the set of node ids pytest collects under ``benchmarks/``."""
-    repo_root = Path(__file__).resolve().parents[2]
-    result = subprocess.run(
-        [
-            sys.executable,
-            "-m",
-            "pytest",
-            "benchmarks/",
-            "--collect-only",
-            "-q",
-            "--no-header",
-            "--co",
-        ],
-        capture_output=True,
-        text=True,
-        check=True,
-        cwd=repo_root,
-    )
-    # pytest -q --co emits one node id per line; trailing summary lines
-    # like "N tests collected" can be ignored.
-    return {
-        line.strip()
-        for line in result.stdout.splitlines()
-        if re.match(r"^benchmarks/.*::.*\[.*\]$", line.strip())
-    }
-
-
-def test_memory_node_ids_match_pytest_collection() -> None:
-    collected = _collect_benchmark_ids()
-    assert collected, "pytest collected zero benchmark node ids — sanity broken"
-
-    # ``basic`` at its smallest size is cheap and declares every default
-    # phase, so it exercises every node-id format ``_measurements`` emits.
-    spec = REGISTRY["basic"]
-    size = spec.sizes[0]
-
-    mem_ids: set[str] = set()
-    for phase in MEMORY_PHASES:
-        for test_id, _ in _measurements(phase, spec, size):
-            mem_ids.add(test_id)
-
-    missing = mem_ids - collected
-    assert not missing, (
-        "memory.py emits node ids that pytest doesn't collect "
-        "(test rename drift?):\n" + "\n".join(f"  {m}" for m in sorted(missing))
-    )
diff --git a/benchmarks/cli/_base.py b/benchmarks/cli/_base.py
index c0170777..730ca4a5 100644
--- a/benchmarks/cli/_base.py
+++ b/benchmarks/cli/_base.py
@@ -44,7 +44,7 @@ class Measure(StrEnum):
 )
 
 PhaseName = Literal[
-    "build", "matrices", "to_lp", "to_netcdf", "from_netcdf", "to_solver"
+    "build", "matrices", "to_lp", "to_netcdf", "from_netcdf", "to_solver", "pipeline"
 ]
 SpecKind = Literal["all", "models", "patterns"]
 
@@ -56,6 +56,7 @@ class Measure(StrEnum):
     "to_netcdf": "benchmarks/test_netcdf.py::test_to_netcdf",
     "from_netcdf": "benchmarks/test_netcdf.py::test_from_netcdf",
     "to_solver": "benchmarks/test_to_solver.py",
+    "pipeline": "benchmarks/test_pipeline.py",
 }
 
 # pytest args that constitute a "smoke" run — quick sizes, no timings.
diff --git a/benchmarks/cli/run.py b/benchmarks/cli/run.py
index 6e42b990..f88b691c 100644
--- a/benchmarks/cli/run.py
+++ b/benchmarks/cli/run.py
@@ -163,6 +163,8 @@ def run(
     def _timing() -> None:
         args: list[str] = []
         args.append(_PHASE_TEST_FILE[phase] if phase is not None else "benchmarks/")
+        if phase == "pipeline":
+            args.append("--pipeline")
         if quick:
             args.append("--quick")
         elif long:
diff --git a/benchmarks/conftest.py b/benchmarks/conftest.py
index eda9ed6c..2eb3be2a 100644
--- a/benchmarks/conftest.py
+++ b/benchmarks/conftest.py
@@ -2,10 +2,17 @@
 
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
 import pytest
 
 from benchmarks.registry import BenchSpec, skip_reason
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+    from benchmarks.phases import PhaseCase
+
 # Test modules the CodSpeed instruments measure (edit to change coverage).
 # build + the two export paths: to_lp (LP text) and to_solver (direct handoff,
 # which also exercises matrix-gen). matrices is dropped — a subset of to_solver;
@@ -55,6 +62,16 @@ def pytest_addoption(parser: pytest.Parser) -> None:
             "--quick/--long for patterns, leaving models on the prevailing tier."
         ),
     )
+    parser.addoption(
+        "--pipeline",
+        action="store_true",
+        default=False,
+        help=(
+            "Include the opt-in end-to-end pipeline benchmark (build → matrices "
+            "→ lp in one measured region). Off by default — it re-runs the "
+            "per-phase work and includes the build."
+        ),
+    )
 
 
 def pytest_collection_modifyitems(
@@ -63,6 +80,7 @@ def pytest_collection_modifyitems(
     """
     ``--quick`` drops the PyPSA end-to-end test (~30s; minutes under cachegrind).
     ``--codspeed`` narrows the run to ``CODSPEED_MODULES`` (drops netcdf/matrices).
+    ``test_pipeline`` (end-to-end) is opt-in — deselected unless ``--pipeline``.
     """
     if config.getoption("--quick"):
         skip = pytest.mark.skip(reason="--quick: pypsa end-to-end skipped")
@@ -70,6 +88,12 @@ def pytest_collection_modifyitems(
             if "test_pypsa_carbon_management" in item.nodeid:
                 item.add_marker(skip)
 
+    if not config.getoption("--pipeline"):
+        dropped = [i for i in items if i.path.stem == "test_pipeline"]
+        if dropped:
+            config.hook.pytest_deselected(items=dropped)
+            items[:] = [i for i in items if i.path.stem != "test_pipeline"]
+
     if getattr(config.option, "codspeed", False):
         deselected = [i for i in items if i.path.stem not in CODSPEED_MODULES]
         if deselected:
@@ -105,3 +129,22 @@ def maybe_skip(request: pytest.FixtureRequest, spec: BenchSpec, size: int) -> No
     )
     if reason:
         pytest.skip(reason)
+
+
+def run_case(
+    benchmark: Callable[..., object],
+    case: PhaseCase,
+    request: pytest.FixtureRequest,
+) -> None:
+    """
+    Shared pytest-benchmark driver body for one :class:`PhaseCase`.
+
+    Honours the case's own ``skip`` (e.g. solver not installed) and the size
+    tiers (via :func:`maybe_skip`), then runs the case's measured action under
+    ``benchmark`` inside the case's setup/teardown context.
+    """
+    if case.skip:
+        pytest.skip(case.skip)
+    maybe_skip(request, case.spec, case.value)
+    with case.run() as action:
+        benchmark(action)
diff --git a/benchmarks/memory.py b/benchmarks/memory.py
index 0f03de53..e22dfcc8 100644
--- a/benchmarks/memory.py
+++ b/benchmarks/memory.py
@@ -15,8 +15,8 @@
 The per-phase peaks are *marginal* (each tracker sees only its own phase's
 allocations), so the end-to-end OOM ceiling can't be recovered from them: the
 opt-in ``pipeline`` phase (``--phase pipeline``) instead measures
-build → matrices → to_lp under one tracker, keyed by a bare
-``pipeline[<spec>-<axis>=<value>]`` id.
+build → matrices → to_lp under one tracker, keyed by the same node id as the
+timing pipeline test (``test_pipeline.py::test_pipeline[...]``).
 """
 
 from __future__ import annotations
@@ -28,11 +28,11 @@
 import subprocess
 import sys
 import tempfile
-from collections.abc import Callable, Iterator
+from collections.abc import Callable
 from pathlib import Path
 from typing import TYPE_CHECKING
 
-from benchmarks.snapshot import spec_param_id, write_memory_snapshot
+from benchmarks.snapshot import write_memory_snapshot
 
 if TYPE_CHECKING:
     from benchmarks.registry import BenchSpec
@@ -73,28 +73,6 @@ def _require_memray() -> None:
 ALL_MEMORY_PHASES: tuple[str, ...] = (*MEMORY_PHASES, "pipeline")
 
 
-def _phase_tag(phase: str) -> str:
-    """Map a phase name to the registry phase tag used by ``spec.applies_to``."""
-    from benchmarks.registry import (
-        BUILD,
-        FROM_NETCDF,
-        MATRICES,
-        TO_HIGHSPY,
-        TO_LP,
-        TO_NETCDF,
-    )
-
-    return {
-        "build": BUILD,
-        "matrices": MATRICES,
-        "to_lp": TO_LP,
-        "to_netcdf": TO_NETCDF,
-        "from_netcdf": FROM_NETCDF,
-        "to_solver": TO_HIGHSPY,  # we always measure the highs handoff
-        "pipeline": BUILD,
-    }[phase]
-
-
 def measure_peak(action: Callable[[], object], repeats: int = 1) -> float:
     """
     Run ``action()`` under ``memray.Tracker`` and return peak MiB.
@@ -138,116 +116,14 @@ def measure_peak(action: Callable[[], object], repeats: int = 1) -> float:
 _measure_peak = measure_peak
 
 
-def _measurements(
-    phase: str, spec: BenchSpec, size: int
-) -> Iterator[tuple[str, Callable[[], object]]]:
-    """
-    Yield ``(test_id, action)`` pairs for one ``(phase, spec, size)``.
-
-    ``action`` is a zero-arg callable; the caller runs it inside a tracker.
-    For non-build phases, the model is built once up front (outside the
-    tracker) and the action closes over it so only the phase work is
-    counted. ``size`` is the swept value along ``spec.axis`` (model size or
-    pattern severity); the test ids match the shared phase drivers either way.
-    """
-    name = spec.name
-    axis = spec.axis
-
-    if phase == "build":
-        yield (
-            f"benchmarks/test_build.py::test_build[{spec_param_id(name, axis, size)}]",
-            lambda: spec.build(size),
-        )
-        return
-
-    if phase == "pipeline":
-        from benchmarks.phases import touch_matrices, write_lp
-
-        tmpdir = tempfile.TemporaryDirectory()
-        lp_path = Path(tmpdir.name) / "m.lp"
-
-        def run_pipeline() -> None:
-            built = spec.build(size)
-            touch_matrices(built)
-            write_lp(built, lp_path)
-
+def _deps_available(spec: BenchSpec) -> bool:
+    """True if every module in ``spec.requires`` imports (e.g. pypsa)."""
+    for mod in spec.requires:
         try:
-            yield (f"pipeline[{spec_param_id(name, axis, size)}]", run_pipeline)
-        finally:
-            tmpdir.cleanup()
-        return
-
-    m = spec.build(size)
-
-    if phase == "matrices":
-        from benchmarks.phases import touch_matrices
-
-        yield (
-            f"benchmarks/test_matrices.py::test_matrices[{spec_param_id(name, axis, size)}]",
-            lambda: touch_matrices(m),
-        )
-
-    elif phase == "to_lp":
-        from benchmarks.phases import write_lp
-
-        tmpdir = tempfile.TemporaryDirectory()
-        lp_path = Path(tmpdir.name) / "m.lp"
-        try:
-            yield (
-                f"benchmarks/test_to_lp.py::test_to_lp[{spec_param_id(name, axis, size)}]",
-                lambda: write_lp(m, lp_path),
-            )
-        finally:
-            tmpdir.cleanup()
-
-    elif phase == "to_netcdf":
-        from benchmarks.phases import write_netcdf
-
-        tmpdir = tempfile.TemporaryDirectory()
-        nc_path = Path(tmpdir.name) / "m.nc"
-        try:
-            yield (
-                f"benchmarks/test_netcdf.py::test_to_netcdf[{spec_param_id(name, axis, size)}]",
-                lambda: write_netcdf(m, nc_path),
-            )
-        finally:
-            tmpdir.cleanup()
-
-    elif phase == "from_netcdf":
-        from benchmarks.phases import read_netcdf, write_netcdf
-
-        tmpdir = tempfile.TemporaryDirectory()
-        nc_path = Path(tmpdir.name) / "m.nc"
-        write_netcdf(m, nc_path)  # setup: written outside the tracker
-        try:
-            yield (
-                f"benchmarks/test_netcdf.py::test_from_netcdf[{spec_param_id(name, axis, size)}]",
-                lambda: read_netcdf(nc_path),
-            )
-        finally:
-            tmpdir.cleanup()
-
-    elif phase == "to_solver":
-        from benchmarks.phases import SOLVER_HANDOFFS
-
-        # Memory currently tracks only HiGHS — look it up by name so a
-        # reordering of SOLVER_HANDOFFS doesn't silently swap solvers.
-        # Older linopy releases without ``to_highspy`` skip the phase
-        # silently rather than emitting an id with no possible match.
-        highs = next((w for n, _, w in SOLVER_HANDOFFS if n == "highs"), None)
-        if highs is None:
-            return
-
-        yield (
-            (
-                f"benchmarks/test_to_solver.py::test_to_solver"
-                f"[highs-{spec_param_id(name, axis, size)}]"
-            ),
-            lambda: highs(m),
-        )
-
-    else:
-        raise ValueError(f"unknown phase: {phase!r}")
+            __import__(mod)
+        except ImportError:
+            return False
+    return True
 
 
 def run_phase(
@@ -260,67 +136,50 @@ def run_phase(
     severities: tuple[int, ...] = (),
 ) -> dict[str, float]:
     """
-    Measure peak memory for every applicable ``(spec, size)`` under one phase.
-
-    Returns a ``{test_id: peak_mib}`` mapping. Invoked once per phase as a
-    subprocess by :func:`measure` for isolation. ``repeats`` is forwarded to
-    :func:`measure_peak` so callers can dial up signal-to-noise. ``filter_expr``
-    keeps only specs whose ``<name>-<axis>=<value>`` key contains it — e.g.
-    ``"nodal_balance"`` (one spec), ``"severity"`` (patterns), ``"n="`` (models).
-    Size selection (``quick`` / ``long`` / ``sizes`` / ``severities``) shares
-    :func:`benchmarks.registry.skip_reason` with pytest so the two never drift.
+    Measure peak memory for every applicable case under one phase.
+
+    Returns a ``{test_id: peak_mib}`` mapping. The work, ids and size selection
+    come from :func:`benchmarks.phases.phase_cases` / ``skip_reason`` — the same
+    source the pytest drivers consume, so the two layers can't drift. Invoked
+    once per phase as a subprocess by :func:`measure` for isolation.
+    ``filter_expr`` keeps only cases whose id-suffix contains it (e.g.
+    ``"nodal_balance"``, ``"severity"``, ``"n="``); ``repeats`` is forwarded to
+    :func:`measure_peak`.
     """
     _require_memray()
 
-    from benchmarks.registry import all_specs, skip_reason
+    from benchmarks.phases import PHASE_NODE, phase_cases
+    from benchmarks.registry import skip_reason
 
-    tag = _phase_tag(phase)
+    node = PHASE_NODE[phase]
     results: dict[str, float] = {}
 
-    for spec in all_specs():
-        if not spec.applies_to(tag):
+    for case in phase_cases(phase):
+        if case.skip:
+            continue
+        if not _deps_available(case.spec):
+            continue
+        if skip_reason(
+            case.spec,
+            case.value,
+            quick=quick,
+            long=long,
+            sizes=sizes,
+            severities=severities,
+        ):
+            continue
+        if filter_expr and filter_expr not in case.id:
             continue
 
-        # Optional-dep gate (e.g. pypsa_scigrid needs pypsa).
-        for mod in spec.requires:
-            try:
-                __import__(mod)
-            except ImportError:
-                break
-        else:
-            for value in spec.sweep:
-                if skip_reason(
-                    spec,
-                    value,
-                    quick=quick,
-                    long=long,
-                    sizes=sizes,
-                    severities=severities,
-                ):
-                    continue
-                key = spec_param_id(spec.name, spec.axis, value)
-                if filter_expr and filter_expr not in key:
-                    continue
-                try:
-                    for test_id, action in _measurements(phase, spec, value):
-                        try:
-                            results[test_id] = _measure_peak(action, repeats=repeats)
-                            print(
-                                f"  {test_id} → {results[test_id]:.1f} MiB",
-                                file=sys.stderr,
-                            )
-                        except Exception as exc:  # noqa: BLE001
-                            print(
-                                f"  skip {test_id}: {type(exc).__name__}: {exc}",
-                                file=sys.stderr,
-                            )
-                except Exception as exc:  # noqa: BLE001
-                    print(
-                        f"  setup failed {spec.name}/{value}: "
-                        f"{type(exc).__name__}: {exc}",
-                        file=sys.stderr,
-                    )
-                gc.collect()
+        test_id = f"{node}[{case.id}]"
+        try:
+            with case.run() as action:
+                peak = measure_peak(action, repeats=repeats)
+            results[test_id] = peak
+            print(f"  {test_id} → {peak:.1f} MiB", file=sys.stderr)
+        except Exception as exc:  # noqa: BLE001
+            print(f"  skip {test_id}: {type(exc).__name__}: {exc}", file=sys.stderr)
+        gc.collect()
 
     return results
 
diff --git a/benchmarks/phases.py b/benchmarks/phases.py
index 8a60fc81..6de89f55 100644
--- a/benchmarks/phases.py
+++ b/benchmarks/phases.py
@@ -14,13 +14,31 @@
 from __future__ import annotations
 
 import inspect
-from collections.abc import Callable
+import tempfile
+from collections.abc import Callable, Iterator
+from contextlib import AbstractContextManager, contextmanager
+from functools import partial
 from pathlib import Path
+from typing import NamedTuple
 
 import linopy
 import linopy.io as lio
-from benchmarks.registry import TO_GUROBIPY, TO_HIGHSPY, TO_MOSEK, TO_XPRESS
+from benchmarks.registry import (
+    BUILD,
+    FROM_NETCDF,
+    MATRICES,
+    TO_GUROBIPY,
+    TO_HIGHSPY,
+    TO_LP,
+    TO_MOSEK,
+    TO_NETCDF,
+    TO_XPRESS,
+    BenchSpec,
+    iter_params,
+    spec_param_id,
+)
 from linopy import read_netcdf
+from linopy.solvers import available_solvers
 
 # linopy <0.4.1's ``to_file`` doesn't accept ``progress``. Check once
 # at import so the benchmark loop stays branchless on the hot path.
@@ -84,3 +102,121 @@ def write_netcdf(m: linopy.Model, path: Path) -> None:
     )
     if wrapper is not None
 )
+
+
+Action = Callable[[], object]
+CaseFactory = Callable[[], AbstractContextManager[Action]]
+
+PIPELINE = "pipeline"
+
+PHASE_NODE: dict[str, str] = {
+    BUILD: "benchmarks/test_build.py::test_build",
+    MATRICES: "benchmarks/test_matrices.py::test_matrices",
+    TO_LP: "benchmarks/test_to_lp.py::test_to_lp",
+    TO_NETCDF: "benchmarks/test_netcdf.py::test_to_netcdf",
+    FROM_NETCDF: "benchmarks/test_netcdf.py::test_from_netcdf",
+    "to_solver": "benchmarks/test_to_solver.py::test_to_solver",
+    PIPELINE: "benchmarks/test_pipeline.py::test_pipeline",
+}
+
+
+class PhaseCase(NamedTuple):
+    """One parametrization of a phase — what both drivers consume."""
+
+    spec: BenchSpec
+    value: int
+    id: str
+    run: CaseFactory
+    skip: str | None
+
+
+@contextmanager
+def _build_case(spec: BenchSpec, value: int) -> Iterator[Action]:
+    yield lambda: spec.build(value)
+
+
+@contextmanager
+def _matrices_case(spec: BenchSpec, value: int) -> Iterator[Action]:
+    m = spec.build(value)
+    yield lambda: touch_matrices(m)
+
+
+@contextmanager
+def _to_lp_case(spec: BenchSpec, value: int) -> Iterator[Action]:
+    m = spec.build(value)
+    with tempfile.TemporaryDirectory() as d:
+        path = Path(d) / "model.lp"
+        yield lambda: write_lp(m, path)
+
+
+@contextmanager
+def _to_netcdf_case(spec: BenchSpec, value: int) -> Iterator[Action]:
+    m = spec.build(value)
+    with tempfile.TemporaryDirectory() as d:
+        path = Path(d) / "model.nc"
+        yield lambda: write_netcdf(m, path)
+
+
+@contextmanager
+def _from_netcdf_case(spec: BenchSpec, value: int) -> Iterator[Action]:
+    m = spec.build(value)
+    with tempfile.TemporaryDirectory() as d:
+        path = Path(d) / "model.nc"
+        write_netcdf(m, path)
+        yield lambda: read_netcdf(path)
+
+
+@contextmanager
+def _solver_case(
+    spec: BenchSpec, value: int, wrapper: Callable[[linopy.Model], object]
+) -> Iterator[Action]:
+    m = spec.build(value)
+    yield lambda: wrapper(m)
+
+
+@contextmanager
+def _pipeline_case(spec: BenchSpec, value: int) -> Iterator[Action]:
+    with tempfile.TemporaryDirectory() as d:
+        path = Path(d) / "model.lp"
+
+        def action() -> None:
+            m = spec.build(value)
+            touch_matrices(m)
+            write_lp(m, path)
+
+        yield action
+
+
+_PHASE_CASE: dict[str, tuple[str, Callable[[BenchSpec, int], AbstractContextManager[Action]]]] = {
+    BUILD: (BUILD, _build_case),
+    MATRICES: (MATRICES, _matrices_case),
+    TO_LP: (TO_LP, _to_lp_case),
+    TO_NETCDF: (TO_NETCDF, _to_netcdf_case),
+    FROM_NETCDF: (FROM_NETCDF, _from_netcdf_case),
+    PIPELINE: (TO_LP, _pipeline_case),
+}
+
+
+def phase_cases(phase: str) -> Iterator[PhaseCase]:
+    """
+    Yield every ``(spec, value)`` parametrization of one phase as a runnable
+    case — the single source of truth for "what runs + its id", shared by the
+    pytest drivers and the memray engine.
+
+    ``to_solver`` expands to one case per available solver (the solver in the
+    id-suffix); every other phase yields one case per applicable ``(spec,
+    value)``. ``skip`` is set for solvers that aren't installed.
+    """
+    if phase == "to_solver":
+        for name, tag, wrapper in SOLVER_HANDOFFS:
+            skip = None if name in available_solvers else f"{name} not installed"
+            for spec, value in iter_params(tag):
+                sfx = f"{name}-{spec_param_id(spec.name, spec.axis, value)}"
+                run = partial(_solver_case, spec, value, wrapper)
+                yield PhaseCase(spec, value, sfx, run, skip)
+        return
+
+    tag, case = _PHASE_CASE[phase]
+    for spec, value in iter_params(tag):
+        sfx = spec_param_id(spec.name, spec.axis, value)
+        yield PhaseCase(spec, value, sfx, partial(case, spec, value), None)
diff --git a/benchmarks/registry.py b/benchmarks/registry.py
index a1ad3d41..87d5c5fe 100644
--- a/benchmarks/registry.py
+++ b/benchmarks/registry.py
@@ -244,9 +244,18 @@ def iter_params(
     ]
 
 
-def param_ids(params: list[tuple[BenchSpec, int]]) -> list[str]:
-    from benchmarks.snapshot import spec_param_id
+def spec_param_id(name: str, axis: str, value: object) -> str:
+    """
+    The ``<name>-<axis>=<value>`` fragment that fills a test id's ``[...]``.
+
+    Single source of truth for the parametrize-id shape — pytest param ids
+    (:func:`param_ids`), the memory grid's test ids, and the solver-handoff ids
+    all build on it; :func:`benchmarks.snapshot.parse_test_id` reads it back.
+    """
+    return f"{name}-{axis}={value}"
+
 
+def param_ids(params: list[tuple[BenchSpec, int]]) -> list[str]:
     return [spec_param_id(spec.name, spec.axis, value) for spec, value in params]
 
 
diff --git a/benchmarks/snapshot.py b/benchmarks/snapshot.py
index 193c780e..163722fe 100644
--- a/benchmarks/snapshot.py
+++ b/benchmarks/snapshot.py
@@ -52,18 +52,6 @@ def parse_test_id(test_id: str) -> tuple[str, str, int | None, str]:
     return "other", "other", None, "other"
 
 
-def spec_param_id(name: str, axis: str, value: object) -> str:
-    """
-    The ``<name>-<axis>=<value>`` fragment that fills a test id's ``[...]``.
-
-    The single source of truth for the parametrize-id shape — pytest param ids
-    (:func:`benchmarks.registry.param_ids`), the memory grid's test ids, and
-    the solver-handoff ids all build on it, and :func:`parse_test_id` reads it
-    back. Keep it in lock-step with ``_SIZE_RE``.
-    """
-    return f"{name}-{axis}={value}"
-
-
 def synth_test_id(
     label: str,
     *,
@@ -84,6 +72,8 @@ def synth_test_id(
     — still fine for ``compare``). A partial spec is ambiguous and rejected.
     """
     if spec is not None and size is not None and phase is not None:
+        from benchmarks.registry import spec_param_id
+
         return f"bench::{phase}[{spec_param_id(spec, axis, size)}]"
     if spec is not None or size is not None or phase is not None:
         raise ValueError(
diff --git a/benchmarks/test_build.py b/benchmarks/test_build.py
index 5bb3430b..119def51 100644
--- a/benchmarks/test_build.py
+++ b/benchmarks/test_build.py
@@ -6,18 +6,17 @@
 
 import pytest
 
-from benchmarks.conftest import maybe_skip
-from benchmarks.registry import BUILD, ModelSpec, iter_params, param_ids
+from benchmarks.conftest import run_case
+from benchmarks.phases import PhaseCase, phase_cases
+from benchmarks.registry import BUILD
 
-_PARAMS = iter_params(BUILD)
+_CASES = list(phase_cases(BUILD))
 
 
-@pytest.mark.parametrize("spec,size", _PARAMS, ids=param_ids(_PARAMS))
+@pytest.mark.parametrize("case", _CASES, ids=[c.id for c in _CASES])
 def test_build(
     benchmark: Callable[..., object],
-    spec: ModelSpec,
-    size: int,
+    case: PhaseCase,
     request: pytest.FixtureRequest,
 ) -> None:
-    maybe_skip(request, spec, size)
-    benchmark(spec.build, size)
+    run_case(benchmark, case, request)
diff --git a/benchmarks/test_matrices.py b/benchmarks/test_matrices.py
index f985aec3..d97b5230 100644
--- a/benchmarks/test_matrices.py
+++ b/benchmarks/test_matrices.py
@@ -6,20 +6,17 @@
 
 import pytest
 
-from benchmarks.conftest import maybe_skip
-from benchmarks.phases import touch_matrices
-from benchmarks.registry import MATRICES, ModelSpec, iter_params, param_ids
+from benchmarks.conftest import run_case
+from benchmarks.phases import PhaseCase, phase_cases
+from benchmarks.registry import MATRICES
 
-_PARAMS = iter_params(MATRICES)
+_CASES = list(phase_cases(MATRICES))
 
 
-@pytest.mark.parametrize("spec,size", _PARAMS, ids=param_ids(_PARAMS))
+@pytest.mark.parametrize("case", _CASES, ids=[c.id for c in _CASES])
 def test_matrices(
     benchmark: Callable[..., object],
-    spec: ModelSpec,
-    size: int,
+    case: PhaseCase,
     request: pytest.FixtureRequest,
 ) -> None:
-    maybe_skip(request, spec, size)
-    m = spec.build(size)
-    benchmark(touch_matrices, m)
+    run_case(benchmark, case, request)
diff --git a/benchmarks/test_netcdf.py b/benchmarks/test_netcdf.py
index 072ba22e..ce48df7d 100644
--- a/benchmarks/test_netcdf.py
+++ b/benchmarks/test_netcdf.py
@@ -9,48 +9,30 @@
 from __future__ import annotations
 
 from collections.abc import Callable
-from pathlib import Path
 
 import pytest
 
-from benchmarks.conftest import maybe_skip
-from benchmarks.phases import read_netcdf, write_netcdf
-from benchmarks.registry import (
-    FROM_NETCDF,
-    TO_NETCDF,
-    ModelSpec,
-    iter_params,
-    param_ids,
-)
+from benchmarks.conftest import run_case
+from benchmarks.phases import PhaseCase, phase_cases
+from benchmarks.registry import FROM_NETCDF, TO_NETCDF
 
-_WRITE_PARAMS = iter_params(TO_NETCDF)
-_READ_PARAMS = iter_params(FROM_NETCDF)
+_WRITE_CASES = list(phase_cases(TO_NETCDF))
+_READ_CASES = list(phase_cases(FROM_NETCDF))
 
 
-@pytest.mark.parametrize("spec,size", _WRITE_PARAMS, ids=param_ids(_WRITE_PARAMS))
+@pytest.mark.parametrize("case", _WRITE_CASES, ids=[c.id for c in _WRITE_CASES])
 def test_to_netcdf(
     benchmark: Callable[..., object],
-    spec: ModelSpec,
-    size: int,
+    case: PhaseCase,
     request: pytest.FixtureRequest,
-    tmp_path: Path,
 ) -> None:
-    maybe_skip(request, spec, size)
-    m = spec.build(size)
-    out = tmp_path / "model.nc"
-    benchmark(write_netcdf, m, out)
+    run_case(benchmark, case, request)
 
 
-@pytest.mark.parametrize("spec,size", _READ_PARAMS, ids=param_ids(_READ_PARAMS))
+@pytest.mark.parametrize("case", _READ_CASES, ids=[c.id for c in _READ_CASES])
 def test_from_netcdf(
     benchmark: Callable[..., object],
-    spec: ModelSpec,
-    size: int,
+    case: PhaseCase,
     request: pytest.FixtureRequest,
-    tmp_path: Path,
 ) -> None:
-    maybe_skip(request, spec, size)
-    m = spec.build(size)
-    out = tmp_path / "model.nc"
-    write_netcdf(m, out)
-    benchmark(read_netcdf, out)
+    run_case(benchmark, case, request)
diff --git a/benchmarks/test_pipeline.py b/benchmarks/test_pipeline.py
new file mode 100644
index 00000000..77a3bc2b
--- /dev/null
+++ b/benchmarks/test_pipeline.py
@@ -0,0 +1,29 @@
+"""
+End-to-end pipeline benchmark: build → matrices → LP write in one region.
+
+Opt-in (deselected unless ``--pipeline``): it re-runs the per-phase work and,
+unlike the individual phase benchmarks, *includes the model build* — so it
+captures the end-to-end cost/peak a real build-then-export session hits, which
+can't be recovered by summing the marginal per-phase numbers. The memory side
+measures the same thing via ``... --metric memory --phase pipeline``.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+
+import pytest
+
+from benchmarks.conftest import run_case
+from benchmarks.phases import PIPELINE, PhaseCase, phase_cases
+
+_CASES = list(phase_cases(PIPELINE))
+
+
+@pytest.mark.parametrize("case", _CASES, ids=[c.id for c in _CASES])
+def test_pipeline(
+    benchmark: Callable[..., object],
+    case: PhaseCase,
+    request: pytest.FixtureRequest,
+) -> None:
+    run_case(benchmark, case, request)
diff --git a/benchmarks/test_to_lp.py b/benchmarks/test_to_lp.py
index de05e5b1..5adfe686 100644
--- a/benchmarks/test_to_lp.py
+++ b/benchmarks/test_to_lp.py
@@ -3,26 +3,20 @@
 from __future__ import annotations
 
 from collections.abc import Callable
-from pathlib import Path
 
 import pytest
 
-from benchmarks.conftest import maybe_skip
-from benchmarks.phases import write_lp
-from benchmarks.registry import TO_LP, ModelSpec, iter_params, param_ids
+from benchmarks.conftest import run_case
+from benchmarks.phases import PhaseCase, phase_cases
+from benchmarks.registry import TO_LP
 
-_PARAMS = iter_params(TO_LP)
+_CASES = list(phase_cases(TO_LP))
 
 
-@pytest.mark.parametrize("spec,size", _PARAMS, ids=param_ids(_PARAMS))
+@pytest.mark.parametrize("case", _CASES, ids=[c.id for c in _CASES])
 def test_to_lp(
     benchmark: Callable[..., object],
-    spec: ModelSpec,
-    size: int,
+    case: PhaseCase,
     request: pytest.FixtureRequest,
-    tmp_path: Path,
 ) -> None:
-    maybe_skip(request, spec, size)
-    m = spec.build(size)
-    lp_file = tmp_path / "model.lp"
-    benchmark(write_lp, m, lp_file)
+    run_case(benchmark, case, request)
diff --git a/benchmarks/test_to_solver.py b/benchmarks/test_to_solver.py
index edc852e7..17da275a 100644
--- a/benchmarks/test_to_solver.py
+++ b/benchmarks/test_to_solver.py
@@ -18,40 +18,16 @@
 
 import pytest
 
-from benchmarks.conftest import maybe_skip
-from benchmarks.phases import SOLVER_HANDOFFS
-from benchmarks.registry import ModelSpec, iter_params
-from benchmarks.snapshot import spec_param_id
-from linopy.solvers import available_solvers
-
-
-def _make_params() -> list[object]:
-    out: list[object] = []
-    for solver_name, phase, wrapper in SOLVER_HANDOFFS:
-        for spec, size in iter_params(phase):
-            out.append(
-                pytest.param(
-                    solver_name,
-                    wrapper,
-                    spec,
-                    size,
-                    id=f"{solver_name}-{spec_param_id(spec.name, spec.axis, size)}",
-                )
-            )
-    return out
-
-
-@pytest.mark.parametrize("solver_name,wrapper,spec,size", _make_params())
+from benchmarks.conftest import run_case
+from benchmarks.phases import PhaseCase, phase_cases
+
+_CASES = list(phase_cases("to_solver"))
+
+
+@pytest.mark.parametrize("case", _CASES, ids=[c.id for c in _CASES])
 def test_to_solver(
     benchmark: Callable[..., object],
-    solver_name: str,
-    wrapper: Callable[..., object],
-    spec: ModelSpec,
-    size: int,
+    case: PhaseCase,
     request: pytest.FixtureRequest,
 ) -> None:
-    if solver_name not in available_solvers:
-        pytest.skip(f"{solver_name} not installed")
-    maybe_skip(request, spec, size)
-    model = spec.build(size)
-    benchmark(wrapper, model)
+    run_case(benchmark, case, request)