From 3c13b797c5f330a2740df758eb77b26f885fc5e1 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Sat, 6 Jun 2026 13:17:54 +0200
Subject: [PATCH 1/4] refactor(benchmarks): --quick as explicit per-spec subset
 + manual axis selection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the quick_threshold size cap with quick_subset — the first/middle/last
of each spec's sweep — so --quick runs three representative points instead of a
threshold side-effect. This fixes two latent bugs the threshold caused:

- models ran only their smallest size under --quick (most quick_threshold=10),
  so the per-PR memory signal was dominated by sub-MiB benchmarks;
- patterns silently skipped severity=100 (quick_threshold=50), so the densest,
  peak-memory regime never ran in CI.

Both now follow one rule (first/mid/last); patterns get all of (0, 50, 100).
pypsa_scigrid opts out via quick_sizes=().

Add manual --size / --severity pytest options (repeatable) that override the
tier flags per axis, so CI can pin exact values.

Widen DEFAULT_SEVERITIES to (0, 25, 50, 75, 100) for finer default/--long/sweep
resolution; --quick still distills to (0, 50, 100), so CodSpeed isn't rebaselined.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli/introspect.py               |  4 +-
 benchmarks/conftest.py                     | 49 ++++++++++++++---
 benchmarks/memory.py                       |  2 +-
 benchmarks/models/basic.py                 |  1 -
 benchmarks/models/expression_arithmetic.py |  1 -
 benchmarks/models/knapsack.py              |  1 -
 benchmarks/models/masked.py                |  1 -
 benchmarks/models/milp.py                  |  1 -
 benchmarks/models/piecewise.py             |  1 -
 benchmarks/models/pypsa_scigrid.py         |  8 +--
 benchmarks/models/qp.py                    |  1 -
 benchmarks/models/sos.py                   |  1 -
 benchmarks/models/sparse_network.py        |  1 -
 benchmarks/models/storage.py               |  1 -
 benchmarks/plotting.py                     |  2 +-
 benchmarks/registry.py                     | 61 +++++++++++++++++-----
 16 files changed, 96 insertions(+), 40 deletions(-)

diff --git a/benchmarks/cli/introspect.py b/benchmarks/cli/introspect.py
index 08f1038b..41e5dc3d 100644
--- a/benchmarks/cli/introspect.py
+++ b/benchmarks/cli/introspect.py
@@ -93,7 +93,7 @@ def _row(label: str, value: object) -> None:
         _row("sizes:", spec.sizes)
         _row("features:", sorted(spec.features))
         _row("phases:", sorted(spec.phases))
-        _row("quick_threshold:", spec.quick_threshold)
+        _row("quick:", spec.quick_subset)
         _row("long_threshold:", spec.long_threshold)
         if spec.requires:
             _row("requires:", list(spec.requires))
@@ -105,7 +105,7 @@ def _row(label: str, value: object) -> None:
         _row("severities:", pattern.severities)
         _row("description:", pattern.description)
         _row("phases:", sorted(pattern.phases))
-        _row("quick_threshold:", pattern.quick_threshold)
+        _row("quick:", pattern.quick_subset)
         _row("long_threshold:", pattern.long_threshold)
         if pattern.requires:
             _row("requires:", list(pattern.requires))
diff --git a/benchmarks/conftest.py b/benchmarks/conftest.py
index ae1027c9..dbe5393b 100644
--- a/benchmarks/conftest.py
+++ b/benchmarks/conftest.py
@@ -33,6 +33,28 @@ def pytest_addoption(parser: pytest.Parser) -> None:
             "Default runs skip them."
         ),
     )
+    parser.addoption(
+        "--size",
+        action="append",
+        type=int,
+        default=[],
+        metavar="N",
+        help=(
+            "Run only these model sizes (repeatable). Overrides --quick/--long "
+            "for models, leaving patterns on the prevailing tier."
+        ),
+    )
+    parser.addoption(
+        "--severity",
+        action="append",
+        type=int,
+        default=[],
+        metavar="S",
+        help=(
+            "Run only these pattern severities (repeatable). Overrides "
+            "--quick/--long for patterns, leaving models on the prevailing tier."
+        ),
+    )
 
 
 def pytest_collection_modifyitems(
@@ -57,25 +79,36 @@ def pytest_collection_modifyitems(
 
 def maybe_skip(request: pytest.FixtureRequest, spec: BenchSpec, size: int) -> None:
     """
-    Apply size-tier skips and ``spec.requires`` importorskips.
+    Apply size selection and ``spec.requires`` importorskips.
 
-    Tiers (most restrictive first):
+    Selection (most specific first):
 
-    - ``--quick``                 → skip ``size > quick_threshold``
-    - default (no flag)           → skip ``size > long_threshold``
-    - ``--long``                  → no size cap
+    - ``--size N`` / ``--severity S`` → run only the listed values for that
+      axis (models read ``--size``, patterns ``--severity``); overrides tiers.
+    - ``--quick``                     → only ``spec.quick_subset``
+    - default (no flag)               → skip ``size > long_threshold``
+    - ``--long``                      → no size cap
 
-    If both ``--quick`` and ``--long`` are passed, ``--quick`` wins (the more
-    restrictive mode is honoured).
+    A manual axis flag wins over ``--quick``/``--long``; ``--quick`` in turn
+    wins over ``--long`` (the more restrictive mode is honoured).
     """
     for mod in spec.requires:
         pytest.importorskip(mod)
 
+    # Manual axis selection (e.g. from CI): --size for models, --severity for
+    # patterns. Empty list ⇒ not requested, fall through to the tier flags.
+    flag = "--severity" if spec.axis == "severity" else "--size"
+    manual = request.config.getoption(flag)
+    if manual:
+        if size not in manual:
+            pytest.skip(f"{flag}: {spec.name} {spec.axis}={size} not selected")
+        return
+
     quick = request.config.getoption("--quick")
     long_ = request.config.getoption("--long")
 
     if quick:
-        if size > spec.quick_threshold:
+        if size not in spec.quick_subset:
             pytest.skip(f"--quick: skipping {spec.name} {spec.axis}={size}")
     elif not long_:
         if size > spec.long_threshold:
diff --git a/benchmarks/memory.py b/benchmarks/memory.py
index 7f5c69a0..ac77de0f 100644
--- a/benchmarks/memory.py
+++ b/benchmarks/memory.py
@@ -281,7 +281,7 @@ def run_phase(
                 break
         else:
             for value in spec.sweep:
-                if quick and value > spec.quick_threshold:
+                if quick and value not in spec.quick_subset:
                     continue
                 key = spec_param_id(spec.name, spec.axis, value)
                 if filter_expr and filter_expr not in key:
diff --git a/benchmarks/models/basic.py b/benchmarks/models/basic.py
index 6959e188..a41f75ae 100644
--- a/benchmarks/models/basic.py
+++ b/benchmarks/models/basic.py
@@ -25,7 +25,6 @@ def build_basic(n: int) -> linopy.Model:
         build=build_basic,
         sizes=SIZES,
         features=frozenset({CONTINUOUS}),
-        quick_threshold=10,
         long_threshold=500,
     )
 )
diff --git a/benchmarks/models/expression_arithmetic.py b/benchmarks/models/expression_arithmetic.py
index 80590951..0f687f05 100644
--- a/benchmarks/models/expression_arithmetic.py
+++ b/benchmarks/models/expression_arithmetic.py
@@ -37,7 +37,6 @@ def build_expression_arithmetic(n: int) -> linopy.Model:
         build=build_expression_arithmetic,
         sizes=SIZES,
         features=frozenset({CONTINUOUS}),
-        quick_threshold=10,
         long_threshold=500,
     )
 )
diff --git a/benchmarks/models/knapsack.py b/benchmarks/models/knapsack.py
index 7860f285..33dc1ca8 100644
--- a/benchmarks/models/knapsack.py
+++ b/benchmarks/models/knapsack.py
@@ -31,7 +31,6 @@ def build_knapsack(n: int) -> linopy.Model:
         sizes=SIZES,
         features=frozenset({BINARY}),
         phases=DEFAULT_PHASES,  # HiGHS handles binary; matrices handles MILP
-        quick_threshold=100,
         long_threshold=10_000,
     )
 )
diff --git a/benchmarks/models/masked.py b/benchmarks/models/masked.py
index fccac137..ec024d8b 100644
--- a/benchmarks/models/masked.py
+++ b/benchmarks/models/masked.py
@@ -85,7 +85,6 @@ def build_masked(n: int) -> linopy.Model:
         sizes=SIZES,
         features=frozenset({CONTINUOUS, MASKED}),
         phases=DEFAULT_PHASES,
-        quick_threshold=10,
         long_threshold=500,
     )
 )
diff --git a/benchmarks/models/milp.py b/benchmarks/models/milp.py
index e762f207..98fe9f99 100644
--- a/benchmarks/models/milp.py
+++ b/benchmarks/models/milp.py
@@ -74,7 +74,6 @@ def build_milp(n: int) -> linopy.Model:
         sizes=SIZES,
         features=frozenset({INTEGER, CONTINUOUS}),
         phases=DEFAULT_PHASES,
-        quick_threshold=10,
         long_threshold=100,
     )
 )
diff --git a/benchmarks/models/piecewise.py b/benchmarks/models/piecewise.py
index 77157ba1..069d135b 100644
--- a/benchmarks/models/piecewise.py
+++ b/benchmarks/models/piecewise.py
@@ -86,7 +86,6 @@ def build_piecewise(n_gens: int) -> linopy.Model:
             # reformulation (pure MILP with binaries), which every supported
             # solver handles.
             phases=DEFAULT_PHASES,
-            quick_threshold=10,
             long_threshold=1_000,
         )
     )
diff --git a/benchmarks/models/pypsa_scigrid.py b/benchmarks/models/pypsa_scigrid.py
index 656b41b6..30641897 100644
--- a/benchmarks/models/pypsa_scigrid.py
+++ b/benchmarks/models/pypsa_scigrid.py
@@ -32,10 +32,10 @@ def build_pypsa_scigrid(snapshots: int = 100) -> linopy.Model:
         build=build_pypsa_scigrid,
         sizes=SIZES,
         features=frozenset({CONTINUOUS}),
-        # quick_threshold=0 keeps pypsa_scigrid out of --quick entirely —
-        # PyPSA import + example loading dominates the smoke wall-clock
-        # otherwise. It still runs in default and --long modes.
-        quick_threshold=0,
+        # quick_sizes=() keeps pypsa_scigrid out of --quick entirely — PyPSA
+        # import + example loading dominates the smoke wall-clock otherwise.
+        # It still runs in default and --long modes.
+        quick_sizes=(),
         long_threshold=50,
         requires=("pypsa",),
     )
diff --git a/benchmarks/models/qp.py b/benchmarks/models/qp.py
index a040df45..62b2002f 100644
--- a/benchmarks/models/qp.py
+++ b/benchmarks/models/qp.py
@@ -60,7 +60,6 @@ def build_qp(n_assets: int) -> linopy.Model:
         sizes=SIZES,
         features=frozenset({CONTINUOUS, QUADRATIC}),
         phases=DEFAULT_PHASES,
-        quick_threshold=10,
         long_threshold=1_000,
     )
 )
diff --git a/benchmarks/models/sos.py b/benchmarks/models/sos.py
index 55beab41..163b8763 100644
--- a/benchmarks/models/sos.py
+++ b/benchmarks/models/sos.py
@@ -93,7 +93,6 @@ def build_sos(n_gens: int) -> linopy.Model:
                     TO_XPRESS,
                 }
             ),
-            quick_threshold=10,
             long_threshold=1_000,
         )
     )
diff --git a/benchmarks/models/sparse_network.py b/benchmarks/models/sparse_network.py
index 7ac71db1..e213a03b 100644
--- a/benchmarks/models/sparse_network.py
+++ b/benchmarks/models/sparse_network.py
@@ -57,7 +57,6 @@ def build_sparse_network(n_buses: int) -> linopy.Model:
         build=build_sparse_network,
         sizes=SIZES,
         features=frozenset({CONTINUOUS}),
-        quick_threshold=10,
         long_threshold=500,
     )
 )
diff --git a/benchmarks/models/storage.py b/benchmarks/models/storage.py
index 27239d97..8d76ec69 100644
--- a/benchmarks/models/storage.py
+++ b/benchmarks/models/storage.py
@@ -50,7 +50,6 @@ def build_storage(n_storage: int) -> linopy.Model:
         build=build_storage,
         sizes=SIZES,
         features=frozenset({CONTINUOUS}),
-        quick_threshold=10,
         long_threshold=500,
         description="storage SoC recursion via .shift() — bidiagonal intertemporal coupling",
     )
diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
index e7af212f..47350d0a 100644
--- a/benchmarks/plotting.py
+++ b/benchmarks/plotting.py
@@ -347,7 +347,7 @@ def plot_scatter(
         **extra,
     )
     fig.add_hline(
-        y=1.0, line_dash="dash", line_color="grey", annotation_text="no change"
+        y=1.0, line_dash="dash", line_color="grey",
     )
     fig.update_traces(marker=dict(size=8, line=dict(width=0.5, color="DarkSlateGrey")))
     if facets is not None:
diff --git a/benchmarks/registry.py b/benchmarks/registry.py
index 5c3a7802..dba2c43a 100644
--- a/benchmarks/registry.py
+++ b/benchmarks/registry.py
@@ -8,7 +8,7 @@
 - ``sizes``                        canonical tuned sizes
 - ``features``                     variable / constraint kinds it uses
 - ``phases``                       applicable phases (to_lp, to_highspy, …)
-- ``quick_threshold``              max size under ``pytest --quick``
+- ``quick_sizes``                  ``--quick`` subset (default: first/mid/last)
 - ``requires``                     modules to ``pytest.importorskip``
 
 ::
@@ -84,18 +84,34 @@
 )
 
 
+def _quick_subset(values: tuple[int, ...]) -> tuple[int, ...]:
+    """
+    The curated ``--quick`` subset of a sweep: first, middle, last.
+
+    Three representative points — a cheap smoke size, the midpoint, and the
+    peak — deduped (so 1–2 value sweeps collapse cleanly). For the 3-value
+    severity sweep this is the whole ``(0, 50, 100)``.
+    """
+    if not values:
+        return ()
+    picks = (values[0], values[len(values) // 2], values[-1])
+    return tuple(dict.fromkeys(picks))
+
+
 @dataclass(frozen=True, repr=False)
 class ModelSpec:
     """
     Declarative description of one benchmark model.
 
-    Three size tiers gate the cost of a default ``pytest benchmarks/`` run:
+    Three tiers gate the cost of a default ``pytest benchmarks/`` run:
 
-    - ``size <= quick_threshold``: included under ``--quick`` (smoke / CI).
-    - ``size <= long_threshold``: included by default (medium-cost regression).
-    - ``size >  long_threshold``: only included under ``--long`` (full sweep).
+    - ``--quick``: only ``quick_subset`` — an explicit subset (defaults to the
+      first / middle / last of ``sizes``). The per-PR / CI smoke set.
+    - default: every size up to ``long_threshold`` (medium-cost regression).
+    - ``--long``: every size, no cap.
 
-    Without explicit values, both thresholds default to "no cap".
+    ``long_threshold`` defaults to "no cap"; set ``quick_sizes`` to override the
+    derived quick subset (``()`` opts the spec out of ``--quick`` entirely).
     """
 
     name: str
@@ -103,7 +119,7 @@ class ModelSpec:
     sizes: tuple[int, ...]
     features: frozenset[str] = frozenset({CONTINUOUS})
     phases: frozenset[str] = DEFAULT_PHASES
-    quick_threshold: int = 10**9
+    quick_sizes: tuple[int, ...] | None = None
     long_threshold: int = 10**9
     requires: tuple[str, ...] = ()
     description: str = ""
@@ -118,6 +134,14 @@ def axis(self) -> str:
         """Short x-axis label for the sweep dial: a model scales by size."""
         return "n"
 
+    @property
+    def quick_subset(self) -> tuple[int, ...]:
+        """
+        Sizes that run under ``--quick`` — the derived first/mid/last,
+        unless ``quick_sizes`` overrides it (``()`` opts out entirely).
+        """
+        return _quick_subset(self.sweep) if self.quick_sizes is None else self.quick_sizes
+
     def applies_to(self, phase: str) -> bool:
         return phase in self.phases
 
@@ -140,7 +164,7 @@ def _repr_html_(self) -> str:
             ("features", ", ".join(sorted(self.features))),
             ("sizes", ", ".join(str(s) for s in self.sizes)),
             ("phases", ", ".join(sorted(self.phases))),
-            ("quick_threshold", self.quick_threshold),
+            ("quick", ", ".join(str(s) for s in self.quick_subset) or "—"),
             ("long_threshold", self.long_threshold),
             ("requires", ", ".join(self.requires) or "—"),
         ]
@@ -223,7 +247,7 @@ def param_ids(params: list[tuple[BenchSpec, int]]) -> list[str]:
 
 # --- Patterns ---------------------------------------------------------------
 
-DEFAULT_SEVERITIES: tuple[int, ...] = (0, 50, 100)
+DEFAULT_SEVERITIES: tuple[int, ...] = (0, 25, 50, 75, 100)
 
 
 class BenchSpec(Protocol):
@@ -246,7 +270,7 @@ def phases(self) -> frozenset[str]: ...
     @property
     def requires(self) -> tuple[str, ...]: ...
     @property
-    def quick_threshold(self) -> int: ...
+    def quick_subset(self) -> tuple[int, ...]: ...
     @property
     def long_threshold(self) -> int: ...
     @property
@@ -273,9 +297,10 @@ class PatternSpec:
     A pattern builds a complete model, so it runs the same ``phases`` as a model
     by default — the build-vs-export contrast (does the dense-``_term`` bloat
     reach the matrix / LP file, or collapse?) is the point. The full severity
-    range runs by default; ``--quick`` keeps everything up to the midpoint
-    (``{0, 25, 50}``) so smoke exercises real pathology, not just the benign
-    endpoint, while skipping the heaviest builds.
+    range (``0, 25, 50, 75, 100``) runs by default; ``--quick`` keeps the
+    ``quick_subset`` (first/middle/last of ``severities`` — ``(0, 50, 100)``) so
+    smoke exercises the benign, midpoint *and* worst-case shapes, while the full
+    sweep keeps the finer resolution.
     """
 
     name: str
@@ -284,7 +309,7 @@ class PatternSpec:
     severities: tuple[int, ...] = DEFAULT_SEVERITIES
     phases: frozenset[str] = DEFAULT_PHASES
     requires: tuple[str, ...] = ()
-    quick_threshold: int = 50
+    quick_sizes: tuple[int, ...] | None = None
     long_threshold: int = 10**9
 
     @property
@@ -295,6 +320,14 @@ def sweep(self) -> tuple[int, ...]:
     def axis(self) -> str:
         return "severity"
 
+    @property
+    def quick_subset(self) -> tuple[int, ...]:
+        """
+        Severities that run under ``--quick`` — the derived first/mid/last,
+        unless ``quick_sizes`` overrides it (``()`` opts out entirely).
+        """
+        return _quick_subset(self.sweep) if self.quick_sizes is None else self.quick_sizes
+
     def applies_to(self, phase: str) -> bool:
         return phase in self.phases
 

From 631fce7dedaa0c41624fe5eab9cdd94e8fa06e19 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Sat, 6 Jun 2026 13:33:51 +0200
Subject: [PATCH 2/4] feat(benchmarks): sweep --metric {time,memory,both} +
 compare auto-detect
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make the metric (time vs memory) a flag on the workflow rather than a separate
command tree — the first half of unifying the time/memory CLI split.

- Add a Measure enum (time|memory|both) and `--metric` to `sweep`, dispatching
  to the timing or memory engine (or both, sequentially — memray overhead would
  skew wall-clock if run concurrently). `both` writes per-metric subdirs so the
  two linopy-<version>.json sets don't collide. Default snapshot dirs are now
  .benchmarks/{time,memory}/ (timing default moved from .benchmarks/sweep).
- `compare` auto-detects memory snapshots (peak_mib key) and diffs them with a
  peak-RSS table; timing still goes through pytest-benchmark. Mixing errors out.
  Adds memory.compare_snapshots (path-based; compare() now delegates to it).

The `memory` sub-app still works; Stage 2 retires it and adds `run --metric` +
--long/--size/--severity parity on the memory engine.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli/_base.py   | 15 +++++++
 benchmarks/cli/compare.py | 31 ++++++++++++-
 benchmarks/cli/sweep.py   | 93 +++++++++++++++++++++++++++++++--------
 benchmarks/memory.py      | 11 +++--
 4 files changed, 127 insertions(+), 23 deletions(-)

diff --git a/benchmarks/cli/_base.py b/benchmarks/cli/_base.py
index 61362a5d..bde1d5fc 100644
--- a/benchmarks/cli/_base.py
+++ b/benchmarks/cli/_base.py
@@ -11,12 +11,27 @@
 
 from __future__ import annotations
 
+from enum import StrEnum
 from typing import Literal
 
 import typer
 
 from benchmarks.snapshot import discover_snapshots
 
+
+class Measure(StrEnum):
+    """
+    What a measuring command records — orthogonal to the workflow.
+
+    ``time`` runs pytest-benchmark (wall clock); ``memory`` tracks peak RSS
+    via memray; ``both`` runs them sequentially (never concurrently — memray's
+    overhead would skew the wall-clock numbers).
+    """
+
+    time = "time"
+    memory = "memory"
+    both = "both"
+
 app = typer.Typer(
     help=(
         "Linopy internal benchmark suite — a thin layer over pytest plus "
diff --git a/benchmarks/cli/compare.py b/benchmarks/cli/compare.py
index 371f9183..dc6d6399 100644
--- a/benchmarks/cli/compare.py
+++ b/benchmarks/cli/compare.py
@@ -29,8 +29,9 @@ def compare(ctx: typer.Context) -> None:
     With no arguments (or missing paths), prints what snapshots exist
     under ``.benchmarks/`` so you can copy-paste the path you want.
 
-    For memory snapshots use ``memory compare`` instead — different format,
-    different tool.
+    Memory snapshots (``peak_mib`` key) are auto-detected and diffed with a
+    peak-RSS table; timing snapshots go through pytest-benchmark. The two
+    can't be mixed in one call.
 
     Implementation note: typer/click don't have a clean idiom for "list-typed
     positional + pass-through", so this command parses ``ctx.args`` by hand
@@ -62,6 +63,32 @@ def compare(ctx: typer.Context) -> None:
         _suggest_snapshots(f"missing snapshots: {[str(p) for p in missing]}")
         raise typer.Exit(code=2)
 
+    # Auto-detect the metric from the snapshots (memory snapshots carry a
+    # ``peak_mib`` key; timing ones don't) and route accordingly — no
+    # ``memory compare`` needed.
+    import json
+
+    is_memory = ["peak_mib" in json.loads(p.read_text()) for p in snapshots]
+    if any(is_memory):
+        if not all(is_memory):
+            typer.secho(
+                "can't compare memory and timing snapshots together",
+                fg=typer.colors.RED,
+                err=True,
+            )
+            raise typer.Exit(code=2)
+        if len(snapshots) != 2:
+            typer.secho(
+                "memory compare takes exactly 2 snapshots",
+                fg=typer.colors.RED,
+                err=True,
+            )
+            raise typer.Exit(code=2)
+        from benchmarks.memory import compare_snapshots
+
+        compare_snapshots(snapshots[0], snapshots[1])
+        return
+
     # Override pytest-benchmark's wide default table: ``--group-by=fullname``
     # gives each test its own (baseline, candidate) mini-table and
     # ``--columns=min,iqr`` shows the noise-floor time plus spread. Applied
diff --git a/benchmarks/cli/sweep.py b/benchmarks/cli/sweep.py
index 970948cf..923553cd 100644
--- a/benchmarks/cli/sweep.py
+++ b/benchmarks/cli/sweep.py
@@ -10,10 +10,11 @@
 from benchmarks.cli._base import (
     _PHASE_TEST_FILE,
     _SMOKE_PYTEST_ARGS,
+    Measure,
     PhaseName,
     app,
 )
-from benchmarks.sweep import run_sweep
+from benchmarks.sweep import run_memory_sweep, run_sweep
 
 
 @app.command(
@@ -25,10 +26,35 @@ def sweep(
         list[str],
         typer.Argument(help="linopy versions, e.g. 0.4.0 0.5.0 (or any pip spec)."),
     ],
+    metric: Annotated[
+        Measure,
+        typer.Option(
+            "--metric",
+            help=(
+                "What to measure: ``time`` (pytest-benchmark wall clock), "
+                "``memory`` (peak RSS via memray), or ``both`` (sequential). "
+                "Default: time."
+            ),
+        ),
+    ] = Measure.time,
     output_dir: Annotated[
-        Path,
-        typer.Option("--output-dir", "-o", help="Where to save snapshot JSONs."),
-    ] = Path(".benchmarks/sweep"),
+        Path | None,
+        typer.Option(
+            "--output-dir",
+            "-o",
+            help=(
+                "Where to save snapshot JSONs. Default: ``.benchmarks/<metric>/`` "
+                "(``both`` writes ``time`` and ``memory`` subdirs)."
+            ),
+        ),
+    ] = None,
+    repeats: Annotated[
+        int,
+        typer.Option(
+            "--repeats",
+            help="Memory only: min-of-N peak per measurement (default 1).",
+        ),
+    ] = 1,
     long: Annotated[
         bool, typer.Option("--long", help="Include the slowest sizes.")
     ] = False,
@@ -120,17 +146,48 @@ def sweep(
     Wall-clock: roughly 1-2 minutes per version (venv + install +
     benchmarks). uv's wheel cache makes repeated runs much faster.
     """
-    test_target = _PHASE_TEST_FILE[phase] if phase is not None else "benchmarks/"
-    run_sweep(
-        versions,
-        output_dir=output_dir,
-        test_target=test_target,
-        smoke_args=_SMOKE_PYTEST_ARGS,
-        long=long,
-        quick=quick,
-        rounds=rounds,
-        filter_expr=filter_expr,
-        smoke=smoke,
-        as_of=as_of,
-        extra_args=ctx.args,
-    )
+    # Timing-only knobs can't apply to a memory run.
+    if metric is not Measure.time and (smoke or rounds is not None):
+        typer.secho(
+            "--smoke / --rounds are timing-only (use --metric time)",
+            fg=typer.colors.RED,
+            err=True,
+        )
+        raise typer.Exit(code=2)
+
+    def _timing(out: Path) -> None:
+        test_target = _PHASE_TEST_FILE[phase] if phase is not None else "benchmarks/"
+        run_sweep(
+            versions,
+            output_dir=out,
+            test_target=test_target,
+            smoke_args=_SMOKE_PYTEST_ARGS,
+            long=long,
+            quick=quick,
+            rounds=rounds,
+            filter_expr=filter_expr,
+            smoke=smoke,
+            as_of=as_of,
+            extra_args=ctx.args,
+        )
+
+    def _memory(out: Path) -> None:
+        run_memory_sweep(
+            versions,
+            output_dir=out,
+            quick=quick,
+            phases=[phase] if phase is not None else None,
+            repeats=repeats,
+            as_of=as_of,
+        )
+
+    # ``both`` runs sequentially into per-metric subdirs so the two
+    # ``linopy-<version>.json`` snapshot sets never collide.
+    if metric is Measure.both:
+        base = output_dir
+        _timing(base / "time" if base else Path(".benchmarks/time"))
+        _memory(base / "memory" if base else Path(".benchmarks/memory"))
+    elif metric is Measure.memory:
+        _memory(output_dir or Path(".benchmarks/memory"))
+    else:
+        _timing(output_dir or Path(".benchmarks/time"))
diff --git a/benchmarks/memory.py b/benchmarks/memory.py
index ac77de0f..4d8285c2 100644
--- a/benchmarks/memory.py
+++ b/benchmarks/memory.py
@@ -385,16 +385,21 @@ def save(
 
 
 def compare(label_a: str, label_b: str) -> None:
-    """Diff two saved memory snapshots side-by-side."""
+    """Diff two saved memory snapshots (by label) side-by-side."""
     path_a = RESULTS_DIR / f"{label_a}.json"
     path_b = RESULTS_DIR / f"{label_b}.json"
     for p in (path_a, path_b):
         if not p.exists():
             print(f"Not found: {p}. Run 'save {p.stem}' first.", file=sys.stderr)
             sys.exit(1)
+    compare_snapshots(path_a, path_b)
 
-    data_a = json.loads(path_a.read_text())["peak_mib"]
-    data_b = json.loads(path_b.read_text())["peak_mib"]
+
+def compare_snapshots(path_a: Path, path_b: Path) -> None:
+    """Diff two memory snapshots (by path) side-by-side."""
+    label_a, label_b = Path(path_a).stem, Path(path_b).stem
+    data_a = json.loads(Path(path_a).read_text())["peak_mib"]
+    data_b = json.loads(Path(path_b).read_text())["peak_mib"]
 
     all_tests = sorted(set(data_a) | set(data_b))
 

From 60e3bfc96eaf5f37fc01b03d65f6ed8cd2da4491 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Sat, 6 Jun 2026 16:20:59 +0200
Subject: [PATCH 3/4] =?UTF-8?q?feat(benchmarks):=20retire=20memory=20sub-a?=
 =?UTF-8?q?pp=20=E2=80=94=20`run=20--metric`=20+=20unified=20gating?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Finish folding the time/memory split into a single command surface. Memory is
now a --metric flag everywhere, not a parallel `memory` sub-app.

- `run --metric {time,memory,both}` replaces `memory save`: prints results,
  saves only with --json (label = filename stem) — one rule for both metrics.
  Adds --quick/--size/--severity/--repeats so flags match across metrics.
- Delete the `memory` sub-app (cli/memory.py, memory_app); `memory sweep` →
  `sweep --metric memory`, `memory compare` → `compare` (auto-detect, Stage 1).
- Unify size/severity selection in registry.skip_reason(), shared by
  conftest.maybe_skip and memory.run_phase so the two engines can't drift.
  Threads --long/--size/--severity through the memory worker; the id-alignment
  test still passes.
- Split memory.save() into measure() (returns the dict) + save() (writes), so
  `run --metric memory` can measure-then-maybe-write.
- run_memory_sweep now invokes `run --metric memory --json <abs>` and writes
  straight to output_dir (no save-to-default-then-move dance); gains --long.
- Update the walkthrough (CI-executed, re-runs clean) and benchmarks/README.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 benchmarks/README.md       |   4 +-
 benchmarks/cli/__init__.py |   1 -
 benchmarks/cli/_base.py    |  12 +--
 benchmarks/cli/compare.py  |   8 +-
 benchmarks/cli/memory.py   | 169 -------------------------------------
 benchmarks/cli/run.py      | 158 ++++++++++++++++++++++++++--------
 benchmarks/cli/sweep.py    |   1 +
 benchmarks/conftest.py     |  32 +++----
 benchmarks/memory.py       |  80 +++++++++++++++---
 benchmarks/registry.py     |  34 ++++++++
 benchmarks/sweep.py        |  31 ++++---
 benchmarks/walkthrough.md  |  50 ++++++-----
 12 files changed, 288 insertions(+), 292 deletions(-)
 delete mode 100644 benchmarks/cli/memory.py

diff --git a/benchmarks/README.md b/benchmarks/README.md
index d264f682..566d5e30 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -14,8 +14,8 @@ This README only covers install and how to open the walkthrough.
 
 ## Models vs patterns
 
-Two kinds of benchmark spec, same harness (time + peak memory, same phases),
-distinguished by their sweep axis:
+Two kinds of benchmark spec, same harness (time *or* peak memory — a
+`run`/`sweep` `--metric` flag, same phases), distinguished by their sweep axis:
 
 - **Models** (`models/`, `REGISTRY`) — whole `linopy.Model`s swept over
   `size` (axis `n`): "how does cost scale with the problem?"
diff --git a/benchmarks/cli/__init__.py b/benchmarks/cli/__init__.py
index 0d71afe0..1f7c4738 100644
--- a/benchmarks/cli/__init__.py
+++ b/benchmarks/cli/__init__.py
@@ -25,7 +25,6 @@
 from benchmarks.cli import sweep  # noqa: F401
 from benchmarks.cli import compare  # noqa: F401
 from benchmarks.cli import plot  # noqa: F401
-from benchmarks.cli import memory  # noqa: F401
 
 # isort: on
 
diff --git a/benchmarks/cli/_base.py b/benchmarks/cli/_base.py
index bde1d5fc..433b3f17 100644
--- a/benchmarks/cli/_base.py
+++ b/benchmarks/cli/_base.py
@@ -2,8 +2,9 @@
 Shared app object, types, and helpers for the benchmark CLI.
 
 The command groups (``introspect``, ``run``, ``sweep``, ``compare``,
-``plot``, ``memory``) all register onto the single ``app`` defined here, so
-the user-facing command surface stays flat (``python -m benchmarks run`` etc.).
+``plot``) all register onto the single ``app`` defined here, so the
+user-facing command surface stays flat (``python -m benchmarks run`` etc.).
+Time vs memory is a ``--metric`` flag on ``run``/``sweep``, not a sub-app.
 
 Note on colour: ``typer.secho`` strips colour automatically when stdout isn't
 a TTY, so piping any command into ``grep`` still yields plain text.
@@ -41,13 +42,6 @@ class Measure(StrEnum):
     rich_markup_mode="rich",
 )
 
-memory_app = typer.Typer(
-    help="Peak-RSS memory snapshots (pytest-memray under the hood).",
-    no_args_is_help=True,
-)
-app.add_typer(memory_app, name="memory")
-
-
 PhaseName = Literal[
     "build", "matrices", "to_lp", "to_netcdf", "from_netcdf", "to_solver"
 ]
diff --git a/benchmarks/cli/compare.py b/benchmarks/cli/compare.py
index dc6d6399..60b5b1b3 100644
--- a/benchmarks/cli/compare.py
+++ b/benchmarks/cli/compare.py
@@ -16,11 +16,11 @@
 )
 def compare(ctx: typer.Context) -> None:
     """
-    Compare timing snapshots side-by-side via ``pytest-benchmark compare``.
+    Compare two snapshots side-by-side — timing or memory, auto-detected.
 
-    Thin wrapper around the upstream tool so the whole suite stays under
-    one entry point. Pass the snapshot paths first, then any pytest-benchmark
-    flags::
+    Timing snapshots wrap ``pytest-benchmark compare``; memory snapshots
+    (``peak_mib`` key) get a peak-RSS table. Pass the snapshot paths first,
+    then any pytest-benchmark flags (timing only)::
 
         python -m benchmarks compare a.json b.json
         python -m benchmarks compare a.json b.json --group-by=name
diff --git a/benchmarks/cli/memory.py b/benchmarks/cli/memory.py
deleted file mode 100644
index af46b63a..00000000
--- a/benchmarks/cli/memory.py
+++ /dev/null
@@ -1,169 +0,0 @@
-"""Memory subcommands: ``memory save`` / ``memory sweep`` / ``memory compare``."""
-
-from __future__ import annotations
-
-from pathlib import Path
-from typing import Annotated
-
-import typer
-
-from benchmarks.cli._base import memory_app
-from benchmarks.memory import compare as memory_compare
-from benchmarks.memory import save as memory_save
-from benchmarks.sweep import run_memory_sweep
-
-
-@memory_app.command("save")
-def memory_save_cmd(
-    label: Annotated[
-        str, typer.Argument(help="Label to attach to this snapshot, e.g. a git sha.")
-    ],
-    quick: Annotated[
-        bool, typer.Option("--quick", help="Use smaller problem sizes.")
-    ] = False,
-    phase: Annotated[
-        list[str] | None,
-        typer.Option(
-            "--phase",
-            help=(
-                "Restrict measurement to these phases. Pass multiple ``--phase`` "
-                "to select more than one. Default: all (build, matrices, to_lp,"
-                " to_netcdf, from_netcdf, to_solver)."
-            ),
-        ),
-    ] = None,
-    repeats: Annotated[
-        int,
-        typer.Option(
-            "--repeats",
-            help=(
-                "Re-run each measurement N times and keep the min peak. Default "
-                "1 (single shot). Memory peaks have ~1–3 %% wobble from GC "
-                "timing, lazy-import priming, and netcdf page-cache effects — "
-                "min-of-3 tightens that signal."
-            ),
-        ),
-    ] = 1,
-    filter_expr: Annotated[
-        str | None,
-        typer.Option(
-            "--filter",
-            "-k",
-            help=(
-                "Keep only specs whose name/id contains this — e.g. "
-                "``nodal_balance`` (one spec), ``severity`` (patterns), ``n=`` "
-                "(models)."
-            ),
-        ),
-    ] = None,
-) -> None:
-    """
-    Measure peak memory across the registry × phase grid via ``memray.Tracker``.
-
-    Each ``(phase, spec, size)`` runs under its own tracker so setup
-    allocations (model construction) are excluded from the peak — only the
-    phase work itself is counted. Phases run in separate subprocesses for
-    isolation.
-
-    Results land in ``.benchmarks/memory/<label>.json``, keyed by full
-    pytest-style test IDs so ``compare`` diffs cleanly across runs that
-    selected different subsets.
-    """
-    from benchmarks.memory import ALL_MEMORY_PHASES
-
-    if phase:
-        unknown = [p for p in phase if p not in ALL_MEMORY_PHASES]
-        if unknown:
-            typer.secho(
-                f"unknown phase(s): {unknown}; valid options: {list(ALL_MEMORY_PHASES)}",
-                fg=typer.colors.RED,
-                err=True,
-            )
-            raise typer.Exit(code=2)
-    memory_save(
-        label, quick=quick, phases=phase, repeats=repeats, filter_expr=filter_expr
-    )
-
-
-@memory_app.command("sweep")
-def memory_sweep_cmd(
-    versions: Annotated[
-        list[str],
-        typer.Argument(help="linopy versions, e.g. 0.4.0 0.5.0 (or any pip spec)."),
-    ],
-    output_dir: Annotated[
-        Path,
-        typer.Option(
-            "--output-dir",
-            "-o",
-            help="Where to save snapshot JSONs.",
-        ),
-    ] = Path(".benchmarks/memory"),
-    quick: Annotated[
-        bool,
-        typer.Option("--quick", help="Use only the smallest sizes (faster sweep)."),
-    ] = False,
-    phase: Annotated[
-        list[str] | None,
-        typer.Option(
-            "--phase",
-            help=(
-                "Restrict each version's run to these phases. Pass multiple "
-                "``--phase`` to select more than one."
-            ),
-        ),
-    ] = None,
-    repeats: Annotated[
-        int,
-        typer.Option(
-            "--repeats",
-            help="min-of-N peak per measurement (default 1).",
-        ),
-    ] = 1,
-    as_of: Annotated[
-        str | None,
-        typer.Option(
-            "--as-of",
-            help=(
-                "Freeze every dep's resolution to releases on or before this "
-                "date (``YYYY-MM-DD`` or ISO 8601). Same semantics as "
-                "``sweep --as-of`` — see that command's help."
-            ),
-        ),
-    ] = None,
-) -> None:
-    """
-    Sweep peak-memory measurements across several linopy versions.
-
-    Mirrors the timing :func:`sweep` but invokes ``memory save`` inside
-    each per-version uv venv. Each version's snapshot lands at
-    ``<output-dir>/linopy-<version>.json`` and is auto-detected by
-    ``plot`` (the ``peak_mib`` key distinguishes memory from timing).
-
-    Memory peaks are much more deterministic than wall time, so
-    ``--repeats 1`` (default) is usually plenty. Use ``--repeats 3``
-    if you need <5%% regression detection.
-    """
-    run_memory_sweep(
-        versions,
-        output_dir=output_dir,
-        quick=quick,
-        phases=phase,
-        repeats=repeats,
-        as_of=as_of,
-    )
-
-
-@memory_app.command("compare")
-def memory_compare_cmd(
-    label_a: Annotated[str, typer.Argument(help="Baseline label (typically master).")],
-    label_b: Annotated[str, typer.Argument(help="Candidate label (your branch).")],
-) -> None:
-    """
-    Compare two saved memory snapshots side-by-side.
-
-    Prints a per-test table of label_a vs label_b peak RSS and a percent
-    change. Tests present in only one snapshot are shown with ``—`` for
-    the missing column.
-    """
-    memory_compare(label_a, label_b)
diff --git a/benchmarks/cli/run.py b/benchmarks/cli/run.py
index aba39689..4e060436 100644
--- a/benchmarks/cli/run.py
+++ b/benchmarks/cli/run.py
@@ -14,6 +14,7 @@
 from benchmarks.cli._base import (
     _PHASE_TEST_FILE,
     _SMOKE_PYTEST_ARGS,
+    Measure,
     PhaseName,
     app,
 )
@@ -51,6 +52,21 @@ def smoke(ctx: typer.Context) -> None:
 )
 def run(
     ctx: typer.Context,
+    metric: Annotated[
+        Measure,
+        typer.Option(
+            "--metric",
+            help=(
+                "What to measure: ``time`` (pytest-benchmark wall clock), "
+                "``memory`` (peak RSS via memray), or ``both`` (sequential). "
+                "Default: time."
+            ),
+        ),
+    ] = Measure.time,
+    quick: Annotated[
+        bool,
+        typer.Option("--quick", help="Use each spec's quick subset of sizes."),
+    ] = False,
     long: Annotated[
         bool,
         typer.Option(
@@ -68,61 +84,135 @@ def run(
             "--filter",
             "-k",
             help=(
-                "pytest ``-k`` expression selecting specs by name/id — e.g. "
-                "``basic`` (one spec), ``severity`` (patterns), "
-                "``'build and basic'``."
+                "Select specs by name/id — a pytest ``-k`` expression for time, "
+                "a substring for memory. E.g. ``basic``, ``severity``."
             ),
         ),
     ] = None,
+    size: Annotated[
+        list[int] | None,
+        typer.Option("--size", help="Run only these model sizes (repeatable)."),
+    ] = None,
+    severity: Annotated[
+        list[int] | None,
+        typer.Option(
+            "--severity", help="Run only these pattern severities (repeatable)."
+        ),
+    ] = None,
     json_out: Annotated[
         Path | None,
-        typer.Option("--json", help="Save pytest-benchmark JSON to this path."),
+        typer.Option(
+            "--json",
+            help=(
+                "Save the snapshot to this path (pytest-benchmark JSON for time, "
+                "peak-RSS JSON for memory). Without it, results are only printed."
+            ),
+        ),
     ] = None,
     rounds: Annotated[
         int | None,
         typer.Option(
             "--rounds",
             help=(
-                "Force pytest-benchmark to run exactly N rounds per test "
-                "(passes ``--benchmark-min-rounds=N --benchmark-max-time=0``). "
-                "Default: pytest-benchmark auto-tunes per test (5–40+ rounds "
-                "depending on cost). Use a fixed N for uniform measurement "
-                "across versions in a sweep."
+                "Time only: force pytest-benchmark to run exactly N rounds per "
+                "test (``--benchmark-min-rounds=N --benchmark-max-time=0``). "
+                "Default: auto-tuned per test."
             ),
         ),
     ] = None,
+    repeats: Annotated[
+        int,
+        typer.Option(
+            "--repeats",
+            help="Memory only: min-of-N peak per measurement (default 1).",
+        ),
+    ] = 1,
 ) -> None:
     """
-    Default timing run. Records timings with pytest-benchmark.
+    Single-environment benchmark run — time, memory, or both.
 
-    Without ``--long``, sizes above each spec's ``long_threshold`` are
-    skipped — keeps the wall-clock around 45s instead of several minutes.
-    Add ``--long`` for the full sweep including the heaviest sizes
-    (knapsack at 1M, basic at 1600, pypsa_scigrid at >50).
+    ``--metric time`` (default) records wall-clock with pytest-benchmark;
+    ``--metric memory`` tracks peak RSS via memray; ``--metric both`` runs
+    them sequentially. Results print to the terminal; pass ``--json PATH``
+    to also save a snapshot (one rule for both metrics).
 
-    Any trailing arguments are forwarded to pytest verbatim, e.g.::
+    Without ``--quick``/``--long``, sizes above each spec's ``long_threshold``
+    are skipped — keeps the wall-clock manageable. ``--size``/``--severity``
+    pin exact values on either axis.
 
-        python -m benchmarks run --long -- --tb=short -x
+    Trailing arguments are forwarded to pytest (time only), e.g.::
 
-    To skip timing entirely (e.g. just verifying everything runs at a
-    bigger size), use ``smoke`` instead, or pass ``--benchmark-disable``
-    as a trailing arg.
+        python -m benchmarks run --long -- --tb=short -x
+        python -m benchmarks run --metric memory --json mem.json -k basic
     """
-    args: list[str] = []
-    args.append(_PHASE_TEST_FILE[phase] if phase is not None else "benchmarks/")
-    if long:
-        args.append("--long")
-    args.append("--benchmark-only")
-    if json_out is not None:
-        args.extend(["--benchmark-json", str(json_out)])
-    if rounds is not None:
-        args.extend([f"--benchmark-min-rounds={rounds}", "--benchmark-max-time=0"])
-
-    if filter_expr:
-        args.extend(["-k", filter_expr])
-
-    args.extend(ctx.args)
-    _run_pytest(args)
+    sizes = tuple(size or ())
+    severities = tuple(severity or ())
+
+    if metric is not Measure.time and rounds is not None:
+        typer.secho("--rounds is timing-only", fg=typer.colors.RED, err=True)
+        raise typer.Exit(code=2)
+    if metric is Measure.both and json_out is not None:
+        typer.secho(
+            "--json can't be used with --metric both (formats would collide); "
+            "run each metric separately to save",
+            fg=typer.colors.RED,
+            err=True,
+        )
+        raise typer.Exit(code=2)
+
+    def _timing() -> None:
+        args: list[str] = []
+        args.append(_PHASE_TEST_FILE[phase] if phase is not None else "benchmarks/")
+        if quick:
+            args.append("--quick")
+        elif long:
+            args.append("--long")
+        for s in sizes:
+            args.extend(["--size", str(s)])
+        for s in severities:
+            args.extend(["--severity", str(s)])
+        args.append("--benchmark-only")
+        if json_out is not None:
+            args.extend(["--benchmark-json", str(json_out)])
+        if rounds is not None:
+            args.extend([f"--benchmark-min-rounds={rounds}", "--benchmark-max-time=0"])
+        if filter_expr:
+            args.extend(["-k", filter_expr])
+        args.extend(ctx.args)
+        _run_pytest(args)
+
+    def _memory() -> None:
+        from benchmarks import memory as mem
+        from benchmarks.snapshot import write_memory_snapshot
+
+        results = mem.measure(
+            quick=quick,
+            phases=[phase] if phase is not None else None,
+            repeats=repeats,
+            filter_expr=filter_expr,
+            long=long,
+            sizes=sizes,
+            severities=severities,
+        )
+        if not results:
+            typer.secho("no measurements produced", fg=typer.colors.RED, err=True)
+            raise typer.Exit(code=1)
+        if json_out is not None:
+            write_memory_snapshot(json_out, json_out.stem, results)
+            typer.secho(
+                f"saved {len(results)} measurements to {json_out}",
+                fg=typer.colors.GREEN,
+            )
+        else:
+            typer.secho(
+                f"{len(results)} measurements (pass --json to save)",
+                fg=typer.colors.GREEN,
+            )
+
+    if metric in (Measure.time, Measure.both):
+        _timing()
+    if metric in (Measure.memory, Measure.both):
+        _memory()
 
 
 @app.command()
diff --git a/benchmarks/cli/sweep.py b/benchmarks/cli/sweep.py
index 923553cd..a67eaf34 100644
--- a/benchmarks/cli/sweep.py
+++ b/benchmarks/cli/sweep.py
@@ -176,6 +176,7 @@ def _memory(out: Path) -> None:
             versions,
             output_dir=out,
             quick=quick,
+            long=long,
             phases=[phase] if phase is not None else None,
             repeats=repeats,
             as_of=as_of,
diff --git a/benchmarks/conftest.py b/benchmarks/conftest.py
index dbe5393b..b78390d7 100644
--- a/benchmarks/conftest.py
+++ b/benchmarks/conftest.py
@@ -4,7 +4,7 @@
 
 import pytest
 
-from benchmarks.registry import BenchSpec
+from benchmarks.registry import BenchSpec, skip_reason
 
 # Test modules the CodSpeed instruments measure (edit to change coverage).
 # build + the two export paths: to_lp (LP text) and to_solver (direct handoff,
@@ -95,23 +95,13 @@ def maybe_skip(request: pytest.FixtureRequest, spec: BenchSpec, size: int) -> No
     for mod in spec.requires:
         pytest.importorskip(mod)
 
-    # Manual axis selection (e.g. from CI): --size for models, --severity for
-    # patterns. Empty list ⇒ not requested, fall through to the tier flags.
-    flag = "--severity" if spec.axis == "severity" else "--size"
-    manual = request.config.getoption(flag)
-    if manual:
-        if size not in manual:
-            pytest.skip(f"{flag}: {spec.name} {spec.axis}={size} not selected")
-        return
-
-    quick = request.config.getoption("--quick")
-    long_ = request.config.getoption("--long")
-
-    if quick:
-        if size not in spec.quick_subset:
-            pytest.skip(f"--quick: skipping {spec.name} {spec.axis}={size}")
-    elif not long_:
-        if size > spec.long_threshold:
-            pytest.skip(
-                f"long sweep needs --long: skipping {spec.name} {spec.axis}={size}"
-            )
+    reason = skip_reason(
+        spec,
+        size,
+        quick=request.config.getoption("--quick"),
+        long=request.config.getoption("--long"),
+        sizes=tuple(request.config.getoption("--size")),
+        severities=tuple(request.config.getoption("--severity")),
+    )
+    if reason:
+        pytest.skip(reason)
diff --git a/benchmarks/memory.py b/benchmarks/memory.py
index 4d8285c2..0f03de53 100644
--- a/benchmarks/memory.py
+++ b/benchmarks/memory.py
@@ -251,20 +251,28 @@ def run_pipeline() -> None:
 
 
 def run_phase(
-    phase: str, quick: bool = False, repeats: int = 1, filter_expr: str | None = None
+    phase: str,
+    quick: bool = False,
+    repeats: int = 1,
+    filter_expr: str | None = None,
+    long: bool = False,
+    sizes: tuple[int, ...] = (),
+    severities: tuple[int, ...] = (),
 ) -> dict[str, float]:
     """
     Measure peak memory for every applicable ``(spec, size)`` under one phase.
 
     Returns a ``{test_id: peak_mib}`` mapping. Invoked once per phase as a
-    subprocess by :func:`save` for isolation. ``repeats`` is forwarded to
+    subprocess by :func:`measure` for isolation. ``repeats`` is forwarded to
     :func:`measure_peak` so callers can dial up signal-to-noise. ``filter_expr``
     keeps only specs whose ``<name>-<axis>=<value>`` key contains it — e.g.
     ``"nodal_balance"`` (one spec), ``"severity"`` (patterns), ``"n="`` (models).
+    Size selection (``quick`` / ``long`` / ``sizes`` / ``severities``) shares
+    :func:`benchmarks.registry.skip_reason` with pytest so the two never drift.
     """
     _require_memray()
 
-    from benchmarks.registry import all_specs
+    from benchmarks.registry import all_specs, skip_reason
 
     tag = _phase_tag(phase)
     results: dict[str, float] = {}
@@ -281,7 +289,14 @@ def run_phase(
                 break
         else:
             for value in spec.sweep:
-                if quick and value not in spec.quick_subset:
+                if skip_reason(
+                    spec,
+                    value,
+                    quick=quick,
+                    long=long,
+                    sizes=sizes,
+                    severities=severities,
+                ):
                     continue
                 key = spec_param_id(spec.name, spec.axis, value)
                 if filter_expr and filter_expr not in key:
@@ -310,21 +325,24 @@ def run_phase(
     return results
 
 
-def save(
-    label: str,
+def measure(
     quick: bool = False,
     phases: list[str] | None = None,
     repeats: int = 1,
     filter_expr: str | None = None,
-) -> Path:
+    long: bool = False,
+    sizes: tuple[int, ...] = (),
+    severities: tuple[int, ...] = (),
+) -> dict[str, float]:
     """
-    Run one subprocess per phase and merge the results into ``<label>.json``.
+    Run one subprocess per phase and return merged ``{test_id: peak_mib}``.
 
     Per-phase subprocesses keep allocations from one phase out of another's
     measurement; ``memray.Tracker`` only counts what's allocated inside its
     ``with`` block, but the subprocess boundary makes the isolation total.
     ``filter_expr`` restricts which specs are measured (substring of the
-    ``<name>-<axis>=<value>`` key).
+    ``<name>-<axis>=<value>`` key); ``quick``/``long``/``sizes``/``severities``
+    select sizes the same way pytest does.
     """
     _require_memray()
 
@@ -351,6 +369,12 @@ def save(
         ]
         if quick:
             cmd.append("--quick")
+        if long:
+            cmd.append("--long")
+        for s in sizes:
+            cmd.extend(["--size", str(s)])
+        for s in severities:
+            cmd.extend(["--severity", str(s)])
         if repeats > 1:
             cmd.extend(["--repeats", str(repeats)])
         if filter_expr:
@@ -374,13 +398,35 @@ def save(
         finally:
             Path(out_tmp).unlink(missing_ok=True)
 
-    if not all_results:
+    return all_results
+
+
+def save(
+    label: str,
+    quick: bool = False,
+    phases: list[str] | None = None,
+    repeats: int = 1,
+    filter_expr: str | None = None,
+    long: bool = False,
+    sizes: tuple[int, ...] = (),
+    severities: tuple[int, ...] = (),
+) -> Path:
+    """Measure peak memory and write a snapshot to ``RESULTS_DIR/<label>.json``."""
+    results = measure(
+        quick=quick,
+        phases=phases,
+        repeats=repeats,
+        filter_expr=filter_expr,
+        long=long,
+        sizes=sizes,
+        severities=severities,
+    )
+    if not results:
         print("No measurements produced.", file=sys.stderr)
         sys.exit(1)
-
     RESULTS_DIR.mkdir(parents=True, exist_ok=True)
-    out_path = write_memory_snapshot(RESULTS_DIR / f"{label}.json", label, all_results)
-    print(f"\nSaved {len(all_results)} measurements to {out_path}", file=sys.stderr)
+    out_path = write_memory_snapshot(RESULTS_DIR / f"{label}.json", label, results)
+    print(f"\nSaved {len(results)} measurements to {out_path}", file=sys.stderr)
     return out_path
 
 
@@ -429,6 +475,11 @@ def compare_snapshots(path_a: Path, path_b: Path) -> None:
     parser.add_argument("cmd", choices=["_worker"])
     parser.add_argument("phase")
     parser.add_argument("--quick", action="store_true")
+    parser.add_argument("--long", action="store_true")
+    parser.add_argument("--size", action="append", type=int, default=[], dest="sizes")
+    parser.add_argument(
+        "--severity", action="append", type=int, default=[], dest="severities"
+    )
     parser.add_argument(
         "--repeats",
         type=int,
@@ -453,5 +504,8 @@ def compare_snapshots(path_a: Path, path_b: Path) -> None:
             quick=args.quick,
             repeats=args.repeats,
             filter_expr=args.filter_expr,
+            long=args.long,
+            sizes=tuple(args.sizes),
+            severities=tuple(args.severities),
         )
         Path(args.out).write_text(json.dumps(out))
diff --git a/benchmarks/registry.py b/benchmarks/registry.py
index dba2c43a..31e74d2f 100644
--- a/benchmarks/registry.py
+++ b/benchmarks/registry.py
@@ -382,3 +382,37 @@ def get_pattern(name: str) -> PatternSpec:
 def all_specs() -> list[BenchSpec]:
     """Every spec in the suite — models then patterns."""
     return [*REGISTRY.values(), *PATTERNS.values()]
+
+
+def skip_reason(
+    spec: BenchSpec,
+    value: int,
+    *,
+    quick: bool = False,
+    long: bool = False,
+    sizes: tuple[int, ...] = (),
+    severities: tuple[int, ...] = (),
+) -> str | None:
+    """
+    Why ``(spec, value)`` is excluded under this selection, or ``None`` to run.
+
+    Single source of truth for size/severity selection, shared by pytest
+    (``conftest.maybe_skip``) and the memory engine (``memory.run_phase``) so
+    the two can't drift. Precedence, most specific first:
+
+    - a manual axis list (``sizes`` for models, ``severities`` for patterns)
+      → run only those values;
+    - ``--quick`` → only ``spec.quick_subset``;
+    - default → skip ``value > long_threshold``;
+    - ``--long`` → no cap.
+    """
+    manual = severities if spec.axis == "severity" else sizes
+    if manual:
+        return None if value in manual else f"{spec.axis}={value} not selected"
+    if quick:
+        if value not in spec.quick_subset:
+            return f"--quick: skipping {spec.name} {spec.axis}={value}"
+        return None
+    if not long and value > spec.long_threshold:
+        return f"long sweep needs --long: skipping {spec.name} {spec.axis}={value}"
+    return None
diff --git a/benchmarks/sweep.py b/benchmarks/sweep.py
index 7015604f..ef81d905 100644
--- a/benchmarks/sweep.py
+++ b/benchmarks/sweep.py
@@ -1,7 +1,7 @@
 """
 Cross-version sweep orchestration — build a fresh per-version uv venv,
 install the pinned benchmark infra plus a target ``linopy``, and run the
-suite (timing) or ``memory save`` (peak RSS) inside it.
+suite (timing) or ``run --metric memory`` (peak RSS) inside it.
 
 The heavy provisioning loop and the two sweep bodies live here so
 ``cli.py`` stays a thin layer of typer command shims. The CLI resolves
@@ -31,7 +31,7 @@ def _benchmarks_extra_pins() -> list[str]:
     """
     Return the pins from ``pyproject.toml``'s ``[benchmarks]`` extra.
 
-    Both ``sweep`` and ``memory sweep`` install these into each
+    Both ``sweep`` and ``sweep --metric memory`` install these into each
     per-version venv. Direct pins are kept in pyproject as the single
     source of truth — bump them there and both sweeps pick up the
     change. Transitive deps resolve fresh per venv; uv's deterministic
@@ -99,7 +99,7 @@ def _provision_venvs(
     """
     Yield one fresh per-version uv venv for each linopy version.
 
-    Used by both ``sweep`` and ``memory sweep`` so the venv plumbing
+    Used by both ``sweep`` and ``sweep --metric memory`` so the venv plumbing
     (uv venv → install ``[benchmarks]`` pins + the target linopy →
     set up an isolated import root) lives in one place. The caller
     supplies the tempdir prefix (so ``ps``/``lsof`` can distinguish
@@ -348,12 +348,13 @@ def run_memory_sweep(
     *,
     output_dir: Path,
     quick: bool = False,
+    long: bool = False,
     phases: list[str] | None = None,
     repeats: int = 1,
     as_of: str | None = None,
 ) -> None:
     """
-    Memory sweep: invoke ``memory save`` in each per-version venv.
+    Memory sweep: invoke ``run --metric memory`` in each per-version venv.
 
     Mirrors :func:`run_sweep` but tracks peak RSS. Each version's
     snapshot lands at ``<output_dir>/linopy-<version>.json``.
@@ -382,16 +383,24 @@ def run_memory_sweep(
         assert prov.python is not None and prov.import_dir is not None
 
         label = f"linopy-{_snapshot_label(prov.version)}"
+        # Write the snapshot straight to the user's output_dir — an absolute
+        # path, so the ``cwd=import_dir`` subprocess still lands it there.
+        # ``run --metric memory`` uses the --json filename stem as the label.
+        target = (output_dir / f"{label}.json").resolve()
         mem_cmd = [
             str(prov.python),
             "-m",
             "benchmarks",
+            "run",
+            "--metric",
             "memory",
-            "save",
-            label,
+            "--json",
+            str(target),
         ]
         if quick:
             mem_cmd.append("--quick")
+        elif long:
+            mem_cmd.append("--long")
         for ph in phases or []:
             mem_cmd.extend(["--phase", ph])
         if repeats > 1:
@@ -400,16 +409,6 @@ def run_memory_sweep(
         typer.secho(f"$ {' '.join(mem_cmd)}", fg=typer.colors.BRIGHT_BLACK)
         subprocess.run(mem_cmd, env=prov.env, cwd=str(prov.import_dir), check=False)
 
-        # ``memory save`` writes to ``.benchmarks/memory/<label>.json``
-        # relative to its cwd — here, the isolated import_dir. Move it
-        # under the user's chosen output_dir (resolves under repo_root
-        # by default).
-        default_path = prov.import_dir / ".benchmarks" / "memory" / f"{label}.json"
-        target = output_dir / f"{label}.json"
-        if default_path.exists() and default_path.resolve() != target.resolve():
-            target.parent.mkdir(parents=True, exist_ok=True)
-            default_path.replace(target)
-
         if target.exists():
             typer.secho(f"saved {target}", fg=typer.colors.GREEN)
         else:
diff --git a/benchmarks/walkthrough.md b/benchmarks/walkthrough.md
index 0822a26b..89abdd80 100644
--- a/benchmarks/walkthrough.md
+++ b/benchmarks/walkthrough.md
@@ -81,6 +81,8 @@ os.environ["FORCE_COLOR"] = "1"
 _tmp = Path(tempfile.mkdtemp(prefix="bench-walkthrough-"))
 baseline = _tmp / "baseline.json"
 candidate = _tmp / "candidate.json"
+baseline_mem = _tmp / "baseline_mem.json"
+candidate_mem = _tmp / "candidate_mem.json"
 scatter_html = _tmp / "scatter.html"
 compare_html = _tmp / "compare.html"
 
@@ -131,15 +133,16 @@ Patterns are tagged by the `severity` axis in their test id, so the usual tools
 target them by filtering on it:
 
 ```bash
-pytest benchmarks/ -k severity                        # all patterns, every phase
-pytest benchmarks/ -k nodal_balance                   # one pattern
-python -m benchmarks run --filter severity --quick    # patterns, timing
-python -m benchmarks memory save mylabel --filter severity   # patterns, memory
+pytest benchmarks/ -k severity                                  # all patterns, every phase
+pytest benchmarks/ -k nodal_balance                             # one pattern
+python -m benchmarks run --filter severity --quick              # patterns, timing
+python -m benchmarks run --metric memory --filter severity --quick  # patterns, memory
 ```
 
-(`--filter`/`-k` selects specs by name or id substring on both `run` and
-`memory save` — `nodal_balance` for one spec, `severity` for all patterns,
-`n=` for models. `list --kind {models,patterns}` browses them.)
+(`--filter`/`-k` selects specs by name or id substring on `run` for either
+metric — `nodal_balance` for one spec, `severity` for all patterns, `n=` for
+models. Or pin exact values with `--size` / `--severity`. `list --kind
+{models,patterns}` browses them.)
 
 ## Run a timing snapshot
 
@@ -234,32 +237,33 @@ to `python -m benchmarks plot` for the rendered version.)
 
 ## Memory snapshots
 
-`memory save <label>` runs benchmarks under `memray.Tracker` and
-writes peak allocations (MiB) per `(phase, spec, size)` to
-`.benchmarks/memory/<label>.json`. The model is built **outside** the
-tracked region so peak reflects only the phase work, not model
-construction.
+Memory is just a `--metric` on the same `run` command — `--metric memory`
+tracks peak allocations (MiB) per `(phase, spec, size)` via `memray.Tracker`
+instead of wall-clock, with the same flags. The model is built **outside** the
+tracked region so peak reflects only the phase work, not model construction.
+`--json` saves the snapshot (its filename stem becomes the label).
 
 ```{code-cell} ipython3
-!python -m benchmarks memory save baseline_mem --quick --phase build --filter basic
+!python -m benchmarks run --metric memory --quick --phase build --filter basic --json {baseline_mem}
 ```
 
 ```{code-cell} ipython3
-!python -m benchmarks memory save candidate_mem --quick --phase build --filter basic
+!python -m benchmarks run --metric memory --quick --phase build --filter basic --json {candidate_mem}
 ```
 
-`memory compare` prints a per-test table of the two labels with
-percent change — same shape as the timing `compare`, different
-metric. Tests present in only one snapshot show `—` for the missing
-column.
+`compare` auto-detects memory snapshots (the `peak_mib` key) and prints a
+per-test table with percent change — same shape as the timing `compare`,
+different metric. Tests present in only one snapshot show `—` for the missing
+column. (`plot` auto-detects them too.)
 
 ```{code-cell} ipython3
-!python -m benchmarks memory compare baseline_mem candidate_mem
+!python -m benchmarks compare {baseline_mem} {candidate_mem}
 ```
 
-For cross-version memory tracking (analogous to `sweep` for timing),
-use `memory sweep <v1> <v2> ...` — same per-version venv shape, peak
-RSS metric.
+For cross-version memory tracking, use `sweep --metric memory <v1> <v2> ...`
+— the same per-version venv shape as the timing sweep, peak-RSS metric.
+`--metric both` runs time then memory in one pass (sequentially — memray's
+overhead would skew wall-clock if they ran together).
 
 Those per-phase peaks are *marginal* — each tracker sees only its own phase, so
 the resident model is excluded. The end-to-end peak a build-then-export session
@@ -268,7 +272,7 @@ opt-in `pipeline` phase (build → matrices → lp_write in one tracker). It re-
 those phases, so it's not in the default set — request it standalone:
 
 ```bash
-python -m benchmarks memory save ceiling --phase pipeline
+python -m benchmarks run --metric memory --phase pipeline --json ceiling.json
 ```
 
 ## Benchmarking custom things — the `bench` API

From 60fa3723a66728e12926908c9b844ffb5a218a06 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Sat, 6 Jun 2026 16:46:10 +0200
Subject: [PATCH 4/4] fix(benchmarks): fail cleanly on malformed/unreadable
 snapshot files

compare crashed with a raw JSONDecodeError traceback (and plot printed a
cryptic "Expecting value:" with no filename) when handed an empty or malformed
snapshot. load_snapshot now raises a clear, file-named ValueError on unreadable
JSON or an unrecognized shape; compare routes its metric detection through
load_snapshot too, so both commands report e.g.

  /tmp/x.json: not a readable JSON snapshot (Expecting value: line 1 ...)

The mixed memory/timing guard (compare's any/all, plot's _check_same_unit) was
already correct and is unchanged.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli/compare.py | 18 ++++++++++++------
 benchmarks/snapshot.py    | 10 +++++++++-
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/benchmarks/cli/compare.py b/benchmarks/cli/compare.py
index 60b5b1b3..19244edb 100644
--- a/benchmarks/cli/compare.py
+++ b/benchmarks/cli/compare.py
@@ -63,12 +63,18 @@ def compare(ctx: typer.Context) -> None:
         _suggest_snapshots(f"missing snapshots: {[str(p) for p in missing]}")
         raise typer.Exit(code=2)
 
-    # Auto-detect the metric from the snapshots (memory snapshots carry a
-    # ``peak_mib`` key; timing ones don't) and route accordingly — no
-    # ``memory compare`` needed.
-    import json
-
-    is_memory = ["peak_mib" in json.loads(p.read_text()) for p in snapshots]
+    # Auto-detect the metric from the snapshots (memory snapshots load as MiB,
+    # timing as s) and route accordingly — no ``memory compare`` needed.
+    # ``load_snapshot`` validates each file, so a malformed/unreadable one
+    # fails here with a clear, file-named message instead of a raw traceback.
+    from benchmarks.snapshot import load_snapshot
+
+    try:
+        units = [load_snapshot(p)[2] for p in snapshots]
+    except ValueError as exc:
+        typer.secho(str(exc), fg=typer.colors.RED, err=True)
+        raise typer.Exit(code=2) from exc
+    is_memory = [u == "MiB" for u in units]
     if any(is_memory):
         if not all(is_memory):
             typer.secho(
diff --git a/benchmarks/snapshot.py b/benchmarks/snapshot.py
index 07b231e0..193c780e 100644
--- a/benchmarks/snapshot.py
+++ b/benchmarks/snapshot.py
@@ -134,9 +134,17 @@ def load_snapshot(
     - memory (``{"peak_mib": {id: float}}``) → ``value`` is the peak in
       **MiB**; ``metric`` is ignored.
     """
-    data = json.loads(path.read_text())
+    try:
+        data = json.loads(path.read_text())
+    except (OSError, json.JSONDecodeError) as exc:
+        raise ValueError(f"{path}: not a readable JSON snapshot ({exc})") from exc
     if "peak_mib" in data:
         return path.stem, dict(data["peak_mib"]), "MiB"
+    if "benchmarks" not in data:
+        raise ValueError(
+            f"{path}: unrecognized snapshot shape "
+            "(no 'peak_mib' memory key or 'benchmarks' timing key)"
+        )
     values = {bm["fullname"]: bm["stats"][metric] for bm in data["benchmarks"]}
     return path.stem, values, "s"