fluxopt · FBumann · Jun 6, 2026 · Jun 6, 2026 · Jun 6, 2026 · Jun 6, 2026
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -14,8 +14,8 @@ This README only covers install and how to open the walkthrough.
 
 ## Models vs patterns
 
-Two kinds of benchmark spec, same harness (time + peak memory, same phases),
-distinguished by their sweep axis:
+Two kinds of benchmark spec, same harness (time *or* peak memory — a
+`run`/`sweep` `--metric` flag, same phases), distinguished by their sweep axis:
 
 - **Models** (`models/`, `REGISTRY`) — whole `linopy.Model`s swept over
   `size` (axis `n`): "how does cost scale with the problem?"

diff --git a/benchmarks/cli/__init__.py b/benchmarks/cli/__init__.py
@@ -25,7 +25,6 @@
 from benchmarks.cli import sweep  # noqa: F401
 from benchmarks.cli import compare  # noqa: F401
 from benchmarks.cli import plot  # noqa: F401
-from benchmarks.cli import memory  # noqa: F401
 
 # isort: on
 

diff --git a/benchmarks/cli/_base.py b/benchmarks/cli/_base.py
@@ -2,21 +2,37 @@
 Shared app object, types, and helpers for the benchmark CLI.
 
 The command groups (``introspect``, ``run``, ``sweep``, ``compare``,
-``plot``, ``memory``) all register onto the single ``app`` defined here, so
-the user-facing command surface stays flat (``python -m benchmarks run`` etc.).
+``plot``) all register onto the single ``app`` defined here, so the
+user-facing command surface stays flat (``python -m benchmarks run`` etc.).
+Time vs memory is a ``--metric`` flag on ``run``/``sweep``, not a sub-app.
 
 Note on colour: ``typer.secho`` strips colour automatically when stdout isn't
 a TTY, so piping any command into ``grep`` still yields plain text.
 """
 
 from __future__ import annotations
 
+from enum import StrEnum
 from typing import Literal
 
 import typer
 
 from benchmarks.snapshot import discover_snapshots
 
+
+class Measure(StrEnum):
+    """
+    What a measuring command records — orthogonal to the workflow.
+
+    ``time`` runs pytest-benchmark (wall clock); ``memory`` tracks peak RSS
+    via memray; ``both`` runs them sequentially (never concurrently — memray's
+    overhead would skew the wall-clock numbers).
+    """
+
+    time = "time"
+    memory = "memory"
+    both = "both"
+
 app = typer.Typer(
     help=(
         "Linopy internal benchmark suite — a thin layer over pytest plus "
@@ -26,13 +42,6 @@
     rich_markup_mode="rich",
 )
 
-memory_app = typer.Typer(
-    help="Peak-RSS memory snapshots (pytest-memray under the hood).",
-    no_args_is_help=True,
-)
-app.add_typer(memory_app, name="memory")
-
-
 PhaseName = Literal[
     "build", "matrices", "to_lp", "to_netcdf", "from_netcdf", "to_solver"
 ]

diff --git a/benchmarks/cli/compare.py b/benchmarks/cli/compare.py
@@ -16,11 +16,11 @@
 )
 def compare(ctx: typer.Context) -> None:
     """
-    Compare timing snapshots side-by-side via ``pytest-benchmark compare``.
+    Compare two snapshots side-by-side — timing or memory, auto-detected.
 
-    Thin wrapper around the upstream tool so the whole suite stays under
-    one entry point. Pass the snapshot paths first, then any pytest-benchmark
-    flags::
+    Timing snapshots wrap ``pytest-benchmark compare``; memory snapshots
+    (``peak_mib`` key) get a peak-RSS table. Pass the snapshot paths first,
+    then any pytest-benchmark flags (timing only)::
 
         python -m benchmarks compare a.json b.json
         python -m benchmarks compare a.json b.json --group-by=name
@@ -29,8 +29,9 @@ def compare(ctx: typer.Context) -> None:
     With no arguments (or missing paths), prints what snapshots exist
     under ``.benchmarks/`` so you can copy-paste the path you want.
 
-    For memory snapshots use ``memory compare`` instead — different format,
-    different tool.
+    Memory snapshots (``peak_mib`` key) are auto-detected and diffed with a
+    peak-RSS table; timing snapshots go through pytest-benchmark. The two
+    can't be mixed in one call.
 
     Implementation note: typer/click don't have a clean idiom for "list-typed
     positional + pass-through", so this command parses ``ctx.args`` by hand
@@ -62,6 +63,38 @@ def compare(ctx: typer.Context) -> None:
         _suggest_snapshots(f"missing snapshots: {[str(p) for p in missing]}")
         raise typer.Exit(code=2)
 
+    # Auto-detect the metric from the snapshots (memory snapshots load as MiB,
+    # timing as s) and route accordingly — no ``memory compare`` needed.
+    # ``load_snapshot`` validates each file, so a malformed/unreadable one
+    # fails here with a clear, file-named message instead of a raw traceback.
+    from benchmarks.snapshot import load_snapshot
+
+    try:
+        units = [load_snapshot(p)[2] for p in snapshots]
+    except ValueError as exc:
+        typer.secho(str(exc), fg=typer.colors.RED, err=True)
+        raise typer.Exit(code=2) from exc
+    is_memory = [u == "MiB" for u in units]
+    if any(is_memory):
+        if not all(is_memory):
+            typer.secho(
+                "can't compare memory and timing snapshots together",
+                fg=typer.colors.RED,
+                err=True,
+            )
+            raise typer.Exit(code=2)
+        if len(snapshots) != 2:
+            typer.secho(
+                "memory compare takes exactly 2 snapshots",
+                fg=typer.colors.RED,
+                err=True,
+            )
+            raise typer.Exit(code=2)
+        from benchmarks.memory import compare_snapshots
+
+        compare_snapshots(snapshots[0], snapshots[1])
+        return
+
     # Override pytest-benchmark's wide default table: ``--group-by=fullname``
     # gives each test its own (baseline, candidate) mini-table and
     # ``--columns=min,iqr`` shows the noise-floor time plus spread. Applied

diff --git a/benchmarks/cli/introspect.py b/benchmarks/cli/introspect.py
@@ -93,7 +93,7 @@ def _row(label: str, value: object) -> None:
         _row("sizes:", spec.sizes)
         _row("features:", sorted(spec.features))
         _row("phases:", sorted(spec.phases))
-        _row("quick_threshold:", spec.quick_threshold)
+        _row("quick:", spec.quick_subset)
         _row("long_threshold:", spec.long_threshold)
         if spec.requires:
             _row("requires:", list(spec.requires))
@@ -105,7 +105,7 @@ def _row(label: str, value: object) -> None:
         _row("severities:", pattern.severities)
         _row("description:", pattern.description)
         _row("phases:", sorted(pattern.phases))
-        _row("quick_threshold:", pattern.quick_threshold)
+        _row("quick:", pattern.quick_subset)
         _row("long_threshold:", pattern.long_threshold)
         if pattern.requires:
             _row("requires:", list(pattern.requires))

diff --git a/benchmarks/cli/memory.py b/benchmarks/cli/memory.py