From 3c13b797c5f330a2740df758eb77b26f885fc5e1 Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Sat, 6 Jun 2026 13:17:54 +0200 Subject: [PATCH 1/4] refactor(benchmarks): --quick as explicit per-spec subset + manual axis selection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the quick_threshold size cap with quick_subset — the first/middle/last of each spec's sweep — so --quick runs three representative points instead of a threshold side-effect. This fixes two latent bugs the threshold caused: - models ran only their smallest size under --quick (most quick_threshold=10), so the per-PR memory signal was dominated by sub-MiB benchmarks; - patterns silently skipped severity=100 (quick_threshold=50), so the densest, peak-memory regime never ran in CI. Both now follow one rule (first/mid/last); patterns get all of (0, 50, 100). pypsa_scigrid opts out via quick_sizes=(). Add manual --size / --severity pytest options (repeatable) that override the tier flags per axis, so CI can pin exact values. Widen DEFAULT_SEVERITIES to (0, 25, 50, 75, 100) for finer default/--long/sweep resolution; --quick still distills to (0, 50, 100), so CodSpeed isn't rebaselined. Co-Authored-By: Claude Opus 4.8 (1M context) --- benchmarks/cli/introspect.py | 4 +- benchmarks/conftest.py | 49 ++++++++++++++--- benchmarks/memory.py | 2 +- benchmarks/models/basic.py | 1 - benchmarks/models/expression_arithmetic.py | 1 - benchmarks/models/knapsack.py | 1 - benchmarks/models/masked.py | 1 - benchmarks/models/milp.py | 1 - benchmarks/models/piecewise.py | 1 - benchmarks/models/pypsa_scigrid.py | 8 +-- benchmarks/models/qp.py | 1 - benchmarks/models/sos.py | 1 - benchmarks/models/sparse_network.py | 1 - benchmarks/models/storage.py | 1 - benchmarks/plotting.py | 2 +- benchmarks/registry.py | 61 +++++++++++++++++----- 16 files changed, 96 insertions(+), 40 deletions(-) diff --git a/benchmarks/cli/introspect.py b/benchmarks/cli/introspect.py index 08f1038b..41e5dc3d 100644 --- a/benchmarks/cli/introspect.py +++ b/benchmarks/cli/introspect.py @@ -93,7 +93,7 @@ def _row(label: str, value: object) -> None: _row("sizes:", spec.sizes) _row("features:", sorted(spec.features)) _row("phases:", sorted(spec.phases)) - _row("quick_threshold:", spec.quick_threshold) + _row("quick:", spec.quick_subset) _row("long_threshold:", spec.long_threshold) if spec.requires: _row("requires:", list(spec.requires)) @@ -105,7 +105,7 @@ def _row(label: str, value: object) -> None: _row("severities:", pattern.severities) _row("description:", pattern.description) _row("phases:", sorted(pattern.phases)) - _row("quick_threshold:", pattern.quick_threshold) + _row("quick:", pattern.quick_subset) _row("long_threshold:", pattern.long_threshold) if pattern.requires: _row("requires:", list(pattern.requires)) diff --git a/benchmarks/conftest.py b/benchmarks/conftest.py index ae1027c9..dbe5393b 100644 --- a/benchmarks/conftest.py +++ b/benchmarks/conftest.py @@ -33,6 +33,28 @@ def pytest_addoption(parser: pytest.Parser) -> None: "Default runs skip them." ), ) + parser.addoption( + "--size", + action="append", + type=int, + default=[], + metavar="N", + help=( + "Run only these model sizes (repeatable). Overrides --quick/--long " + "for models, leaving patterns on the prevailing tier." + ), + ) + parser.addoption( + "--severity", + action="append", + type=int, + default=[], + metavar="S", + help=( + "Run only these pattern severities (repeatable). Overrides " + "--quick/--long for patterns, leaving models on the prevailing tier." + ), + ) def pytest_collection_modifyitems( @@ -57,25 +79,36 @@ def pytest_collection_modifyitems( def maybe_skip(request: pytest.FixtureRequest, spec: BenchSpec, size: int) -> None: """ - Apply size-tier skips and ``spec.requires`` importorskips. + Apply size selection and ``spec.requires`` importorskips. - Tiers (most restrictive first): + Selection (most specific first): - - ``--quick`` → skip ``size > quick_threshold`` - - default (no flag) → skip ``size > long_threshold`` - - ``--long`` → no size cap + - ``--size N`` / ``--severity S`` → run only the listed values for that + axis (models read ``--size``, patterns ``--severity``); overrides tiers. + - ``--quick`` → only ``spec.quick_subset`` + - default (no flag) → skip ``size > long_threshold`` + - ``--long`` → no size cap - If both ``--quick`` and ``--long`` are passed, ``--quick`` wins (the more - restrictive mode is honoured). + A manual axis flag wins over ``--quick``/``--long``; ``--quick`` in turn + wins over ``--long`` (the more restrictive mode is honoured). """ for mod in spec.requires: pytest.importorskip(mod) + # Manual axis selection (e.g. from CI): --size for models, --severity for + # patterns. Empty list ⇒ not requested, fall through to the tier flags. + flag = "--severity" if spec.axis == "severity" else "--size" + manual = request.config.getoption(flag) + if manual: + if size not in manual: + pytest.skip(f"{flag}: {spec.name} {spec.axis}={size} not selected") + return + quick = request.config.getoption("--quick") long_ = request.config.getoption("--long") if quick: - if size > spec.quick_threshold: + if size not in spec.quick_subset: pytest.skip(f"--quick: skipping {spec.name} {spec.axis}={size}") elif not long_: if size > spec.long_threshold: diff --git a/benchmarks/memory.py b/benchmarks/memory.py index 7f5c69a0..ac77de0f 100644 --- a/benchmarks/memory.py +++ b/benchmarks/memory.py @@ -281,7 +281,7 @@ def run_phase( break else: for value in spec.sweep: - if quick and value > spec.quick_threshold: + if quick and value not in spec.quick_subset: continue key = spec_param_id(spec.name, spec.axis, value) if filter_expr and filter_expr not in key: diff --git a/benchmarks/models/basic.py b/benchmarks/models/basic.py index 6959e188..a41f75ae 100644 --- a/benchmarks/models/basic.py +++ b/benchmarks/models/basic.py @@ -25,7 +25,6 @@ def build_basic(n: int) -> linopy.Model: build=build_basic, sizes=SIZES, features=frozenset({CONTINUOUS}), - quick_threshold=10, long_threshold=500, ) ) diff --git a/benchmarks/models/expression_arithmetic.py b/benchmarks/models/expression_arithmetic.py index 80590951..0f687f05 100644 --- a/benchmarks/models/expression_arithmetic.py +++ b/benchmarks/models/expression_arithmetic.py @@ -37,7 +37,6 @@ def build_expression_arithmetic(n: int) -> linopy.Model: build=build_expression_arithmetic, sizes=SIZES, features=frozenset({CONTINUOUS}), - quick_threshold=10, long_threshold=500, ) ) diff --git a/benchmarks/models/knapsack.py b/benchmarks/models/knapsack.py index 7860f285..33dc1ca8 100644 --- a/benchmarks/models/knapsack.py +++ b/benchmarks/models/knapsack.py @@ -31,7 +31,6 @@ def build_knapsack(n: int) -> linopy.Model: sizes=SIZES, features=frozenset({BINARY}), phases=DEFAULT_PHASES, # HiGHS handles binary; matrices handles MILP - quick_threshold=100, long_threshold=10_000, ) ) diff --git a/benchmarks/models/masked.py b/benchmarks/models/masked.py index fccac137..ec024d8b 100644 --- a/benchmarks/models/masked.py +++ b/benchmarks/models/masked.py @@ -85,7 +85,6 @@ def build_masked(n: int) -> linopy.Model: sizes=SIZES, features=frozenset({CONTINUOUS, MASKED}), phases=DEFAULT_PHASES, - quick_threshold=10, long_threshold=500, ) ) diff --git a/benchmarks/models/milp.py b/benchmarks/models/milp.py index e762f207..98fe9f99 100644 --- a/benchmarks/models/milp.py +++ b/benchmarks/models/milp.py @@ -74,7 +74,6 @@ def build_milp(n: int) -> linopy.Model: sizes=SIZES, features=frozenset({INTEGER, CONTINUOUS}), phases=DEFAULT_PHASES, - quick_threshold=10, long_threshold=100, ) ) diff --git a/benchmarks/models/piecewise.py b/benchmarks/models/piecewise.py index 77157ba1..069d135b 100644 --- a/benchmarks/models/piecewise.py +++ b/benchmarks/models/piecewise.py @@ -86,7 +86,6 @@ def build_piecewise(n_gens: int) -> linopy.Model: # reformulation (pure MILP with binaries), which every supported # solver handles. phases=DEFAULT_PHASES, - quick_threshold=10, long_threshold=1_000, ) ) diff --git a/benchmarks/models/pypsa_scigrid.py b/benchmarks/models/pypsa_scigrid.py index 656b41b6..30641897 100644 --- a/benchmarks/models/pypsa_scigrid.py +++ b/benchmarks/models/pypsa_scigrid.py @@ -32,10 +32,10 @@ def build_pypsa_scigrid(snapshots: int = 100) -> linopy.Model: build=build_pypsa_scigrid, sizes=SIZES, features=frozenset({CONTINUOUS}), - # quick_threshold=0 keeps pypsa_scigrid out of --quick entirely — - # PyPSA import + example loading dominates the smoke wall-clock - # otherwise. It still runs in default and --long modes. - quick_threshold=0, + # quick_sizes=() keeps pypsa_scigrid out of --quick entirely — PyPSA + # import + example loading dominates the smoke wall-clock otherwise. + # It still runs in default and --long modes. + quick_sizes=(), long_threshold=50, requires=("pypsa",), ) diff --git a/benchmarks/models/qp.py b/benchmarks/models/qp.py index a040df45..62b2002f 100644 --- a/benchmarks/models/qp.py +++ b/benchmarks/models/qp.py @@ -60,7 +60,6 @@ def build_qp(n_assets: int) -> linopy.Model: sizes=SIZES, features=frozenset({CONTINUOUS, QUADRATIC}), phases=DEFAULT_PHASES, - quick_threshold=10, long_threshold=1_000, ) ) diff --git a/benchmarks/models/sos.py b/benchmarks/models/sos.py index 55beab41..163b8763 100644 --- a/benchmarks/models/sos.py +++ b/benchmarks/models/sos.py @@ -93,7 +93,6 @@ def build_sos(n_gens: int) -> linopy.Model: TO_XPRESS, } ), - quick_threshold=10, long_threshold=1_000, ) ) diff --git a/benchmarks/models/sparse_network.py b/benchmarks/models/sparse_network.py index 7ac71db1..e213a03b 100644 --- a/benchmarks/models/sparse_network.py +++ b/benchmarks/models/sparse_network.py @@ -57,7 +57,6 @@ def build_sparse_network(n_buses: int) -> linopy.Model: build=build_sparse_network, sizes=SIZES, features=frozenset({CONTINUOUS}), - quick_threshold=10, long_threshold=500, ) ) diff --git a/benchmarks/models/storage.py b/benchmarks/models/storage.py index 27239d97..8d76ec69 100644 --- a/benchmarks/models/storage.py +++ b/benchmarks/models/storage.py @@ -50,7 +50,6 @@ def build_storage(n_storage: int) -> linopy.Model: build=build_storage, sizes=SIZES, features=frozenset({CONTINUOUS}), - quick_threshold=10, long_threshold=500, description="storage SoC recursion via .shift() — bidiagonal intertemporal coupling", ) diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py index e7af212f..47350d0a 100644 --- a/benchmarks/plotting.py +++ b/benchmarks/plotting.py @@ -347,7 +347,7 @@ def plot_scatter( **extra, ) fig.add_hline( - y=1.0, line_dash="dash", line_color="grey", annotation_text="no change" + y=1.0, line_dash="dash", line_color="grey", ) fig.update_traces(marker=dict(size=8, line=dict(width=0.5, color="DarkSlateGrey"))) if facets is not None: diff --git a/benchmarks/registry.py b/benchmarks/registry.py index 5c3a7802..dba2c43a 100644 --- a/benchmarks/registry.py +++ b/benchmarks/registry.py @@ -8,7 +8,7 @@ - ``sizes`` canonical tuned sizes - ``features`` variable / constraint kinds it uses - ``phases`` applicable phases (to_lp, to_highspy, …) -- ``quick_threshold`` max size under ``pytest --quick`` +- ``quick_sizes`` ``--quick`` subset (default: first/mid/last) - ``requires`` modules to ``pytest.importorskip`` :: @@ -84,18 +84,34 @@ ) +def _quick_subset(values: tuple[int, ...]) -> tuple[int, ...]: + """ + The curated ``--quick`` subset of a sweep: first, middle, last. + + Three representative points — a cheap smoke size, the midpoint, and the + peak — deduped (so 1–2 value sweeps collapse cleanly). For the 3-value + severity sweep this is the whole ``(0, 50, 100)``. + """ + if not values: + return () + picks = (values[0], values[len(values) // 2], values[-1]) + return tuple(dict.fromkeys(picks)) + + @dataclass(frozen=True, repr=False) class ModelSpec: """ Declarative description of one benchmark model. - Three size tiers gate the cost of a default ``pytest benchmarks/`` run: + Three tiers gate the cost of a default ``pytest benchmarks/`` run: - - ``size <= quick_threshold``: included under ``--quick`` (smoke / CI). - - ``size <= long_threshold``: included by default (medium-cost regression). - - ``size > long_threshold``: only included under ``--long`` (full sweep). + - ``--quick``: only ``quick_subset`` — an explicit subset (defaults to the + first / middle / last of ``sizes``). The per-PR / CI smoke set. + - default: every size up to ``long_threshold`` (medium-cost regression). + - ``--long``: every size, no cap. - Without explicit values, both thresholds default to "no cap". + ``long_threshold`` defaults to "no cap"; set ``quick_sizes`` to override the + derived quick subset (``()`` opts the spec out of ``--quick`` entirely). """ name: str @@ -103,7 +119,7 @@ class ModelSpec: sizes: tuple[int, ...] features: frozenset[str] = frozenset({CONTINUOUS}) phases: frozenset[str] = DEFAULT_PHASES - quick_threshold: int = 10**9 + quick_sizes: tuple[int, ...] | None = None long_threshold: int = 10**9 requires: tuple[str, ...] = () description: str = "" @@ -118,6 +134,14 @@ def axis(self) -> str: """Short x-axis label for the sweep dial: a model scales by size.""" return "n" + @property + def quick_subset(self) -> tuple[int, ...]: + """ + Sizes that run under ``--quick`` — the derived first/mid/last, + unless ``quick_sizes`` overrides it (``()`` opts out entirely). + """ + return _quick_subset(self.sweep) if self.quick_sizes is None else self.quick_sizes + def applies_to(self, phase: str) -> bool: return phase in self.phases @@ -140,7 +164,7 @@ def _repr_html_(self) -> str: ("features", ", ".join(sorted(self.features))), ("sizes", ", ".join(str(s) for s in self.sizes)), ("phases", ", ".join(sorted(self.phases))), - ("quick_threshold", self.quick_threshold), + ("quick", ", ".join(str(s) for s in self.quick_subset) or "—"), ("long_threshold", self.long_threshold), ("requires", ", ".join(self.requires) or "—"), ] @@ -223,7 +247,7 @@ def param_ids(params: list[tuple[BenchSpec, int]]) -> list[str]: # --- Patterns --------------------------------------------------------------- -DEFAULT_SEVERITIES: tuple[int, ...] = (0, 50, 100) +DEFAULT_SEVERITIES: tuple[int, ...] = (0, 25, 50, 75, 100) class BenchSpec(Protocol): @@ -246,7 +270,7 @@ def phases(self) -> frozenset[str]: ... @property def requires(self) -> tuple[str, ...]: ... @property - def quick_threshold(self) -> int: ... + def quick_subset(self) -> tuple[int, ...]: ... @property def long_threshold(self) -> int: ... @property @@ -273,9 +297,10 @@ class PatternSpec: A pattern builds a complete model, so it runs the same ``phases`` as a model by default — the build-vs-export contrast (does the dense-``_term`` bloat reach the matrix / LP file, or collapse?) is the point. The full severity - range runs by default; ``--quick`` keeps everything up to the midpoint - (``{0, 25, 50}``) so smoke exercises real pathology, not just the benign - endpoint, while skipping the heaviest builds. + range (``0, 25, 50, 75, 100``) runs by default; ``--quick`` keeps the + ``quick_subset`` (first/middle/last of ``severities`` — ``(0, 50, 100)``) so + smoke exercises the benign, midpoint *and* worst-case shapes, while the full + sweep keeps the finer resolution. """ name: str @@ -284,7 +309,7 @@ class PatternSpec: severities: tuple[int, ...] = DEFAULT_SEVERITIES phases: frozenset[str] = DEFAULT_PHASES requires: tuple[str, ...] = () - quick_threshold: int = 50 + quick_sizes: tuple[int, ...] | None = None long_threshold: int = 10**9 @property @@ -295,6 +320,14 @@ def sweep(self) -> tuple[int, ...]: def axis(self) -> str: return "severity" + @property + def quick_subset(self) -> tuple[int, ...]: + """ + Severities that run under ``--quick`` — the derived first/mid/last, + unless ``quick_sizes`` overrides it (``()`` opts out entirely). + """ + return _quick_subset(self.sweep) if self.quick_sizes is None else self.quick_sizes + def applies_to(self, phase: str) -> bool: return phase in self.phases From 631fce7dedaa0c41624fe5eab9cdd94e8fa06e19 Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Sat, 6 Jun 2026 13:33:51 +0200 Subject: [PATCH 2/4] feat(benchmarks): sweep --metric {time,memory,both} + compare auto-detect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the metric (time vs memory) a flag on the workflow rather than a separate command tree — the first half of unifying the time/memory CLI split. - Add a Measure enum (time|memory|both) and `--metric` to `sweep`, dispatching to the timing or memory engine (or both, sequentially — memray overhead would skew wall-clock if run concurrently). `both` writes per-metric subdirs so the two linopy-.json sets don't collide. Default snapshot dirs are now .benchmarks/{time,memory}/ (timing default moved from .benchmarks/sweep). - `compare` auto-detects memory snapshots (peak_mib key) and diffs them with a peak-RSS table; timing still goes through pytest-benchmark. Mixing errors out. Adds memory.compare_snapshots (path-based; compare() now delegates to it). The `memory` sub-app still works; Stage 2 retires it and adds `run --metric` + --long/--size/--severity parity on the memory engine. Co-Authored-By: Claude Opus 4.8 (1M context) --- benchmarks/cli/_base.py | 15 +++++++ benchmarks/cli/compare.py | 31 ++++++++++++- benchmarks/cli/sweep.py | 93 +++++++++++++++++++++++++++++++-------- benchmarks/memory.py | 11 +++-- 4 files changed, 127 insertions(+), 23 deletions(-) diff --git a/benchmarks/cli/_base.py b/benchmarks/cli/_base.py index 61362a5d..bde1d5fc 100644 --- a/benchmarks/cli/_base.py +++ b/benchmarks/cli/_base.py @@ -11,12 +11,27 @@ from __future__ import annotations +from enum import StrEnum from typing import Literal import typer from benchmarks.snapshot import discover_snapshots + +class Measure(StrEnum): + """ + What a measuring command records — orthogonal to the workflow. + + ``time`` runs pytest-benchmark (wall clock); ``memory`` tracks peak RSS + via memray; ``both`` runs them sequentially (never concurrently — memray's + overhead would skew the wall-clock numbers). + """ + + time = "time" + memory = "memory" + both = "both" + app = typer.Typer( help=( "Linopy internal benchmark suite — a thin layer over pytest plus " diff --git a/benchmarks/cli/compare.py b/benchmarks/cli/compare.py index 371f9183..dc6d6399 100644 --- a/benchmarks/cli/compare.py +++ b/benchmarks/cli/compare.py @@ -29,8 +29,9 @@ def compare(ctx: typer.Context) -> None: With no arguments (or missing paths), prints what snapshots exist under ``.benchmarks/`` so you can copy-paste the path you want. - For memory snapshots use ``memory compare`` instead — different format, - different tool. + Memory snapshots (``peak_mib`` key) are auto-detected and diffed with a + peak-RSS table; timing snapshots go through pytest-benchmark. The two + can't be mixed in one call. Implementation note: typer/click don't have a clean idiom for "list-typed positional + pass-through", so this command parses ``ctx.args`` by hand @@ -62,6 +63,32 @@ def compare(ctx: typer.Context) -> None: _suggest_snapshots(f"missing snapshots: {[str(p) for p in missing]}") raise typer.Exit(code=2) + # Auto-detect the metric from the snapshots (memory snapshots carry a + # ``peak_mib`` key; timing ones don't) and route accordingly — no + # ``memory compare`` needed. + import json + + is_memory = ["peak_mib" in json.loads(p.read_text()) for p in snapshots] + if any(is_memory): + if not all(is_memory): + typer.secho( + "can't compare memory and timing snapshots together", + fg=typer.colors.RED, + err=True, + ) + raise typer.Exit(code=2) + if len(snapshots) != 2: + typer.secho( + "memory compare takes exactly 2 snapshots", + fg=typer.colors.RED, + err=True, + ) + raise typer.Exit(code=2) + from benchmarks.memory import compare_snapshots + + compare_snapshots(snapshots[0], snapshots[1]) + return + # Override pytest-benchmark's wide default table: ``--group-by=fullname`` # gives each test its own (baseline, candidate) mini-table and # ``--columns=min,iqr`` shows the noise-floor time plus spread. Applied diff --git a/benchmarks/cli/sweep.py b/benchmarks/cli/sweep.py index 970948cf..923553cd 100644 --- a/benchmarks/cli/sweep.py +++ b/benchmarks/cli/sweep.py @@ -10,10 +10,11 @@ from benchmarks.cli._base import ( _PHASE_TEST_FILE, _SMOKE_PYTEST_ARGS, + Measure, PhaseName, app, ) -from benchmarks.sweep import run_sweep +from benchmarks.sweep import run_memory_sweep, run_sweep @app.command( @@ -25,10 +26,35 @@ def sweep( list[str], typer.Argument(help="linopy versions, e.g. 0.4.0 0.5.0 (or any pip spec)."), ], + metric: Annotated[ + Measure, + typer.Option( + "--metric", + help=( + "What to measure: ``time`` (pytest-benchmark wall clock), " + "``memory`` (peak RSS via memray), or ``both`` (sequential). " + "Default: time." + ), + ), + ] = Measure.time, output_dir: Annotated[ - Path, - typer.Option("--output-dir", "-o", help="Where to save snapshot JSONs."), - ] = Path(".benchmarks/sweep"), + Path | None, + typer.Option( + "--output-dir", + "-o", + help=( + "Where to save snapshot JSONs. Default: ``.benchmarks//`` " + "(``both`` writes ``time`` and ``memory`` subdirs)." + ), + ), + ] = None, + repeats: Annotated[ + int, + typer.Option( + "--repeats", + help="Memory only: min-of-N peak per measurement (default 1).", + ), + ] = 1, long: Annotated[ bool, typer.Option("--long", help="Include the slowest sizes.") ] = False, @@ -120,17 +146,48 @@ def sweep( Wall-clock: roughly 1-2 minutes per version (venv + install + benchmarks). uv's wheel cache makes repeated runs much faster. """ - test_target = _PHASE_TEST_FILE[phase] if phase is not None else "benchmarks/" - run_sweep( - versions, - output_dir=output_dir, - test_target=test_target, - smoke_args=_SMOKE_PYTEST_ARGS, - long=long, - quick=quick, - rounds=rounds, - filter_expr=filter_expr, - smoke=smoke, - as_of=as_of, - extra_args=ctx.args, - ) + # Timing-only knobs can't apply to a memory run. + if metric is not Measure.time and (smoke or rounds is not None): + typer.secho( + "--smoke / --rounds are timing-only (use --metric time)", + fg=typer.colors.RED, + err=True, + ) + raise typer.Exit(code=2) + + def _timing(out: Path) -> None: + test_target = _PHASE_TEST_FILE[phase] if phase is not None else "benchmarks/" + run_sweep( + versions, + output_dir=out, + test_target=test_target, + smoke_args=_SMOKE_PYTEST_ARGS, + long=long, + quick=quick, + rounds=rounds, + filter_expr=filter_expr, + smoke=smoke, + as_of=as_of, + extra_args=ctx.args, + ) + + def _memory(out: Path) -> None: + run_memory_sweep( + versions, + output_dir=out, + quick=quick, + phases=[phase] if phase is not None else None, + repeats=repeats, + as_of=as_of, + ) + + # ``both`` runs sequentially into per-metric subdirs so the two + # ``linopy-.json`` snapshot sets never collide. + if metric is Measure.both: + base = output_dir + _timing(base / "time" if base else Path(".benchmarks/time")) + _memory(base / "memory" if base else Path(".benchmarks/memory")) + elif metric is Measure.memory: + _memory(output_dir or Path(".benchmarks/memory")) + else: + _timing(output_dir or Path(".benchmarks/time")) diff --git a/benchmarks/memory.py b/benchmarks/memory.py index ac77de0f..4d8285c2 100644 --- a/benchmarks/memory.py +++ b/benchmarks/memory.py @@ -385,16 +385,21 @@ def save( def compare(label_a: str, label_b: str) -> None: - """Diff two saved memory snapshots side-by-side.""" + """Diff two saved memory snapshots (by label) side-by-side.""" path_a = RESULTS_DIR / f"{label_a}.json" path_b = RESULTS_DIR / f"{label_b}.json" for p in (path_a, path_b): if not p.exists(): print(f"Not found: {p}. Run 'save {p.stem}' first.", file=sys.stderr) sys.exit(1) + compare_snapshots(path_a, path_b) - data_a = json.loads(path_a.read_text())["peak_mib"] - data_b = json.loads(path_b.read_text())["peak_mib"] + +def compare_snapshots(path_a: Path, path_b: Path) -> None: + """Diff two memory snapshots (by path) side-by-side.""" + label_a, label_b = Path(path_a).stem, Path(path_b).stem + data_a = json.loads(Path(path_a).read_text())["peak_mib"] + data_b = json.loads(Path(path_b).read_text())["peak_mib"] all_tests = sorted(set(data_a) | set(data_b)) From 60e3bfc96eaf5f37fc01b03d65f6ed8cd2da4491 Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Sat, 6 Jun 2026 16:20:59 +0200 Subject: [PATCH 3/4] =?UTF-8?q?feat(benchmarks):=20retire=20memory=20sub-a?= =?UTF-8?q?pp=20=E2=80=94=20`run=20--metric`=20+=20unified=20gating?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Finish folding the time/memory split into a single command surface. Memory is now a --metric flag everywhere, not a parallel `memory` sub-app. - `run --metric {time,memory,both}` replaces `memory save`: prints results, saves only with --json (label = filename stem) — one rule for both metrics. Adds --quick/--size/--severity/--repeats so flags match across metrics. - Delete the `memory` sub-app (cli/memory.py, memory_app); `memory sweep` → `sweep --metric memory`, `memory compare` → `compare` (auto-detect, Stage 1). - Unify size/severity selection in registry.skip_reason(), shared by conftest.maybe_skip and memory.run_phase so the two engines can't drift. Threads --long/--size/--severity through the memory worker; the id-alignment test still passes. - Split memory.save() into measure() (returns the dict) + save() (writes), so `run --metric memory` can measure-then-maybe-write. - run_memory_sweep now invokes `run --metric memory --json ` and writes straight to output_dir (no save-to-default-then-move dance); gains --long. - Update the walkthrough (CI-executed, re-runs clean) and benchmarks/README. Co-Authored-By: Claude Opus 4.8 (1M context) --- benchmarks/README.md | 4 +- benchmarks/cli/__init__.py | 1 - benchmarks/cli/_base.py | 12 +-- benchmarks/cli/compare.py | 8 +- benchmarks/cli/memory.py | 169 ------------------------------------- benchmarks/cli/run.py | 158 ++++++++++++++++++++++++++-------- benchmarks/cli/sweep.py | 1 + benchmarks/conftest.py | 32 +++---- benchmarks/memory.py | 80 +++++++++++++++--- benchmarks/registry.py | 34 ++++++++ benchmarks/sweep.py | 31 ++++--- benchmarks/walkthrough.md | 50 ++++++----- 12 files changed, 288 insertions(+), 292 deletions(-) delete mode 100644 benchmarks/cli/memory.py diff --git a/benchmarks/README.md b/benchmarks/README.md index d264f682..566d5e30 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -14,8 +14,8 @@ This README only covers install and how to open the walkthrough. ## Models vs patterns -Two kinds of benchmark spec, same harness (time + peak memory, same phases), -distinguished by their sweep axis: +Two kinds of benchmark spec, same harness (time *or* peak memory — a +`run`/`sweep` `--metric` flag, same phases), distinguished by their sweep axis: - **Models** (`models/`, `REGISTRY`) — whole `linopy.Model`s swept over `size` (axis `n`): "how does cost scale with the problem?" diff --git a/benchmarks/cli/__init__.py b/benchmarks/cli/__init__.py index 0d71afe0..1f7c4738 100644 --- a/benchmarks/cli/__init__.py +++ b/benchmarks/cli/__init__.py @@ -25,7 +25,6 @@ from benchmarks.cli import sweep # noqa: F401 from benchmarks.cli import compare # noqa: F401 from benchmarks.cli import plot # noqa: F401 -from benchmarks.cli import memory # noqa: F401 # isort: on diff --git a/benchmarks/cli/_base.py b/benchmarks/cli/_base.py index bde1d5fc..433b3f17 100644 --- a/benchmarks/cli/_base.py +++ b/benchmarks/cli/_base.py @@ -2,8 +2,9 @@ Shared app object, types, and helpers for the benchmark CLI. The command groups (``introspect``, ``run``, ``sweep``, ``compare``, -``plot``, ``memory``) all register onto the single ``app`` defined here, so -the user-facing command surface stays flat (``python -m benchmarks run`` etc.). +``plot``) all register onto the single ``app`` defined here, so the +user-facing command surface stays flat (``python -m benchmarks run`` etc.). +Time vs memory is a ``--metric`` flag on ``run``/``sweep``, not a sub-app. Note on colour: ``typer.secho`` strips colour automatically when stdout isn't a TTY, so piping any command into ``grep`` still yields plain text. @@ -41,13 +42,6 @@ class Measure(StrEnum): rich_markup_mode="rich", ) -memory_app = typer.Typer( - help="Peak-RSS memory snapshots (pytest-memray under the hood).", - no_args_is_help=True, -) -app.add_typer(memory_app, name="memory") - - PhaseName = Literal[ "build", "matrices", "to_lp", "to_netcdf", "from_netcdf", "to_solver" ] diff --git a/benchmarks/cli/compare.py b/benchmarks/cli/compare.py index dc6d6399..60b5b1b3 100644 --- a/benchmarks/cli/compare.py +++ b/benchmarks/cli/compare.py @@ -16,11 +16,11 @@ ) def compare(ctx: typer.Context) -> None: """ - Compare timing snapshots side-by-side via ``pytest-benchmark compare``. + Compare two snapshots side-by-side — timing or memory, auto-detected. - Thin wrapper around the upstream tool so the whole suite stays under - one entry point. Pass the snapshot paths first, then any pytest-benchmark - flags:: + Timing snapshots wrap ``pytest-benchmark compare``; memory snapshots + (``peak_mib`` key) get a peak-RSS table. Pass the snapshot paths first, + then any pytest-benchmark flags (timing only):: python -m benchmarks compare a.json b.json python -m benchmarks compare a.json b.json --group-by=name diff --git a/benchmarks/cli/memory.py b/benchmarks/cli/memory.py deleted file mode 100644 index af46b63a..00000000 --- a/benchmarks/cli/memory.py +++ /dev/null @@ -1,169 +0,0 @@ -"""Memory subcommands: ``memory save`` / ``memory sweep`` / ``memory compare``.""" - -from __future__ import annotations - -from pathlib import Path -from typing import Annotated - -import typer - -from benchmarks.cli._base import memory_app -from benchmarks.memory import compare as memory_compare -from benchmarks.memory import save as memory_save -from benchmarks.sweep import run_memory_sweep - - -@memory_app.command("save") -def memory_save_cmd( - label: Annotated[ - str, typer.Argument(help="Label to attach to this snapshot, e.g. a git sha.") - ], - quick: Annotated[ - bool, typer.Option("--quick", help="Use smaller problem sizes.") - ] = False, - phase: Annotated[ - list[str] | None, - typer.Option( - "--phase", - help=( - "Restrict measurement to these phases. Pass multiple ``--phase`` " - "to select more than one. Default: all (build, matrices, to_lp," - " to_netcdf, from_netcdf, to_solver)." - ), - ), - ] = None, - repeats: Annotated[ - int, - typer.Option( - "--repeats", - help=( - "Re-run each measurement N times and keep the min peak. Default " - "1 (single shot). Memory peaks have ~1–3 %% wobble from GC " - "timing, lazy-import priming, and netcdf page-cache effects — " - "min-of-3 tightens that signal." - ), - ), - ] = 1, - filter_expr: Annotated[ - str | None, - typer.Option( - "--filter", - "-k", - help=( - "Keep only specs whose name/id contains this — e.g. " - "``nodal_balance`` (one spec), ``severity`` (patterns), ``n=`` " - "(models)." - ), - ), - ] = None, -) -> None: - """ - Measure peak memory across the registry × phase grid via ``memray.Tracker``. - - Each ``(phase, spec, size)`` runs under its own tracker so setup - allocations (model construction) are excluded from the peak — only the - phase work itself is counted. Phases run in separate subprocesses for - isolation. - - Results land in ``.benchmarks/memory/