diff --git a/benchmarks/README.md b/benchmarks/README.md index d264f682..566d5e30 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -14,8 +14,8 @@ This README only covers install and how to open the walkthrough. ## Models vs patterns -Two kinds of benchmark spec, same harness (time + peak memory, same phases), -distinguished by their sweep axis: +Two kinds of benchmark spec, same harness (time *or* peak memory — a +`run`/`sweep` `--metric` flag, same phases), distinguished by their sweep axis: - **Models** (`models/`, `REGISTRY`) — whole `linopy.Model`s swept over `size` (axis `n`): "how does cost scale with the problem?" diff --git a/benchmarks/cli/__init__.py b/benchmarks/cli/__init__.py index 0d71afe0..1f7c4738 100644 --- a/benchmarks/cli/__init__.py +++ b/benchmarks/cli/__init__.py @@ -25,7 +25,6 @@ from benchmarks.cli import sweep # noqa: F401 from benchmarks.cli import compare # noqa: F401 from benchmarks.cli import plot # noqa: F401 -from benchmarks.cli import memory # noqa: F401 # isort: on diff --git a/benchmarks/cli/_base.py b/benchmarks/cli/_base.py index 61362a5d..433b3f17 100644 --- a/benchmarks/cli/_base.py +++ b/benchmarks/cli/_base.py @@ -2,8 +2,9 @@ Shared app object, types, and helpers for the benchmark CLI. The command groups (``introspect``, ``run``, ``sweep``, ``compare``, -``plot``, ``memory``) all register onto the single ``app`` defined here, so -the user-facing command surface stays flat (``python -m benchmarks run`` etc.). +``plot``) all register onto the single ``app`` defined here, so the +user-facing command surface stays flat (``python -m benchmarks run`` etc.). +Time vs memory is a ``--metric`` flag on ``run``/``sweep``, not a sub-app. Note on colour: ``typer.secho`` strips colour automatically when stdout isn't a TTY, so piping any command into ``grep`` still yields plain text. @@ -11,12 +12,27 @@ from __future__ import annotations +from enum import StrEnum from typing import Literal import typer from benchmarks.snapshot import discover_snapshots + +class Measure(StrEnum): + """ + What a measuring command records — orthogonal to the workflow. + + ``time`` runs pytest-benchmark (wall clock); ``memory`` tracks peak RSS + via memray; ``both`` runs them sequentially (never concurrently — memray's + overhead would skew the wall-clock numbers). + """ + + time = "time" + memory = "memory" + both = "both" + app = typer.Typer( help=( "Linopy internal benchmark suite — a thin layer over pytest plus " @@ -26,13 +42,6 @@ rich_markup_mode="rich", ) -memory_app = typer.Typer( - help="Peak-RSS memory snapshots (pytest-memray under the hood).", - no_args_is_help=True, -) -app.add_typer(memory_app, name="memory") - - PhaseName = Literal[ "build", "matrices", "to_lp", "to_netcdf", "from_netcdf", "to_solver" ] diff --git a/benchmarks/cli/compare.py b/benchmarks/cli/compare.py index 371f9183..19244edb 100644 --- a/benchmarks/cli/compare.py +++ b/benchmarks/cli/compare.py @@ -16,11 +16,11 @@ ) def compare(ctx: typer.Context) -> None: """ - Compare timing snapshots side-by-side via ``pytest-benchmark compare``. + Compare two snapshots side-by-side — timing or memory, auto-detected. - Thin wrapper around the upstream tool so the whole suite stays under - one entry point. Pass the snapshot paths first, then any pytest-benchmark - flags:: + Timing snapshots wrap ``pytest-benchmark compare``; memory snapshots + (``peak_mib`` key) get a peak-RSS table. Pass the snapshot paths first, + then any pytest-benchmark flags (timing only):: python -m benchmarks compare a.json b.json python -m benchmarks compare a.json b.json --group-by=name @@ -29,8 +29,9 @@ def compare(ctx: typer.Context) -> None: With no arguments (or missing paths), prints what snapshots exist under ``.benchmarks/`` so you can copy-paste the path you want. - For memory snapshots use ``memory compare`` instead — different format, - different tool. + Memory snapshots (``peak_mib`` key) are auto-detected and diffed with a + peak-RSS table; timing snapshots go through pytest-benchmark. The two + can't be mixed in one call. Implementation note: typer/click don't have a clean idiom for "list-typed positional + pass-through", so this command parses ``ctx.args`` by hand @@ -62,6 +63,38 @@ def compare(ctx: typer.Context) -> None: _suggest_snapshots(f"missing snapshots: {[str(p) for p in missing]}") raise typer.Exit(code=2) + # Auto-detect the metric from the snapshots (memory snapshots load as MiB, + # timing as s) and route accordingly — no ``memory compare`` needed. + # ``load_snapshot`` validates each file, so a malformed/unreadable one + # fails here with a clear, file-named message instead of a raw traceback. + from benchmarks.snapshot import load_snapshot + + try: + units = [load_snapshot(p)[2] for p in snapshots] + except ValueError as exc: + typer.secho(str(exc), fg=typer.colors.RED, err=True) + raise typer.Exit(code=2) from exc + is_memory = [u == "MiB" for u in units] + if any(is_memory): + if not all(is_memory): + typer.secho( + "can't compare memory and timing snapshots together", + fg=typer.colors.RED, + err=True, + ) + raise typer.Exit(code=2) + if len(snapshots) != 2: + typer.secho( + "memory compare takes exactly 2 snapshots", + fg=typer.colors.RED, + err=True, + ) + raise typer.Exit(code=2) + from benchmarks.memory import compare_snapshots + + compare_snapshots(snapshots[0], snapshots[1]) + return + # Override pytest-benchmark's wide default table: ``--group-by=fullname`` # gives each test its own (baseline, candidate) mini-table and # ``--columns=min,iqr`` shows the noise-floor time plus spread. Applied diff --git a/benchmarks/cli/introspect.py b/benchmarks/cli/introspect.py index 08f1038b..41e5dc3d 100644 --- a/benchmarks/cli/introspect.py +++ b/benchmarks/cli/introspect.py @@ -93,7 +93,7 @@ def _row(label: str, value: object) -> None: _row("sizes:", spec.sizes) _row("features:", sorted(spec.features)) _row("phases:", sorted(spec.phases)) - _row("quick_threshold:", spec.quick_threshold) + _row("quick:", spec.quick_subset) _row("long_threshold:", spec.long_threshold) if spec.requires: _row("requires:", list(spec.requires)) @@ -105,7 +105,7 @@ def _row(label: str, value: object) -> None: _row("severities:", pattern.severities) _row("description:", pattern.description) _row("phases:", sorted(pattern.phases)) - _row("quick_threshold:", pattern.quick_threshold) + _row("quick:", pattern.quick_subset) _row("long_threshold:", pattern.long_threshold) if pattern.requires: _row("requires:", list(pattern.requires)) diff --git a/benchmarks/cli/memory.py b/benchmarks/cli/memory.py deleted file mode 100644 index af46b63a..00000000 --- a/benchmarks/cli/memory.py +++ /dev/null @@ -1,169 +0,0 @@ -"""Memory subcommands: ``memory save`` / ``memory sweep`` / ``memory compare``.""" - -from __future__ import annotations - -from pathlib import Path -from typing import Annotated - -import typer - -from benchmarks.cli._base import memory_app -from benchmarks.memory import compare as memory_compare -from benchmarks.memory import save as memory_save -from benchmarks.sweep import run_memory_sweep - - -@memory_app.command("save") -def memory_save_cmd( - label: Annotated[ - str, typer.Argument(help="Label to attach to this snapshot, e.g. a git sha.") - ], - quick: Annotated[ - bool, typer.Option("--quick", help="Use smaller problem sizes.") - ] = False, - phase: Annotated[ - list[str] | None, - typer.Option( - "--phase", - help=( - "Restrict measurement to these phases. Pass multiple ``--phase`` " - "to select more than one. Default: all (build, matrices, to_lp," - " to_netcdf, from_netcdf, to_solver)." - ), - ), - ] = None, - repeats: Annotated[ - int, - typer.Option( - "--repeats", - help=( - "Re-run each measurement N times and keep the min peak. Default " - "1 (single shot). Memory peaks have ~1–3 %% wobble from GC " - "timing, lazy-import priming, and netcdf page-cache effects — " - "min-of-3 tightens that signal." - ), - ), - ] = 1, - filter_expr: Annotated[ - str | None, - typer.Option( - "--filter", - "-k", - help=( - "Keep only specs whose name/id contains this — e.g. " - "``nodal_balance`` (one spec), ``severity`` (patterns), ``n=`` " - "(models)." - ), - ), - ] = None, -) -> None: - """ - Measure peak memory across the registry × phase grid via ``memray.Tracker``. - - Each ``(phase, spec, size)`` runs under its own tracker so setup - allocations (model construction) are excluded from the peak — only the - phase work itself is counted. Phases run in separate subprocesses for - isolation. - - Results land in ``.benchmarks/memory/