From c319c92a02bfbba144fb2bf2037bb39f659f8126 Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Fri, 5 Jun 2026 23:16:09 +0200 Subject: [PATCH 1/7] feat(benchmarks): log2 sweep colouring + --clip clamp (shared symmetric p95) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sweep heatmap coloured by raw ratio on plotly's linear scale, so a 2x and its mirror 1/2x looked asymmetric. Colour by log2(ratio) instead — folds symmetric around 1x, with a fold-change colourbar (1/8x...8x). Add --clip to override the colour clamp (a fold-change >1 for sweep, an absolute delta for scatter) over a new shared _symmetric_clip(magnitudes, override) helper that defaults to the symmetric p95 of the data, reused by both views. numpy promoted to a module-level import. Co-Authored-By: Claude Opus 4.8 (1M context) --- benchmarks/cli/plot.py | 24 +++++++++++- benchmarks/plotting.py | 85 ++++++++++++++++++++++++++++++------------ 2 files changed, 84 insertions(+), 25 deletions(-) diff --git a/benchmarks/cli/plot.py b/benchmarks/cli/plot.py index c0cad047..6551aea6 100644 --- a/benchmarks/cli/plot.py +++ b/benchmarks/cli/plot.py @@ -61,6 +61,17 @@ def plot( ), ), ] = None, + clip: Annotated[ + float | None, + typer.Option( + "--clip", + help=( + "Override the symmetric p95 colour clamp. Sweep: a " + "fold-change (>1) — ``--clip 8`` shows ⅛×–8×, beyond saturates. " + "Scatter: an absolute Δ bound. compare/scaling ignore it." + ), + ), + ] = None, output: Annotated[ Path | None, typer.Option( @@ -129,6 +140,17 @@ def plot( "scaling view takes exactly 1 snapshot", fg=typer.colors.RED, err=True ) raise typer.Exit(code=2) + if clip is not None: + if clip <= 0: + typer.secho("--clip must be positive", fg=typer.colors.RED, err=True) + raise typer.Exit(code=2) + if chosen == "sweep" and clip <= 1: + typer.secho( + "sweep --clip is a fold-change > 1 (e.g. 8 for ⅛×–8×)", + fg=typer.colors.RED, + err=True, + ) + raise typer.Exit(code=2) # RENDERERS imports fine without plotly (lazy inside each), so check the dep. if importlib.util.find_spec("plotly") is None: @@ -147,7 +169,7 @@ def plot( output = Path(".benchmarks") / "plots" / f"{chosen}.html" try: - fig, n_tests = RENDERERS[chosen](snapshots, metric, sort, facets) + fig, n_tests = RENDERERS[chosen](snapshots, metric, sort, facets, clip) except ValueError as exc: typer.secho(str(exc), fg=typer.colors.RED, err=True) raise typer.Exit(code=1) from exc diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py index c31054b8..382e0fcb 100644 --- a/benchmarks/plotting.py +++ b/benchmarks/plotting.py @@ -20,6 +20,8 @@ from pathlib import Path from typing import TYPE_CHECKING, Literal +import numpy as np + from benchmarks.snapshot import Metric, load_long_df if TYPE_CHECKING: @@ -43,6 +45,22 @@ def _diverging_kwargs(midpoint: float = 0.0) -> dict: } +def _symmetric_clip(magnitudes: np.ndarray, override: float | None, pct: float = 95.0) -> float: + """ + Symmetric colour bound for a diverging scale: ``override`` if given, else the + ``pct`` percentile of ``|magnitudes|`` — so a few outliers don't wash the rest + to the midpoint. Positive; callers use ``[-b, +b]``. + """ + if override is not None: + return float(override) + mags = np.abs(np.asarray(magnitudes, dtype=float)) + mags = mags[np.isfinite(mags)] + if mags.size == 0: + return 1.0 + bound = float(np.percentile(mags, pct)) + return bound if bound > 0 else (float(mags.max()) or 1e-9) + + def _axis_kwargs(unit: str) -> dict: """``update_xaxes`` kwargs for a given unit.""" if unit == "s": @@ -104,6 +122,7 @@ def plot_compare( metric: Metric = "min", sort: SortMode = "absolute", facets: FacetBy | None = None, + clip: float | None = None, # noqa: ARG001 (uniform signature, unused here) ) -> tuple[Figure, int]: """ Bar chart of per-test delta, in alphabetical test-id order. @@ -228,6 +247,7 @@ def plot_scatter( metric: Metric = "min", sort: SortMode = "absolute", # noqa: ARG001 (uniform signature, unused here) facets: FacetBy | None = None, + clip: float | None = None, ) -> tuple[Figure, int]: """ Baseline cost (log-x) vs candidate/baseline ratio (y) — the exploratory @@ -239,7 +259,6 @@ def plot_scatter( ``animation_frame``. A dashed line at ``ratio = 1`` marks no change; colour encodes absolute Δ. """ - import numpy as np import plotly.express as px if len(snapshots) < 2: @@ -280,10 +299,7 @@ def plot_scatter( # Clip the colour scale to the p95 absolute Δ so one huge regression doesn't # wash the rest to white; outliers saturate at the bound. - clip = float(np.percentile(df["delta_abs"].abs(), 95)) if len(df) > 0 else 0.0 - if clip == 0.0: - max_abs = float(df["delta_abs"].abs().max()) - clip = max_abs if max_abs > 0 else 1e-9 + color_clip = _symmetric_clip(df["delta_abs"].to_numpy(), clip) animate = len(snapshots) >= 3 extra: dict = {} @@ -300,7 +316,7 @@ def plot_scatter( y="ratio", color="delta_abs", **_diverging_kwargs(), - range_color=[-clip, clip], + range_color=[-color_clip, color_clip], log_x=True, range_x=[x_lo * 0.5, x_hi * 2], range_y=y_range, @@ -344,8 +360,9 @@ def plot_sweep( metric: Metric = "min", sort: SortMode = "absolute", # noqa: ARG001 (uniform signature, unused here) facets: FacetBy | None = None, # noqa: ARG001 (uniform signature, unused here) + clip: float | None = None, ) -> tuple[Figure, int]: - """Heatmap of per-test ratio relative to the first snapshot.""" + """Heatmap of per-test fold-change (log2 ratio) vs the first snapshot.""" import plotly.express as px df_long, unit = load_long_df(snapshots, metric) @@ -353,38 +370,54 @@ def plot_sweep( versions = df_long["snapshot"].drop_duplicates().tolist() baseline_label = versions[0] - # Pivot to absolutes, drop tests missing the baseline, divide by it for ratios. abs_df = df_long.pivot(index="test_id", columns="snapshot", values="value").reindex( columns=versions ) abs_df = abs_df.dropna(subset=[baseline_label]) if abs_df.empty: raise ValueError(f"no overlap with baseline snapshot {baseline_label}") - df = abs_df.div(abs_df[baseline_label], axis=0) - abs_df.index.name = "test" - df.index.name = "test" - + ratio = abs_df.div(abs_df[baseline_label], axis=0) + # Colour by log2(ratio): plotly's colour scale is linear (no log mode), so raw + # ratio makes a 2x look twice as intense as its mirror 1/2x. log2 makes folds + # symmetric around 0; the bar is relabelled to fold-change. Range defaults to + # the symmetric p95 (override via --clip, a fold-change). + logr = np.log2(ratio.where(ratio > 0)) + abs_df.index.name = ratio.index.name = logr.index.name = "test" + + bound = _symmetric_clip(logr.values, float(np.log2(clip)) if clip else None) fig = px.imshow( - df, - **_diverging_kwargs(1.0), + logr, + color_continuous_scale=["green", "white", "red"], + color_continuous_midpoint=0.0, + zmin=-bound, + zmax=bound, aspect="auto", - title=f"{metric_label} ratio relative to baseline ({versions[0]})", - labels={"x": "version", "y": "test", "color": "ratio"}, - text_auto=".2f", + title=f"{metric_label} fold-change vs baseline ({versions[0]})", + labels={"x": "version", "y": "test", "color": "fold"}, + ) + # Fold-change ticks at integer log2 steps spanning the actual colour range. + hi = max(1, int(bound)) + ticks = list(range(-hi, hi + 1)) + fig.update_coloraxes( + colorbar=dict( + tickvals=ticks, + ticktext=[f"{2**t}×" if t >= 0 else f"1/{2**-t}×" for t in ticks], + title="fold", + ) ) - # Absolute values as customdata so hover shows both ratio and value. fig.update_traces( + text=ratio.round(2).values, + texttemplate="%{text}×", customdata=abs_df.values, hovertemplate=( - "test: %{y}
" - "version: %{x}
" - "ratio: %{z:.3f}
" + "test: %{y}
version: %{x}
" + "fold: %{text}×
" f"{metric_label}: %{{customdata:.4g}}{unit}" "" ), ) - fig.update_layout(height=max(500, len(df) * 22)) - return fig, len(df) + fig.update_layout(height=max(500, len(logr) * 22)) + return fig, len(logr) # Per sweep axis: (x label, log-scaled?). Size is multiplicative → log; severity @@ -400,6 +433,7 @@ def plot_scaling( metric: Metric = "min", sort: SortMode = "absolute", # noqa: ARG001 (uniform signature, unused here) facets: FacetBy | None = None, # noqa: ARG001 (uniform signature, unused here) + clip: float | None = None, # noqa: ARG001 (uniform signature, unused here) ) -> tuple[Figure, int]: """ Cost vs the sweep dial for parametrized tests, faceted by phase. @@ -448,7 +482,10 @@ def plot_scaling( RENDERERS: dict[ PlotView, - Callable[[list[Path], Metric, SortMode, FacetBy | None], tuple[Figure, int]], + Callable[ + [list[Path], Metric, SortMode, FacetBy | None, float | None], + tuple[Figure, int], + ], ] = { "compare": plot_compare, "scatter": plot_scatter, From d3f5bf41126b50f45d0b9ac8e28d980a23dcef6d Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Fri, 5 Jun 2026 23:30:05 +0200 Subject: [PATCH 2/7] feat(benchmarks): log-scale ratio axis in scatter; symmetric p95 colour in compare Same fix as the sweep view, applied across the others: - scatter: the ratio y-axis was linear, so a 2x and its mirror 1/2x read asymmetrically (they even centred it on 1.0 *linearly*). Make it log_y so folds are symmetric about 1.0; window symmetric in log space; drop non-positive ratios (a log axis can't show a 0). - compare: clamp the bar colour with the shared symmetric p95 (consistency; the bar *length* still shows the full delta). - scaling already log-scales size; left as is. Co-Authored-By: Claude Opus 4.8 (1M context) --- benchmarks/plotting.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py index 382e0fcb..cece9a49 100644 --- a/benchmarks/plotting.py +++ b/benchmarks/plotting.py @@ -202,6 +202,7 @@ def plot_compare( facet_kwargs = {"facet_col": facets} facet_kwargs["facet_col_wrap"] = 2 if facets == "phase" else 3 + color_clip = _symmetric_clip(df[x_col].to_numpy(), None) fig = px.bar( df, x=x_col, @@ -209,6 +210,7 @@ def plot_compare( orientation="h", color=x_col, **_diverging_kwargs(), + range_color=[-color_clip, color_clip], title=title, labels={x_col: x_label, y_col: ""}, text_auto=text_fmt, @@ -287,15 +289,16 @@ def plot_scatter( df["ratio"] = df["candidate_time"] / df["baseline_time"] df["delta_abs"] = df["candidate_time"] - df["baseline_time"] df["delta_pct"] = df["delta_abs"] / df["baseline_time"] * 100.0 - df = df.rename(columns={"test_id": "test"}) + # log y-axis can't show a zero ratio (candidate value of 0) — drop those. + df = df[df["ratio"] > 0].rename(columns={"test_id": "test"}) # Fixed ranges so the animation doesn't jitter; pad to avoid edge clipping. x_lo, x_hi = df["baseline_time"].min(), df["baseline_time"].max() - # y-range centred symmetrically on 1.0 so regressions and improvements read - # equally; the larger side sets the window width. + # ratio is multiplicative → log y-axis (set below) so 2x and 1/2x read + # symmetrically about 1.0; window symmetric in log10 space, larger fold wins. y_lo, y_hi = df["ratio"].min(), df["ratio"].max() - max_dist = max(abs(1.0 - y_lo), abs(y_hi - 1.0), 0.05) - pad_y = max(0.05, max_dist * 0.05) - y_range = [1.0 - max_dist - pad_y, 1.0 + max_dist + pad_y] + fold = max(y_hi, 1.0 / y_lo, 1.1) + bound = fold**1.08 # ~8% pad in log space; plotly log-converts range_y itself + y_range = [1.0 / bound, bound] # Clip the colour scale to the p95 absolute Δ so one huge regression doesn't # wash the rest to white; outliers saturate at the bound. @@ -318,6 +321,7 @@ def plot_scatter( **_diverging_kwargs(), range_color=[-color_clip, color_clip], log_x=True, + log_y=True, range_x=[x_lo * 0.5, x_hi * 2], range_y=y_range, hover_name="test", @@ -335,7 +339,7 @@ def plot_scatter( ), labels={ "baseline_time": f"baseline {metric_label} ({unit}, log scale)", - "ratio": f"{metric_label} ratio (candidate / baseline)", + "ratio": f"{metric_label} ratio (candidate / baseline, log scale)", "candidate_time": "candidate", "delta_abs": f"Δ ({unit}, p95-clipped)", }, From b100696d780ef44876f7341f300de7365165affe Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Fri, 5 Jun 2026 23:38:34 +0200 Subject: [PATCH 3/7] refactor(benchmarks): make --clip a uniform fold-change on the ratio dimension MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Was inconsistent: a fold for sweep but an absolute Δ for scatter, and compare ignored it. Now --clip is always a fold-change (>1, default symmetric p95) that bounds the *ratio* dimension wherever a view has one: - sweep: the ratio colour (±log2) - scatter: the ratio y-axis ([1/clip, clip]) — moved off the colour, which reverts to the auto symmetric-p95 Δ clamp - compare / scaling: no ratio axis → ignored Validation is now uniform (fold-change > 1). Co-Authored-By: Claude Opus 4.8 (1M context) --- benchmarks/cli/plot.py | 25 +++++++++++-------------- benchmarks/plotting.py | 18 ++++++++++-------- 2 files changed, 21 insertions(+), 22 deletions(-) diff --git a/benchmarks/cli/plot.py b/benchmarks/cli/plot.py index 6551aea6..61b6cac6 100644 --- a/benchmarks/cli/plot.py +++ b/benchmarks/cli/plot.py @@ -66,9 +66,10 @@ def plot( typer.Option( "--clip", help=( - "Override the symmetric p95 colour clamp. Sweep: a " - "fold-change (>1) — ``--clip 8`` shows ⅛×–8×, beyond saturates. " - "Scatter: an absolute Δ bound. compare/scaling ignore it." + "Bound the ratio axis to a fold-change (>1); default is the " + "symmetric p95. Sweep clamps the colour (±log₂) — ``--clip 8`` " + "shows ⅛×–8×; scatter clamps the y-axis to ``[1/clip, clip]``. " + "compare/scaling have no ratio axis and ignore it." ), ), ] = None, @@ -140,17 +141,13 @@ def plot( "scaling view takes exactly 1 snapshot", fg=typer.colors.RED, err=True ) raise typer.Exit(code=2) - if clip is not None: - if clip <= 0: - typer.secho("--clip must be positive", fg=typer.colors.RED, err=True) - raise typer.Exit(code=2) - if chosen == "sweep" and clip <= 1: - typer.secho( - "sweep --clip is a fold-change > 1 (e.g. 8 for ⅛×–8×)", - fg=typer.colors.RED, - err=True, - ) - raise typer.Exit(code=2) + if clip is not None and clip <= 1: + typer.secho( + "--clip is a fold-change > 1 (e.g. 8 for ⅛×–8×)", + fg=typer.colors.RED, + err=True, + ) + raise typer.Exit(code=2) # RENDERERS imports fine without plotly (lazy inside each), so check the dep. if importlib.util.find_spec("plotly") is None: diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py index cece9a49..3439540d 100644 --- a/benchmarks/plotting.py +++ b/benchmarks/plotting.py @@ -293,16 +293,18 @@ def plot_scatter( df = df[df["ratio"] > 0].rename(columns={"test_id": "test"}) # Fixed ranges so the animation doesn't jitter; pad to avoid edge clipping. x_lo, x_hi = df["baseline_time"].min(), df["baseline_time"].max() - # ratio is multiplicative → log y-axis (set below) so 2x and 1/2x read - # symmetrically about 1.0; window symmetric in log10 space, larger fold wins. - y_lo, y_hi = df["ratio"].min(), df["ratio"].max() - fold = max(y_hi, 1.0 / y_lo, 1.1) - bound = fold**1.08 # ~8% pad in log space; plotly log-converts range_y itself + # ratio is multiplicative → log y-axis; window symmetric about 1.0 in log + # space, defaulting to the symmetric p95 fold (override via --clip, a + # fold-change). plotly log-converts range_y itself, so give it ratio units. + log_bound = _symmetric_clip( + np.log2(df["ratio"].to_numpy()), np.log2(clip) if clip else None + ) + bound = max(float(2.0 ** (log_bound * 1.08)), 1.1) y_range = [1.0 / bound, bound] - # Clip the colour scale to the p95 absolute Δ so one huge regression doesn't - # wash the rest to white; outliers saturate at the bound. - color_clip = _symmetric_clip(df["delta_abs"].to_numpy(), clip) + # Colour auto-clamps to the symmetric p95 absolute Δ (not --clip-tunable — + # that flag is folds, which here drive the y-axis above). + color_clip = _symmetric_clip(df["delta_abs"].to_numpy(), None) animate = len(snapshots) >= 3 extra: dict = {} From 5f0fec234163b3d4f0e736a74f8b6471773411d5 Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Fri, 5 Jun 2026 23:44:22 +0200 Subject: [PATCH 4/7] refactor(benchmarks): --clip clamps only the colour scale (log or linear per plot) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Colour is the one thing you can't adjust after the plot is drawn (axes zoom). So --clip targets the colour only, and its unit follows the plot's colour scale: - sweep (colour = log2 ratio): a fold-change (>1) - scatter / compare (colour = absolute Δ): a linear Δ bound - scaling: no diverging colour → ignored Default stays the symmetric p95. Axes are full-range and zoomable — scatter's y-axis no longer p95-clips (which hid outlier points). Validation is per-scale (fold > 1 for sweep; any positive for the linear ones). Co-Authored-By: Claude Opus 4.8 (1M context) --- benchmarks/cli/plot.py | 27 ++++++++++++++++----------- benchmarks/plotting.py | 22 ++++++++++------------ 2 files changed, 26 insertions(+), 23 deletions(-) diff --git a/benchmarks/cli/plot.py b/benchmarks/cli/plot.py index 61b6cac6..65736f10 100644 --- a/benchmarks/cli/plot.py +++ b/benchmarks/cli/plot.py @@ -66,10 +66,11 @@ def plot( typer.Option( "--clip", help=( - "Bound the ratio axis to a fold-change (>1); default is the " - "symmetric p95. Sweep clamps the colour (±log₂) — ``--clip 8`` " - "shows ⅛×–8×; scatter clamps the y-axis to ``[1/clip, clip]``. " - "compare/scaling have no ratio axis and ignore it." + "Clamp the *colour* scale (the one thing you can't zoom after " + "the plot is drawn); default is the symmetric p95. Unit follows " + "the plot's colour: a fold-change (>1) for fold-coloured sweep " + "(``--clip 8`` = ⅛×–8×), an absolute Δ for Δ-coloured " + "scatter/compare. scaling has no diverging colour and ignores it." ), ), ] = None, @@ -141,13 +142,17 @@ def plot( "scaling view takes exactly 1 snapshot", fg=typer.colors.RED, err=True ) raise typer.Exit(code=2) - if clip is not None and clip <= 1: - typer.secho( - "--clip is a fold-change > 1 (e.g. 8 for ⅛×–8×)", - fg=typer.colors.RED, - err=True, - ) - raise typer.Exit(code=2) + if clip is not None: + if clip <= 0: + typer.secho("--clip must be positive", fg=typer.colors.RED, err=True) + raise typer.Exit(code=2) + if chosen == "sweep" and clip <= 1: + typer.secho( + "sweep --clip is a fold-change > 1 (colour is log₂; e.g. 8 = ⅛×–8×)", + fg=typer.colors.RED, + err=True, + ) + raise typer.Exit(code=2) # RENDERERS imports fine without plotly (lazy inside each), so check the dep. if importlib.util.find_spec("plotly") is None: diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py index 3439540d..56a1767d 100644 --- a/benchmarks/plotting.py +++ b/benchmarks/plotting.py @@ -122,7 +122,7 @@ def plot_compare( metric: Metric = "min", sort: SortMode = "absolute", facets: FacetBy | None = None, - clip: float | None = None, # noqa: ARG001 (uniform signature, unused here) + clip: float | None = None, ) -> tuple[Figure, int]: """ Bar chart of per-test delta, in alphabetical test-id order. @@ -202,7 +202,7 @@ def plot_compare( facet_kwargs = {"facet_col": facets} facet_kwargs["facet_col_wrap"] = 2 if facets == "phase" else 3 - color_clip = _symmetric_clip(df[x_col].to_numpy(), None) + color_clip = _symmetric_clip(df[x_col].to_numpy(), clip) fig = px.bar( df, x=x_col, @@ -293,18 +293,16 @@ def plot_scatter( df = df[df["ratio"] > 0].rename(columns={"test_id": "test"}) # Fixed ranges so the animation doesn't jitter; pad to avoid edge clipping. x_lo, x_hi = df["baseline_time"].min(), df["baseline_time"].max() - # ratio is multiplicative → log y-axis; window symmetric about 1.0 in log - # space, defaulting to the symmetric p95 fold (override via --clip, a - # fold-change). plotly log-converts range_y itself, so give it ratio units. - log_bound = _symmetric_clip( - np.log2(df["ratio"].to_numpy()), np.log2(clip) if clip else None - ) - bound = max(float(2.0 ** (log_bound * 1.08)), 1.1) + # ratio is multiplicative → log y-axis; show the *full* fold range (symmetric + # about 1.0) so every point is visible — zoom interactively to focus. + y_lo, y_hi = df["ratio"].min(), df["ratio"].max() + fold = max(y_hi, 1.0 / y_lo, 1.1) + bound = fold**1.05 y_range = [1.0 / bound, bound] - # Colour auto-clamps to the symmetric p95 absolute Δ (not --clip-tunable — - # that flag is folds, which here drive the y-axis above). - color_clip = _symmetric_clip(df["delta_abs"].to_numpy(), None) + # --clip clamps the *colour* — the one thing you can't zoom after the plot is + # made. Here colour is the absolute Δ, so it's a linear bound. Default: p95. + color_clip = _symmetric_clip(df["delta_abs"].to_numpy(), clip) animate = len(snapshots) >= 3 extra: dict = {} From b4f2a9d3f4e8909d5e8605523c2178963325589d Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Sat, 6 Jun 2026 00:01:15 +0200 Subject: [PATCH 5/7] fix(benchmarks): make --sort actually sort compare bars; add --order for inputs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - compare: --sort was misleading — bars were hardcoded alphabetical (sort_values('test_id')) while --sort only switched the dimension. Now it sorts by the chosen Δ (delta_abs/delta_pct): biggest regressions on top, improvements at the bottom. The name/help are finally truthful. - plot --order {input,version}: default 'input' preserves the order you pass (the plot never re-sorts); 'version' sorts inputs by parsed linopy-, fixing a glob's string order (0.3.10 before 0.3.2) for release-history sweeps. Co-Authored-By: Claude Opus 4.8 (1M context) --- benchmarks/cli/plot.py | 40 +++++++++++++++++++++++++++++++++++----- benchmarks/plotting.py | 5 +++-- 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/benchmarks/cli/plot.py b/benchmarks/cli/plot.py index 65736f10..e9438108 100644 --- a/benchmarks/cli/plot.py +++ b/benchmarks/cli/plot.py @@ -4,7 +4,7 @@ import importlib.util from pathlib import Path -from typing import Annotated +from typing import Annotated, Literal import typer @@ -12,6 +12,21 @@ from benchmarks.plotting import FacetBy, Metric, PlotView, SortMode +def _snapshot_version_key(p: Path) -> tuple[int, object]: + """Sort key parsing ``linopy-`` from a filename; non-matches sort last.""" + import re + + from packaging.version import InvalidVersion, Version + + m = re.search(r"-(\d[\w.]*)\.json$", p.name) + if m: + try: + return (0, Version(m.group(1))) + except InvalidVersion: + pass + return (1, p.name) + + @app.command() def plot( snapshots: Annotated[ @@ -28,6 +43,18 @@ def plot( ) ), ] = None, + order: Annotated[ + Literal["input", "version"], + typer.Option( + "--order", + help=( + "Snapshot input order. ``input`` (default) keeps the order you " + "pass — the plot never re-sorts. ``version`` sorts inputs by the " + "parsed ``linopy-``, fixing a glob's string order (0.3.10 " + "before 0.3.2) for release-history sweeps." + ), + ), + ] = "input", metric: Annotated[ Metric, typer.Option( @@ -42,10 +69,10 @@ def plot( SortMode, typer.Option( help=( - "Compare-view sort and bar dimension. ``absolute`` (default) " - "uses ``b - a`` in seconds so the biggest actual-time impacts " - "float to the bottom — avoids over-weighting cheap " - "microsecond tests. ``relative`` uses percent change." + "Compare-view bar metric *and* sort. ``absolute`` (default) " + "ranks by ``b - a`` (actual-time/MiB impact, not over-weighting " + "tiny tests); ``relative`` by percent change. Bars are ordered by " + "it — biggest regressions on top, improvements at the bottom." ) ), ] = "absolute", @@ -118,6 +145,9 @@ def plot( _suggest_snapshots(f"missing snapshots: {[str(p) for p in missing]}") raise typer.Exit(code=2) + if order == "version": + snapshots = sorted(snapshots, key=_snapshot_version_key) + chosen = view or ( "scaling" if len(snapshots) == 1 diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py index 56a1767d..e7af212f 100644 --- a/benchmarks/plotting.py +++ b/benchmarks/plotting.py @@ -125,7 +125,8 @@ def plot_compare( clip: float | None = None, ) -> tuple[Figure, int]: """ - Bar chart of per-test delta, in alphabetical test-id order. + Bar chart of per-test delta, sorted by the chosen ``--sort`` Δ + (biggest regressions on top, improvements at the bottom). ``sort`` picks the bar dimension: ``absolute`` (default) plots ``b - a`` in the native unit, ``relative`` plots percent change. Bars stay in id order @@ -166,8 +167,8 @@ def plot_compare( df["delta_abs"] = df[b_label] - df[a_label] df["delta_pct"] = (df["delta_abs"] / df[a_label]) * 100.0 - df = df.sort_values("test_id").reset_index(drop=True) x_col = "delta_abs" if sort == "absolute" else "delta_pct" + df = df.sort_values(x_col).reset_index(drop=True) if sort == "absolute": x_label = f"{metric_label} delta ({unit})" From e2f663dd96ec7b22b9c392b5e78c71e005078888 Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Sat, 6 Jun 2026 00:02:56 +0200 Subject: [PATCH 6/7] feat(benchmarks): add plot --reverse to flip snapshot order Applied after --order, so e.g. --order version --reverse = newest-first (which also makes the newest snapshot the sweep baseline / compare 'a'). Co-Authored-By: Claude Opus 4.8 (1M context) --- benchmarks/cli/plot.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/benchmarks/cli/plot.py b/benchmarks/cli/plot.py index e9438108..d9b86061 100644 --- a/benchmarks/cli/plot.py +++ b/benchmarks/cli/plot.py @@ -55,6 +55,17 @@ def plot( ), ), ] = "input", + reverse: Annotated[ + bool, + typer.Option( + "--reverse/--no-reverse", + help=( + "Reverse the snapshot order (after --order). E.g. ``--order " + "version --reverse`` = newest-first; in compare/scatter it flips " + "which snapshot is the baseline." + ), + ), + ] = False, metric: Annotated[ Metric, typer.Option( @@ -147,6 +158,8 @@ def plot( if order == "version": snapshots = sorted(snapshots, key=_snapshot_version_key) + if reverse: + snapshots = snapshots[::-1] chosen = view or ( "scaling" From 3a78d7fdc784d51d31d3fd1674844c885c85ce91 Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Sat, 6 Jun 2026 00:06:28 +0200 Subject: [PATCH 7/7] revert(benchmarks): drop plot --order/--reverse MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit plot serves arbitrary snapshots (bench labels, baseline.json, …), so parsing linopy- from filenames is a leaky abstraction — --order version is meaningless for non-version snapshots. plot already preserves input order, so callers control the axis by the order they pass. The --sort fix (compare bars sort by Δ, not alphabetically) stays — that was a real bug. Co-Authored-By: Claude Opus 4.8 (1M context) --- benchmarks/cli/plot.py | 45 +----------------------------------------- 1 file changed, 1 insertion(+), 44 deletions(-) diff --git a/benchmarks/cli/plot.py b/benchmarks/cli/plot.py index d9b86061..d51a9957 100644 --- a/benchmarks/cli/plot.py +++ b/benchmarks/cli/plot.py @@ -4,7 +4,7 @@ import importlib.util from pathlib import Path -from typing import Annotated, Literal +from typing import Annotated import typer @@ -12,21 +12,6 @@ from benchmarks.plotting import FacetBy, Metric, PlotView, SortMode -def _snapshot_version_key(p: Path) -> tuple[int, object]: - """Sort key parsing ``linopy-`` from a filename; non-matches sort last.""" - import re - - from packaging.version import InvalidVersion, Version - - m = re.search(r"-(\d[\w.]*)\.json$", p.name) - if m: - try: - return (0, Version(m.group(1))) - except InvalidVersion: - pass - return (1, p.name) - - @app.command() def plot( snapshots: Annotated[ @@ -43,29 +28,6 @@ def plot( ) ), ] = None, - order: Annotated[ - Literal["input", "version"], - typer.Option( - "--order", - help=( - "Snapshot input order. ``input`` (default) keeps the order you " - "pass — the plot never re-sorts. ``version`` sorts inputs by the " - "parsed ``linopy-``, fixing a glob's string order (0.3.10 " - "before 0.3.2) for release-history sweeps." - ), - ), - ] = "input", - reverse: Annotated[ - bool, - typer.Option( - "--reverse/--no-reverse", - help=( - "Reverse the snapshot order (after --order). E.g. ``--order " - "version --reverse`` = newest-first; in compare/scatter it flips " - "which snapshot is the baseline." - ), - ), - ] = False, metric: Annotated[ Metric, typer.Option( @@ -156,11 +118,6 @@ def plot( _suggest_snapshots(f"missing snapshots: {[str(p) for p in missing]}") raise typer.Exit(code=2) - if order == "version": - snapshots = sorted(snapshots, key=_snapshot_version_key) - if reverse: - snapshots = snapshots[::-1] - chosen = view or ( "scaling" if len(snapshots) == 1