From c319c92a02bfbba144fb2bf2037bb39f659f8126 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Fri, 5 Jun 2026 23:16:09 +0200
Subject: [PATCH 1/7] feat(benchmarks): log2 sweep colouring + --clip clamp
 (shared symmetric p95)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The sweep heatmap coloured by raw ratio on plotly's linear scale, so a 2x and
its mirror 1/2x looked asymmetric. Colour by log2(ratio) instead — folds
symmetric around 1x, with a fold-change colourbar (1/8x...8x).

Add --clip to override the colour clamp (a fold-change >1 for sweep, an absolute
delta for scatter) over a new shared _symmetric_clip(magnitudes, override)
helper that defaults to the symmetric p95 of the data, reused by both views.
numpy promoted to a module-level import.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli/plot.py | 24 +++++++++++-
 benchmarks/plotting.py | 85 ++++++++++++++++++++++++++++++------------
 2 files changed, 84 insertions(+), 25 deletions(-)

diff --git a/benchmarks/cli/plot.py b/benchmarks/cli/plot.py
index c0cad047..6551aea6 100644
--- a/benchmarks/cli/plot.py
+++ b/benchmarks/cli/plot.py
@@ -61,6 +61,17 @@ def plot(
             ),
         ),
     ] = None,
+    clip: Annotated[
+        float | None,
+        typer.Option(
+            "--clip",
+            help=(
+                "Override the symmetric p95 colour clamp. Sweep: a "
+                "fold-change (>1) — ``--clip 8`` shows ⅛×–8×, beyond saturates. "
+                "Scatter: an absolute Δ bound. compare/scaling ignore it."
+            ),
+        ),
+    ] = None,
     output: Annotated[
         Path | None,
         typer.Option(
@@ -129,6 +140,17 @@ def plot(
             "scaling view takes exactly 1 snapshot", fg=typer.colors.RED, err=True
         )
         raise typer.Exit(code=2)
+    if clip is not None:
+        if clip <= 0:
+            typer.secho("--clip must be positive", fg=typer.colors.RED, err=True)
+            raise typer.Exit(code=2)
+        if chosen == "sweep" and clip <= 1:
+            typer.secho(
+                "sweep --clip is a fold-change > 1 (e.g. 8 for ⅛×–8×)",
+                fg=typer.colors.RED,
+                err=True,
+            )
+            raise typer.Exit(code=2)
 
     # RENDERERS imports fine without plotly (lazy inside each), so check the dep.
     if importlib.util.find_spec("plotly") is None:
@@ -147,7 +169,7 @@ def plot(
         output = Path(".benchmarks") / "plots" / f"{chosen}.html"
 
     try:
-        fig, n_tests = RENDERERS[chosen](snapshots, metric, sort, facets)
+        fig, n_tests = RENDERERS[chosen](snapshots, metric, sort, facets, clip)
     except ValueError as exc:
         typer.secho(str(exc), fg=typer.colors.RED, err=True)
         raise typer.Exit(code=1) from exc
diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
index c31054b8..382e0fcb 100644
--- a/benchmarks/plotting.py
+++ b/benchmarks/plotting.py
@@ -20,6 +20,8 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Literal
 
+import numpy as np
+
 from benchmarks.snapshot import Metric, load_long_df
 
 if TYPE_CHECKING:
@@ -43,6 +45,22 @@ def _diverging_kwargs(midpoint: float = 0.0) -> dict:
     }
 
 
+def _symmetric_clip(magnitudes: np.ndarray, override: float | None, pct: float = 95.0) -> float:
+    """
+    Symmetric colour bound for a diverging scale: ``override`` if given, else the
+    ``pct`` percentile of ``|magnitudes|`` — so a few outliers don't wash the rest
+    to the midpoint. Positive; callers use ``[-b, +b]``.
+    """
+    if override is not None:
+        return float(override)
+    mags = np.abs(np.asarray(magnitudes, dtype=float))
+    mags = mags[np.isfinite(mags)]
+    if mags.size == 0:
+        return 1.0
+    bound = float(np.percentile(mags, pct))
+    return bound if bound > 0 else (float(mags.max()) or 1e-9)
+
+
 def _axis_kwargs(unit: str) -> dict:
     """``update_xaxes`` kwargs for a given unit."""
     if unit == "s":
@@ -104,6 +122,7 @@ def plot_compare(
     metric: Metric = "min",
     sort: SortMode = "absolute",
     facets: FacetBy | None = None,
+    clip: float | None = None,  # noqa: ARG001  (uniform signature, unused here)
 ) -> tuple[Figure, int]:
     """
     Bar chart of per-test delta, in alphabetical test-id order.
@@ -228,6 +247,7 @@ def plot_scatter(
     metric: Metric = "min",
     sort: SortMode = "absolute",  # noqa: ARG001  (uniform signature, unused here)
     facets: FacetBy | None = None,
+    clip: float | None = None,
 ) -> tuple[Figure, int]:
     """
     Baseline cost (log-x) vs candidate/baseline ratio (y) — the exploratory
@@ -239,7 +259,6 @@ def plot_scatter(
     ``animation_frame``. A dashed line at ``ratio = 1`` marks no change; colour
     encodes absolute Δ.
     """
-    import numpy as np
     import plotly.express as px
 
     if len(snapshots) < 2:
@@ -280,10 +299,7 @@ def plot_scatter(
 
     # Clip the colour scale to the p95 absolute Δ so one huge regression doesn't
     # wash the rest to white; outliers saturate at the bound.
-    clip = float(np.percentile(df["delta_abs"].abs(), 95)) if len(df) > 0 else 0.0
-    if clip == 0.0:
-        max_abs = float(df["delta_abs"].abs().max())
-        clip = max_abs if max_abs > 0 else 1e-9
+    color_clip = _symmetric_clip(df["delta_abs"].to_numpy(), clip)
 
     animate = len(snapshots) >= 3
     extra: dict = {}
@@ -300,7 +316,7 @@ def plot_scatter(
         y="ratio",
         color="delta_abs",
         **_diverging_kwargs(),
-        range_color=[-clip, clip],
+        range_color=[-color_clip, color_clip],
         log_x=True,
         range_x=[x_lo * 0.5, x_hi * 2],
         range_y=y_range,
@@ -344,8 +360,9 @@ def plot_sweep(
     metric: Metric = "min",
     sort: SortMode = "absolute",  # noqa: ARG001  (uniform signature, unused here)
     facets: FacetBy | None = None,  # noqa: ARG001  (uniform signature, unused here)
+    clip: float | None = None,
 ) -> tuple[Figure, int]:
-    """Heatmap of per-test ratio relative to the first snapshot."""
+    """Heatmap of per-test fold-change (log2 ratio) vs the first snapshot."""
     import plotly.express as px
 
     df_long, unit = load_long_df(snapshots, metric)
@@ -353,38 +370,54 @@ def plot_sweep(
     versions = df_long["snapshot"].drop_duplicates().tolist()
     baseline_label = versions[0]
 
-    # Pivot to absolutes, drop tests missing the baseline, divide by it for ratios.
     abs_df = df_long.pivot(index="test_id", columns="snapshot", values="value").reindex(
         columns=versions
     )
     abs_df = abs_df.dropna(subset=[baseline_label])
     if abs_df.empty:
         raise ValueError(f"no overlap with baseline snapshot {baseline_label}")
-    df = abs_df.div(abs_df[baseline_label], axis=0)
-    abs_df.index.name = "test"
-    df.index.name = "test"
-
+    ratio = abs_df.div(abs_df[baseline_label], axis=0)
+    # Colour by log2(ratio): plotly's colour scale is linear (no log mode), so raw
+    # ratio makes a 2x look twice as intense as its mirror 1/2x. log2 makes folds
+    # symmetric around 0; the bar is relabelled to fold-change. Range defaults to
+    # the symmetric p95 (override via --clip, a fold-change).
+    logr = np.log2(ratio.where(ratio > 0))
+    abs_df.index.name = ratio.index.name = logr.index.name = "test"
+
+    bound = _symmetric_clip(logr.values, float(np.log2(clip)) if clip else None)
     fig = px.imshow(
-        df,
-        **_diverging_kwargs(1.0),
+        logr,
+        color_continuous_scale=["green", "white", "red"],
+        color_continuous_midpoint=0.0,
+        zmin=-bound,
+        zmax=bound,
         aspect="auto",
-        title=f"{metric_label} ratio relative to baseline ({versions[0]})",
-        labels={"x": "version", "y": "test", "color": "ratio"},
-        text_auto=".2f",
+        title=f"{metric_label} fold-change vs baseline ({versions[0]})",
+        labels={"x": "version", "y": "test", "color": "fold"},
+    )
+    # Fold-change ticks at integer log2 steps spanning the actual colour range.
+    hi = max(1, int(bound))
+    ticks = list(range(-hi, hi + 1))
+    fig.update_coloraxes(
+        colorbar=dict(
+            tickvals=ticks,
+            ticktext=[f"{2**t}×" if t >= 0 else f"1/{2**-t}×" for t in ticks],
+            title="fold",
+        )
     )
-    # Absolute values as customdata so hover shows both ratio and value.
     fig.update_traces(
+        text=ratio.round(2).values,
+        texttemplate="%{text}×",
         customdata=abs_df.values,
         hovertemplate=(
-            "test: %{y}<br>"
-            "version: %{x}<br>"
-            "ratio: %{z:.3f}<br>"
+            "test: %{y}<br>version: %{x}<br>"
+            "fold: %{text}×<br>"
             f"{metric_label}: %{{customdata:.4g}}{unit}"
             "<extra></extra>"
         ),
     )
-    fig.update_layout(height=max(500, len(df) * 22))
-    return fig, len(df)
+    fig.update_layout(height=max(500, len(logr) * 22))
+    return fig, len(logr)
 
 
 # Per sweep axis: (x label, log-scaled?). Size is multiplicative → log; severity
@@ -400,6 +433,7 @@ def plot_scaling(
     metric: Metric = "min",
     sort: SortMode = "absolute",  # noqa: ARG001  (uniform signature, unused here)
     facets: FacetBy | None = None,  # noqa: ARG001  (uniform signature, unused here)
+    clip: float | None = None,  # noqa: ARG001  (uniform signature, unused here)
 ) -> tuple[Figure, int]:
     """
     Cost vs the sweep dial for parametrized tests, faceted by phase.
@@ -448,7 +482,10 @@ def plot_scaling(
 
 RENDERERS: dict[
     PlotView,
-    Callable[[list[Path], Metric, SortMode, FacetBy | None], tuple[Figure, int]],
+    Callable[
+        [list[Path], Metric, SortMode, FacetBy | None, float | None],
+        tuple[Figure, int],
+    ],
 ] = {
     "compare": plot_compare,
     "scatter": plot_scatter,

From d3f5bf41126b50f45d0b9ac8e28d980a23dcef6d Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Fri, 5 Jun 2026 23:30:05 +0200
Subject: [PATCH 2/7] feat(benchmarks): log-scale ratio axis in scatter;
 symmetric p95 colour in compare

Same fix as the sweep view, applied across the others:
- scatter: the ratio y-axis was linear, so a 2x and its mirror 1/2x read
  asymmetrically (they even centred it on 1.0 *linearly*). Make it log_y so folds
  are symmetric about 1.0; window symmetric in log space; drop non-positive
  ratios (a log axis can't show a 0).
- compare: clamp the bar colour with the shared symmetric p95 (consistency; the
  bar *length* still shows the full delta).
- scaling already log-scales size; left as is.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 benchmarks/plotting.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
index 382e0fcb..cece9a49 100644
--- a/benchmarks/plotting.py
+++ b/benchmarks/plotting.py
@@ -202,6 +202,7 @@ def plot_compare(
         facet_kwargs = {"facet_col": facets}
         facet_kwargs["facet_col_wrap"] = 2 if facets == "phase" else 3
 
+    color_clip = _symmetric_clip(df[x_col].to_numpy(), None)
     fig = px.bar(
         df,
         x=x_col,
@@ -209,6 +210,7 @@ def plot_compare(
         orientation="h",
         color=x_col,
         **_diverging_kwargs(),
+        range_color=[-color_clip, color_clip],
         title=title,
         labels={x_col: x_label, y_col: ""},
         text_auto=text_fmt,
@@ -287,15 +289,16 @@ def plot_scatter(
     df["ratio"] = df["candidate_time"] / df["baseline_time"]
     df["delta_abs"] = df["candidate_time"] - df["baseline_time"]
     df["delta_pct"] = df["delta_abs"] / df["baseline_time"] * 100.0
-    df = df.rename(columns={"test_id": "test"})
+    # log y-axis can't show a zero ratio (candidate value of 0) — drop those.
+    df = df[df["ratio"] > 0].rename(columns={"test_id": "test"})
     # Fixed ranges so the animation doesn't jitter; pad to avoid edge clipping.
     x_lo, x_hi = df["baseline_time"].min(), df["baseline_time"].max()
-    # y-range centred symmetrically on 1.0 so regressions and improvements read
-    # equally; the larger side sets the window width.
+    # ratio is multiplicative → log y-axis (set below) so 2x and 1/2x read
+    # symmetrically about 1.0; window symmetric in log10 space, larger fold wins.
     y_lo, y_hi = df["ratio"].min(), df["ratio"].max()
-    max_dist = max(abs(1.0 - y_lo), abs(y_hi - 1.0), 0.05)
-    pad_y = max(0.05, max_dist * 0.05)
-    y_range = [1.0 - max_dist - pad_y, 1.0 + max_dist + pad_y]
+    fold = max(y_hi, 1.0 / y_lo, 1.1)
+    bound = fold**1.08  # ~8% pad in log space; plotly log-converts range_y itself
+    y_range = [1.0 / bound, bound]
 
     # Clip the colour scale to the p95 absolute Δ so one huge regression doesn't
     # wash the rest to white; outliers saturate at the bound.
@@ -318,6 +321,7 @@ def plot_scatter(
         **_diverging_kwargs(),
         range_color=[-color_clip, color_clip],
         log_x=True,
+        log_y=True,
         range_x=[x_lo * 0.5, x_hi * 2],
         range_y=y_range,
         hover_name="test",
@@ -335,7 +339,7 @@ def plot_scatter(
         ),
         labels={
             "baseline_time": f"baseline {metric_label} ({unit}, log scale)",
-            "ratio": f"{metric_label} ratio  (candidate / baseline)",
+            "ratio": f"{metric_label} ratio (candidate / baseline, log scale)",
             "candidate_time": "candidate",
             "delta_abs": f"Δ ({unit}, p95-clipped)",
         },

From b100696d780ef44876f7341f300de7365165affe Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Fri, 5 Jun 2026 23:38:34 +0200
Subject: [PATCH 3/7] refactor(benchmarks): make --clip a uniform fold-change
 on the ratio dimension
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Was inconsistent: a fold for sweep but an absolute Δ for scatter, and compare
ignored it. Now --clip is always a fold-change (>1, default symmetric p95) that
bounds the *ratio* dimension wherever a view has one:
- sweep: the ratio colour (±log2)
- scatter: the ratio y-axis ([1/clip, clip]) — moved off the colour, which
  reverts to the auto symmetric-p95 Δ clamp
- compare / scaling: no ratio axis → ignored

Validation is now uniform (fold-change > 1).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli/plot.py | 25 +++++++++++--------------
 benchmarks/plotting.py | 18 ++++++++++--------
 2 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/benchmarks/cli/plot.py b/benchmarks/cli/plot.py
index 6551aea6..61b6cac6 100644
--- a/benchmarks/cli/plot.py
+++ b/benchmarks/cli/plot.py
@@ -66,9 +66,10 @@ def plot(
         typer.Option(
             "--clip",
             help=(
-                "Override the symmetric p95 colour clamp. Sweep: a "
-                "fold-change (>1) — ``--clip 8`` shows ⅛×–8×, beyond saturates. "
-                "Scatter: an absolute Δ bound. compare/scaling ignore it."
+                "Bound the ratio axis to a fold-change (>1); default is the "
+                "symmetric p95. Sweep clamps the colour (±log₂) — ``--clip 8`` "
+                "shows ⅛×–8×; scatter clamps the y-axis to ``[1/clip, clip]``. "
+                "compare/scaling have no ratio axis and ignore it."
             ),
         ),
     ] = None,
@@ -140,17 +141,13 @@ def plot(
             "scaling view takes exactly 1 snapshot", fg=typer.colors.RED, err=True
         )
         raise typer.Exit(code=2)
-    if clip is not None:
-        if clip <= 0:
-            typer.secho("--clip must be positive", fg=typer.colors.RED, err=True)
-            raise typer.Exit(code=2)
-        if chosen == "sweep" and clip <= 1:
-            typer.secho(
-                "sweep --clip is a fold-change > 1 (e.g. 8 for ⅛×–8×)",
-                fg=typer.colors.RED,
-                err=True,
-            )
-            raise typer.Exit(code=2)
+    if clip is not None and clip <= 1:
+        typer.secho(
+            "--clip is a fold-change > 1 (e.g. 8 for ⅛×–8×)",
+            fg=typer.colors.RED,
+            err=True,
+        )
+        raise typer.Exit(code=2)
 
     # RENDERERS imports fine without plotly (lazy inside each), so check the dep.
     if importlib.util.find_spec("plotly") is None:
diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
index cece9a49..3439540d 100644
--- a/benchmarks/plotting.py
+++ b/benchmarks/plotting.py
@@ -293,16 +293,18 @@ def plot_scatter(
     df = df[df["ratio"] > 0].rename(columns={"test_id": "test"})
     # Fixed ranges so the animation doesn't jitter; pad to avoid edge clipping.
     x_lo, x_hi = df["baseline_time"].min(), df["baseline_time"].max()
-    # ratio is multiplicative → log y-axis (set below) so 2x and 1/2x read
-    # symmetrically about 1.0; window symmetric in log10 space, larger fold wins.
-    y_lo, y_hi = df["ratio"].min(), df["ratio"].max()
-    fold = max(y_hi, 1.0 / y_lo, 1.1)
-    bound = fold**1.08  # ~8% pad in log space; plotly log-converts range_y itself
+    # ratio is multiplicative → log y-axis; window symmetric about 1.0 in log
+    # space, defaulting to the symmetric p95 fold (override via --clip, a
+    # fold-change). plotly log-converts range_y itself, so give it ratio units.
+    log_bound = _symmetric_clip(
+        np.log2(df["ratio"].to_numpy()), np.log2(clip) if clip else None
+    )
+    bound = max(float(2.0 ** (log_bound * 1.08)), 1.1)
     y_range = [1.0 / bound, bound]
 
-    # Clip the colour scale to the p95 absolute Δ so one huge regression doesn't
-    # wash the rest to white; outliers saturate at the bound.
-    color_clip = _symmetric_clip(df["delta_abs"].to_numpy(), clip)
+    # Colour auto-clamps to the symmetric p95 absolute Δ (not --clip-tunable —
+    # that flag is folds, which here drive the y-axis above).
+    color_clip = _symmetric_clip(df["delta_abs"].to_numpy(), None)
 
     animate = len(snapshots) >= 3
     extra: dict = {}

From 5f0fec234163b3d4f0e736a74f8b6471773411d5 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Fri, 5 Jun 2026 23:44:22 +0200
Subject: [PATCH 4/7] refactor(benchmarks): --clip clamps only the colour scale
 (log or linear per plot)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Colour is the one thing you can't adjust after the plot is drawn (axes zoom).
So --clip targets the colour only, and its unit follows the plot's colour scale:
- sweep (colour = log2 ratio): a fold-change (>1)
- scatter / compare (colour = absolute Δ): a linear Δ bound
- scaling: no diverging colour → ignored
Default stays the symmetric p95. Axes are full-range and zoomable — scatter's
y-axis no longer p95-clips (which hid outlier points). Validation is per-scale
(fold > 1 for sweep; any positive for the linear ones).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli/plot.py | 27 ++++++++++++++++-----------
 benchmarks/plotting.py | 22 ++++++++++------------
 2 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/benchmarks/cli/plot.py b/benchmarks/cli/plot.py
index 61b6cac6..65736f10 100644
--- a/benchmarks/cli/plot.py
+++ b/benchmarks/cli/plot.py
@@ -66,10 +66,11 @@ def plot(
         typer.Option(
             "--clip",
             help=(
-                "Bound the ratio axis to a fold-change (>1); default is the "
-                "symmetric p95. Sweep clamps the colour (±log₂) — ``--clip 8`` "
-                "shows ⅛×–8×; scatter clamps the y-axis to ``[1/clip, clip]``. "
-                "compare/scaling have no ratio axis and ignore it."
+                "Clamp the *colour* scale (the one thing you can't zoom after "
+                "the plot is drawn); default is the symmetric p95. Unit follows "
+                "the plot's colour: a fold-change (>1) for fold-coloured sweep "
+                "(``--clip 8`` = ⅛×–8×), an absolute Δ for Δ-coloured "
+                "scatter/compare. scaling has no diverging colour and ignores it."
             ),
         ),
     ] = None,
@@ -141,13 +142,17 @@ def plot(
             "scaling view takes exactly 1 snapshot", fg=typer.colors.RED, err=True
         )
         raise typer.Exit(code=2)
-    if clip is not None and clip <= 1:
-        typer.secho(
-            "--clip is a fold-change > 1 (e.g. 8 for ⅛×–8×)",
-            fg=typer.colors.RED,
-            err=True,
-        )
-        raise typer.Exit(code=2)
+    if clip is not None:
+        if clip <= 0:
+            typer.secho("--clip must be positive", fg=typer.colors.RED, err=True)
+            raise typer.Exit(code=2)
+        if chosen == "sweep" and clip <= 1:
+            typer.secho(
+                "sweep --clip is a fold-change > 1 (colour is log₂; e.g. 8 = ⅛×–8×)",
+                fg=typer.colors.RED,
+                err=True,
+            )
+            raise typer.Exit(code=2)
 
     # RENDERERS imports fine without plotly (lazy inside each), so check the dep.
     if importlib.util.find_spec("plotly") is None:
diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
index 3439540d..56a1767d 100644
--- a/benchmarks/plotting.py
+++ b/benchmarks/plotting.py
@@ -122,7 +122,7 @@ def plot_compare(
     metric: Metric = "min",
     sort: SortMode = "absolute",
     facets: FacetBy | None = None,
-    clip: float | None = None,  # noqa: ARG001  (uniform signature, unused here)
+    clip: float | None = None,
 ) -> tuple[Figure, int]:
     """
     Bar chart of per-test delta, in alphabetical test-id order.
@@ -202,7 +202,7 @@ def plot_compare(
         facet_kwargs = {"facet_col": facets}
         facet_kwargs["facet_col_wrap"] = 2 if facets == "phase" else 3
 
-    color_clip = _symmetric_clip(df[x_col].to_numpy(), None)
+    color_clip = _symmetric_clip(df[x_col].to_numpy(), clip)
     fig = px.bar(
         df,
         x=x_col,
@@ -293,18 +293,16 @@ def plot_scatter(
     df = df[df["ratio"] > 0].rename(columns={"test_id": "test"})
     # Fixed ranges so the animation doesn't jitter; pad to avoid edge clipping.
     x_lo, x_hi = df["baseline_time"].min(), df["baseline_time"].max()
-    # ratio is multiplicative → log y-axis; window symmetric about 1.0 in log
-    # space, defaulting to the symmetric p95 fold (override via --clip, a
-    # fold-change). plotly log-converts range_y itself, so give it ratio units.
-    log_bound = _symmetric_clip(
-        np.log2(df["ratio"].to_numpy()), np.log2(clip) if clip else None
-    )
-    bound = max(float(2.0 ** (log_bound * 1.08)), 1.1)
+    # ratio is multiplicative → log y-axis; show the *full* fold range (symmetric
+    # about 1.0) so every point is visible — zoom interactively to focus.
+    y_lo, y_hi = df["ratio"].min(), df["ratio"].max()
+    fold = max(y_hi, 1.0 / y_lo, 1.1)
+    bound = fold**1.05
     y_range = [1.0 / bound, bound]
 
-    # Colour auto-clamps to the symmetric p95 absolute Δ (not --clip-tunable —
-    # that flag is folds, which here drive the y-axis above).
-    color_clip = _symmetric_clip(df["delta_abs"].to_numpy(), None)
+    # --clip clamps the *colour* — the one thing you can't zoom after the plot is
+    # made. Here colour is the absolute Δ, so it's a linear bound. Default: p95.
+    color_clip = _symmetric_clip(df["delta_abs"].to_numpy(), clip)
 
     animate = len(snapshots) >= 3
     extra: dict = {}

From b4f2a9d3f4e8909d5e8605523c2178963325589d Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Sat, 6 Jun 2026 00:01:15 +0200
Subject: [PATCH 5/7] fix(benchmarks): make --sort actually sort compare bars;
 add --order for inputs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- compare: --sort was misleading — bars were hardcoded alphabetical
  (sort_values('test_id')) while --sort only switched the dimension. Now it
  sorts by the chosen Δ (delta_abs/delta_pct): biggest regressions on top,
  improvements at the bottom. The name/help are finally truthful.
- plot --order {input,version}: default 'input' preserves the order you pass
  (the plot never re-sorts); 'version' sorts inputs by parsed linopy-<ver>,
  fixing a glob's string order (0.3.10 before 0.3.2) for release-history sweeps.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli/plot.py | 40 +++++++++++++++++++++++++++++++++++-----
 benchmarks/plotting.py |  5 +++--
 2 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/benchmarks/cli/plot.py b/benchmarks/cli/plot.py
index 65736f10..e9438108 100644
--- a/benchmarks/cli/plot.py
+++ b/benchmarks/cli/plot.py
@@ -4,7 +4,7 @@
 
 import importlib.util
 from pathlib import Path
-from typing import Annotated
+from typing import Annotated, Literal
 
 import typer
 
@@ -12,6 +12,21 @@
 from benchmarks.plotting import FacetBy, Metric, PlotView, SortMode
 
 
+def _snapshot_version_key(p: Path) -> tuple[int, object]:
+    """Sort key parsing ``linopy-<ver>`` from a filename; non-matches sort last."""
+    import re
+
+    from packaging.version import InvalidVersion, Version
+
+    m = re.search(r"-(\d[\w.]*)\.json$", p.name)
+    if m:
+        try:
+            return (0, Version(m.group(1)))
+        except InvalidVersion:
+            pass
+    return (1, p.name)
+
+
 @app.command()
 def plot(
     snapshots: Annotated[
@@ -28,6 +43,18 @@ def plot(
             )
         ),
     ] = None,
+    order: Annotated[
+        Literal["input", "version"],
+        typer.Option(
+            "--order",
+            help=(
+                "Snapshot input order. ``input`` (default) keeps the order you "
+                "pass — the plot never re-sorts. ``version`` sorts inputs by the "
+                "parsed ``linopy-<ver>``, fixing a glob's string order (0.3.10 "
+                "before 0.3.2) for release-history sweeps."
+            ),
+        ),
+    ] = "input",
     metric: Annotated[
         Metric,
         typer.Option(
@@ -42,10 +69,10 @@ def plot(
         SortMode,
         typer.Option(
             help=(
-                "Compare-view sort and bar dimension. ``absolute`` (default) "
-                "uses ``b - a`` in seconds so the biggest actual-time impacts "
-                "float to the bottom — avoids over-weighting cheap "
-                "microsecond tests. ``relative`` uses percent change."
+                "Compare-view bar metric *and* sort. ``absolute`` (default) "
+                "ranks by ``b - a`` (actual-time/MiB impact, not over-weighting "
+                "tiny tests); ``relative`` by percent change. Bars are ordered by "
+                "it — biggest regressions on top, improvements at the bottom."
             )
         ),
     ] = "absolute",
@@ -118,6 +145,9 @@ def plot(
         _suggest_snapshots(f"missing snapshots: {[str(p) for p in missing]}")
         raise typer.Exit(code=2)
 
+    if order == "version":
+        snapshots = sorted(snapshots, key=_snapshot_version_key)
+
     chosen = view or (
         "scaling"
         if len(snapshots) == 1
diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
index 56a1767d..e7af212f 100644
--- a/benchmarks/plotting.py
+++ b/benchmarks/plotting.py
@@ -125,7 +125,8 @@ def plot_compare(
     clip: float | None = None,
 ) -> tuple[Figure, int]:
     """
-    Bar chart of per-test delta, in alphabetical test-id order.
+    Bar chart of per-test delta, sorted by the chosen ``--sort`` Δ
+    (biggest regressions on top, improvements at the bottom).
 
     ``sort`` picks the bar dimension: ``absolute`` (default) plots ``b - a`` in
     the native unit, ``relative`` plots percent change. Bars stay in id order
@@ -166,8 +167,8 @@ def plot_compare(
 
     df["delta_abs"] = df[b_label] - df[a_label]
     df["delta_pct"] = (df["delta_abs"] / df[a_label]) * 100.0
-    df = df.sort_values("test_id").reset_index(drop=True)
     x_col = "delta_abs" if sort == "absolute" else "delta_pct"
+    df = df.sort_values(x_col).reset_index(drop=True)
 
     if sort == "absolute":
         x_label = f"{metric_label} delta ({unit})"

From e2f663dd96ec7b22b9c392b5e78c71e005078888 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Sat, 6 Jun 2026 00:02:56 +0200
Subject: [PATCH 6/7] feat(benchmarks): add plot --reverse to flip snapshot
 order

Applied after --order, so e.g. --order version --reverse = newest-first (which
also makes the newest snapshot the sweep baseline / compare 'a').

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli/plot.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/benchmarks/cli/plot.py b/benchmarks/cli/plot.py
index e9438108..d9b86061 100644
--- a/benchmarks/cli/plot.py
+++ b/benchmarks/cli/plot.py
@@ -55,6 +55,17 @@ def plot(
             ),
         ),
     ] = "input",
+    reverse: Annotated[
+        bool,
+        typer.Option(
+            "--reverse/--no-reverse",
+            help=(
+                "Reverse the snapshot order (after --order). E.g. ``--order "
+                "version --reverse`` = newest-first; in compare/scatter it flips "
+                "which snapshot is the baseline."
+            ),
+        ),
+    ] = False,
     metric: Annotated[
         Metric,
         typer.Option(
@@ -147,6 +158,8 @@ def plot(
 
     if order == "version":
         snapshots = sorted(snapshots, key=_snapshot_version_key)
+    if reverse:
+        snapshots = snapshots[::-1]
 
     chosen = view or (
         "scaling"

From 3a78d7fdc784d51d31d3fd1674844c885c85ce91 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Sat, 6 Jun 2026 00:06:28 +0200
Subject: [PATCH 7/7] revert(benchmarks): drop plot --order/--reverse
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

plot serves arbitrary snapshots (bench labels, baseline.json, …), so parsing
linopy-<ver> from filenames is a leaky abstraction — --order version is
meaningless for non-version snapshots. plot already preserves input order, so
callers control the axis by the order they pass. The --sort fix (compare bars
sort by Δ, not alphabetically) stays — that was a real bug.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli/plot.py | 45 +-----------------------------------------
 1 file changed, 1 insertion(+), 44 deletions(-)

diff --git a/benchmarks/cli/plot.py b/benchmarks/cli/plot.py
index d9b86061..d51a9957 100644
--- a/benchmarks/cli/plot.py
+++ b/benchmarks/cli/plot.py
@@ -4,7 +4,7 @@
 
 import importlib.util
 from pathlib import Path
-from typing import Annotated, Literal
+from typing import Annotated
 
 import typer
 
@@ -12,21 +12,6 @@
 from benchmarks.plotting import FacetBy, Metric, PlotView, SortMode
 
 
-def _snapshot_version_key(p: Path) -> tuple[int, object]:
-    """Sort key parsing ``linopy-<ver>`` from a filename; non-matches sort last."""
-    import re
-
-    from packaging.version import InvalidVersion, Version
-
-    m = re.search(r"-(\d[\w.]*)\.json$", p.name)
-    if m:
-        try:
-            return (0, Version(m.group(1)))
-        except InvalidVersion:
-            pass
-    return (1, p.name)
-
-
 @app.command()
 def plot(
     snapshots: Annotated[
@@ -43,29 +28,6 @@ def plot(
             )
         ),
     ] = None,
-    order: Annotated[
-        Literal["input", "version"],
-        typer.Option(
-            "--order",
-            help=(
-                "Snapshot input order. ``input`` (default) keeps the order you "
-                "pass — the plot never re-sorts. ``version`` sorts inputs by the "
-                "parsed ``linopy-<ver>``, fixing a glob's string order (0.3.10 "
-                "before 0.3.2) for release-history sweeps."
-            ),
-        ),
-    ] = "input",
-    reverse: Annotated[
-        bool,
-        typer.Option(
-            "--reverse/--no-reverse",
-            help=(
-                "Reverse the snapshot order (after --order). E.g. ``--order "
-                "version --reverse`` = newest-first; in compare/scatter it flips "
-                "which snapshot is the baseline."
-            ),
-        ),
-    ] = False,
     metric: Annotated[
         Metric,
         typer.Option(
@@ -156,11 +118,6 @@ def plot(
         _suggest_snapshots(f"missing snapshots: {[str(p) for p in missing]}")
         raise typer.Exit(code=2)
 
-    if order == "version":
-        snapshots = sorted(snapshots, key=_snapshot_version_key)
-    if reverse:
-        snapshots = snapshots[::-1]
-
     chosen = view or (
         "scaling"
         if len(snapshots) == 1