fluxopt · FBumann · Jun 6, 2026 · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026
diff --git a/benchmarks/cli/plot.py b/benchmarks/cli/plot.py
@@ -42,10 +42,10 @@ def plot(
         SortMode,
         typer.Option(
             help=(
-                "Compare-view sort and bar dimension. ``absolute`` (default) "
-                "uses ``b - a`` in seconds so the biggest actual-time impacts "
-                "float to the bottom — avoids over-weighting cheap "
-                "microsecond tests. ``relative`` uses percent change."
+                "Compare-view bar metric *and* sort. ``absolute`` (default) "
+                "ranks by ``b - a`` (actual-time/MiB impact, not over-weighting "
+                "tiny tests); ``relative`` by percent change. Bars are ordered by "
+                "it — biggest regressions on top, improvements at the bottom."
             )
         ),
     ] = "absolute",
@@ -61,6 +61,19 @@ def plot(
             ),
         ),
     ] = None,
+    clip: Annotated[
+        float | None,
+        typer.Option(
+            "--clip",
+            help=(
+                "Clamp the *colour* scale (the one thing you can't zoom after "
+                "the plot is drawn); default is the symmetric p95. Unit follows "
+                "the plot's colour: a fold-change (>1) for fold-coloured sweep "
+                "(``--clip 8`` = ⅛×–8×), an absolute Δ for Δ-coloured "
+                "scatter/compare. scaling has no diverging colour and ignores it."
+            ),
+        ),
+    ] = None,
     output: Annotated[
         Path | None,
         typer.Option(
@@ -129,6 +142,17 @@ def plot(
             "scaling view takes exactly 1 snapshot", fg=typer.colors.RED, err=True
         )
         raise typer.Exit(code=2)
+    if clip is not None:
+        if clip <= 0:
+            typer.secho("--clip must be positive", fg=typer.colors.RED, err=True)
+            raise typer.Exit(code=2)
+        if chosen == "sweep" and clip <= 1:
+            typer.secho(
+                "sweep --clip is a fold-change > 1 (colour is log₂; e.g. 8 = ⅛×–8×)",
+                fg=typer.colors.RED,
+                err=True,
+            )
+            raise typer.Exit(code=2)
 
     # RENDERERS imports fine without plotly (lazy inside each), so check the dep.
     if importlib.util.find_spec("plotly") is None:
@@ -147,7 +171,7 @@ def plot(
         output = Path(".benchmarks") / "plots" / f"{chosen}.html"
 
     try:
-        fig, n_tests = RENDERERS[chosen](snapshots, metric, sort, facets)
+        fig, n_tests = RENDERERS[chosen](snapshots, metric, sort, facets, clip)
     except ValueError as exc:
         typer.secho(str(exc), fg=typer.colors.RED, err=True)
         raise typer.Exit(code=1) from exc

diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
@@ -20,6 +20,8 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Literal
 
+import numpy as np
+
 from benchmarks.snapshot import Metric, load_long_df
 
 if TYPE_CHECKING:
@@ -43,6 +45,22 @@ def _diverging_kwargs(midpoint: float = 0.0) -> dict:
     }
 
 
+def _symmetric_clip(magnitudes: np.ndarray, override: float | None, pct: float = 95.0) -> float:
+    """
+    Symmetric colour bound for a diverging scale: ``override`` if given, else the
+    ``pct`` percentile of ``|magnitudes|`` — so a few outliers don't wash the rest
+    to the midpoint. Positive; callers use ``[-b, +b]``.
+    """
+    if override is not None:
+        return float(override)
+    mags = np.abs(np.asarray(magnitudes, dtype=float))
+    mags = mags[np.isfinite(mags)]
+    if mags.size == 0:
+        return 1.0
+    bound = float(np.percentile(mags, pct))
+    return bound if bound > 0 else (float(mags.max()) or 1e-9)
+
+
 def _axis_kwargs(unit: str) -> dict:
     """``update_xaxes`` kwargs for a given unit."""
     if unit == "s":
@@ -104,9 +122,11 @@ def plot_compare(
     metric: Metric = "min",
     sort: SortMode = "absolute",
     facets: FacetBy | None = None,
+    clip: float | None = None,
 ) -> tuple[Figure, int]:
     """
-    Bar chart of per-test delta, in alphabetical test-id order.
+    Bar chart of per-test delta, sorted by the chosen ``--sort`` Δ
+    (biggest regressions on top, improvements at the bottom).
 
     ``sort`` picks the bar dimension: ``absolute`` (default) plots ``b - a`` in
     the native unit, ``relative`` plots percent change. Bars stay in id order
@@ -147,8 +167,8 @@ def plot_compare(
 
     df["delta_abs"] = df[b_label] - df[a_label]
     df["delta_pct"] = (df["delta_abs"] / df[a_label]) * 100.0
-    df = df.sort_values("test_id").reset_index(drop=True)
     x_col = "delta_abs" if sort == "absolute" else "delta_pct"
+    df = df.sort_values(x_col).reset_index(drop=True)
 
     if sort == "absolute":
         x_label = f"{metric_label} delta ({unit})"
@@ -183,13 +203,15 @@ def plot_compare(
         facet_kwargs = {"facet_col": facets}
         facet_kwargs["facet_col_wrap"] = 2 if facets == "phase" else 3
 
+    color_clip = _symmetric_clip(df[x_col].to_numpy(), clip)
     fig = px.bar(
         df,
         x=x_col,
         y=y_col,
         orientation="h",
         color=x_col,
         **_diverging_kwargs(),
+        range_color=[-color_clip, color_clip],
         title=title,
         labels={x_col: x_label, y_col: ""},
         text_auto=text_fmt,
@@ -228,6 +250,7 @@ def plot_scatter(
     metric: Metric = "min",
     sort: SortMode = "absolute",  # noqa: ARG001  (uniform signature, unused here)
     facets: FacetBy | None = None,
+    clip: float | None = None,
 ) -> tuple[Figure, int]:
     """
     Baseline cost (log-x) vs candidate/baseline ratio (y) — the exploratory
@@ -239,7 +262,6 @@ def plot_scatter(
     ``animation_frame``. A dashed line at ``ratio = 1`` marks no change; colour
     encodes absolute Δ.
     """
-    import numpy as np
     import plotly.express as px
 
     if len(snapshots) < 2:
@@ -268,22 +290,20 @@ def plot_scatter(
     df["ratio"] = df["candidate_time"] / df["baseline_time"]
     df["delta_abs"] = df["candidate_time"] - df["baseline_time"]
     df["delta_pct"] = df["delta_abs"] / df["baseline_time"] * 100.0
-    df = df.rename(columns={"test_id": "test"})
+    # log y-axis can't show a zero ratio (candidate value of 0) — drop those.
+    df = df[df["ratio"] > 0].rename(columns={"test_id": "test"})
     # Fixed ranges so the animation doesn't jitter; pad to avoid edge clipping.
     x_lo, x_hi = df["baseline_time"].min(), df["baseline_time"].max()
-    # y-range centred symmetrically on 1.0 so regressions and improvements read
-    # equally; the larger side sets the window width.
+    # ratio is multiplicative → log y-axis; show the *full* fold range (symmetric
+    # about 1.0) so every point is visible — zoom interactively to focus.
     y_lo, y_hi = df["ratio"].min(), df["ratio"].max()
-    max_dist = max(abs(1.0 - y_lo), abs(y_hi - 1.0), 0.05)
-    pad_y = max(0.05, max_dist * 0.05)
-    y_range = [1.0 - max_dist - pad_y, 1.0 + max_dist + pad_y]
+    fold = max(y_hi, 1.0 / y_lo, 1.1)
+    bound = fold**1.05
+    y_range = [1.0 / bound, bound]
 
-    # Clip the colour scale to the p95 absolute Δ so one huge regression doesn't
-    # wash the rest to white; outliers saturate at the bound.
-    clip = float(np.percentile(df["delta_abs"].abs(), 95)) if len(df) > 0 else 0.0
-    if clip == 0.0:
-        max_abs = float(df["delta_abs"].abs().max())
-        clip = max_abs if max_abs > 0 else 1e-9
+    # --clip clamps the *colour* — the one thing you can't zoom after the plot is
+    # made. Here colour is the absolute Δ, so it's a linear bound. Default: p95.
+    color_clip = _symmetric_clip(df["delta_abs"].to_numpy(), clip)
 
     animate = len(snapshots) >= 3
     extra: dict = {}
@@ -300,8 +320,9 @@ def plot_scatter(
         y="ratio",
         color="delta_abs",
         **_diverging_kwargs(),
-        range_color=[-clip, clip],
+        range_color=[-color_clip, color_clip],
         log_x=True,
+        log_y=True,
         range_x=[x_lo * 0.5, x_hi * 2],
         range_y=y_range,
         hover_name="test",
@@ -319,7 +340,7 @@ def plot_scatter(
         ),
         labels={
             "baseline_time": f"baseline {metric_label} ({unit}, log scale)",
-            "ratio": f"{metric_label} ratio  (candidate / baseline)",
+            "ratio": f"{metric_label} ratio (candidate / baseline, log scale)",
             "candidate_time": "candidate",
             "delta_abs": f"Δ ({unit}, p95-clipped)",
         },
@@ -344,47 +365,64 @@ def plot_sweep(
     metric: Metric = "min",
     sort: SortMode = "absolute",  # noqa: ARG001  (uniform signature, unused here)
     facets: FacetBy | None = None,  # noqa: ARG001  (uniform signature, unused here)
+    clip: float | None = None,
 ) -> tuple[Figure, int]:
-    """Heatmap of per-test ratio relative to the first snapshot."""
+    """Heatmap of per-test fold-change (log2 ratio) vs the first snapshot."""
     import plotly.express as px
 
     df_long, unit = load_long_df(snapshots, metric)
     metric_label = _metric_label(metric, unit)
     versions = df_long["snapshot"].drop_duplicates().tolist()
     baseline_label = versions[0]
 
-    # Pivot to absolutes, drop tests missing the baseline, divide by it for ratios.
     abs_df = df_long.pivot(index="test_id", columns="snapshot", values="value").reindex(
         columns=versions
     )
     abs_df = abs_df.dropna(subset=[baseline_label])
     if abs_df.empty:
         raise ValueError(f"no overlap with baseline snapshot {baseline_label}")
-    df = abs_df.div(abs_df[baseline_label], axis=0)
-    abs_df.index.name = "test"
-    df.index.name = "test"
-
+    ratio = abs_df.div(abs_df[baseline_label], axis=0)
+    # Colour by log2(ratio): plotly's colour scale is linear (no log mode), so raw
+    # ratio makes a 2x look twice as intense as its mirror 1/2x. log2 makes folds
+    # symmetric around 0; the bar is relabelled to fold-change. Range defaults to
+    # the symmetric p95 (override via --clip, a fold-change).
+    logr = np.log2(ratio.where(ratio > 0))
+    abs_df.index.name = ratio.index.name = logr.index.name = "test"
+
+    bound = _symmetric_clip(logr.values, float(np.log2(clip)) if clip else None)
     fig = px.imshow(
-        df,
-        **_diverging_kwargs(1.0),
+        logr,
+        color_continuous_scale=["green", "white", "red"],
+        color_continuous_midpoint=0.0,
+        zmin=-bound,
+        zmax=bound,
         aspect="auto",
-        title=f"{metric_label} ratio relative to baseline ({versions[0]})",
-        labels={"x": "version", "y": "test", "color": "ratio"},
-        text_auto=".2f",
+        title=f"{metric_label} fold-change vs baseline ({versions[0]})",
+        labels={"x": "version", "y": "test", "color": "fold"},
+    )
+    # Fold-change ticks at integer log2 steps spanning the actual colour range.
+    hi = max(1, int(bound))
+    ticks = list(range(-hi, hi + 1))
+    fig.update_coloraxes(
+        colorbar=dict(
+            tickvals=ticks,
+            ticktext=[f"{2**t}×" if t >= 0 else f"1/{2**-t}×" for t in ticks],
+            title="fold",
+        )
     )
-    # Absolute values as customdata so hover shows both ratio and value.
     fig.update_traces(
+        text=ratio.round(2).values,
+        texttemplate="%{text}×",
         customdata=abs_df.values,
         hovertemplate=(
-            "test: %{y}<br>"
-            "version: %{x}<br>"
-            "ratio: %{z:.3f}<br>"
+            "test: %{y}<br>version: %{x}<br>"
+            "fold: %{text}×<br>"
             f"{metric_label}: %{{customdata:.4g}}{unit}"
             "<extra></extra>"
         ),
     )
-    fig.update_layout(height=max(500, len(df) * 22))
-    return fig, len(df)
+    fig.update_layout(height=max(500, len(logr) * 22))
+    return fig, len(logr)
 
 
 # Per sweep axis: (x label, log-scaled?). Size is multiplicative → log; severity
@@ -400,6 +438,7 @@ def plot_scaling(
     metric: Metric = "min",
     sort: SortMode = "absolute",  # noqa: ARG001  (uniform signature, unused here)
     facets: FacetBy | None = None,  # noqa: ARG001  (uniform signature, unused here)
+    clip: float | None = None,  # noqa: ARG001  (uniform signature, unused here)
 ) -> tuple[Figure, int]:
     """
     Cost vs the sweep dial for parametrized tests, faceted by phase.
@@ -448,7 +487,10 @@ def plot_scaling(
 
 RENDERERS: dict[
     PlotView,
-    Callable[[list[Path], Metric, SortMode, FacetBy | None], tuple[Figure, int]],
+    Callable[
+        [list[Path], Metric, SortMode, FacetBy | None, float | None],
+        tuple[Figure, int],
+    ],
 ] = {
     "compare": plot_compare,
     "scatter": plot_scatter,