diff --git a/benchmarks/cli/plot.py b/benchmarks/cli/plot.py
index c0cad047..d51a9957 100644
--- a/benchmarks/cli/plot.py
+++ b/benchmarks/cli/plot.py
@@ -42,10 +42,10 @@ def plot(
SortMode,
typer.Option(
help=(
- "Compare-view sort and bar dimension. ``absolute`` (default) "
- "uses ``b - a`` in seconds so the biggest actual-time impacts "
- "float to the bottom — avoids over-weighting cheap "
- "microsecond tests. ``relative`` uses percent change."
+ "Compare-view bar metric *and* sort. ``absolute`` (default) "
+ "ranks by ``b - a`` (actual-time/MiB impact, not over-weighting "
+ "tiny tests); ``relative`` by percent change. Bars are ordered by "
+ "it — biggest regressions on top, improvements at the bottom."
)
),
] = "absolute",
@@ -61,6 +61,19 @@ def plot(
),
),
] = None,
+ clip: Annotated[
+ float | None,
+ typer.Option(
+ "--clip",
+ help=(
+ "Clamp the *colour* scale (the one thing you can't zoom after "
+ "the plot is drawn); default is the symmetric p95. Unit follows "
+ "the plot's colour: a fold-change (>1) for fold-coloured sweep "
+ "(``--clip 8`` = ⅛×–8×), an absolute Δ for Δ-coloured "
+ "scatter/compare. scaling has no diverging colour and ignores it."
+ ),
+ ),
+ ] = None,
output: Annotated[
Path | None,
typer.Option(
@@ -129,6 +142,17 @@ def plot(
"scaling view takes exactly 1 snapshot", fg=typer.colors.RED, err=True
)
raise typer.Exit(code=2)
+ if clip is not None:
+ if clip <= 0:
+ typer.secho("--clip must be positive", fg=typer.colors.RED, err=True)
+ raise typer.Exit(code=2)
+ if chosen == "sweep" and clip <= 1:
+ typer.secho(
+ "sweep --clip is a fold-change > 1 (colour is log₂; e.g. 8 = ⅛×–8×)",
+ fg=typer.colors.RED,
+ err=True,
+ )
+ raise typer.Exit(code=2)
# RENDERERS imports fine without plotly (lazy inside each), so check the dep.
if importlib.util.find_spec("plotly") is None:
@@ -147,7 +171,7 @@ def plot(
output = Path(".benchmarks") / "plots" / f"{chosen}.html"
try:
- fig, n_tests = RENDERERS[chosen](snapshots, metric, sort, facets)
+ fig, n_tests = RENDERERS[chosen](snapshots, metric, sort, facets, clip)
except ValueError as exc:
typer.secho(str(exc), fg=typer.colors.RED, err=True)
raise typer.Exit(code=1) from exc
diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
index c31054b8..e7af212f 100644
--- a/benchmarks/plotting.py
+++ b/benchmarks/plotting.py
@@ -20,6 +20,8 @@
from pathlib import Path
from typing import TYPE_CHECKING, Literal
+import numpy as np
+
from benchmarks.snapshot import Metric, load_long_df
if TYPE_CHECKING:
@@ -43,6 +45,22 @@ def _diverging_kwargs(midpoint: float = 0.0) -> dict:
}
+def _symmetric_clip(magnitudes: np.ndarray, override: float | None, pct: float = 95.0) -> float:
+ """
+ Symmetric colour bound for a diverging scale: ``override`` if given, else the
+ ``pct`` percentile of ``|magnitudes|`` — so a few outliers don't wash the rest
+ to the midpoint. Positive; callers use ``[-b, +b]``.
+ """
+ if override is not None:
+ return float(override)
+ mags = np.abs(np.asarray(magnitudes, dtype=float))
+ mags = mags[np.isfinite(mags)]
+ if mags.size == 0:
+ return 1.0
+ bound = float(np.percentile(mags, pct))
+ return bound if bound > 0 else (float(mags.max()) or 1e-9)
+
+
def _axis_kwargs(unit: str) -> dict:
"""``update_xaxes`` kwargs for a given unit."""
if unit == "s":
@@ -104,9 +122,11 @@ def plot_compare(
metric: Metric = "min",
sort: SortMode = "absolute",
facets: FacetBy | None = None,
+ clip: float | None = None,
) -> tuple[Figure, int]:
"""
- Bar chart of per-test delta, in alphabetical test-id order.
+ Bar chart of per-test delta, sorted by the chosen ``--sort`` Δ
+ (biggest regressions on top, improvements at the bottom).
``sort`` picks the bar dimension: ``absolute`` (default) plots ``b - a`` in
the native unit, ``relative`` plots percent change. Bars stay in id order
@@ -147,8 +167,8 @@ def plot_compare(
df["delta_abs"] = df[b_label] - df[a_label]
df["delta_pct"] = (df["delta_abs"] / df[a_label]) * 100.0
- df = df.sort_values("test_id").reset_index(drop=True)
x_col = "delta_abs" if sort == "absolute" else "delta_pct"
+ df = df.sort_values(x_col).reset_index(drop=True)
if sort == "absolute":
x_label = f"{metric_label} delta ({unit})"
@@ -183,6 +203,7 @@ def plot_compare(
facet_kwargs = {"facet_col": facets}
facet_kwargs["facet_col_wrap"] = 2 if facets == "phase" else 3
+ color_clip = _symmetric_clip(df[x_col].to_numpy(), clip)
fig = px.bar(
df,
x=x_col,
@@ -190,6 +211,7 @@ def plot_compare(
orientation="h",
color=x_col,
**_diverging_kwargs(),
+ range_color=[-color_clip, color_clip],
title=title,
labels={x_col: x_label, y_col: ""},
text_auto=text_fmt,
@@ -228,6 +250,7 @@ def plot_scatter(
metric: Metric = "min",
sort: SortMode = "absolute", # noqa: ARG001 (uniform signature, unused here)
facets: FacetBy | None = None,
+ clip: float | None = None,
) -> tuple[Figure, int]:
"""
Baseline cost (log-x) vs candidate/baseline ratio (y) — the exploratory
@@ -239,7 +262,6 @@ def plot_scatter(
``animation_frame``. A dashed line at ``ratio = 1`` marks no change; colour
encodes absolute Δ.
"""
- import numpy as np
import plotly.express as px
if len(snapshots) < 2:
@@ -268,22 +290,20 @@ def plot_scatter(
df["ratio"] = df["candidate_time"] / df["baseline_time"]
df["delta_abs"] = df["candidate_time"] - df["baseline_time"]
df["delta_pct"] = df["delta_abs"] / df["baseline_time"] * 100.0
- df = df.rename(columns={"test_id": "test"})
+ # log y-axis can't show a zero ratio (candidate value of 0) — drop those.
+ df = df[df["ratio"] > 0].rename(columns={"test_id": "test"})
# Fixed ranges so the animation doesn't jitter; pad to avoid edge clipping.
x_lo, x_hi = df["baseline_time"].min(), df["baseline_time"].max()
- # y-range centred symmetrically on 1.0 so regressions and improvements read
- # equally; the larger side sets the window width.
+ # ratio is multiplicative → log y-axis; show the *full* fold range (symmetric
+ # about 1.0) so every point is visible — zoom interactively to focus.
y_lo, y_hi = df["ratio"].min(), df["ratio"].max()
- max_dist = max(abs(1.0 - y_lo), abs(y_hi - 1.0), 0.05)
- pad_y = max(0.05, max_dist * 0.05)
- y_range = [1.0 - max_dist - pad_y, 1.0 + max_dist + pad_y]
+ fold = max(y_hi, 1.0 / y_lo, 1.1)
+ bound = fold**1.05
+ y_range = [1.0 / bound, bound]
- # Clip the colour scale to the p95 absolute Δ so one huge regression doesn't
- # wash the rest to white; outliers saturate at the bound.
- clip = float(np.percentile(df["delta_abs"].abs(), 95)) if len(df) > 0 else 0.0
- if clip == 0.0:
- max_abs = float(df["delta_abs"].abs().max())
- clip = max_abs if max_abs > 0 else 1e-9
+ # --clip clamps the *colour* — the one thing you can't zoom after the plot is
+ # made. Here colour is the absolute Δ, so it's a linear bound. Default: p95.
+ color_clip = _symmetric_clip(df["delta_abs"].to_numpy(), clip)
animate = len(snapshots) >= 3
extra: dict = {}
@@ -300,8 +320,9 @@ def plot_scatter(
y="ratio",
color="delta_abs",
**_diverging_kwargs(),
- range_color=[-clip, clip],
+ range_color=[-color_clip, color_clip],
log_x=True,
+ log_y=True,
range_x=[x_lo * 0.5, x_hi * 2],
range_y=y_range,
hover_name="test",
@@ -319,7 +340,7 @@ def plot_scatter(
),
labels={
"baseline_time": f"baseline {metric_label} ({unit}, log scale)",
- "ratio": f"{metric_label} ratio (candidate / baseline)",
+ "ratio": f"{metric_label} ratio (candidate / baseline, log scale)",
"candidate_time": "candidate",
"delta_abs": f"Δ ({unit}, p95-clipped)",
},
@@ -344,8 +365,9 @@ def plot_sweep(
metric: Metric = "min",
sort: SortMode = "absolute", # noqa: ARG001 (uniform signature, unused here)
facets: FacetBy | None = None, # noqa: ARG001 (uniform signature, unused here)
+ clip: float | None = None,
) -> tuple[Figure, int]:
- """Heatmap of per-test ratio relative to the first snapshot."""
+ """Heatmap of per-test fold-change (log2 ratio) vs the first snapshot."""
import plotly.express as px
df_long, unit = load_long_df(snapshots, metric)
@@ -353,38 +375,54 @@ def plot_sweep(
versions = df_long["snapshot"].drop_duplicates().tolist()
baseline_label = versions[0]
- # Pivot to absolutes, drop tests missing the baseline, divide by it for ratios.
abs_df = df_long.pivot(index="test_id", columns="snapshot", values="value").reindex(
columns=versions
)
abs_df = abs_df.dropna(subset=[baseline_label])
if abs_df.empty:
raise ValueError(f"no overlap with baseline snapshot {baseline_label}")
- df = abs_df.div(abs_df[baseline_label], axis=0)
- abs_df.index.name = "test"
- df.index.name = "test"
-
+ ratio = abs_df.div(abs_df[baseline_label], axis=0)
+ # Colour by log2(ratio): plotly's colour scale is linear (no log mode), so raw
+ # ratio makes a 2x look twice as intense as its mirror 1/2x. log2 makes folds
+ # symmetric around 0; the bar is relabelled to fold-change. Range defaults to
+ # the symmetric p95 (override via --clip, a fold-change).
+ logr = np.log2(ratio.where(ratio > 0))
+ abs_df.index.name = ratio.index.name = logr.index.name = "test"
+
+ bound = _symmetric_clip(logr.values, float(np.log2(clip)) if clip else None)
fig = px.imshow(
- df,
- **_diverging_kwargs(1.0),
+ logr,
+ color_continuous_scale=["green", "white", "red"],
+ color_continuous_midpoint=0.0,
+ zmin=-bound,
+ zmax=bound,
aspect="auto",
- title=f"{metric_label} ratio relative to baseline ({versions[0]})",
- labels={"x": "version", "y": "test", "color": "ratio"},
- text_auto=".2f",
+ title=f"{metric_label} fold-change vs baseline ({versions[0]})",
+ labels={"x": "version", "y": "test", "color": "fold"},
+ )
+ # Fold-change ticks at integer log2 steps spanning the actual colour range.
+ hi = max(1, int(bound))
+ ticks = list(range(-hi, hi + 1))
+ fig.update_coloraxes(
+ colorbar=dict(
+ tickvals=ticks,
+ ticktext=[f"{2**t}×" if t >= 0 else f"1/{2**-t}×" for t in ticks],
+ title="fold",
+ )
)
- # Absolute values as customdata so hover shows both ratio and value.
fig.update_traces(
+ text=ratio.round(2).values,
+ texttemplate="%{text}×",
customdata=abs_df.values,
hovertemplate=(
- "test: %{y}
"
- "version: %{x}
"
- "ratio: %{z:.3f}
"
+ "test: %{y}
version: %{x}
"
+ "fold: %{text}×
"
f"{metric_label}: %{{customdata:.4g}}{unit}"
""
),
)
- fig.update_layout(height=max(500, len(df) * 22))
- return fig, len(df)
+ fig.update_layout(height=max(500, len(logr) * 22))
+ return fig, len(logr)
# Per sweep axis: (x label, log-scaled?). Size is multiplicative → log; severity
@@ -400,6 +438,7 @@ def plot_scaling(
metric: Metric = "min",
sort: SortMode = "absolute", # noqa: ARG001 (uniform signature, unused here)
facets: FacetBy | None = None, # noqa: ARG001 (uniform signature, unused here)
+ clip: float | None = None, # noqa: ARG001 (uniform signature, unused here)
) -> tuple[Figure, int]:
"""
Cost vs the sweep dial for parametrized tests, faceted by phase.
@@ -448,7 +487,10 @@ def plot_scaling(
RENDERERS: dict[
PlotView,
- Callable[[list[Path], Metric, SortMode, FacetBy | None], tuple[Figure, int]],
+ Callable[
+ [list[Path], Metric, SortMode, FacetBy | None, float | None],
+ tuple[Figure, int],
+ ],
] = {
"compare": plot_compare,
"scatter": plot_scatter,