Skip to content
34 changes: 29 additions & 5 deletions benchmarks/cli/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,10 @@ def plot(
SortMode,
typer.Option(
help=(
"Compare-view sort and bar dimension. ``absolute`` (default) "
"uses ``b - a`` in seconds so the biggest actual-time impacts "
"float to the bottom — avoids over-weighting cheap "
"microsecond tests. ``relative`` uses percent change."
"Compare-view bar metric *and* sort. ``absolute`` (default) "
"ranks by ``b - a`` (actual-time/MiB impact, not over-weighting "
"tiny tests); ``relative`` by percent change. Bars are ordered by "
"it — biggest regressions on top, improvements at the bottom."
)
),
] = "absolute",
Expand All @@ -61,6 +61,19 @@ def plot(
),
),
] = None,
clip: Annotated[
float | None,
typer.Option(
"--clip",
help=(
"Clamp the *colour* scale (the one thing you can't zoom after "
"the plot is drawn); default is the symmetric p95. Unit follows "
"the plot's colour: a fold-change (>1) for fold-coloured sweep "
"(``--clip 8`` = ⅛×–8×), an absolute Δ for Δ-coloured "
"scatter/compare. scaling has no diverging colour and ignores it."
),
),
] = None,
output: Annotated[
Path | None,
typer.Option(
Expand Down Expand Up @@ -129,6 +142,17 @@ def plot(
"scaling view takes exactly 1 snapshot", fg=typer.colors.RED, err=True
)
raise typer.Exit(code=2)
if clip is not None:
if clip <= 0:
typer.secho("--clip must be positive", fg=typer.colors.RED, err=True)
raise typer.Exit(code=2)
if chosen == "sweep" and clip <= 1:
typer.secho(
"sweep --clip is a fold-change > 1 (colour is log₂; e.g. 8 = ⅛×–8×)",
fg=typer.colors.RED,
err=True,
)
raise typer.Exit(code=2)

# RENDERERS imports fine without plotly (lazy inside each), so check the dep.
if importlib.util.find_spec("plotly") is None:
Expand All @@ -147,7 +171,7 @@ def plot(
output = Path(".benchmarks") / "plots" / f"{chosen}.html"

try:
fig, n_tests = RENDERERS[chosen](snapshots, metric, sort, facets)
fig, n_tests = RENDERERS[chosen](snapshots, metric, sort, facets, clip)
except ValueError as exc:
typer.secho(str(exc), fg=typer.colors.RED, err=True)
raise typer.Exit(code=1) from exc
Expand Down
112 changes: 77 additions & 35 deletions benchmarks/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
from pathlib import Path
from typing import TYPE_CHECKING, Literal

import numpy as np

from benchmarks.snapshot import Metric, load_long_df

if TYPE_CHECKING:
Expand All @@ -43,6 +45,22 @@ def _diverging_kwargs(midpoint: float = 0.0) -> dict:
}


def _symmetric_clip(magnitudes: np.ndarray, override: float | None, pct: float = 95.0) -> float:
"""
Symmetric colour bound for a diverging scale: ``override`` if given, else the
``pct`` percentile of ``|magnitudes|`` — so a few outliers don't wash the rest
to the midpoint. Positive; callers use ``[-b, +b]``.
"""
if override is not None:
return float(override)
mags = np.abs(np.asarray(magnitudes, dtype=float))
mags = mags[np.isfinite(mags)]
if mags.size == 0:
return 1.0
bound = float(np.percentile(mags, pct))
return bound if bound > 0 else (float(mags.max()) or 1e-9)


def _axis_kwargs(unit: str) -> dict:
"""``update_xaxes`` kwargs for a given unit."""
if unit == "s":
Expand Down Expand Up @@ -104,9 +122,11 @@ def plot_compare(
metric: Metric = "min",
sort: SortMode = "absolute",
facets: FacetBy | None = None,
clip: float | None = None,
) -> tuple[Figure, int]:
"""
Bar chart of per-test delta, in alphabetical test-id order.
Bar chart of per-test delta, sorted by the chosen ``--sort`` Δ
(biggest regressions on top, improvements at the bottom).

``sort`` picks the bar dimension: ``absolute`` (default) plots ``b - a`` in
the native unit, ``relative`` plots percent change. Bars stay in id order
Expand Down Expand Up @@ -147,8 +167,8 @@ def plot_compare(

df["delta_abs"] = df[b_label] - df[a_label]
df["delta_pct"] = (df["delta_abs"] / df[a_label]) * 100.0
df = df.sort_values("test_id").reset_index(drop=True)
x_col = "delta_abs" if sort == "absolute" else "delta_pct"
df = df.sort_values(x_col).reset_index(drop=True)

if sort == "absolute":
x_label = f"{metric_label} delta ({unit})"
Expand Down Expand Up @@ -183,13 +203,15 @@ def plot_compare(
facet_kwargs = {"facet_col": facets}
facet_kwargs["facet_col_wrap"] = 2 if facets == "phase" else 3

color_clip = _symmetric_clip(df[x_col].to_numpy(), clip)
fig = px.bar(
df,
x=x_col,
y=y_col,
orientation="h",
color=x_col,
**_diverging_kwargs(),
range_color=[-color_clip, color_clip],
title=title,
labels={x_col: x_label, y_col: ""},
text_auto=text_fmt,
Expand Down Expand Up @@ -228,6 +250,7 @@ def plot_scatter(
metric: Metric = "min",
sort: SortMode = "absolute", # noqa: ARG001 (uniform signature, unused here)
facets: FacetBy | None = None,
clip: float | None = None,
) -> tuple[Figure, int]:
"""
Baseline cost (log-x) vs candidate/baseline ratio (y) — the exploratory
Expand All @@ -239,7 +262,6 @@ def plot_scatter(
``animation_frame``. A dashed line at ``ratio = 1`` marks no change; colour
encodes absolute Δ.
"""
import numpy as np
import plotly.express as px

if len(snapshots) < 2:
Expand Down Expand Up @@ -268,22 +290,20 @@ def plot_scatter(
df["ratio"] = df["candidate_time"] / df["baseline_time"]
df["delta_abs"] = df["candidate_time"] - df["baseline_time"]
df["delta_pct"] = df["delta_abs"] / df["baseline_time"] * 100.0
df = df.rename(columns={"test_id": "test"})
# log y-axis can't show a zero ratio (candidate value of 0) — drop those.
df = df[df["ratio"] > 0].rename(columns={"test_id": "test"})
# Fixed ranges so the animation doesn't jitter; pad to avoid edge clipping.
x_lo, x_hi = df["baseline_time"].min(), df["baseline_time"].max()
# y-range centred symmetrically on 1.0 so regressions and improvements read
# equally; the larger side sets the window width.
# ratio is multiplicative → log y-axis; show the *full* fold range (symmetric
# about 1.0) so every point is visible — zoom interactively to focus.
y_lo, y_hi = df["ratio"].min(), df["ratio"].max()
max_dist = max(abs(1.0 - y_lo), abs(y_hi - 1.0), 0.05)
pad_y = max(0.05, max_dist * 0.05)
y_range = [1.0 - max_dist - pad_y, 1.0 + max_dist + pad_y]
fold = max(y_hi, 1.0 / y_lo, 1.1)
bound = fold**1.05
y_range = [1.0 / bound, bound]

# Clip the colour scale to the p95 absolute Δ so one huge regression doesn't
# wash the rest to white; outliers saturate at the bound.
clip = float(np.percentile(df["delta_abs"].abs(), 95)) if len(df) > 0 else 0.0
if clip == 0.0:
max_abs = float(df["delta_abs"].abs().max())
clip = max_abs if max_abs > 0 else 1e-9
# --clip clamps the *colour* — the one thing you can't zoom after the plot is
# made. Here colour is the absolute Δ, so it's a linear bound. Default: p95.
color_clip = _symmetric_clip(df["delta_abs"].to_numpy(), clip)

animate = len(snapshots) >= 3
extra: dict = {}
Expand All @@ -300,8 +320,9 @@ def plot_scatter(
y="ratio",
color="delta_abs",
**_diverging_kwargs(),
range_color=[-clip, clip],
range_color=[-color_clip, color_clip],
log_x=True,
log_y=True,
range_x=[x_lo * 0.5, x_hi * 2],
range_y=y_range,
hover_name="test",
Expand All @@ -319,7 +340,7 @@ def plot_scatter(
),
labels={
"baseline_time": f"baseline {metric_label} ({unit}, log scale)",
"ratio": f"{metric_label} ratio (candidate / baseline)",
"ratio": f"{metric_label} ratio (candidate / baseline, log scale)",
"candidate_time": "candidate",
"delta_abs": f"Δ ({unit}, p95-clipped)",
},
Expand All @@ -344,47 +365,64 @@ def plot_sweep(
metric: Metric = "min",
sort: SortMode = "absolute", # noqa: ARG001 (uniform signature, unused here)
facets: FacetBy | None = None, # noqa: ARG001 (uniform signature, unused here)
clip: float | None = None,
) -> tuple[Figure, int]:
"""Heatmap of per-test ratio relative to the first snapshot."""
"""Heatmap of per-test fold-change (log2 ratio) vs the first snapshot."""
import plotly.express as px

df_long, unit = load_long_df(snapshots, metric)
metric_label = _metric_label(metric, unit)
versions = df_long["snapshot"].drop_duplicates().tolist()
baseline_label = versions[0]

# Pivot to absolutes, drop tests missing the baseline, divide by it for ratios.
abs_df = df_long.pivot(index="test_id", columns="snapshot", values="value").reindex(
columns=versions
)
abs_df = abs_df.dropna(subset=[baseline_label])
if abs_df.empty:
raise ValueError(f"no overlap with baseline snapshot {baseline_label}")
df = abs_df.div(abs_df[baseline_label], axis=0)
abs_df.index.name = "test"
df.index.name = "test"

ratio = abs_df.div(abs_df[baseline_label], axis=0)
# Colour by log2(ratio): plotly's colour scale is linear (no log mode), so raw
# ratio makes a 2x look twice as intense as its mirror 1/2x. log2 makes folds
# symmetric around 0; the bar is relabelled to fold-change. Range defaults to
# the symmetric p95 (override via --clip, a fold-change).
logr = np.log2(ratio.where(ratio > 0))
abs_df.index.name = ratio.index.name = logr.index.name = "test"

bound = _symmetric_clip(logr.values, float(np.log2(clip)) if clip else None)
fig = px.imshow(
df,
**_diverging_kwargs(1.0),
logr,
color_continuous_scale=["green", "white", "red"],
color_continuous_midpoint=0.0,
zmin=-bound,
zmax=bound,
aspect="auto",
title=f"{metric_label} ratio relative to baseline ({versions[0]})",
labels={"x": "version", "y": "test", "color": "ratio"},
text_auto=".2f",
title=f"{metric_label} fold-change vs baseline ({versions[0]})",
labels={"x": "version", "y": "test", "color": "fold"},
)
# Fold-change ticks at integer log2 steps spanning the actual colour range.
hi = max(1, int(bound))
ticks = list(range(-hi, hi + 1))
fig.update_coloraxes(
colorbar=dict(
tickvals=ticks,
ticktext=[f"{2**t}×" if t >= 0 else f"1/{2**-t}×" for t in ticks],
title="fold",
)
)
# Absolute values as customdata so hover shows both ratio and value.
fig.update_traces(
text=ratio.round(2).values,
texttemplate="%{text}×",
customdata=abs_df.values,
hovertemplate=(
"test: %{y}<br>"
"version: %{x}<br>"
"ratio: %{z:.3f}<br>"
"test: %{y}<br>version: %{x}<br>"
"fold: %{text}×<br>"
f"{metric_label}: %{{customdata:.4g}}{unit}"
"<extra></extra>"
),
)
fig.update_layout(height=max(500, len(df) * 22))
return fig, len(df)
fig.update_layout(height=max(500, len(logr) * 22))
return fig, len(logr)


# Per sweep axis: (x label, log-scaled?). Size is multiplicative → log; severity
Expand All @@ -400,6 +438,7 @@ def plot_scaling(
metric: Metric = "min",
sort: SortMode = "absolute", # noqa: ARG001 (uniform signature, unused here)
facets: FacetBy | None = None, # noqa: ARG001 (uniform signature, unused here)
clip: float | None = None, # noqa: ARG001 (uniform signature, unused here)
) -> tuple[Figure, int]:
"""
Cost vs the sweep dial for parametrized tests, faceted by phase.
Expand Down Expand Up @@ -448,7 +487,10 @@ def plot_scaling(

RENDERERS: dict[
PlotView,
Callable[[list[Path], Metric, SortMode, FacetBy | None], tuple[Figure, int]],
Callable[
[list[Path], Metric, SortMode, FacetBy | None, float | None],
tuple[Figure, int],
],
] = {
"compare": plot_compare,
"scatter": plot_scatter,
Expand Down
Loading