diff --git a/benchmarks/cli/plot.py b/benchmarks/cli/plot.py index c0cad047..d51a9957 100644 --- a/benchmarks/cli/plot.py +++ b/benchmarks/cli/plot.py @@ -42,10 +42,10 @@ def plot( SortMode, typer.Option( help=( - "Compare-view sort and bar dimension. ``absolute`` (default) " - "uses ``b - a`` in seconds so the biggest actual-time impacts " - "float to the bottom — avoids over-weighting cheap " - "microsecond tests. ``relative`` uses percent change." + "Compare-view bar metric *and* sort. ``absolute`` (default) " + "ranks by ``b - a`` (actual-time/MiB impact, not over-weighting " + "tiny tests); ``relative`` by percent change. Bars are ordered by " + "it — biggest regressions on top, improvements at the bottom." ) ), ] = "absolute", @@ -61,6 +61,19 @@ def plot( ), ), ] = None, + clip: Annotated[ + float | None, + typer.Option( + "--clip", + help=( + "Clamp the *colour* scale (the one thing you can't zoom after " + "the plot is drawn); default is the symmetric p95. Unit follows " + "the plot's colour: a fold-change (>1) for fold-coloured sweep " + "(``--clip 8`` = ⅛×–8×), an absolute Δ for Δ-coloured " + "scatter/compare. scaling has no diverging colour and ignores it." + ), + ), + ] = None, output: Annotated[ Path | None, typer.Option( @@ -129,6 +142,17 @@ def plot( "scaling view takes exactly 1 snapshot", fg=typer.colors.RED, err=True ) raise typer.Exit(code=2) + if clip is not None: + if clip <= 0: + typer.secho("--clip must be positive", fg=typer.colors.RED, err=True) + raise typer.Exit(code=2) + if chosen == "sweep" and clip <= 1: + typer.secho( + "sweep --clip is a fold-change > 1 (colour is log₂; e.g. 8 = ⅛×–8×)", + fg=typer.colors.RED, + err=True, + ) + raise typer.Exit(code=2) # RENDERERS imports fine without plotly (lazy inside each), so check the dep. if importlib.util.find_spec("plotly") is None: @@ -147,7 +171,7 @@ def plot( output = Path(".benchmarks") / "plots" / f"{chosen}.html" try: - fig, n_tests = RENDERERS[chosen](snapshots, metric, sort, facets) + fig, n_tests = RENDERERS[chosen](snapshots, metric, sort, facets, clip) except ValueError as exc: typer.secho(str(exc), fg=typer.colors.RED, err=True) raise typer.Exit(code=1) from exc diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py index c31054b8..e7af212f 100644 --- a/benchmarks/plotting.py +++ b/benchmarks/plotting.py @@ -20,6 +20,8 @@ from pathlib import Path from typing import TYPE_CHECKING, Literal +import numpy as np + from benchmarks.snapshot import Metric, load_long_df if TYPE_CHECKING: @@ -43,6 +45,22 @@ def _diverging_kwargs(midpoint: float = 0.0) -> dict: } +def _symmetric_clip(magnitudes: np.ndarray, override: float | None, pct: float = 95.0) -> float: + """ + Symmetric colour bound for a diverging scale: ``override`` if given, else the + ``pct`` percentile of ``|magnitudes|`` — so a few outliers don't wash the rest + to the midpoint. Positive; callers use ``[-b, +b]``. + """ + if override is not None: + return float(override) + mags = np.abs(np.asarray(magnitudes, dtype=float)) + mags = mags[np.isfinite(mags)] + if mags.size == 0: + return 1.0 + bound = float(np.percentile(mags, pct)) + return bound if bound > 0 else (float(mags.max()) or 1e-9) + + def _axis_kwargs(unit: str) -> dict: """``update_xaxes`` kwargs for a given unit.""" if unit == "s": @@ -104,9 +122,11 @@ def plot_compare( metric: Metric = "min", sort: SortMode = "absolute", facets: FacetBy | None = None, + clip: float | None = None, ) -> tuple[Figure, int]: """ - Bar chart of per-test delta, in alphabetical test-id order. + Bar chart of per-test delta, sorted by the chosen ``--sort`` Δ + (biggest regressions on top, improvements at the bottom). ``sort`` picks the bar dimension: ``absolute`` (default) plots ``b - a`` in the native unit, ``relative`` plots percent change. Bars stay in id order @@ -147,8 +167,8 @@ def plot_compare( df["delta_abs"] = df[b_label] - df[a_label] df["delta_pct"] = (df["delta_abs"] / df[a_label]) * 100.0 - df = df.sort_values("test_id").reset_index(drop=True) x_col = "delta_abs" if sort == "absolute" else "delta_pct" + df = df.sort_values(x_col).reset_index(drop=True) if sort == "absolute": x_label = f"{metric_label} delta ({unit})" @@ -183,6 +203,7 @@ def plot_compare( facet_kwargs = {"facet_col": facets} facet_kwargs["facet_col_wrap"] = 2 if facets == "phase" else 3 + color_clip = _symmetric_clip(df[x_col].to_numpy(), clip) fig = px.bar( df, x=x_col, @@ -190,6 +211,7 @@ def plot_compare( orientation="h", color=x_col, **_diverging_kwargs(), + range_color=[-color_clip, color_clip], title=title, labels={x_col: x_label, y_col: ""}, text_auto=text_fmt, @@ -228,6 +250,7 @@ def plot_scatter( metric: Metric = "min", sort: SortMode = "absolute", # noqa: ARG001 (uniform signature, unused here) facets: FacetBy | None = None, + clip: float | None = None, ) -> tuple[Figure, int]: """ Baseline cost (log-x) vs candidate/baseline ratio (y) — the exploratory @@ -239,7 +262,6 @@ def plot_scatter( ``animation_frame``. A dashed line at ``ratio = 1`` marks no change; colour encodes absolute Δ. """ - import numpy as np import plotly.express as px if len(snapshots) < 2: @@ -268,22 +290,20 @@ def plot_scatter( df["ratio"] = df["candidate_time"] / df["baseline_time"] df["delta_abs"] = df["candidate_time"] - df["baseline_time"] df["delta_pct"] = df["delta_abs"] / df["baseline_time"] * 100.0 - df = df.rename(columns={"test_id": "test"}) + # log y-axis can't show a zero ratio (candidate value of 0) — drop those. + df = df[df["ratio"] > 0].rename(columns={"test_id": "test"}) # Fixed ranges so the animation doesn't jitter; pad to avoid edge clipping. x_lo, x_hi = df["baseline_time"].min(), df["baseline_time"].max() - # y-range centred symmetrically on 1.0 so regressions and improvements read - # equally; the larger side sets the window width. + # ratio is multiplicative → log y-axis; show the *full* fold range (symmetric + # about 1.0) so every point is visible — zoom interactively to focus. y_lo, y_hi = df["ratio"].min(), df["ratio"].max() - max_dist = max(abs(1.0 - y_lo), abs(y_hi - 1.0), 0.05) - pad_y = max(0.05, max_dist * 0.05) - y_range = [1.0 - max_dist - pad_y, 1.0 + max_dist + pad_y] + fold = max(y_hi, 1.0 / y_lo, 1.1) + bound = fold**1.05 + y_range = [1.0 / bound, bound] - # Clip the colour scale to the p95 absolute Δ so one huge regression doesn't - # wash the rest to white; outliers saturate at the bound. - clip = float(np.percentile(df["delta_abs"].abs(), 95)) if len(df) > 0 else 0.0 - if clip == 0.0: - max_abs = float(df["delta_abs"].abs().max()) - clip = max_abs if max_abs > 0 else 1e-9 + # --clip clamps the *colour* — the one thing you can't zoom after the plot is + # made. Here colour is the absolute Δ, so it's a linear bound. Default: p95. + color_clip = _symmetric_clip(df["delta_abs"].to_numpy(), clip) animate = len(snapshots) >= 3 extra: dict = {} @@ -300,8 +320,9 @@ def plot_scatter( y="ratio", color="delta_abs", **_diverging_kwargs(), - range_color=[-clip, clip], + range_color=[-color_clip, color_clip], log_x=True, + log_y=True, range_x=[x_lo * 0.5, x_hi * 2], range_y=y_range, hover_name="test", @@ -319,7 +340,7 @@ def plot_scatter( ), labels={ "baseline_time": f"baseline {metric_label} ({unit}, log scale)", - "ratio": f"{metric_label} ratio (candidate / baseline)", + "ratio": f"{metric_label} ratio (candidate / baseline, log scale)", "candidate_time": "candidate", "delta_abs": f"Δ ({unit}, p95-clipped)", }, @@ -344,8 +365,9 @@ def plot_sweep( metric: Metric = "min", sort: SortMode = "absolute", # noqa: ARG001 (uniform signature, unused here) facets: FacetBy | None = None, # noqa: ARG001 (uniform signature, unused here) + clip: float | None = None, ) -> tuple[Figure, int]: - """Heatmap of per-test ratio relative to the first snapshot.""" + """Heatmap of per-test fold-change (log2 ratio) vs the first snapshot.""" import plotly.express as px df_long, unit = load_long_df(snapshots, metric) @@ -353,38 +375,54 @@ def plot_sweep( versions = df_long["snapshot"].drop_duplicates().tolist() baseline_label = versions[0] - # Pivot to absolutes, drop tests missing the baseline, divide by it for ratios. abs_df = df_long.pivot(index="test_id", columns="snapshot", values="value").reindex( columns=versions ) abs_df = abs_df.dropna(subset=[baseline_label]) if abs_df.empty: raise ValueError(f"no overlap with baseline snapshot {baseline_label}") - df = abs_df.div(abs_df[baseline_label], axis=0) - abs_df.index.name = "test" - df.index.name = "test" - + ratio = abs_df.div(abs_df[baseline_label], axis=0) + # Colour by log2(ratio): plotly's colour scale is linear (no log mode), so raw + # ratio makes a 2x look twice as intense as its mirror 1/2x. log2 makes folds + # symmetric around 0; the bar is relabelled to fold-change. Range defaults to + # the symmetric p95 (override via --clip, a fold-change). + logr = np.log2(ratio.where(ratio > 0)) + abs_df.index.name = ratio.index.name = logr.index.name = "test" + + bound = _symmetric_clip(logr.values, float(np.log2(clip)) if clip else None) fig = px.imshow( - df, - **_diverging_kwargs(1.0), + logr, + color_continuous_scale=["green", "white", "red"], + color_continuous_midpoint=0.0, + zmin=-bound, + zmax=bound, aspect="auto", - title=f"{metric_label} ratio relative to baseline ({versions[0]})", - labels={"x": "version", "y": "test", "color": "ratio"}, - text_auto=".2f", + title=f"{metric_label} fold-change vs baseline ({versions[0]})", + labels={"x": "version", "y": "test", "color": "fold"}, + ) + # Fold-change ticks at integer log2 steps spanning the actual colour range. + hi = max(1, int(bound)) + ticks = list(range(-hi, hi + 1)) + fig.update_coloraxes( + colorbar=dict( + tickvals=ticks, + ticktext=[f"{2**t}×" if t >= 0 else f"1/{2**-t}×" for t in ticks], + title="fold", + ) ) - # Absolute values as customdata so hover shows both ratio and value. fig.update_traces( + text=ratio.round(2).values, + texttemplate="%{text}×", customdata=abs_df.values, hovertemplate=( - "test: %{y}
" - "version: %{x}
" - "ratio: %{z:.3f}
" + "test: %{y}
version: %{x}
" + "fold: %{text}×
" f"{metric_label}: %{{customdata:.4g}}{unit}" "" ), ) - fig.update_layout(height=max(500, len(df) * 22)) - return fig, len(df) + fig.update_layout(height=max(500, len(logr) * 22)) + return fig, len(logr) # Per sweep axis: (x label, log-scaled?). Size is multiplicative → log; severity @@ -400,6 +438,7 @@ def plot_scaling( metric: Metric = "min", sort: SortMode = "absolute", # noqa: ARG001 (uniform signature, unused here) facets: FacetBy | None = None, # noqa: ARG001 (uniform signature, unused here) + clip: float | None = None, # noqa: ARG001 (uniform signature, unused here) ) -> tuple[Figure, int]: """ Cost vs the sweep dial for parametrized tests, faceted by phase. @@ -448,7 +487,10 @@ def plot_scaling( RENDERERS: dict[ PlotView, - Callable[[list[Path], Metric, SortMode, FacetBy | None], tuple[Figure, int]], + Callable[ + [list[Path], Metric, SortMode, FacetBy | None, float | None], + tuple[Figure, int], + ], ] = { "compare": plot_compare, "scatter": plot_scatter,