From 3401571a8ac1b70cca60cfc5b4c4066953b66d8e Mon Sep 17 00:00:00 2001 From: Hong Ge Date: Mon, 18 May 2026 15:34:44 +0100 Subject: [PATCH 1/2] Improve benchmark report and harness - Move per-row interpretation notes out of the GitHub workflow comment body into `benchmarks/README.md`, leaving the comment focused on numbers; add the closing `` sentinel. - Mark rows whose `t(logdensity)` is below ~100 ns with `*` so noisy ratios are flagged in place, and add a short footnote explaining what `*` means. - Parenthesize default `Type` parameter syntax in benchmark models (`(::Type{T})=Vector{Float64}`) for parser compatibility. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/Benchmarking.yml | 15 +++++++++------ benchmarks/README.md | 25 +++++++++++++++++++++++-- benchmarks/benchmarks.jl | 15 ++++++++++----- 3 files changed, 42 insertions(+), 13 deletions(-) diff --git a/.github/workflows/Benchmarking.yml b/.github/workflows/Benchmarking.yml index a2830262a..5088d5b98 100644 --- a/.github/workflows/Benchmarking.yml +++ b/.github/workflows/Benchmarking.yml @@ -3,8 +3,6 @@ name: Benchmarking on: pull_request: -# Needed so `peter-evans/create-or-update-comment` can post on PRs from forks -# (default GITHUB_TOKEN is read-only for fork pull_requests). permissions: pull-requests: write @@ -103,11 +101,17 @@ jobs: echo "" echo "## Benchmarks @ ${head_sha}" echo "" + echo "### Performance" + echo "" + echo "Performance Ratio:" + echo "Ratio of time to compute gradient and time to compute log-density." + echo "Warning: results are very approximate! See [benchmark notes](https://github.com/TuringLang/DynamicPPL.jl/tree/main/benchmarks#interpreting-results) for more context." + echo "" cat head/results.md echo "" + echo "Rows marked \`*\` have \`t(logdensity)\` below about 100 ns; their ratios can be dominated by timer floor, fixed overhead, and run-to-run variation. For those rows, raw \`t(grad)\` is more meaningful than \`t(grad)/t(logdensity)\`." + echo "" if [[ "$main_status" == "success" ]]; then - echo "Each row times one of DynamicPPL's reference models on this PR's head. \`Dim\` is the parameter count; \`Linked\` is \`true\` when parameters have been mapped to unconstrained space. \`t(logdensity)\` is the wall-clock time for one log-density evaluation. The AD (automatic differentiation) backend columns express gradient time as a multiple of \`t(logdensity)\` — a value of \`10\` means computing the gradient takes 10× as long as the log-density. Lower is better throughout; \`err\` means the backend errored on that model. Compare against \`main\` below to spot regressions." - echo "" echo "
Main @ ${main_sha}" echo "" cat main/results.md @@ -115,8 +119,6 @@ jobs: echo "
" echo "" else - echo "Each row times one of DynamicPPL's reference models on this PR's head. \`Dim\` is the parameter count; \`Linked\` is \`true\` when parameters have been mapped to unconstrained space. \`t(logdensity)\` is the wall-clock time for one log-density evaluation. The AD (automatic differentiation) backend columns express gradient time as a multiple of \`t(logdensity)\` — a value of \`10\` means computing the gradient takes 10× as long as the log-density. Lower is better throughout; \`err\` means the backend errored on that model." - echo "" echo "Main benchmark job did not succeed (\`${main_status}\`) — see workflow logs." echo "" fi @@ -125,6 +127,7 @@ jobs: cat head/version_info.txt echo "" echo "" + echo "" } > body.md - name: Find existing benchmark comment diff --git a/benchmarks/README.md b/benchmarks/README.md index 67a2cca43..5c266a2d4 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -1,3 +1,5 @@ +# Benchmarks + Run from the repository root: ```sh @@ -6,5 +8,24 @@ julia --project=benchmarks benchmarks/benchmarks.jl ``` The `Benchmarking` CI workflow runs this on each PR and posts the table as a -comment. There is no base-vs-head comparison: judge regressions by comparing -against the most recent main-branch run in the comment history. +comment. + +## Interpreting results + +Each row times one of DynamicPPL's reference models. `Dim` is the parameter +count. `Linked` is `true` when parameters have been mapped to unconstrained +space. `t(logdensity)` is the wall-clock time for one log-density evaluation. + +The AD backend columns are performance ratios: each value is the gradient time +divided by `t(logdensity)`. For example, a value of `10` means computing the +gradient takes 10 times as long as evaluating the log-density. Lower is better. +`err` means the backend errored on that model. + +If `t(logdensity)` is below about 100 ns, ratios are often dominated by timer +floor and fixed overhead. For those rows, raw `t(grad)` is more meaningful than +`t(grad)/t(logdensity)`. These microbenchmarks can also vary noticeably across +runs. + +The CI comment shows the PR head table first and, when available, includes a +collapsed `main` table for comparison. Treat the numbers as approximate and use +the `main` table to spot likely regressions. diff --git a/benchmarks/benchmarks.jl b/benchmarks/benchmarks.jl index cab363811..4c260cec1 100644 --- a/benchmarks/benchmarks.jl +++ b/benchmarks/benchmarks.jl @@ -35,7 +35,7 @@ end Covers many DynamicPPL features: scalar/vector/multivariate variables, `~`, `.~`, loops, allocated vectors, and observations as both arguments and literals. """ -@model function smorgasbord(x, y, ::Type{TV}=Vector{Float64}) where {TV} +@model function smorgasbord(x, y, (::Type{TV})=Vector{Float64}) where {TV} @assert length(x) == length(y) m ~ truncated(Normal(); lower=0) means ~ product_distribution(fill(Exponential(m), length(x))) @@ -50,7 +50,7 @@ Covers many DynamicPPL features: scalar/vector/multivariate variables, `~`, end "`num_dims` univariate normals via a loop. Condition on `o` after instantiation." -@model function loop_univariate(num_dims, ::Type{TV}=Vector{Float64}) where {TV} +@model function loop_univariate(num_dims, (::Type{TV})=Vector{Float64}) where {TV} a = TV(undef, num_dims) o = TV(undef, num_dims) for i in 1:num_dims @@ -64,7 +64,7 @@ end end "As `loop_univariate`, but using `product_distribution` instead of loops." -@model function multivariate(num_dims, ::Type{TV}=Vector{Float64}) where {TV} +@model function multivariate(num_dims, (::Type{TV})=Vector{Float64}) where {TV} a = TV(undef, num_dims) o = TV(undef, num_dims) a ~ product_distribution(fill(Normal(0, 1), num_dims)) @@ -86,7 +86,7 @@ end end "Variables whose support varies under linking, or otherwise nontrivial bijectors." -@model function dynamic(::Type{T}=Vector{Float64}) where {T} +@model function dynamic((::Type{T})=Vector{Float64}) where {T} eta ~ truncated(Normal(); lower=0.0, upper=0.1) mat1 ~ LKJCholesky(4, eta) mat2 ~ InverseWishart(3.2, cholesky([1.0 0.5; 0.5 1.0])) @@ -193,6 +193,11 @@ format_ratio(::Missing) = "err" format_dim(d::Integer) = string(d) format_dim(::Missing) = "err" +const TINY_PRIMAL_THRESHOLD_SECONDS = 100e-9 + +is_tiny_primal(t::Float64) = t < TINY_PRIMAL_THRESHOLD_SECONDS +is_tiny_primal(::Missing) = false + # Pivot so each (Model, Dim, Linked) row spans all backends. A long-form table # (one row per (model, linked, backend)) reads as four near-duplicate rows # differing only in the backend column; pivoting puts the backends side-by-side @@ -235,7 +240,7 @@ function print_results(results) rows = map(pivoted) do g ratios = [format_ratio(g.ratios[b.key]) for b in backend_info] ( - name=g.name, + name=is_tiny_primal(g.primal) ? "$(g.name)*" : g.name, dim=format_dim(g.dim), linked=string(g.islinked), primal=format_time(g.primal), From 51a8357b5d68bd73b6ec07a508a68a48cfa4f4ac Mon Sep 17 00:00:00 2001 From: Hong Ge Date: Mon, 18 May 2026 15:40:34 +0100 Subject: [PATCH 2/2] Fresh-buffer setup for run_ad benchmarks Use `setup = deepcopy($params)` so each Chairmarks sample starts from a fresh input buffer instead of reusing the same vector across calls. Matches Mooncake's bench harness. Setup runs before the timed window, so the copy itself is excluded from measurements. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/test_utils/ad.jl | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/test_utils/ad.jl b/src/test_utils/ad.jl index 349c61e4b..606b7ecee 100644 --- a/src/test_utils/ad.jl +++ b/src/test_utils/ad.jl @@ -381,11 +381,14 @@ function run_ad( # (tens of ns on Linux/macOS) instead of reading as zero. Pattern # borrowed from Mooncake's bench harness: # https://github.com/chalk-lab/Mooncake.jl/blob/main/bench/run_benchmarks.jl + # Per-sample `setup` deep-copies `params` so each sample starts from a + # fresh input buffer, matching Mooncake's bench harness. (Setup runs + # before the timed window, so the copy is excluded from measurements.) logdensity(ldf, params) # Warm-up GC.gc(true) primal_benchmark = @be( - _, - logdensity($ldf, $params), + deepcopy($params), + logdensity($ldf, _), _ -> GC.gc(false), seconds = benchmark_seconds, ) @@ -397,8 +400,8 @@ function run_ad( logdensity_and_gradient(ldf, params) # Warm-up GC.gc(true) grad_benchmark = @be( - _, - logdensity_and_gradient($ldf, $params), + deepcopy($params), + logdensity_and_gradient($ldf, _), _ -> GC.gc(false), seconds = benchmark_seconds, )