From a946b33009ef6d413e42941ce101a431b41bda6c Mon Sep 17 00:00:00 2001 From: Hong Ge Date: Tue, 5 May 2026 11:32:44 +0100 Subject: [PATCH 1/7] Cherry-pick benchmarks updates from adproblems-interface Bring DifferentiationInterface into the benchmarks env and adopt the flatter markdown layout (no
wrapper, no "Gist:" prefix). Released AbstractPPL/Bijectors are used instead of the fork-branch sources from the source branch. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/Project.toml | 6 ++++++ benchmarks/benchmarks.jl | 8 +++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/benchmarks/Project.toml b/benchmarks/Project.toml index 1440ba40e..fa5052254 100644 --- a/benchmarks/Project.toml +++ b/benchmarks/Project.toml @@ -1,6 +1,9 @@ [deps] ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b" +AbstractPPL = "7a57a42e-76ec-4ea3-a279-07e840d6d9cf" +Bijectors = "76274a88-744f-5084-9051-94815aaf08c4" Chairmarks = "0ca39b1e-fe0b-4e98-acfc-b1656634c4de" +DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" DynamicPPL = "366bfd00-2699-11ea-058f-f148b4cae6d8" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" @@ -18,7 +21,10 @@ DynamicPPL = {path = ".."} [compat] ADTypes = "1.14.0" +AbstractPPL = "0.14" +Bijectors = "0.15.17" Chairmarks = "1.3.1" +DifferentiationInterface = "0.7" Distributions = "0.25.117" DynamicPPL = "0.41" Enzyme = "0.13" diff --git a/benchmarks/benchmarks.jl b/benchmarks/benchmarks.jl index c3b47f201..935941a0f 100644 --- a/benchmarks/benchmarks.jl +++ b/benchmarks/benchmarks.jl @@ -9,6 +9,7 @@ using Distributions: Normal, product_distribution, truncated +using DifferentiationInterface: DifferentiationInterface using DynamicPPL: DynamicPPL, @model, to_submodel, VarInfo, LinkAll, UnlinkAll using DynamicPPL.TestUtils.AD: run_ad, NoTest using Enzyme: Enzyme @@ -283,21 +284,18 @@ function run(; markdown::Bool=false) if markdown gist = filter(r -> r.name == GIST_MODEL, results) if !isempty(gist) - println("### Gist: ", GIST_MODEL) + println("### ", GIST_MODEL) println() println("```") print_results(gist) println("```") println() end - println("
") - println("Full table (", length(results), " rows)") + println("### Full table (", length(results), " rows)") println() println("```") print_results(results) println("```") - println() - println("
") else print_results(results) end From df6de192dd0761d80946591bbacefcd6a7c710c3 Mon Sep 17 00:00:00 2001 From: Hong Ge Date: Tue, 5 May 2026 11:35:09 +0100 Subject: [PATCH 2/7] Cherry-pick Benchmarking.yml from adproblems-interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pairs with the prior commit's benchmarks.jl markdown changes — the new workflow benches PR head and main side-by-side and wraps main's table in
on the CI side. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/Benchmarking.yml | 143 ++++++++++++++++++++++------- 1 file changed, 109 insertions(+), 34 deletions(-) diff --git a/.github/workflows/Benchmarking.yml b/.github/workflows/Benchmarking.yml index 4e5433c96..ae515de5f 100644 --- a/.github/workflows/Benchmarking.yml +++ b/.github/workflows/Benchmarking.yml @@ -3,12 +3,26 @@ name: Benchmarking on: pull_request: +# Needed so `peter-evans/create-or-update-comment` can post on PRs from forks +# (default GITHUB_TOKEN is read-only for fork pull_requests). +permissions: + pull-requests: write + +# Cancel in-flight runs on the same PR when a new commit arrives. Benchmark +# jobs are slow (~10min each), so back-to-back force-pushes would otherwise +# spawn parallel runs that race to post the comment. +concurrency: + group: ${{ github.workflow }}-${{ github.ref || github.run_id }} + cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} + jobs: - benchmark: - # Pinned (rather than `ubuntu-latest`) so that successive runs land on the - # same VM family. GitHub silently rotates `latest`, which changes the noise - # floor between runs and makes timings hard to compare across PRs. + benchmark-pr: + # OS pinned (rather than `ubuntu-latest`) so that successive runs land on + # the same VM family — GitHub silently rotates `latest` and the noise + # floor changes between runs. Julia version pinned for the same reason: + # comparing timings under different compiler versions is meaningless. runs-on: ubuntu-22.04 + timeout-minutes: 60 steps: - uses: actions/checkout@v6 with: @@ -22,23 +36,100 @@ jobs: working-directory: ./benchmarks run: | julia --project=. -e 'using Pkg; Pkg.instantiate()' + julia -e 'using InteractiveUtils; versioninfo()' > version_info.txt + # `tee` so the table also appears in the workflow log at-a-glance. + julia --project=. benchmarks.jl markdown | tee results.md + + - uses: actions/upload-artifact@v4 + with: + name: benchmark-pr + path: | + benchmarks/results.md + benchmarks/version_info.txt + + benchmark-main: + # Tracks main's moving HEAD — the displayed main SHA may shift between + # successive re-runs of the same PR if main advances in the interim. + runs-on: ubuntu-22.04 + timeout-minutes: 60 + outputs: + sha: ${{ steps.mainsha.outputs.sha }} + steps: + - uses: actions/checkout@v6 + with: + ref: main + - id: mainsha + run: echo "sha=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT" + - uses: julia-actions/setup-julia@v3 + with: + version: '1.11' + - uses: julia-actions/cache@v3 + + - name: Run benchmarks + working-directory: ./benchmarks + run: | + julia --project=. -e 'using Pkg; Pkg.instantiate()' + julia --project=. benchmarks.jl markdown | tee results.md - version_info=$(julia -e 'using InteractiveUtils; versioninfo()') + - uses: actions/upload-artifact@v4 + with: + name: benchmark-main + path: benchmarks/results.md - # Capture the markdown-mode benchmark output. The `tee` keeps it in - # the workflow log too, so a failure during comment posting does not - # lose the numbers. - results_file=$(mktemp) - julia --project=. benchmarks.jl markdown | tee "$results_file" + post-comment: + needs: [benchmark-pr, benchmark-main] + # Post the comment as long as the PR-head bench succeeded. If the main + # bench failed (e.g. transitionally, before this PR's bench changes are on + # main), the comment still goes up with a note in place of main's numbers. + if: ${{ !cancelled() && needs.benchmark-pr.result == 'success' }} + runs-on: ubuntu-latest + steps: + - uses: actions/download-artifact@v4 + with: + name: benchmark-pr + path: head + - uses: actions/download-artifact@v4 + if: needs.benchmark-main.result == 'success' + with: + name: benchmark-main + path: main + - name: Build comment body + run: | + head_sha='${{ github.event.pull_request.head.sha }}' + main_sha='${{ needs.benchmark-main.outputs.sha }}' + main_status='${{ needs.benchmark-main.result }}' { - echo "VERSION_INFO<> "$GITHUB_ENV" + echo "## Benchmark Report" + echo "" + echo "**PR head:** \`${head_sha}\` " + if [[ "$main_status" == "success" ]]; then + echo "**Main:** \`${main_sha}\` (foldout below)" + else + echo "**Main:** benchmark job did not succeed (\`${main_status}\`) — see workflow logs" + fi + echo "" + echo "\`t(logdensity)\`: wall-clock time per log-density evaluation." + echo "\`t(grad)/t(logdensity)\`: AD overhead ratio, lower is better." + echo "" + cat head/results.md + echo "" + if [[ "$main_status" == "success" ]]; then + echo "
" + echo "Main branch results" + echo "" + cat main/results.md + echo "" + echo "
" + echo "" + fi + echo "
" + echo "Computer Information" + echo "
"
+            cat head/version_info.txt
+            echo "
" + echo "
" + } > body.md - name: Find existing benchmark comment uses: peter-evans/find-comment@v4 @@ -52,22 +143,6 @@ jobs: uses: peter-evans/create-or-update-comment@v5 with: issue-number: ${{ github.event.pull_request.number }} - body: | - ## Benchmark Report - - - this PR's head: `${{ github.event.pull_request.head.sha }}` - - Absolute log-density times and grad/log-density ratios are - reported. To judge whether a PR helps or hurts, compare against - the latest comment on a recent main-branch PR run. - - ### Computer Information - ``` - ${{ env.VERSION_INFO }} - ``` - ### Benchmark Results - - ${{ env.BENCHMARK_OUTPUT }} - + body-path: body.md comment-id: ${{ steps.find_comment.outputs.comment-id }} edit-mode: replace From ab6ac7cb371e22223e550e682410d6e8b9049bf8 Mon Sep 17 00:00:00 2001 From: Hong Ge Date: Tue, 5 May 2026 18:27:49 +0100 Subject: [PATCH 3/7] Restructure benchmark report table formatting Replace the PrettyTables benchmark report with a manual text formatter modeled on posteriordb-bench: top/bottom `=` rules, centered `eval` and `gradient` banners, dashed subgroup underlines, and a stub of Model/dim/linked columns. Keep the current pivoted data shape, with a shared `primal` column and backend ratio columns labelled FwdDiff, RvsDiff, Mooncake, and Enzyme. While there, simplify the renderer by formatting rows once up front and using a single backend key/label table as the source of truth. Update the PR comment caption to explain that `primal` is shared `t(logdensity)` and the backend columns are `t(grad)/t(logdensity)`. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/Benchmarking.yml | 3 +- benchmarks/benchmarks.jl | 115 +++++++++++++++++++++++------ 2 files changed, 95 insertions(+), 23 deletions(-) diff --git a/.github/workflows/Benchmarking.yml b/.github/workflows/Benchmarking.yml index ae515de5f..6866ea098 100644 --- a/.github/workflows/Benchmarking.yml +++ b/.github/workflows/Benchmarking.yml @@ -109,8 +109,7 @@ jobs: echo "**Main:** benchmark job did not succeed (\`${main_status}\`) — see workflow logs" fi echo "" - echo "\`t(logdensity)\`: wall-clock time per log-density evaluation." - echo "\`t(grad)/t(logdensity)\`: AD overhead ratio, lower is better." + echo "_`primal` is shared \`t(logdensity)\`; AD-backend columns are \`t(grad)/t(logdensity)\` (lower is better)._" echo "" cat head/results.md echo "" diff --git a/benchmarks/benchmarks.jl b/benchmarks/benchmarks.jl index 935941a0f..a9561002d 100644 --- a/benchmarks/benchmarks.jl +++ b/benchmarks/benchmarks.jl @@ -16,7 +16,6 @@ using Enzyme: Enzyme using ForwardDiff: ForwardDiff using LinearAlgebra: cholesky using Mooncake: Mooncake -using PrettyTables: pretty_table using Printf: @sprintf using ReverseDiff: ReverseDiff using StableRNGs: StableRNG @@ -179,11 +178,6 @@ end # Reporting # -# https://github.com/chalk-lab/Mooncake.jl/blob/main/bench/run_benchmarks.jl -const COLNAMES = [ - "Model", "Dim", "AD Backend", "Linked", "t(logdensity)", "t(grad)/t(logdensity)" -] - fix_sig_fig(t) = string(round(t; sigdigits=3)) function format_time(t::Float64) t < 1e-6 && return fix_sig_fig(t * 1e9) * " ns" @@ -199,26 +193,105 @@ format_ratio(::Missing) = "err" format_dim(d::Integer) = string(d) format_dim(::Missing) = "err" +# Pivot so each (Model, Dim, Linked) row spans all backends. A long-form table +# (one row per (model, linked, backend)) reads as four near-duplicate rows +# differing only in the backend column; pivoting puts the backends side-by-side +# where the ratios are actually compared. `t(logdensity)` does not depend on +# the AD backend (it is the primal model evaluation), so the four primal +# samples per group are noise around a common value — take the minimum, which +# is the most stable estimate (see `run_ad`'s benchmark docstring). +function pivot(results, backends) + keys_in_order = Tuple{String,Bool}[] + seen = Set{Tuple{String,Bool}}() + for r in results + k = (r.name, r.islinked) + if !(k in seen) + push!(seen, k) + push!(keys_in_order, k) + end + end + return map(keys_in_order) do (name, islinked) + rows = filter(r -> r.name == name && r.islinked == islinked, results) + primals = collect(skipmissing(r.t_logd for r in rows)) + primal = isempty(primals) ? missing : minimum(primals) + ratios = Dict{String,Union{Float64,Missing}}(string(b) => missing for b in backends) + for r in rows + ratios[r.adbackend] = r.ratio + end + (; name, dim=first(rows).dim, islinked, primal, ratios) + end +end + function print_results(results) isempty(results) && return println("No benchmark results obtained.") - rows = map(results) do r + pivoted = pivot(results, BACKENDS) + backend_info = [ + (key="forwarddiff", label="FwdDiff"), + (key="reversediff", label="RvsDiff"), + (key="mooncake", label="Mooncake"), + (key="enzyme", label="Enzyme"), + ] + + rows = map(pivoted) do g + ratios = [format_ratio(g.ratios[b.key]) for b in backend_info] ( - r.name, - format_dim(r.dim), - r.adbackend, - r.islinked, - format_time(r.t_logd), - format_ratio(r.ratio), + name = g.name, + dim = format_dim(g.dim), + linked = string(g.islinked), + primal = format_time(g.primal), + ratios, ) end - matrix = hcat(Iterators.map(collect, zip(rows...))...) - return pretty_table( - matrix; - column_labels=COLNAMES, - backend=:text, - fit_table_in_display_horizontally=false, - fit_table_in_display_vertically=false, - ) + + name_w = max(length("Model"), maximum(textwidth(r.name) for r in rows)) + 1 + dim_w = max(length("dim"), maximum(textwidth(r.dim) for r in rows)) + 2 + linked_w = max(length("linked"), maximum(textwidth(r.linked) for r in rows)) + 2 + primal_w = max(length("primal"), maximum(textwidth(r.primal) for r in rows)) + 2 + ratio_ws = [ + max(length(b.label), maximum(textwidth(r.ratios[i]) for r in rows)) + 2 for + (i, b) in enumerate(backend_info) + ] + + gap = " " + gap_w = textwidth(gap) + stub_w = name_w + dim_w + linked_w + 2 * gap_w + eval_w = primal_w + grad_w = sum(ratio_ws) + gap_w * (length(ratio_ws) - 1) + total_w = stub_w + gap_w + eval_w + gap_w + grad_w + + center(s, w) = lpad(rpad(s, div(w + textwidth(s), 2)), w) + println(repeat("=", total_w)) + println(rpad("", stub_w) * gap * center("eval", eval_w) * gap * center("gradient", grad_w)) + println(rpad("", stub_w) * gap * repeat("-", eval_w) * gap * repeat("-", grad_w)) + + header = + rpad("Model", name_w) * + gap * + lpad("dim", dim_w) * + gap * + lpad("linked", linked_w) * + gap * + lpad("primal", primal_w) * + gap * + join((lpad(b.label, w) for (b, w) in zip(backend_info, ratio_ws)), gap) + println(header) + println(repeat("-", total_w)) + + for r in rows + row = + rpad(r.name, name_w) * + gap * + lpad(r.dim, dim_w) * + gap * + lpad(r.linked, linked_w) * + gap * + lpad(r.primal, primal_w) * + gap * + join((lpad(x, w) for (x, w) in zip(r.ratios, ratio_ws)), gap) + println(row) + end + println(repeat("=", total_w)) + return nothing end # From 70cf3158f984b4149366492fef5732cb803cee6b Mon Sep 17 00:00:00 2001 From: Hong Ge Date: Tue, 5 May 2026 18:33:05 +0100 Subject: [PATCH 4/7] format --- benchmarks/benchmarks.jl | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/benchmarks/benchmarks.jl b/benchmarks/benchmarks.jl index a9561002d..e6f0f99fb 100644 --- a/benchmarks/benchmarks.jl +++ b/benchmarks/benchmarks.jl @@ -235,10 +235,10 @@ function print_results(results) rows = map(pivoted) do g ratios = [format_ratio(g.ratios[b.key]) for b in backend_info] ( - name = g.name, - dim = format_dim(g.dim), - linked = string(g.islinked), - primal = format_time(g.primal), + name=g.name, + dim=format_dim(g.dim), + linked=string(g.islinked), + primal=format_time(g.primal), ratios, ) end @@ -261,7 +261,9 @@ function print_results(results) center(s, w) = lpad(rpad(s, div(w + textwidth(s), 2)), w) println(repeat("=", total_w)) - println(rpad("", stub_w) * gap * center("eval", eval_w) * gap * center("gradient", grad_w)) + println( + rpad("", stub_w) * gap * center("eval", eval_w) * gap * center("gradient", grad_w) + ) println(rpad("", stub_w) * gap * repeat("-", eval_w) * gap * repeat("-", grad_w)) header = From fc7df54d8d9d7e7b699ad1e8f9e5f860e173102a Mon Sep 17 00:00:00 2001 From: Hong Ge Date: Tue, 5 May 2026 21:01:35 +0100 Subject: [PATCH 5/7] Clarify benchmark PR comment for general audience Restructure the comment so the table comes first, followed by a single paragraph explaining what each column means and how to read the AD backend ratios. Update the surrounding workflow text: - "## Benchmark Report" + separate PR head/Main lines collapsed into a single "## Benchmarks @ " heading. - Foldout summaries shortened to "Main @ " and "Environment". - Comparison hint ("compare against `main`") only appears when the baseline foldout is actually available. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/Benchmarking.yml | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/.github/workflows/Benchmarking.yml b/.github/workflows/Benchmarking.yml index 6866ea098..918c225b7 100644 --- a/.github/workflows/Benchmarking.yml +++ b/.github/workflows/Benchmarking.yml @@ -100,30 +100,26 @@ jobs: main_sha='${{ needs.benchmark-main.outputs.sha }}' main_status='${{ needs.benchmark-main.result }}' { - echo "## Benchmark Report" - echo "" - echo "**PR head:** \`${head_sha}\` " - if [[ "$main_status" == "success" ]]; then - echo "**Main:** \`${main_sha}\` (foldout below)" - else - echo "**Main:** benchmark job did not succeed (\`${main_status}\`) — see workflow logs" - fi - echo "" - echo "_`primal` is shared \`t(logdensity)\`; AD-backend columns are \`t(grad)/t(logdensity)\` (lower is better)._" + echo "## Benchmarks @ ${head_sha}" echo "" cat head/results.md echo "" if [[ "$main_status" == "success" ]]; then - echo "
" - echo "Main branch results" + echo "Each row times one of DynamicPPL's reference models on this PR's head. \`Dim\` is the parameter count; \`Linked\` is \`true\` when parameters have been mapped to unconstrained space. \`t(logdensity)\` is the wall-clock time for one log-density evaluation. The AD (automatic differentiation) backend columns express gradient time as a multiple of \`t(logdensity)\` — a value of \`10\` means computing the gradient takes 10× as long as the log-density. Lower is better throughout; \`err\` means the backend errored on that model. Compare against \`main\` below to spot regressions." + echo "" + echo "
Main @ ${main_sha}" echo "" cat main/results.md echo "" echo "
" echo "" + else + echo "Each row times one of DynamicPPL's reference models on this PR's head. \`Dim\` is the parameter count; \`Linked\` is \`true\` when parameters have been mapped to unconstrained space. \`t(logdensity)\` is the wall-clock time for one log-density evaluation. The AD (automatic differentiation) backend columns express gradient time as a multiple of \`t(logdensity)\` — a value of \`10\` means computing the gradient takes 10× as long as the log-density. Lower is better throughout; \`err\` means the backend errored on that model." + echo "" + echo "_Main benchmark job did not succeed (\`${main_status}\`) — see workflow logs._" + echo "" fi - echo "
" - echo "Computer Information" + echo "
Environment" echo "
"
             cat head/version_info.txt
             echo "
" From 1971c91bc67fd455f5407e3935611ce3ceaac842 Mon Sep 17 00:00:00 2001 From: Hong Ge Date: Tue, 5 May 2026 21:07:24 +0100 Subject: [PATCH 6/7] Use plain text for benchmark main-job failure note Co-Authored-By: Claude Opus 4.7 --- .github/workflows/Benchmarking.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/Benchmarking.yml b/.github/workflows/Benchmarking.yml index 918c225b7..8de69b6c1 100644 --- a/.github/workflows/Benchmarking.yml +++ b/.github/workflows/Benchmarking.yml @@ -116,7 +116,7 @@ jobs: else echo "Each row times one of DynamicPPL's reference models on this PR's head. \`Dim\` is the parameter count; \`Linked\` is \`true\` when parameters have been mapped to unconstrained space. \`t(logdensity)\` is the wall-clock time for one log-density evaluation. The AD (automatic differentiation) backend columns express gradient time as a multiple of \`t(logdensity)\` — a value of \`10\` means computing the gradient takes 10× as long as the log-density. Lower is better throughout; \`err\` means the backend errored on that model." echo "" - echo "_Main benchmark job did not succeed (\`${main_status}\`) — see workflow logs._" + echo "Main benchmark job did not succeed (\`${main_status}\`) — see workflow logs." echo "" fi echo "
Environment" From 8f4888568c78328b7c878a441db84bfc4cadbed6 Mon Sep 17 00:00:00 2001 From: Hong Ge Date: Tue, 5 May 2026 21:40:52 +0100 Subject: [PATCH 7/7] Print only the full benchmark table in markdown mode Co-Authored-By: Claude Opus 4.7 --- benchmarks/benchmarks.jl | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/benchmarks/benchmarks.jl b/benchmarks/benchmarks.jl index e6f0f99fb..cab363811 100644 --- a/benchmarks/benchmarks.jl +++ b/benchmarks/benchmarks.jl @@ -331,12 +331,6 @@ function build_combinations(rng) return combos end -# Representative model whose 8 rows are surfaced as the at-a-glance "gist" -# in markdown mode. `Smorgasbord` covers the broadest set of DPPL features -# (scalar/vector/multivariate variables, `~`, `.~`, loops, observations as -# both arguments and literals), so it is the most informative single row band. -const GIST_MODEL = "Smorgasbord" - function run(; markdown::Bool=false) combinations = build_combinations(StableRNG(23)) total = length(combinations) @@ -357,17 +351,6 @@ function run(; markdown::Bool=false) push!(results, (; name, dim, adbackend=string(adbackend), islinked, t_logd, ratio)) end if markdown - gist = filter(r -> r.name == GIST_MODEL, results) - if !isempty(gist) - println("### ", GIST_MODEL) - println() - println("```") - print_results(gist) - println("```") - println() - end - println("### Full table (", length(results), " rows)") - println() println("```") print_results(results) println("```")