TuringLang · yebai · May 5, 2026 · May 5, 2026 · May 5, 2026 · May 5, 2026
diff --git a/.github/workflows/Benchmarking.yml b/.github/workflows/Benchmarking.yml
@@ -3,12 +3,26 @@ name: Benchmarking
 on:
   pull_request:
 
+# Needed so `peter-evans/create-or-update-comment` can post on PRs from forks
+# (default GITHUB_TOKEN is read-only for fork pull_requests).
+permissions:
+  pull-requests: write
+
+# Cancel in-flight runs on the same PR when a new commit arrives. Benchmark
+# jobs are slow (~10min each), so back-to-back force-pushes would otherwise
+# spawn parallel runs that race to post the comment.
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
+  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
+
 jobs:
-  benchmark:
-    # Pinned (rather than `ubuntu-latest`) so that successive runs land on the
-    # same VM family. GitHub silently rotates `latest`, which changes the noise
-    # floor between runs and makes timings hard to compare across PRs.
+  benchmark-pr:
+    # OS pinned (rather than `ubuntu-latest`) so that successive runs land on
+    # the same VM family — GitHub silently rotates `latest` and the noise
+    # floor changes between runs. Julia version pinned for the same reason:
+    # comparing timings under different compiler versions is meaningless.
     runs-on: ubuntu-22.04
+    timeout-minutes: 60
     steps:
       - uses: actions/checkout@v6
         with:
@@ -22,23 +36,95 @@ jobs:
         working-directory: ./benchmarks
         run: |
           julia --project=. -e 'using Pkg; Pkg.instantiate()'
+          julia -e 'using InteractiveUtils; versioninfo()' > version_info.txt
+          # `tee` so the table also appears in the workflow log at-a-glance.
+          julia --project=. benchmarks.jl markdown | tee results.md
+
+      - uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-pr
+          path: |
+            benchmarks/results.md
+            benchmarks/version_info.txt
+
+  benchmark-main:
+    # Tracks main's moving HEAD — the displayed main SHA may shift between
+    # successive re-runs of the same PR if main advances in the interim.
+    runs-on: ubuntu-22.04
+    timeout-minutes: 60
+    outputs:
+      sha: ${{ steps.mainsha.outputs.sha }}
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          ref: main
+      - id: mainsha
+        run: echo "sha=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT"
+      - uses: julia-actions/setup-julia@v3
+        with:
+          version: '1.11'
+      - uses: julia-actions/cache@v3
+
+      - name: Run benchmarks
+        working-directory: ./benchmarks
+        run: |
+          julia --project=. -e 'using Pkg; Pkg.instantiate()'
+          julia --project=. benchmarks.jl markdown | tee results.md
 
-          version_info=$(julia -e 'using InteractiveUtils; versioninfo()')
+      - uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-main
+          path: benchmarks/results.md
 
-          # Capture the markdown-mode benchmark output. The `tee` keeps it in
-          # the workflow log too, so a failure during comment posting does not
-          # lose the numbers.
-          results_file=$(mktemp)
-          julia --project=. benchmarks.jl markdown | tee "$results_file"
+  post-comment:
+    needs: [benchmark-pr, benchmark-main]
+    # Post the comment as long as the PR-head bench succeeded. If the main
+    # bench failed (e.g. transitionally, before this PR's bench changes are on
+    # main), the comment still goes up with a note in place of main's numbers.
+    if: ${{ !cancelled() && needs.benchmark-pr.result == 'success' }}
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          name: benchmark-pr
+          path: head
+      - uses: actions/download-artifact@v4
+        if: needs.benchmark-main.result == 'success'
+        with:
+          name: benchmark-main
+          path: main
 
+      - name: Build comment body
+        run: |
+          head_sha='${{ github.event.pull_request.head.sha }}'
+          main_sha='${{ needs.benchmark-main.outputs.sha }}'
+          main_status='${{ needs.benchmark-main.result }}'
           {
-            echo "VERSION_INFO<<DPPL_BENCH_EOF"
-            echo "$version_info"
-            echo "DPPL_BENCH_EOF"
-            echo "BENCHMARK_OUTPUT<<DPPL_BENCH_EOF"
-            cat "$results_file"
-            echo "DPPL_BENCH_EOF"
-          } >> "$GITHUB_ENV"
+            echo "## Benchmarks @ ${head_sha}"
+            echo ""
+            cat head/results.md
+            echo ""
+            if [[ "$main_status" == "success" ]]; then
+              echo "Each row times one of DynamicPPL's reference models on this PR's head. \`Dim\` is the parameter count; \`Linked\` is \`true\` when parameters have been mapped to unconstrained space. \`t(logdensity)\` is the wall-clock time for one log-density evaluation. The AD (automatic differentiation) backend columns express gradient time as a multiple of \`t(logdensity)\` — a value of \`10\` means computing the gradient takes 10× as long as the log-density. Lower is better throughout; \`err\` means the backend errored on that model. Compare against \`main\` below to spot regressions."
+              echo ""
+              echo "<details><summary>Main @ ${main_sha}</summary>"
+              echo ""
+              cat main/results.md
+              echo ""
+              echo "</details>"
+              echo ""
+            else
+              echo "Each row times one of DynamicPPL's reference models on this PR's head. \`Dim\` is the parameter count; \`Linked\` is \`true\` when parameters have been mapped to unconstrained space. \`t(logdensity)\` is the wall-clock time for one log-density evaluation. The AD (automatic differentiation) backend columns express gradient time as a multiple of \`t(logdensity)\` — a value of \`10\` means computing the gradient takes 10× as long as the log-density. Lower is better throughout; \`err\` means the backend errored on that model."
+              echo ""
+              echo "Main benchmark job did not succeed (\`${main_status}\`) — see workflow logs."
+              echo ""
+            fi
+            echo "<details><summary>Environment</summary>"
+            echo "<pre>"
+            cat head/version_info.txt
+            echo "</pre>"
+            echo "</details>"
+          } > body.md
 
       - name: Find existing benchmark comment
         uses: peter-evans/find-comment@v4
@@ -52,22 +138,6 @@ jobs:
         uses: peter-evans/create-or-update-comment@v5
         with:
           issue-number: ${{ github.event.pull_request.number }}
-          body: |
-            ## Benchmark Report
-
-            - this PR's head: `${{ github.event.pull_request.head.sha }}`
-
-            Absolute log-density times and grad/log-density ratios are
-            reported. To judge whether a PR helps or hurts, compare against
-            the latest comment on a recent main-branch PR run.
-
-            ### Computer Information
-            ```
-            ${{ env.VERSION_INFO }}
-            ```
-            ### Benchmark Results
-
-            ${{ env.BENCHMARK_OUTPUT }}
-
+          body-path: body.md
           comment-id: ${{ steps.find_comment.outputs.comment-id }}
           edit-mode: replace
diff --git a/benchmarks/Project.toml b/benchmarks/Project.toml
@@ -1,6 +1,9 @@
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
+AbstractPPL = "7a57a42e-76ec-4ea3-a279-07e840d6d9cf"
+Bijectors = "76274a88-744f-5084-9051-94815aaf08c4"
 Chairmarks = "0ca39b1e-fe0b-4e98-acfc-b1656634c4de"
+DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 DynamicPPL = "366bfd00-2699-11ea-058f-f148b4cae6d8"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
@@ -18,7 +21,10 @@ DynamicPPL = {path = ".."}
 
 [compat]
 ADTypes = "1.14.0"
+AbstractPPL = "0.14"
+Bijectors = "0.15.17"
 Chairmarks = "1.3.1"
+DifferentiationInterface = "0.7"
 Distributions = "0.25.117"
 DynamicPPL = "0.41"
 Enzyme = "0.13"

diff --git a/benchmarks/benchmarks.jl b/benchmarks/benchmarks.jl
@@ -9,13 +9,13 @@ using Distributions:
     Normal,
     product_distribution,
     truncated
+using DifferentiationInterface: DifferentiationInterface
 using DynamicPPL: DynamicPPL, @model, to_submodel, VarInfo, LinkAll, UnlinkAll
 using DynamicPPL.TestUtils.AD: run_ad, NoTest
 using Enzyme: Enzyme
 using ForwardDiff: ForwardDiff
 using LinearAlgebra: cholesky
 using Mooncake: Mooncake
-using PrettyTables: pretty_table
 using Printf: @sprintf
 using ReverseDiff: ReverseDiff
 using StableRNGs: StableRNG
@@ -178,11 +178,6 @@ end
 #  Reporting
 #
 
-# https://github.com/chalk-lab/Mooncake.jl/blob/main/bench/run_benchmarks.jl
-const COLNAMES = [
-    "Model", "Dim", "AD Backend", "Linked", "t(logdensity)", "t(grad)/t(logdensity)"
-]
-
 fix_sig_fig(t) = string(round(t; sigdigits=3))
 function format_time(t::Float64)
     t < 1e-6 && return fix_sig_fig(t * 1e9) * " ns"
@@ -198,26 +193,107 @@ format_ratio(::Missing) = "err"
 format_dim(d::Integer) = string(d)
 format_dim(::Missing) = "err"
 
+# Pivot so each (Model, Dim, Linked) row spans all backends. A long-form table
+# (one row per (model, linked, backend)) reads as four near-duplicate rows
+# differing only in the backend column; pivoting puts the backends side-by-side
+# where the ratios are actually compared. `t(logdensity)` does not depend on
+# the AD backend (it is the primal model evaluation), so the four primal
+# samples per group are noise around a common value — take the minimum, which
+# is the most stable estimate (see `run_ad`'s benchmark docstring).
+function pivot(results, backends)
+    keys_in_order = Tuple{String,Bool}[]
+    seen = Set{Tuple{String,Bool}}()
+    for r in results
+        k = (r.name, r.islinked)
+        if !(k in seen)
+            push!(seen, k)
+            push!(keys_in_order, k)
+        end
+    end
+    return map(keys_in_order) do (name, islinked)
+        rows = filter(r -> r.name == name && r.islinked == islinked, results)
+        primals = collect(skipmissing(r.t_logd for r in rows))
+        primal = isempty(primals) ? missing : minimum(primals)
+        ratios = Dict{String,Union{Float64,Missing}}(string(b) => missing for b in backends)
+        for r in rows
+            ratios[r.adbackend] = r.ratio
+        end
+        (; name, dim=first(rows).dim, islinked, primal, ratios)
+    end
+end
+
 function print_results(results)
     isempty(results) && return println("No benchmark results obtained.")
-    rows = map(results) do r
+    pivoted = pivot(results, BACKENDS)
+    backend_info = [
+        (key="forwarddiff", label="FwdDiff"),
+        (key="reversediff", label="RvsDiff"),
+        (key="mooncake", label="Mooncake"),
+        (key="enzyme", label="Enzyme"),
+    ]
+
+    rows = map(pivoted) do g
+        ratios = [format_ratio(g.ratios[b.key]) for b in backend_info]
         (
-            r.name,
-            format_dim(r.dim),
-            r.adbackend,
-            r.islinked,
-            format_time(r.t_logd),
-            format_ratio(r.ratio),
+            name=g.name,
+            dim=format_dim(g.dim),
+            linked=string(g.islinked),
+            primal=format_time(g.primal),
+            ratios,
         )
     end
-    matrix = hcat(Iterators.map(collect, zip(rows...))...)
-    return pretty_table(
-        matrix;
-        column_labels=COLNAMES,
-        backend=:text,
-        fit_table_in_display_horizontally=false,
-        fit_table_in_display_vertically=false,
+
+    name_w = max(length("Model"), maximum(textwidth(r.name) for r in rows)) + 1
+    dim_w = max(length("dim"), maximum(textwidth(r.dim) for r in rows)) + 2
+    linked_w = max(length("linked"), maximum(textwidth(r.linked) for r in rows)) + 2
+    primal_w = max(length("primal"), maximum(textwidth(r.primal) for r in rows)) + 2
+    ratio_ws = [
+        max(length(b.label), maximum(textwidth(r.ratios[i]) for r in rows)) + 2 for
+        (i, b) in enumerate(backend_info)
+    ]
+
+    gap = "  "
+    gap_w = textwidth(gap)
+    stub_w = name_w + dim_w + linked_w + 2 * gap_w
+    eval_w = primal_w
+    grad_w = sum(ratio_ws) + gap_w * (length(ratio_ws) - 1)
+    total_w = stub_w + gap_w + eval_w + gap_w + grad_w
+
+    center(s, w) = lpad(rpad(s, div(w + textwidth(s), 2)), w)
+    println(repeat("=", total_w))
+    println(
+        rpad("", stub_w) * gap * center("eval", eval_w) * gap * center("gradient", grad_w)
     )
+    println(rpad("", stub_w) * gap * repeat("-", eval_w) * gap * repeat("-", grad_w))
+
+    header =
+        rpad("Model", name_w) *
+        gap *
+        lpad("dim", dim_w) *
+        gap *
+        lpad("linked", linked_w) *
+        gap *
+        lpad("primal", primal_w) *
+        gap *
+        join((lpad(b.label, w) for (b, w) in zip(backend_info, ratio_ws)), gap)
+    println(header)
+    println(repeat("-", total_w))
+
+    for r in rows
+        row =
+            rpad(r.name, name_w) *
+            gap *
+            lpad(r.dim, dim_w) *
+            gap *
+            lpad(r.linked, linked_w) *
+            gap *
+            lpad(r.primal, primal_w) *
+            gap *
+            join((lpad(x, w) for (x, w) in zip(r.ratios, ratio_ws)), gap)
+        println(row)
+    end
+    println(repeat("=", total_w))
+    return nothing
 end
 
 #
@@ -255,12 +331,6 @@ function build_combinations(rng)
     return combos
 end
 
-# Representative model whose 8 rows are surfaced as the at-a-glance "gist"
-# in markdown mode. `Smorgasbord` covers the broadest set of DPPL features
-# (scalar/vector/multivariate variables, `~`, `.~`, loops, observations as
-# both arguments and literals), so it is the most informative single row band.
-const GIST_MODEL = "Smorgasbord"
-
 function run(; markdown::Bool=false)
     combinations = build_combinations(StableRNG(23))
     total = length(combinations)
@@ -281,23 +351,9 @@ function run(; markdown::Bool=false)
         push!(results, (; name, dim, adbackend=string(adbackend), islinked, t_logd, ratio))
     end
     if markdown
-        gist = filter(r -> r.name == GIST_MODEL, results)
-        if !isempty(gist)
-            println("### Gist: ", GIST_MODEL)
-            println()
-            println("```")
-            print_results(gist)
-            println("```")
-            println()
-        end
-        println("<details>")
-        println("<summary>Full table (", length(results), " rows)</summary>")
-        println()
         println("```")
         print_results(results)
         println("```")
-        println()
-        println("</details>")
     else
         print_results(results)
     end