TuringLang · yebai · May 19, 2026 · May 18, 2026 · May 18, 2026 · May 18, 2026
diff --git a/.github/workflows/Benchmarking.yml b/.github/workflows/Benchmarking.yml
@@ -3,8 +3,6 @@ name: Benchmarking
 on:
   pull_request:
 
-# Needed so `peter-evans/create-or-update-comment` can post on PRs from forks
-# (default GITHUB_TOKEN is read-only for fork pull_requests).
 permissions:
   pull-requests: write
 
@@ -103,20 +101,24 @@ jobs:
             echo "<!-- benchmark-report -->"
             echo "## Benchmarks @ ${head_sha}"
             echo ""
+            echo "### Performance"
+            echo ""
+            echo "Performance Ratio:"
+            echo "Ratio of time to compute gradient and time to compute log-density."
+            echo "Warning: results are very approximate! See [benchmark notes](https://github.com/TuringLang/DynamicPPL.jl/tree/main/benchmarks#interpreting-results) for more context."
+            echo ""
             cat head/results.md
             echo ""
+            echo "Rows marked \`*\` have \`t(logdensity)\` below about 100 ns; their ratios can be dominated by timer floor, fixed overhead, and run-to-run variation. For those rows, raw \`t(grad)\` is more meaningful than \`t(grad)/t(logdensity)\`."
+            echo ""
             if [[ "$main_status" == "success" ]]; then
-              echo "Each row times one of DynamicPPL's reference models on this PR's head. \`Dim\` is the parameter count; \`Linked\` is \`true\` when parameters have been mapped to unconstrained space. \`t(logdensity)\` is the wall-clock time for one log-density evaluation. The AD (automatic differentiation) backend columns express gradient time as a multiple of \`t(logdensity)\` — a value of \`10\` means computing the gradient takes 10× as long as the log-density. Lower is better throughout; \`err\` means the backend errored on that model. Compare against \`main\` below to spot regressions."
-              echo ""
               echo "<details><summary>Main @ ${main_sha}</summary>"
               echo ""
               cat main/results.md
               echo ""
               echo "</details>"
               echo ""
             else
-              echo "Each row times one of DynamicPPL's reference models on this PR's head. \`Dim\` is the parameter count; \`Linked\` is \`true\` when parameters have been mapped to unconstrained space. \`t(logdensity)\` is the wall-clock time for one log-density evaluation. The AD (automatic differentiation) backend columns express gradient time as a multiple of \`t(logdensity)\` — a value of \`10\` means computing the gradient takes 10× as long as the log-density. Lower is better throughout; \`err\` means the backend errored on that model."
-              echo ""
               echo "Main benchmark job did not succeed (\`${main_status}\`) — see workflow logs."
               echo ""
             fi
@@ -125,6 +127,7 @@ jobs:
             cat head/version_info.txt
             echo "</pre>"
             echo "</details>"
+            echo "<!-- benchmark-report -->"
           } > body.md
 
       - name: Find existing benchmark comment

diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -1,3 +1,5 @@
+# Benchmarks
+
 Run from the repository root:
 
 ```sh
@@ -6,5 +8,24 @@ julia --project=benchmarks benchmarks/benchmarks.jl
 ```
 
 The `Benchmarking` CI workflow runs this on each PR and posts the table as a
-comment. There is no base-vs-head comparison: judge regressions by comparing
-against the most recent main-branch run in the comment history.
+comment.
+
+## Interpreting results
+
+Each row times one of DynamicPPL's reference models. `Dim` is the parameter
+count. `Linked` is `true` when parameters have been mapped to unconstrained
+space. `t(logdensity)` is the wall-clock time for one log-density evaluation.
+
+The AD backend columns are performance ratios: each value is the gradient time
+divided by `t(logdensity)`. For example, a value of `10` means computing the
+gradient takes 10 times as long as evaluating the log-density. Lower is better.
+`err` means the backend errored on that model.
+
+If `t(logdensity)` is below about 100 ns, ratios are often dominated by timer
+floor and fixed overhead. For those rows, raw `t(grad)` is more meaningful than
+`t(grad)/t(logdensity)`. These microbenchmarks can also vary noticeably across
+runs.
+
+The CI comment shows the PR head table first and, when available, includes a
+collapsed `main` table for comparison. Treat the numbers as approximate and use
+the `main` table to spot likely regressions.
diff --git a/benchmarks/benchmarks.jl b/benchmarks/benchmarks.jl
@@ -35,7 +35,7 @@ end
 Covers many DynamicPPL features: scalar/vector/multivariate variables, `~`,
 `.~`, loops, allocated vectors, and observations as both arguments and literals.
 """
-@model function smorgasbord(x, y, ::Type{TV}=Vector{Float64}) where {TV}
+@model function smorgasbord(x, y, (::Type{TV})=Vector{Float64}) where {TV}
     @assert length(x) == length(y)
     m ~ truncated(Normal(); lower=0)
     means ~ product_distribution(fill(Exponential(m), length(x)))
@@ -50,7 +50,7 @@ Covers many DynamicPPL features: scalar/vector/multivariate variables, `~`,
 end
 
 "`num_dims` univariate normals via a loop. Condition on `o` after instantiation."
-@model function loop_univariate(num_dims, ::Type{TV}=Vector{Float64}) where {TV}
+@model function loop_univariate(num_dims, (::Type{TV})=Vector{Float64}) where {TV}
     a = TV(undef, num_dims)
     o = TV(undef, num_dims)
     for i in 1:num_dims
@@ -64,7 +64,7 @@ end
 end
 
 "As `loop_univariate`, but using `product_distribution` instead of loops."
-@model function multivariate(num_dims, ::Type{TV}=Vector{Float64}) where {TV}
+@model function multivariate(num_dims, (::Type{TV})=Vector{Float64}) where {TV}
     a = TV(undef, num_dims)
     o = TV(undef, num_dims)
     a ~ product_distribution(fill(Normal(0, 1), num_dims))
@@ -86,7 +86,7 @@ end
 end
 
 "Variables whose support varies under linking, or otherwise nontrivial bijectors."
-@model function dynamic(::Type{T}=Vector{Float64}) where {T}
+@model function dynamic((::Type{T})=Vector{Float64}) where {T}
     eta ~ truncated(Normal(); lower=0.0, upper=0.1)
     mat1 ~ LKJCholesky(4, eta)
     mat2 ~ InverseWishart(3.2, cholesky([1.0 0.5; 0.5 1.0]))
@@ -193,6 +193,11 @@ format_ratio(::Missing) = "err"
 format_dim(d::Integer) = string(d)
 format_dim(::Missing) = "err"
 
+const TINY_PRIMAL_THRESHOLD_SECONDS = 100e-9
+
+is_tiny_primal(t::Float64) = t < TINY_PRIMAL_THRESHOLD_SECONDS
+is_tiny_primal(::Missing) = false
+
 # Pivot so each (Model, Dim, Linked) row spans all backends. A long-form table
 # (one row per (model, linked, backend)) reads as four near-duplicate rows
 # differing only in the backend column; pivoting puts the backends side-by-side
@@ -235,7 +240,7 @@ function print_results(results)
     rows = map(pivoted) do g
         ratios = [format_ratio(g.ratios[b.key]) for b in backend_info]
         (
-            name=g.name,
+            name=is_tiny_primal(g.primal) ? "$(g.name)*" : g.name,
             dim=format_dim(g.dim),
             linked=string(g.islinked),
             primal=format_time(g.primal),

diff --git a/src/test_utils/ad.jl b/src/test_utils/ad.jl
@@ -381,11 +381,14 @@ function run_ad(
         # (tens of ns on Linux/macOS) instead of reading as zero. Pattern
         # borrowed from Mooncake's bench harness:
         # https://github.com/chalk-lab/Mooncake.jl/blob/main/bench/run_benchmarks.jl
+        # Per-sample `setup` deep-copies `params` so each sample starts from a
+        # fresh input buffer, matching Mooncake's bench harness. (Setup runs
+        # before the timed window, so the copy is excluded from measurements.)
         logdensity(ldf, params)  # Warm-up
         GC.gc(true)
         primal_benchmark = @be(
-            _,
-            logdensity($ldf, $params),
+            deepcopy($params),
+            logdensity($ldf, _),
             _ -> GC.gc(false),
             seconds = benchmark_seconds,
         )
@@ -397,8 +400,8 @@ function run_ad(
         logdensity_and_gradient(ldf, params)  # Warm-up
         GC.gc(true)
         grad_benchmark = @be(
-            _,
-            logdensity_and_gradient($ldf, $params),
+            deepcopy($params),
+            logdensity_and_gradient($ldf, _),
             _ -> GC.gc(false),
             seconds = benchmark_seconds,
         )