Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions .github/workflows/Benchmarking.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@ name: Benchmarking
on:
pull_request:

# Needed so `peter-evans/create-or-update-comment` can post on PRs from forks
# (default GITHUB_TOKEN is read-only for fork pull_requests).
permissions:
pull-requests: write

Expand Down Expand Up @@ -103,20 +101,24 @@ jobs:
echo "<!-- benchmark-report -->"
echo "## Benchmarks @ ${head_sha}"
echo ""
echo "### Performance"
echo ""
echo "Performance Ratio:"
echo "Ratio of time to compute gradient and time to compute log-density."
echo "Warning: results are very approximate! See [benchmark notes](https://github.com/TuringLang/DynamicPPL.jl/tree/main/benchmarks#interpreting-results) for more context."
echo ""
cat head/results.md
echo ""
echo "Rows marked \`*\` have \`t(logdensity)\` below about 100 ns; their ratios can be dominated by timer floor, fixed overhead, and run-to-run variation. For those rows, raw \`t(grad)\` is more meaningful than \`t(grad)/t(logdensity)\`."
echo ""
if [[ "$main_status" == "success" ]]; then
echo "Each row times one of DynamicPPL's reference models on this PR's head. \`Dim\` is the parameter count; \`Linked\` is \`true\` when parameters have been mapped to unconstrained space. \`t(logdensity)\` is the wall-clock time for one log-density evaluation. The AD (automatic differentiation) backend columns express gradient time as a multiple of \`t(logdensity)\` — a value of \`10\` means computing the gradient takes 10× as long as the log-density. Lower is better throughout; \`err\` means the backend errored on that model. Compare against \`main\` below to spot regressions."
echo ""
echo "<details><summary>Main @ ${main_sha}</summary>"
echo ""
cat main/results.md
echo ""
echo "</details>"
echo ""
else
echo "Each row times one of DynamicPPL's reference models on this PR's head. \`Dim\` is the parameter count; \`Linked\` is \`true\` when parameters have been mapped to unconstrained space. \`t(logdensity)\` is the wall-clock time for one log-density evaluation. The AD (automatic differentiation) backend columns express gradient time as a multiple of \`t(logdensity)\` — a value of \`10\` means computing the gradient takes 10× as long as the log-density. Lower is better throughout; \`err\` means the backend errored on that model."
echo ""
echo "Main benchmark job did not succeed (\`${main_status}\`) — see workflow logs."
echo ""
fi
Expand All @@ -125,6 +127,7 @@ jobs:
cat head/version_info.txt
echo "</pre>"
echo "</details>"
echo "<!-- benchmark-report -->"
} > body.md

- name: Find existing benchmark comment
Expand Down
25 changes: 23 additions & 2 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# Benchmarks

Run from the repository root:

```sh
Expand All @@ -6,5 +8,24 @@ julia --project=benchmarks benchmarks/benchmarks.jl
```

The `Benchmarking` CI workflow runs this on each PR and posts the table as a
comment. There is no base-vs-head comparison: judge regressions by comparing
against the most recent main-branch run in the comment history.
comment.

## Interpreting results

Each row times one of DynamicPPL's reference models. `Dim` is the parameter
count. `Linked` is `true` when parameters have been mapped to unconstrained
space. `t(logdensity)` is the wall-clock time for one log-density evaluation.

The AD backend columns are performance ratios: each value is the gradient time
divided by `t(logdensity)`. For example, a value of `10` means computing the
gradient takes 10 times as long as evaluating the log-density. Lower is better.
`err` means the backend errored on that model.

If `t(logdensity)` is below about 100 ns, ratios are often dominated by timer
floor and fixed overhead. For those rows, raw `t(grad)` is more meaningful than
`t(grad)/t(logdensity)`. These microbenchmarks can also vary noticeably across
runs.

The CI comment shows the PR head table first and, when available, includes a
collapsed `main` table for comparison. Treat the numbers as approximate and use
the `main` table to spot likely regressions.
15 changes: 10 additions & 5 deletions benchmarks/benchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ end
Covers many DynamicPPL features: scalar/vector/multivariate variables, `~`,
`.~`, loops, allocated vectors, and observations as both arguments and literals.
"""
@model function smorgasbord(x, y, ::Type{TV}=Vector{Float64}) where {TV}
@model function smorgasbord(x, y, (::Type{TV})=Vector{Float64}) where {TV}
@assert length(x) == length(y)
m ~ truncated(Normal(); lower=0)
means ~ product_distribution(fill(Exponential(m), length(x)))
Expand All @@ -50,7 +50,7 @@ Covers many DynamicPPL features: scalar/vector/multivariate variables, `~`,
end

"`num_dims` univariate normals via a loop. Condition on `o` after instantiation."
@model function loop_univariate(num_dims, ::Type{TV}=Vector{Float64}) where {TV}
@model function loop_univariate(num_dims, (::Type{TV})=Vector{Float64}) where {TV}
a = TV(undef, num_dims)
o = TV(undef, num_dims)
for i in 1:num_dims
Expand All @@ -64,7 +64,7 @@ end
end

"As `loop_univariate`, but using `product_distribution` instead of loops."
@model function multivariate(num_dims, ::Type{TV}=Vector{Float64}) where {TV}
@model function multivariate(num_dims, (::Type{TV})=Vector{Float64}) where {TV}
a = TV(undef, num_dims)
o = TV(undef, num_dims)
a ~ product_distribution(fill(Normal(0, 1), num_dims))
Expand All @@ -86,7 +86,7 @@ end
end

"Variables whose support varies under linking, or otherwise nontrivial bijectors."
@model function dynamic(::Type{T}=Vector{Float64}) where {T}
@model function dynamic((::Type{T})=Vector{Float64}) where {T}
eta ~ truncated(Normal(); lower=0.0, upper=0.1)
mat1 ~ LKJCholesky(4, eta)
mat2 ~ InverseWishart(3.2, cholesky([1.0 0.5; 0.5 1.0]))
Expand Down Expand Up @@ -193,6 +193,11 @@ format_ratio(::Missing) = "err"
format_dim(d::Integer) = string(d)
format_dim(::Missing) = "err"

const TINY_PRIMAL_THRESHOLD_SECONDS = 100e-9

is_tiny_primal(t::Float64) = t < TINY_PRIMAL_THRESHOLD_SECONDS
is_tiny_primal(::Missing) = false

# Pivot so each (Model, Dim, Linked) row spans all backends. A long-form table
# (one row per (model, linked, backend)) reads as four near-duplicate rows
# differing only in the backend column; pivoting puts the backends side-by-side
Expand Down Expand Up @@ -235,7 +240,7 @@ function print_results(results)
rows = map(pivoted) do g
ratios = [format_ratio(g.ratios[b.key]) for b in backend_info]
(
name=g.name,
name=is_tiny_primal(g.primal) ? "$(g.name)*" : g.name,
dim=format_dim(g.dim),
linked=string(g.islinked),
primal=format_time(g.primal),
Expand Down
11 changes: 7 additions & 4 deletions src/test_utils/ad.jl
Original file line number Diff line number Diff line change
Expand Up @@ -381,11 +381,14 @@ function run_ad(
# (tens of ns on Linux/macOS) instead of reading as zero. Pattern
# borrowed from Mooncake's bench harness:
# https://github.com/chalk-lab/Mooncake.jl/blob/main/bench/run_benchmarks.jl
# Per-sample `setup` deep-copies `params` so each sample starts from a
# fresh input buffer, matching Mooncake's bench harness. (Setup runs
# before the timed window, so the copy is excluded from measurements.)
logdensity(ldf, params) # Warm-up
GC.gc(true)
primal_benchmark = @be(
_,
logdensity($ldf, $params),
deepcopy($params),
logdensity($ldf, _),
_ -> GC.gc(false),
seconds = benchmark_seconds,
Comment thread
yebai marked this conversation as resolved.
)
Expand All @@ -397,8 +400,8 @@ function run_ad(
logdensity_and_gradient(ldf, params) # Warm-up
GC.gc(true)
grad_benchmark = @be(
_,
logdensity_and_gradient($ldf, $params),
deepcopy($params),
logdensity_and_gradient($ldf, _),
_ -> GC.gc(false),
seconds = benchmark_seconds,
)
Expand Down
Loading