Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 104 additions & 34 deletions .github/workflows/Benchmarking.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,26 @@ name: Benchmarking
on:
pull_request:

# Needed so `peter-evans/create-or-update-comment` can post on PRs from forks
# (default GITHUB_TOKEN is read-only for fork pull_requests).
permissions:
pull-requests: write

# Cancel in-flight runs on the same PR when a new commit arrives. Benchmark
# jobs are slow (~10min each), so back-to-back force-pushes would otherwise
# spawn parallel runs that race to post the comment.
concurrency:
group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}

jobs:
benchmark:
# Pinned (rather than `ubuntu-latest`) so that successive runs land on the
# same VM family. GitHub silently rotates `latest`, which changes the noise
# floor between runs and makes timings hard to compare across PRs.
benchmark-pr:
# OS pinned (rather than `ubuntu-latest`) so that successive runs land on
# the same VM family — GitHub silently rotates `latest` and the noise
# floor changes between runs. Julia version pinned for the same reason:
# comparing timings under different compiler versions is meaningless.
runs-on: ubuntu-22.04
timeout-minutes: 60
steps:
- uses: actions/checkout@v6
with:
Expand All @@ -22,23 +36,95 @@ jobs:
working-directory: ./benchmarks
run: |
julia --project=. -e 'using Pkg; Pkg.instantiate()'
julia -e 'using InteractiveUtils; versioninfo()' > version_info.txt
# `tee` so the table also appears in the workflow log at-a-glance.
julia --project=. benchmarks.jl markdown | tee results.md

- uses: actions/upload-artifact@v4
with:
name: benchmark-pr
path: |
benchmarks/results.md
benchmarks/version_info.txt

benchmark-main:
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems to kind of revert the previous PR but now the comparision rartios have been removed? Quite confused about this churn.

# Tracks main's moving HEAD — the displayed main SHA may shift between
# successive re-runs of the same PR if main advances in the interim.
runs-on: ubuntu-22.04
timeout-minutes: 60
outputs:
sha: ${{ steps.mainsha.outputs.sha }}
steps:
- uses: actions/checkout@v6
with:
ref: main
- id: mainsha
run: echo "sha=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT"
- uses: julia-actions/setup-julia@v3
with:
version: '1.11'
- uses: julia-actions/cache@v3

- name: Run benchmarks
working-directory: ./benchmarks
run: |
julia --project=. -e 'using Pkg; Pkg.instantiate()'
julia --project=. benchmarks.jl markdown | tee results.md

version_info=$(julia -e 'using InteractiveUtils; versioninfo()')
- uses: actions/upload-artifact@v4
with:
name: benchmark-main
path: benchmarks/results.md

# Capture the markdown-mode benchmark output. The `tee` keeps it in
# the workflow log too, so a failure during comment posting does not
# lose the numbers.
results_file=$(mktemp)
julia --project=. benchmarks.jl markdown | tee "$results_file"
post-comment:
needs: [benchmark-pr, benchmark-main]
# Post the comment as long as the PR-head bench succeeded. If the main
# bench failed (e.g. transitionally, before this PR's bench changes are on
# main), the comment still goes up with a note in place of main's numbers.
if: ${{ !cancelled() && needs.benchmark-pr.result == 'success' }}
runs-on: ubuntu-latest
steps:
- uses: actions/download-artifact@v4
with:
name: benchmark-pr
path: head
- uses: actions/download-artifact@v4
if: needs.benchmark-main.result == 'success'
with:
name: benchmark-main
path: main

- name: Build comment body
run: |
head_sha='${{ github.event.pull_request.head.sha }}'
main_sha='${{ needs.benchmark-main.outputs.sha }}'
main_status='${{ needs.benchmark-main.result }}'
{
echo "VERSION_INFO<<DPPL_BENCH_EOF"
echo "$version_info"
echo "DPPL_BENCH_EOF"
echo "BENCHMARK_OUTPUT<<DPPL_BENCH_EOF"
cat "$results_file"
echo "DPPL_BENCH_EOF"
} >> "$GITHUB_ENV"
echo "## Benchmarks @ ${head_sha}"
echo ""
cat head/results.md
echo ""
if [[ "$main_status" == "success" ]]; then
echo "Each row times one of DynamicPPL's reference models on this PR's head. \`Dim\` is the parameter count; \`Linked\` is \`true\` when parameters have been mapped to unconstrained space. \`t(logdensity)\` is the wall-clock time for one log-density evaluation. The AD (automatic differentiation) backend columns express gradient time as a multiple of \`t(logdensity)\` — a value of \`10\` means computing the gradient takes 10× as long as the log-density. Lower is better throughout; \`err\` means the backend errored on that model. Compare against \`main\` below to spot regressions."
echo ""
echo "<details><summary>Main @ ${main_sha}</summary>"
echo ""
cat main/results.md
echo ""
echo "</details>"
echo ""
else
echo "Each row times one of DynamicPPL's reference models on this PR's head. \`Dim\` is the parameter count; \`Linked\` is \`true\` when parameters have been mapped to unconstrained space. \`t(logdensity)\` is the wall-clock time for one log-density evaluation. The AD (automatic differentiation) backend columns express gradient time as a multiple of \`t(logdensity)\` — a value of \`10\` means computing the gradient takes 10× as long as the log-density. Lower is better throughout; \`err\` means the backend errored on that model."
echo ""
echo "Main benchmark job did not succeed (\`${main_status}\`) — see workflow logs."
echo ""
fi
echo "<details><summary>Environment</summary>"
echo "<pre>"
cat head/version_info.txt
echo "</pre>"
echo "</details>"
} > body.md

- name: Find existing benchmark comment
uses: peter-evans/find-comment@v4
Expand All @@ -52,22 +138,6 @@ jobs:
uses: peter-evans/create-or-update-comment@v5
with:
issue-number: ${{ github.event.pull_request.number }}
body: |
## Benchmark Report

- this PR's head: `${{ github.event.pull_request.head.sha }}`

Absolute log-density times and grad/log-density ratios are
reported. To judge whether a PR helps or hurts, compare against
the latest comment on a recent main-branch PR run.

### Computer Information
```
${{ env.VERSION_INFO }}
```
### Benchmark Results

${{ env.BENCHMARK_OUTPUT }}

body-path: body.md
comment-id: ${{ steps.find_comment.outputs.comment-id }}
edit-mode: replace
6 changes: 6 additions & 0 deletions benchmarks/Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
[deps]
ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
AbstractPPL = "7a57a42e-76ec-4ea3-a279-07e840d6d9cf"
Bijectors = "76274a88-744f-5084-9051-94815aaf08c4"
Chairmarks = "0ca39b1e-fe0b-4e98-acfc-b1656634c4de"
DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
DynamicPPL = "366bfd00-2699-11ea-058f-f148b4cae6d8"
Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
Expand All @@ -18,7 +21,10 @@ DynamicPPL = {path = ".."}

[compat]
ADTypes = "1.14.0"
AbstractPPL = "0.14"
Bijectors = "0.15.17"
Chairmarks = "1.3.1"
DifferentiationInterface = "0.7"
Distributions = "0.25.117"
DynamicPPL = "0.41"
Enzyme = "0.13"
Expand Down
136 changes: 96 additions & 40 deletions benchmarks/benchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@ using Distributions:
Normal,
product_distribution,
truncated
using DifferentiationInterface: DifferentiationInterface
using DynamicPPL: DynamicPPL, @model, to_submodel, VarInfo, LinkAll, UnlinkAll
using DynamicPPL.TestUtils.AD: run_ad, NoTest
using Enzyme: Enzyme
using ForwardDiff: ForwardDiff
using LinearAlgebra: cholesky
using Mooncake: Mooncake
using PrettyTables: pretty_table
using Printf: @sprintf
using ReverseDiff: ReverseDiff
using StableRNGs: StableRNG
Expand Down Expand Up @@ -178,11 +178,6 @@ end
# Reporting
#

# https://github.com/chalk-lab/Mooncake.jl/blob/main/bench/run_benchmarks.jl
const COLNAMES = [
"Model", "Dim", "AD Backend", "Linked", "t(logdensity)", "t(grad)/t(logdensity)"
]

fix_sig_fig(t) = string(round(t; sigdigits=3))
function format_time(t::Float64)
t < 1e-6 && return fix_sig_fig(t * 1e9) * " ns"
Expand All @@ -198,26 +193,107 @@ format_ratio(::Missing) = "err"
format_dim(d::Integer) = string(d)
format_dim(::Missing) = "err"

# Pivot so each (Model, Dim, Linked) row spans all backends. A long-form table
# (one row per (model, linked, backend)) reads as four near-duplicate rows
# differing only in the backend column; pivoting puts the backends side-by-side
# where the ratios are actually compared. `t(logdensity)` does not depend on
# the AD backend (it is the primal model evaluation), so the four primal
# samples per group are noise around a common value — take the minimum, which
# is the most stable estimate (see `run_ad`'s benchmark docstring).
function pivot(results, backends)
keys_in_order = Tuple{String,Bool}[]
seen = Set{Tuple{String,Bool}}()
for r in results
k = (r.name, r.islinked)
if !(k in seen)
push!(seen, k)
push!(keys_in_order, k)
end
end
return map(keys_in_order) do (name, islinked)
rows = filter(r -> r.name == name && r.islinked == islinked, results)
primals = collect(skipmissing(r.t_logd for r in rows))
primal = isempty(primals) ? missing : minimum(primals)
ratios = Dict{String,Union{Float64,Missing}}(string(b) => missing for b in backends)
for r in rows
ratios[r.adbackend] = r.ratio
end
(; name, dim=first(rows).dim, islinked, primal, ratios)
end
end

function print_results(results)
isempty(results) && return println("No benchmark results obtained.")
rows = map(results) do r
pivoted = pivot(results, BACKENDS)
backend_info = [
(key="forwarddiff", label="FwdDiff"),
(key="reversediff", label="RvsDiff"),
(key="mooncake", label="Mooncake"),
(key="enzyme", label="Enzyme"),
]

rows = map(pivoted) do g
ratios = [format_ratio(g.ratios[b.key]) for b in backend_info]
(
r.name,
format_dim(r.dim),
r.adbackend,
r.islinked,
format_time(r.t_logd),
format_ratio(r.ratio),
name=g.name,
dim=format_dim(g.dim),
linked=string(g.islinked),
primal=format_time(g.primal),
ratios,
)
end
matrix = hcat(Iterators.map(collect, zip(rows...))...)
return pretty_table(
matrix;
column_labels=COLNAMES,
backend=:text,
fit_table_in_display_horizontally=false,
fit_table_in_display_vertically=false,

name_w = max(length("Model"), maximum(textwidth(r.name) for r in rows)) + 1
dim_w = max(length("dim"), maximum(textwidth(r.dim) for r in rows)) + 2
linked_w = max(length("linked"), maximum(textwidth(r.linked) for r in rows)) + 2
primal_w = max(length("primal"), maximum(textwidth(r.primal) for r in rows)) + 2
ratio_ws = [
max(length(b.label), maximum(textwidth(r.ratios[i]) for r in rows)) + 2 for
(i, b) in enumerate(backend_info)
]

gap = " "
gap_w = textwidth(gap)
stub_w = name_w + dim_w + linked_w + 2 * gap_w
eval_w = primal_w
grad_w = sum(ratio_ws) + gap_w * (length(ratio_ws) - 1)
total_w = stub_w + gap_w + eval_w + gap_w + grad_w

center(s, w) = lpad(rpad(s, div(w + textwidth(s), 2)), w)
println(repeat("=", total_w))
println(
rpad("", stub_w) * gap * center("eval", eval_w) * gap * center("gradient", grad_w)
)
println(rpad("", stub_w) * gap * repeat("-", eval_w) * gap * repeat("-", grad_w))

header =
rpad("Model", name_w) *
gap *
lpad("dim", dim_w) *
gap *
lpad("linked", linked_w) *
gap *
lpad("primal", primal_w) *
gap *
join((lpad(b.label, w) for (b, w) in zip(backend_info, ratio_ws)), gap)
println(header)
println(repeat("-", total_w))

for r in rows
row =
rpad(r.name, name_w) *
gap *
lpad(r.dim, dim_w) *
gap *
lpad(r.linked, linked_w) *
gap *
lpad(r.primal, primal_w) *
gap *
join((lpad(x, w) for (x, w) in zip(r.ratios, ratio_ws)), gap)
println(row)
end
println(repeat("=", total_w))
return nothing
end

#
Expand Down Expand Up @@ -255,12 +331,6 @@ function build_combinations(rng)
return combos
end

# Representative model whose 8 rows are surfaced as the at-a-glance "gist"
# in markdown mode. `Smorgasbord` covers the broadest set of DPPL features
# (scalar/vector/multivariate variables, `~`, `.~`, loops, observations as
# both arguments and literals), so it is the most informative single row band.
const GIST_MODEL = "Smorgasbord"

function run(; markdown::Bool=false)
combinations = build_combinations(StableRNG(23))
total = length(combinations)
Expand All @@ -281,23 +351,9 @@ function run(; markdown::Bool=false)
push!(results, (; name, dim, adbackend=string(adbackend), islinked, t_logd, ratio))
end
if markdown
gist = filter(r -> r.name == GIST_MODEL, results)
if !isempty(gist)
println("### Gist: ", GIST_MODEL)
println()
println("```")
print_results(gist)
println("```")
println()
end
println("<details>")
println("<summary>Full table (", length(results), " rows)</summary>")
println()
println("```")
print_results(results)
println("```")
println()
println("</details>")
else
print_results(results)
end
Expand Down
Loading