diff --git a/.github/workflows/Benchmarking.yml b/.github/workflows/Benchmarking.yml index 48bd72875..4e5433c96 100644 --- a/.github/workflows/Benchmarking.yml +++ b/.github/workflows/Benchmarking.yml @@ -4,36 +4,11 @@ on: pull_request: jobs: - benchmark-base: - runs-on: ubuntu-latest - outputs: - results: ${{ steps.benchmark.outputs.results }} - sha: ${{ steps.benchmark.outputs.sha }} - steps: - - uses: actions/checkout@v6 - with: - ref: ${{ github.base_ref }} - - uses: julia-actions/setup-julia@v3 - with: - version: '1.11' - - uses: julia-actions/cache@v3 - - - name: Run benchmarks - id: benchmark - working-directory: ./benchmarks - run: | - # github output can't handle more than 1 line, hence the tail - julia --project=. -e 'using Pkg; Pkg.instantiate()' - results=$(julia --project=. benchmarks.jl json | tail -n 1 || true) - echo $results - echo "results=$results" >> "$GITHUB_OUTPUT" - echo "sha=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT" - - benchmark-head: - runs-on: ubuntu-latest - outputs: - results: ${{ steps.benchmark.outputs.results }} - sha: ${{ steps.benchmark.outputs.sha }} + benchmark: + # Pinned (rather than `ubuntu-latest`) so that successive runs land on the + # same VM family. GitHub silently rotates `latest`, which changes the noise + # floor between runs and makes timings hard to compare across PRs. + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v6 with: @@ -44,57 +19,26 @@ jobs: - uses: julia-actions/cache@v3 - name: Run benchmarks - id: benchmark working-directory: ./benchmarks run: | - # github output can't handle more than 1 line, hence the tail julia --project=. -e 'using Pkg; Pkg.instantiate()' - results=$(julia --project=. benchmarks.jl json | tail -n 1 || true) - echo $results - echo "results=$results" >> "$GITHUB_OUTPUT" - echo "sha=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT" - - combine-results: - runs-on: ubuntu-latest - needs: [benchmark-base, benchmark-head] - steps: - - uses: actions/checkout@v6 - with: - ref: ${{ github.event.pull_request.head.sha }} - - uses: julia-actions/setup-julia@v3 - with: - version: '1.11' - - uses: julia-actions/cache@v3 - - name: Combine benchmark results - working-directory: ./benchmarks - run: | version_info=$(julia -e 'using InteractiveUtils; versioninfo()') - echo "$version_info" - echo "VERSION_INFO<> $GITHUB_ENV - echo "$version_info" >> $GITHUB_ENV - echo "EOF" >> $GITHUB_ENV - # save outputs of previous jobs to json file - echo "Base results" - echo "--------------------------------------------------------" - echo '${{needs.benchmark-base.outputs.results}}' - echo '${{needs.benchmark-base.outputs.results}}' > base.json - echo "Head results" - echo "--------------------------------------------------------" - echo '${{needs.benchmark-head.outputs.results}}' - echo '${{needs.benchmark-head.outputs.results}}' > head.json - - # combine them and save the output as an env var for later steps - julia --project=. -e 'using Pkg; Pkg.instantiate()' - results=$(julia --project=. benchmarks.jl combine head.json base.json) - echo "Combined results" - echo "--------------------------------------------------------" - echo "$results" - - echo "BENCHMARK_OUTPUT<> $GITHUB_ENV - echo "$results" >> $GITHUB_ENV - echo "EOF" >> $GITHUB_ENV + # Capture the markdown-mode benchmark output. The `tee` keeps it in + # the workflow log too, so a failure during comment posting does not + # lose the numbers. + results_file=$(mktemp) + julia --project=. benchmarks.jl markdown | tee "$results_file" + + { + echo "VERSION_INFO<> "$GITHUB_ENV" - name: Find existing benchmark comment uses: peter-evans/find-comment@v4 @@ -102,6 +46,7 @@ jobs: with: issue-number: ${{ github.event.pull_request.number }} comment-author: github-actions[bot] + body-includes: Benchmark Report - name: Create or update benchmark comment uses: peter-evans/create-or-update-comment@v5 @@ -110,8 +55,11 @@ jobs: body: | ## Benchmark Report - - this PR's head: `${{ needs.benchmark-head.outputs.sha }}` - - base branch: `${{ needs.benchmark-base.outputs.sha }}` + - this PR's head: `${{ github.event.pull_request.head.sha }}` + + Absolute log-density times and grad/log-density ratios are + reported. To judge whether a PR helps or hurts, compare against + the latest comment on a recent main-branch PR run. ### Computer Information ``` diff --git a/benchmarks/Project.toml b/benchmarks/Project.toml index 6adb27efa..1440ba40e 100644 --- a/benchmarks/Project.toml +++ b/benchmarks/Project.toml @@ -1,7 +1,3 @@ -name = "DynamicPPLBenchmarks" -uuid = "d94a1522-c11e-44a7-981a-42bf5dc1a001" -version = "0.1.0" - [deps] ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b" Chairmarks = "0ca39b1e-fe0b-4e98-acfc-b1656634c4de" @@ -9,16 +5,16 @@ Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" DynamicPPL = "366bfd00-2699-11ea-058f-f148b4cae6d8" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" -JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" LogDensityProblems = "6fdf6af0-433a-55f7-b3ed-c6c6e0b8df7c" Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6" PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" +Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267" StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" [sources] -DynamicPPL = {path = "../"} +DynamicPPL = {path = ".."} [compat] ADTypes = "1.14.0" @@ -27,7 +23,6 @@ Distributions = "0.25.117" DynamicPPL = "0.41" Enzyme = "0.13" ForwardDiff = "1" -JSON = "1.3.0" LogDensityProblems = "2.1.2" Mooncake = "0.4, 0.5" PrettyTables = "3" diff --git a/benchmarks/README.md b/benchmarks/README.md index ad70b7c03..67a2cca43 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -1,5 +1,10 @@ -To run the benchmarks locally, run this from the root directory of the repository: +Run from the repository root: ```sh +julia --project=benchmarks -e 'using Pkg; Pkg.instantiate()' julia --project=benchmarks benchmarks/benchmarks.jl ``` + +The `Benchmarking` CI workflow runs this on each PR and posts the table as a +comment. There is no base-vs-head comparison: judge regressions by comparing +against the most recent main-branch run in the comment history. diff --git a/benchmarks/benchmarks.jl b/benchmarks/benchmarks.jl index 5be32fdef..c3b47f201 100644 --- a/benchmarks/benchmarks.jl +++ b/benchmarks/benchmarks.jl @@ -1,253 +1,315 @@ -using Pkg - -using Chairmarks: @be, median -using DynamicPPLBenchmarks: Models, benchmark, model_dimension -using JSON: JSON -using PrettyTables: pretty_table, fmt__printf, EmptyCells, MultiColumn, TextTableFormat +using ADTypes: ADTypes +using Distributions: + Categorical, + Dirichlet, + Exponential, + Gamma, + InverseWishart, + LKJCholesky, + Normal, + product_distribution, + truncated +using DynamicPPL: DynamicPPL, @model, to_submodel, VarInfo, LinkAll, UnlinkAll +using DynamicPPL.TestUtils.AD: run_ad, NoTest +using Enzyme: Enzyme +using ForwardDiff: ForwardDiff +using LinearAlgebra: cholesky +using Mooncake: Mooncake +using PrettyTables: pretty_table using Printf: @sprintf +using ReverseDiff: ReverseDiff using StableRNGs: StableRNG -rng = StableRNG(23) - -colnames = ["Model", "Dim", "AD Backend", "Linked", "t(eval)/t(ref)", "t(grad)/t(eval)"] -function print_results(results_table; to_json=false) - if to_json - # Print to the given file as JSON - results_array = [ - Dict(colnames[i] => results_table[j][i] for i in eachindex(colnames)) for - j in eachindex(results_table) - ] - # do not use pretty=true, as GitHub Actions expects no linebreaks - JSON.json(stdout, results_array) - println() - else - # Pretty-print to terminal - table_matrix = hcat(Iterators.map(collect, zip(results_table...))...) - return pretty_table( - table_matrix; - column_labels=colnames, - backend=:text, - formatters=[fmt__printf("%.1f", [6, 7])], - fit_table_in_display_horizontally=false, - fit_table_in_display_vertically=false, - ) +# +# Models +# + +"One scalar assumption, one scalar observation." +@model function simple_assume_observe(obs) + x ~ Normal() + obs ~ Normal(x, 1) + return (; x=x) +end + +""" +Covers many DynamicPPL features: scalar/vector/multivariate variables, `~`, +`.~`, loops, allocated vectors, and observations as both arguments and literals. +""" +@model function smorgasbord(x, y, ::Type{TV}=Vector{Float64}) where {TV} + @assert length(x) == length(y) + m ~ truncated(Normal(); lower=0) + means ~ product_distribution(fill(Exponential(m), length(x))) + stds = TV(undef, length(x)) + stds .~ Gamma(1, 1) + for i in 1:length(x) + x[i] ~ Normal(means[i], stds[i]) end + y ~ product_distribution(map((mean, std) -> Normal(mean, std), means, stds)) + 0.0 ~ Normal(sum(y), 1) + return (; m=m, means=means, stds=stds) end -function run(; to_json=false) - # Create DynamicPPL.Model instances to run benchmarks on. - smorgasbord_instance = Models.smorgasbord(randn(rng, 100), randn(rng, 100)) - loop_univariate1k, multivariate1k = begin - data_1k = randn(rng, 1_000) - loop = Models.loop_univariate(length(data_1k)) | (; o=data_1k) - multi = Models.multivariate(length(data_1k)) | (; o=data_1k) - loop, multi +"`num_dims` univariate normals via a loop. Condition on `o` after instantiation." +@model function loop_univariate(num_dims, ::Type{TV}=Vector{Float64}) where {TV} + a = TV(undef, num_dims) + o = TV(undef, num_dims) + for i in 1:num_dims + a[i] ~ Normal(0, 1) end - loop_univariate10k, multivariate10k = begin - data_10k = randn(rng, 10_000) - loop = Models.loop_univariate(length(data_10k)) | (; o=data_10k) - multi = Models.multivariate(length(data_10k)) | (; o=data_10k) - loop, multi + m = sum(a) + for i in 1:num_dims + o[i] ~ Normal(m, 1) end - lda_instance = begin - w = [1, 2, 3, 2, 1, 1] - d = [1, 1, 1, 2, 2, 2] - Models.lda(2, d, w) + return (; a=a) +end + +"As `loop_univariate`, but using `product_distribution` instead of loops." +@model function multivariate(num_dims, ::Type{TV}=Vector{Float64}) where {TV} + a = TV(undef, num_dims) + o = TV(undef, num_dims) + a ~ product_distribution(fill(Normal(0, 1), num_dims)) + m = sum(a) + o ~ product_distribution(fill(Normal(m, 1), num_dims)) + return (; a=a) +end + +@model function _sub() + x ~ Normal() + return x +end + +"As `simple_assume_observe`, but with the assumed RV inside a submodel." +@model function parent(obs) + x ~ to_submodel(_sub()) + obs ~ Normal(x, 1) + return (; x=x) +end + +"Variables whose support varies under linking, or otherwise nontrivial bijectors." +@model function dynamic(::Type{T}=Vector{Float64}) where {T} + eta ~ truncated(Normal(); lower=0.0, upper=0.1) + mat1 ~ LKJCholesky(4, eta) + mat2 ~ InverseWishart(3.2, cholesky([1.0 0.5; 0.5 1.0])) + return (; eta=eta, mat1=mat1, mat2=mat2) +end + +"Linear Discriminant Analysis." +@model function lda(K, d, w) + V = length(unique(w)) + D = length(unique(d)) + N = length(d) + @assert length(w) == N + + ϕ = Vector{Vector{Real}}(undef, K) + for i in 1:K + ϕ[i] ~ Dirichlet(ones(V) / V) end - # Specify the combinations to test: - # (Model Name, model instance, AD backend, linked) - chosen_combinations = [ - ( - "Simple assume observe", - Models.simple_assume_observe(randn(rng)), - :forwarddiff, - false, - ), - ("Smorgasbord", smorgasbord_instance, :forwarddiff, false), - ("Smorgasbord", smorgasbord_instance, :forwarddiff, true), - ("Smorgasbord", smorgasbord_instance, :reversediff, true), - ("Smorgasbord", smorgasbord_instance, :mooncake, true), - ("Smorgasbord", smorgasbord_instance, :enzyme, true), - ("Loop univariate 1k", loop_univariate1k, :mooncake, true), - ("Multivariate 1k", multivariate1k, :mooncake, true), - ("Loop univariate 10k", loop_univariate10k, :mooncake, true), - ("Multivariate 10k", multivariate10k, :mooncake, true), - ("Dynamic", Models.dynamic(), :mooncake, true), - ("Submodel", Models.parent(randn(rng)), :mooncake, true), - ("LDA", lda_instance, :reversediff, true), - ] + θ = Vector{Vector{Real}}(undef, D) + for i in 1:D + θ[i] ~ Dirichlet(ones(K) / K) + end - # Time running a model-like function that does not use DynamicPPL, as a reference point. - # Eval timings will be relative to this. - reference_time = begin - obs = randn(rng) - median(@be Models.simple_assume_observe_non_model(obs)).time + z = zeros(Int, N) + for i in 1:N + z[i] ~ Categorical(θ[d[i]]) + w[i] ~ Categorical(ϕ[d[i]]) end - @info "Reference evaluation time: $(reference_time) seconds" - - results_table = Tuple{ - String,Int,String,Bool,Union{Float64,Missing},Union{Float64,Missing} - }[] - - for (model_name, model, adbackend, islinked) in chosen_combinations - @info "Running benchmark for $model_name, $adbackend, $islinked" - relative_eval_time, relative_ad_eval_time = try - results = benchmark(model, adbackend, islinked) - @info " t(eval) = $(results.primal_time)" - @info " t(grad) = $(results.grad_time)" - (results.primal_time / reference_time), - (results.grad_time / results.primal_time) - catch e - @info "benchmark errored: $e" - missing, missing - end - push!( - results_table, - ( - model_name, - model_dimension(model, islinked), - string(adbackend), - islinked, - relative_eval_time, - relative_ad_eval_time, + return (; ϕ=ϕ, θ=θ, z=z) +end + +# +# Benchmark harness +# + +# Copied from TuringBenchmarking.jl. +const SYMBOL_TO_BACKEND = Dict( + :forwarddiff => ADTypes.AutoForwardDiff(), + :reversediff => ADTypes.AutoReverseDiff(; compile=false), + :reversediff_compiled => ADTypes.AutoReverseDiff(; compile=true), + :mooncake => ADTypes.AutoMooncake(; config=nothing), + :enzyme => ADTypes.AutoEnzyme(; + mode=Enzyme.set_runtime_activity(Enzyme.Reverse), + function_annotation=Enzyme.Const, + ), +) + +transform_strategy(islinked) = islinked ? LinkAll() : UnlinkAll() + +"Dimension of `model`, accounting for linking. Used as a fallback when `benchmark` errors." +function model_dimension(model, islinked) + return try + vi = last( + DynamicPPL.init!!( + StableRNG(23), + model, + VarInfo(), + DynamicPPL.InitFromPrior(), + transform_strategy(islinked), ), ) - print_results(results_table; to_json=to_json) + length(vi[:]) + catch + missing end - print_results(results_table; to_json=to_json) - return nothing end -struct TestCase - model_name::String - dim::Integer - ad_backend::String - linked::Bool - TestCase(d::Dict{String,Any}) = new((d[c] for c in colnames[1:4])...) +""" + benchmark(model, adbackend, islinked; seconds=2) + +Time log-density and gradient evaluation for `model` with the given AD backend. +`seconds` is Chairmarks' per-measurement budget (doubled from its default to +tighten the median estimate). +""" +function benchmark(model, adbackend::Symbol, islinked::Bool; seconds::Real=2) + return run_ad( + model, + SYMBOL_TO_BACKEND[adbackend]; + rng=StableRNG(23), + transform_strategy=transform_strategy(islinked), + benchmark=true, + benchmark_seconds=seconds, + test=NoTest(), + verbose=false, + ) end -function combine(head_filename::String, base_filename::String) - head_results = try - JSON.parsefile(head_filename, Vector{Dict{String,Any}}) - catch - Dict{String,Any}[] - end - @info "Loaded $(length(head_results)) results from $head_filename" - base_results = try - JSON.parsefile(base_filename, Vector{Dict{String,Any}}) - catch - Dict{String,Any}[] + +# +# Reporting +# + +# https://github.com/chalk-lab/Mooncake.jl/blob/main/bench/run_benchmarks.jl +const COLNAMES = [ + "Model", "Dim", "AD Backend", "Linked", "t(logdensity)", "t(grad)/t(logdensity)" +] + +fix_sig_fig(t) = string(round(t; sigdigits=3)) +function format_time(t::Float64) + t < 1e-6 && return fix_sig_fig(t * 1e9) * " ns" + t < 1e-3 && return fix_sig_fig(t * 1e6) * " μs" + t < 1 && return fix_sig_fig(t * 1e3) * " ms" + return fix_sig_fig(t) * " s" +end +format_time(::Missing) = "err" + +format_ratio(x::Float64) = @sprintf("%.2f", x) +format_ratio(::Missing) = "err" + +format_dim(d::Integer) = string(d) +format_dim(::Missing) = "err" + +function print_results(results) + isempty(results) && return println("No benchmark results obtained.") + rows = map(results) do r + ( + r.name, + format_dim(r.dim), + r.adbackend, + r.islinked, + format_time(r.t_logd), + format_ratio(r.ratio), + ) end - @info "Loaded $(length(base_results)) results from $base_filename" - # Identify unique combinations of (Model, Dim, AD Backend, Linked) - head_testcases = Dict( - TestCase(d) => (d[colnames[5]], d[colnames[6]]) for d in head_results + matrix = hcat(Iterators.map(collect, zip(rows...))...) + return pretty_table( + matrix; + column_labels=COLNAMES, + backend=:text, + fit_table_in_display_horizontally=false, + fit_table_in_display_vertically=false, ) - base_testcases = Dict( - TestCase(d) => (d[colnames[5]], d[colnames[6]]) for d in base_results - ) - all_testcases = union(Set(keys(head_testcases)), Set(keys(base_testcases))) - @info "$(length(all_testcases)) unique test cases found" - sorted_testcases = sort( - collect(all_testcases); by=(c -> (c.model_name, c.linked, c.ad_backend)) - ) - results_table = Tuple{ - String, - Int, - String, - Bool, - String, - String, - String, - String, - String, - String, - String, - String, - String, - }[] - sublabels = ["base", "this PR", "speedup"] - results_colnames = [ - [ - EmptyCells(4), - MultiColumn(3, "t(eval) / t(ref)"), - MultiColumn(3, "t(grad) / t(eval)"), - MultiColumn(3, "t(grad) / t(ref)"), - ], - [colnames[1:4]..., sublabels..., sublabels..., sublabels...], +end + +# +# Main +# + +# Backends compared on every model. `:reversediff_compiled` is excluded because +# compiled tapes are input-dependent and silently produce wrong gradients on +# models with parameter-dependent control flow (see CLAUDE.md). +const BACKENDS = (:forwarddiff, :reversediff, :mooncake, :enzyme) + +function build_combinations(rng) + smorg = smorgasbord(randn(rng, 100), randn(rng, 100)) + models = Tuple{String,DynamicPPL.Model}[ + ("Simple assume observe", simple_assume_observe(randn(rng))), ("Smorgasbord", smorg) ] - sprint_float(x::Float64) = @sprintf("%.2f", x) - sprint_float(m::Missing) = "err" - for c in sorted_testcases - head_eval, head_grad = get(head_testcases, c, (missing, missing)) - base_eval, base_grad = get(base_testcases, c, (missing, missing)) - # If the benchmark errored, it will return `missing` in the `run()` function above. - # The issue with this is that JSON serialisation converts it to `null`, and then - # when reading back from JSON, it becomes `nothing` instead of `missing`! - head_eval = head_eval === nothing ? missing : head_eval - head_grad = head_grad === nothing ? missing : head_grad - base_eval = base_eval === nothing ? missing : base_eval - base_grad = base_grad === nothing ? missing : base_grad - # Finally that lets us do this division safely - speedup_eval = base_eval / head_eval - speedup_grad = base_grad / head_grad - # As well as this multiplication, which is t(grad) / t(ref) - head_grad_vs_ref = head_grad * head_eval - base_grad_vs_ref = base_grad * base_eval - speedup_grad_vs_ref = base_grad_vs_ref / head_grad_vs_ref - push!( - results_table, - ( - c.model_name, - c.dim, - c.ad_backend, - c.linked, - sprint_float(base_eval), - sprint_float(head_eval), - sprint_float(speedup_eval), - sprint_float(base_grad), - sprint_float(head_grad), - sprint_float(speedup_grad), - sprint_float(base_grad_vs_ref), - sprint_float(head_grad_vs_ref), - sprint_float(speedup_grad_vs_ref), - ), - ) + for n in (1_000, 10_000) + data = randn(rng, n) + push!(models, ("Loop univariate $(n ÷ 1_000)k", loop_univariate(n) | (; o=data))) + push!(models, ("Multivariate $(n ÷ 1_000)k", multivariate(n) | (; o=data))) end - # Pretty-print to terminal - if isempty(results_table) - println("No benchmark results obtained.") - else - table_matrix = hcat(Iterators.map(collect, zip(results_table...))...) + push!(models, ("Dynamic", dynamic())) + push!(models, ("Submodel", parent(randn(rng)))) + push!(models, ("LDA", lda(2, [1, 1, 1, 2, 2, 2], [1, 2, 3, 2, 1, 1]))) + + # Order: model → linked → backend, so each model's eight rows are adjacent + # and inspecting one model side-by-side across backends/links is trivial. + combos = Tuple{String,DynamicPPL.Model,Symbol,Bool}[] + for (name, model) in models, islinked in (false, true), backend in BACKENDS + # LDA's discrete Categorical RVs make `linked = false` ill-defined for + # gradient-based AD (every backend errors), so the row is omitted. + name == "LDA" && !islinked && continue + push!(combos, (name, model, backend, islinked)) + end + return combos +end + +# Representative model whose 8 rows are surfaced as the at-a-glance "gist" +# in markdown mode. `Smorgasbord` covers the broadest set of DPPL features +# (scalar/vector/multivariate variables, `~`, `.~`, loops, observations as +# both arguments and literals), so it is the most informative single row band. +const GIST_MODEL = "Smorgasbord" + +function run(; markdown::Bool=false) + combinations = build_combinations(StableRNG(23)) + total = length(combinations) + results = [] + for (i, (name, model, adbackend, islinked)) in enumerate(combinations) + # Mooncake-style header: index/total, then model + config, then backend. + @info "$i / $total", name, (; linked=islinked) + @info adbackend + dim, t_logd, ratio = try + r = benchmark(model, adbackend, islinked) + @info " t(logdensity) = $(format_time(r.primal_time))" + @info " t(grad) = $(format_time(r.grad_time))" + (length(r.params), r.primal_time, r.grad_time / r.primal_time) + catch e + @info " errored: $(sprint(showerror, e))" + (model_dimension(model, islinked), missing, missing) + end + push!(results, (; name, dim, adbackend=string(adbackend), islinked, t_logd, ratio)) + end + if markdown + gist = filter(r -> r.name == GIST_MODEL, results) + if !isempty(gist) + println("### Gist: ", GIST_MODEL) + println() + println("```") + print_results(gist) + println("```") + println() + end + println("
") + println("Full table (", length(results), " rows)") + println() println("```") - pretty_table( - table_matrix; - column_labels=results_colnames, - backend=:text, - fit_table_in_display_horizontally=false, - fit_table_in_display_vertically=false, - table_format=TextTableFormat(; - horizontal_line_at_merged_column_labels=true, - horizontal_lines_at_data_rows=collect(3:3:length(results_table)), - ), - ) + print_results(results) println("```") + println() + println("
") + else + print_results(results) end + return nothing end -# The command-line arguments are used on CI purposes. -# Run with `julia --project=. benchmarks.jl json` to run benchmarks and output JSON to -# stdout -# Run with `julia --project=. benchmarks.jl combine head.json base.json` to combine two JSON -# files -if length(ARGS) == 3 && ARGS[1] == "combine" - combine(ARGS[2], ARGS[3]) -elseif ARGS == ["json"] - run(; to_json=true) -elseif ARGS == [] - # When running locally just omit the argument and it will just benchmark and print to - # terminal. - run() -else - error("invalid arguments: $(ARGS)") +if abspath(PROGRAM_FILE) == @__FILE__ + if ARGS == ["markdown"] + run(; markdown=true) + elseif ARGS == [] + run() + else + error("invalid arguments: $(ARGS)") + end end diff --git a/benchmarks/src/DynamicPPLBenchmarks.jl b/benchmarks/src/DynamicPPLBenchmarks.jl deleted file mode 100644 index f4cc1511e..000000000 --- a/benchmarks/src/DynamicPPLBenchmarks.jl +++ /dev/null @@ -1,77 +0,0 @@ -module DynamicPPLBenchmarks - -using DynamicPPL: VarInfo, VarName, LinkAll, UnlinkAll -using DynamicPPL: DynamicPPL -using DynamicPPL.TestUtils.AD: run_ad, NoTest -using ADTypes: ADTypes -using LogDensityProblems: LogDensityProblems - -using ForwardDiff: ForwardDiff -using ReverseDiff: ReverseDiff -using Mooncake: Mooncake -using Enzyme: Enzyme -using StableRNGs: StableRNG - -include("./Models.jl") -using .Models: Models -export Models, benchmark, model_dimension - -""" - model_dimension(model, islinked) - -Return the dimension of `model`, accounting for linking, if any. -""" -function model_dimension(model, islinked) - tfm_strategy = islinked ? DynamicPPL.LinkAll() : DynamicPPL.UnlinkAll() - vi = last( - DynamicPPL.init!!( - StableRNG(23), model, VarInfo(), DynamicPPL.InitFromPrior(), tfm_strategy - ), - ) - return length(vi[:]) -end - -# Utility functions for representing AD backends using symbols. -# Copied from TuringBenchmarking.jl. -const SYMBOL_TO_BACKEND = Dict( - :forwarddiff => ADTypes.AutoForwardDiff(), - :reversediff => ADTypes.AutoReverseDiff(; compile=false), - :reversediff_compiled => ADTypes.AutoReverseDiff(; compile=true), - :mooncake => ADTypes.AutoMooncake(; config=nothing), - :enzyme => ADTypes.AutoEnzyme(; - mode=Enzyme.set_runtime_activity(Enzyme.Reverse), - function_annotation=Enzyme.Const, - ), -) - -to_backend(x) = error("Unknown backend: $x") -to_backend(x::ADTypes.AbstractADType) = x -function to_backend(x::Union{AbstractString,Symbol}) - k = Symbol(lowercase(string(x))) - haskey(SYMBOL_TO_BACKEND, k) || error("Unknown backend: $x") - return SYMBOL_TO_BACKEND[k] -end - -""" - benchmark(model, adbackend::Symbol, islinked::Bool) - -Benchmark evaluation and gradient calculation for `model` using the selected AD backend. - -The AD backend should be specified as a Symbol (e.g. `:forwarddiff`, `:reversediff`, `:zygote`). - -`islinked` determines whether to link the VarInfo for evaluation. -""" -function benchmark(model, adbackend::Symbol, islinked::Bool) - transform_strategy = islinked ? LinkAll() : UnlinkAll() - return run_ad( - model, - to_backend(adbackend); - rng=StableRNG(23), - transform_strategy=transform_strategy, - benchmark=true, - test=NoTest(), - verbose=false, - ) -end - -end diff --git a/benchmarks/src/Models.jl b/benchmarks/src/Models.jl deleted file mode 100644 index 76d4b2e93..000000000 --- a/benchmarks/src/Models.jl +++ /dev/null @@ -1,156 +0,0 @@ -""" -Models for benchmarking Turing.jl. - -Each model returns a NamedTuple of all the random variables in the model that are not -observed. -""" -module Models - -using Distributions: - Categorical, - Dirichlet, - Exponential, - Gamma, - LKJCholesky, - InverseWishart, - Normal, - logpdf, - product_distribution, - truncated -using DynamicPPL: DynamicPPL, @model, to_submodel -using LinearAlgebra: cholesky - -export simple_assume_observe_non_model, - simple_assume_observe, smorgasbord, loop_univariate, multivariate, parent, dynamic, lda - -# This one is like simple_assume_observe, but explicitly does not use DynamicPPL. -# Other runtimes are normalised by this one's runtime. -function simple_assume_observe_non_model(obs) - x = rand(Normal()) - logp = logpdf(Normal(), x) - logp += logpdf(Normal(x, 1), obs) - return (; logp=logp, x=x) -end - -""" -A simple model that does one scalar assumption and one scalar observation. -""" -@model function simple_assume_observe(obs) - x ~ Normal() - obs ~ Normal(x, 1) - return (; x=x) -end - -""" -A short model that tries to cover many DynamicPPL features. - -Includes scalar, vector univariate, and multivariate variables; ~, .~, and loops; allocating -a variable vector; observations passed as arguments, and as literals. -""" -@model function smorgasbord(x, y, ::Type{TV}=Vector{Float64}) where {TV} - @assert length(x) == length(y) - m ~ truncated(Normal(); lower=0) - means ~ product_distribution(fill(Exponential(m), length(x))) - stds = TV(undef, length(x)) - stds .~ Gamma(1, 1) - for i in 1:length(x) - x[i] ~ Normal(means[i], stds[i]) - end - y ~ product_distribution(map((mean, std) -> Normal(mean, std), means, stds)) - 0.0 ~ Normal(sum(y), 1) - return (; m=m, means=means, stds=stds) -end - -""" -A model that loops over two vectors of univariate normals of length `num_dims`. - -The second variable, `o`, is meant to be conditioned on after model instantiation. - -See `multivariate` for a version that uses `product_distribution` rather than loops. -""" -@model function loop_univariate(num_dims, ::Type{TV}=Vector{Float64}) where {TV} - a = TV(undef, num_dims) - o = TV(undef, num_dims) - for i in 1:num_dims - a[i] ~ Normal(0, 1) - end - m = sum(a) - for i in 1:num_dims - o[i] ~ Normal(m, 1) - end - return (; a=a) -end - -""" -A model with two multivariate normal distributed variables of dimension `num_dims`. - -The second variable, `o`, is meant to be conditioned on after model instantiation. - -See `loop_univariate` for a version that uses loops rather than `product_distribution`. -""" -@model function multivariate(num_dims, ::Type{TV}=Vector{Float64}) where {TV} - a = TV(undef, num_dims) - o = TV(undef, num_dims) - a ~ product_distribution(fill(Normal(0, 1), num_dims)) - m = sum(a) - o ~ product_distribution(fill(Normal(m, 1), num_dims)) - return (; a=a) -end - -""" -A submodel for `parent`. Not exported. -""" -@model function sub() - x ~ Normal() - return x -end - -""" -Like simple_assume_observe, but with a submodel for the assumed random variable. -""" -@model function parent(obs) - x ~ to_submodel(sub()) - obs ~ Normal(x, 1) - return (; x=x) -end - -""" -A model with random variables that have changing support under linking, or otherwise -complicated bijectors. -""" -@model function dynamic(::Type{T}=Vector{Float64}) where {T} - eta ~ truncated(Normal(); lower=0.0, upper=0.1) - mat1 ~ LKJCholesky(4, eta) - mat2 ~ InverseWishart(3.2, cholesky([1.0 0.5; 0.5 1.0])) - return (; eta=eta, mat1=mat1, mat2=mat2) -end - -""" -A simple Linear Discriminant Analysis model. -""" -@model function lda(K, d, w) - V = length(unique(w)) - D = length(unique(d)) - N = length(d) - @assert length(w) == N - - ϕ = Vector{Vector{Real}}(undef, K) - for i in 1:K - ϕ[i] ~ Dirichlet(ones(V) / V) - end - - θ = Vector{Vector{Real}}(undef, D) - for i in 1:D - θ[i] ~ Dirichlet(ones(K) / K) - end - - z = zeros(Int, N) - - for i in 1:N - z[i] ~ Categorical(θ[d[i]]) - w[i] ~ Categorical(ϕ[d[i]]) - end - return (; ϕ=ϕ, θ=θ, z=z) -end - -end diff --git a/src/test_utils/ad.jl b/src/test_utils/ad.jl index 40336d4fe..349c61e4b 100644 --- a/src/test_utils/ad.jl +++ b/src/test_utils/ad.jl @@ -293,7 +293,10 @@ Everything else is optional, and can be categorised into several groups: When enabled, the time taken to evaluate logp as well as its gradient is measured using Chairmarks.jl, and the `ADResult` object returned will contain `grad_time` and `primal_time` fields with the median times (in - seconds). + seconds). The `benchmark_seconds` keyword (default `1`) sets the time + budget passed to Chairmarks for each of the two measurements; raising it + collects more samples and yields a tighter median estimate at the cost + of a longer run. 1. _Whether to output extra logging information._ @@ -314,6 +317,7 @@ function run_ad( adtype::AbstractADType; test::Union{AbstractADCorrectnessTestSetting,Bool}=WithBackend(), benchmark::Bool=false, + benchmark_seconds::Real=1, atol::AbstractFloat=100 * eps(), rtol::AbstractFloat=sqrt(eps()), getlogdensity::Function=getlogjoint_internal, @@ -370,15 +374,34 @@ function run_ad( # Benchmark grad_time, primal_time = if benchmark + # Per-sample incremental GC keeps accumulated garbage from triggering a + # full collection mid-sample, which would inflate that sample several- + # fold. Auto-tuned `evals` (not pinned to 1) batches enough calls per + # sample that fast log-densities clear `time_ns`'s real precision floor + # (tens of ns on Linux/macOS) instead of reading as zero. Pattern + # borrowed from Mooncake's bench harness: + # https://github.com/chalk-lab/Mooncake.jl/blob/main/bench/run_benchmarks.jl logdensity(ldf, params) # Warm-up - primal_benchmark = @be logdensity($ldf, $params) + GC.gc(true) + primal_benchmark = @be( + _, + logdensity($ldf, $params), + _ -> GC.gc(false), + seconds = benchmark_seconds, + ) if verbose print(" evaluation : ") show(stdout, MIME("text/plain"), median(primal_benchmark)) println() end logdensity_and_gradient(ldf, params) # Warm-up - grad_benchmark = @be logdensity_and_gradient($ldf, $params) + GC.gc(true) + grad_benchmark = @be( + _, + logdensity_and_gradient($ldf, $params), + _ -> GC.gc(false), + seconds = benchmark_seconds, + ) if verbose print(" gradient : ") show(stdout, MIME("text/plain"), median(grad_benchmark))