Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 109 additions & 0 deletions .github/workflows/Benchmarks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
name: Benchmarks

on:
pull_request:
branches:
- main
paths:
- ".github/workflows/Benchmarks.yml"
- "Project.toml"
- "src/**"
- "ext/**"
- "benchmarks/**"
types: [opened, synchronize, reopened, ready_for_review]

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}

permissions:
contents: read

jobs:
benchmark:
name: PR benchmark comparison
runs-on: ubuntu-latest
timeout-minutes: 60

env:
PMCMC_BENCH_SECONDS: "0.5"
PMCMC_BENCH_SAMPLES: "8"
PMCMC_BENCH_WARN_RATIO: "1.25"
PMCMC_BENCH_FAIL_RATIO: "1.75"
JULIA_NUM_PRECOMPILE_TASKS: "1"

steps:
- name: Checkout base
uses: actions/checkout@v4
with:
repository: ${{ github.event.pull_request.base.repo.full_name }}
ref: ${{ github.event.pull_request.base.sha }}
path: base

- name: Checkout PR head
uses: actions/checkout@v4
with:
repository: ${{ github.event.pull_request.head.repo.full_name }}
ref: ${{ github.event.pull_request.head.sha }}
path: head

- uses: julia-actions/setup-julia@v2
with:
version: "1"
arch: x64

- name: Use Julia cache
uses: julia-actions/cache@v3

- name: Instantiate base package in benchmark environment
working-directory: head/benchmarks/ParallelMCMCBenchmarks
run: |
julia --project=. -e 'using Pkg; Pkg.develop(PackageSpec(path=joinpath(ENV["GITHUB_WORKSPACE"], "base"))); Pkg.instantiate()'

- name: Run base benchmarks
run: |
julia --project="$GITHUB_WORKSPACE/head/benchmarks/ParallelMCMCBenchmarks" \
"$GITHUB_WORKSPACE/head/benchmarks/ParallelMCMCBenchmarks/scripts/pr_benchmarks.jl" \
--seconds "$PMCMC_BENCH_SECONDS" \
--samples "$PMCMC_BENCH_SAMPLES" \
--output "$GITHUB_WORKSPACE/base-benchmarks.toml" \
--markdown "$GITHUB_WORKSPACE/base-benchmarks.md"
Comment thread
rsenne marked this conversation as resolved.

- name: Instantiate PR package in benchmark environment
working-directory: head/benchmarks/ParallelMCMCBenchmarks
run: |
julia --project=. -e 'using Pkg; Pkg.develop(PackageSpec(path=joinpath(ENV["GITHUB_WORKSPACE"], "head"))); Pkg.instantiate()'

- name: Run PR benchmarks
run: |
julia --project="$GITHUB_WORKSPACE/head/benchmarks/ParallelMCMCBenchmarks" \
"$GITHUB_WORKSPACE/head/benchmarks/ParallelMCMCBenchmarks/scripts/pr_benchmarks.jl" \
--seconds "$PMCMC_BENCH_SECONDS" \
--samples "$PMCMC_BENCH_SAMPLES" \
--output "$GITHUB_WORKSPACE/head-benchmarks.toml" \
--markdown "$GITHUB_WORKSPACE/head-benchmarks.md"

- name: Compare benchmarks
run: |
set +e
julia "$GITHUB_WORKSPACE/head/benchmarks/ParallelMCMCBenchmarks/scripts/compare_pr_benchmarks.jl" \
--base "$GITHUB_WORKSPACE/base-benchmarks.toml" \
--head "$GITHUB_WORKSPACE/head-benchmarks.toml" \
--warn-ratio "$PMCMC_BENCH_WARN_RATIO" \
--fail-ratio "$PMCMC_BENCH_FAIL_RATIO" \
--markdown "$GITHUB_WORKSPACE/benchmark-comparison.md"
status=$?
cat "$GITHUB_WORKSPACE/benchmark-comparison.md" >> "$GITHUB_STEP_SUMMARY"
exit "$status"

- name: Upload benchmark results
uses: actions/upload-artifact@v4
if: always()
with:
name: pr-benchmark-results
path: |
base-benchmarks.toml
base-benchmarks.md
head-benchmarks.toml
head-benchmarks.md
benchmark-comparison.md
11 changes: 11 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,19 @@
.benchmarkci
Manifest.toml
benchmark/*.json
benchmarks/ParallelMCMCBenchmarks/*-benchmarks.toml
benchmarks/ParallelMCMCBenchmarks/*-benchmarks.md
benchmarks/ParallelMCMCBenchmarks/benchmark-comparison.md
coverage
docs/build/
env
node_modules
LocalPreferences.toml

# ignore any AI files a contributor may have
.codex
.claude
CLAUDE.md
AGENTS.md
CODEX.md
.gemini
6 changes: 3 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@ CUDA = "5.11.0"
DifferentiationInterface = "0.7.13"
DynamicPPL = "0.40"
Enzyme = "0.13.131"
LinearAlgebra = "1.12.0"
LinearAlgebra = "1"
LogDensityProblems = "2"
LogDensityProblemsAD = "1"
MCMCChains = "7.7.0"
Random = "1.11.0"
Statistics = "1.11.1"
Random = "1"
Statistics = "1"
julia = "1.10"

[deps]
Expand Down
60 changes: 58 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,22 +1,78 @@
# ParallelMCMC

<p align="center">
<img src="docs/src/assets/logo.png" alt="ParallelMCMC logo" width="220">
</p>

[![Stable Documentation](https://img.shields.io/badge/docs-stable-blue.svg)](https://rsenne.github.io/ParallelMCMC.jl/stable)
[![Development documentation](https://img.shields.io/badge/docs-dev-blue.svg)](https://rsenne.github.io/ParallelMCMC.jl/dev)
[![Test workflow status](https://github.com/rsenne/ParallelMCMC.jl/actions/workflows/Test.yml/badge.svg?branch=main)](https://github.com/rsenne/ParallelMCMC.jl/actions/workflows/Test.yml?query=branch%3Amain)
[![Coverage](https://codecov.io/gh/rsenne/ParallelMCMC.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/rsenne/ParallelMCMC.jl)
[![Docs workflow Status](https://github.com/rsenne/ParallelMCMC.jl/actions/workflows/Docs.yml/badge.svg?branch=main)](https://github.com/rsenne/ParallelMCMC.jl/actions/workflows/Docs.yml?query=branch%3Amain)
[![DOI](https://zenodo.org/badge/DOI/FIXME)](https://doi.org/FIXME)
[![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.1-4baaaa.svg)](CODE_OF_CONDUCT.md)
[![All Contributors](https://img.shields.io/github/all-contributors/rsenne/ParallelMCMC.jl?labelColor=5e1ec7&color=c0ffee&style=flat-square)](#contributors)
[![BestieTemplate](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/JuliaBesties/BestieTemplate.jl/main/docs/src/assets/badge.json)](https://github.com/JuliaBesties/BestieTemplate.jl)

<p align="center">
<img src="docs/src/assets/julia_deer_posterior.gif" alt="DEER trajectory estimates improving on a Julia-logo-shaped posterior" width="620">
</p>

<p align="center">
<em>DEER iterates on a synthetic Julia-logo-shaped posterior: orange trajectory estimates move toward the taped MALA path over repeated trajectory solves.</em>
</p>

## What this package does

**ParallelMCMC.jl** implements *parallel-across-the-sequence* MCMC in Julia: instead of generating samples one at a time, an entire trajectory of $T$ correlated steps is solved *simultaneously*. This makes wall-clock time per sample sublinear in chain length on multi-core CPUs and GPUs, where conventional sequential MCMC scales linearly.

The flagship algorithm is **DEER** (Lim et al. 2024; Gonzalez et al. 2024), which reformulates a chain of $T$ MALA steps as a fixed-point problem and solves it with Newton iterations. Each iteration linearizes the per-step transition around the current trajectory guess and resolves the resulting linear recursion in $O(\log T)$ parallel work via an associative prefix scan. With shared input randomness, DEER converges to the exact sequential MALA trace up to a numerical tolerance — typically in tens of iterations even for chains of tens of thousands of samples.

The approach and its scaling tricks (stochastic Hutchinson Jacobian estimators, damping, sliding windows) are described in:

> Zoltowski, D. M., Wu, S., Gonzalez, X., Kozachkov, L., & Linderman, S. W. (2025).
> **Parallelizing MCMC Across the Sequence Length.** *NeurIPS 2025.*
> [arXiv:2508.18413](https://arxiv.org/abs/2508.18413)

### Samplers

| Sampler | Role |
|---|---|
| [`ParallelMALASampler`](src/interface.jl) | **Primary** — parallel-across-sequence MALA via DEER; $O(\log T)$ per solve |
| [`MALASampler`](src/interface.jl) | Baseline — sequential MALA with a fixed step size |
| [`AdaptiveMALASampler`](src/interface.jl) | Baseline — sequential MALA with dual-averaging step-size adaptation |

All samplers implement the [AbstractMCMC](https://github.com/TuringLang/AbstractMCMC.jl) interface and return [`MCMCChains.Chains`](https://github.com/TuringLang/MCMCChains.jl) objects, so they slot into existing Turing.jl / AbstractMCMC workflows.


### Quick start

Install the package from GitHub with:

```julia-repl
pkg> add https://github.com/rsenne/ParallelMCMC.jl
```

```julia
using ParallelMCMC, MCMCChains

logp(x) = -0.5 * sum(abs2, x) # 2-D standard normal
grad_logp(x) = -x

model = DensityModel(logp, grad_logp, 2; param_names=[:x1, :x2])
sampler = ParallelMALASampler(0.1; T=64, jacobian=:stoch_diag)

chain = sample(model, sampler, 500; chain_type=MCMCChains.Chains)
```

See the [Getting Started guide](docs/src/10-getting-started.md) for worked examples, GPU usage, Turing.jl integration, and step-size tuning.

## How to Cite

If you use ParallelMCMC.jl in your work, please cite using the reference given in [CITATION.cff](https://github.com/rsenne/ParallelMCMC.jl/blob/main/CITATION.cff).

## Contributing

If you want to make contributions of any kind, please first that a look into our [contributing guide directly on GitHub](docs/src/90-contributing.md) or the [contributing page on the website](https://rsenne.github.io/ParallelMCMC.jl/dev/90-contributing/)
If you want to contribute, start with the [contributing guide on GitHub](docs/src/90-contributing.md) or the [documentation site](https://rsenne.github.io/ParallelMCMC.jl/dev/90-contributing/).

---

Expand Down
1 change: 1 addition & 0 deletions benchmarks/ParallelMCMCBenchmarks/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ ParallelMCMC = "1a970f40-4406-51c9-a967-cb3143c111e8"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"

[extras]
CUDA_Runtime_jll = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
Expand Down
41 changes: 41 additions & 0 deletions benchmarks/ParallelMCMCBenchmarks/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# ParallelMCMC Benchmarks

This package holds reproducible benchmark workloads for `ParallelMCMC.jl`.

## PR Regression Suite

The pull request workflow runs a small CPU-only suite against both the PR head
and the PR base commit, then compares median runtimes. It covers:

- one allocation-light MALA transition,
- the diagonal affine scan used by quasi-DEER,
- a full DEER block solve on a taped Gaussian MALA trajectory,
- the public `ParallelMALASampler` path on a small Bayesian logistic regression.

Run the same suite locally from the repository root with:

```bash
julia --project=benchmarks/ParallelMCMCBenchmarks \
benchmarks/ParallelMCMCBenchmarks/scripts/pr_benchmarks.jl \
--output pr-benchmarks.toml \
--markdown pr-benchmarks.md
```

To compare two result files:

```bash
julia benchmarks/ParallelMCMCBenchmarks/scripts/compare_pr_benchmarks.jl \
--base base-benchmarks.toml \
--head head-benchmarks.toml \
--markdown benchmark-comparison.md
```

CI marks a benchmark as `watch` above a 1.25x median-time ratio and fails above
1.75x. The thresholds can be adjusted with `PMCMC_BENCH_WARN_RATIO` and
`PMCMC_BENCH_FAIL_RATIO`.

## Manual GPU Sweeps

The existing scripts in `scripts/` are still intended for deeper manual
throughput and GPU investigations. They are not part of the PR gate because
standard GitHub-hosted runners do not provide CUDA hardware.
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ const BayesLogReg = ParallelMCMCBenchmarks.BayesLogReg

function _parse_t_vals()
raw = get(ENV, "PMCMC_T_VALS", "")
isempty(strip(raw)) && return [128, 256, 512, 1024, 2048]
isempty(strip(raw)) && return [512, 1024, 2048, 4096, 8192]
return parse.(Int, strip.(split(raw, ",")))
end

Expand Down Expand Up @@ -109,8 +109,9 @@ if _cuda_ok
y_gpu = CUDA.CuVector(y_f32)

logp_gpu, gradlogp_gpu, hvp_gpu = BayesLogReg.make_problem_with_hvp(X_gpu, y_gpu)
logp_gpu_batch, gradlogp_gpu_batch, hvp_gpu_batch =
BayesLogReg.make_problem_batched_with_hvp(X_gpu, y_gpu)
logp_gpu_batch, gradlogp_gpu_batch, hvp_gpu_batch = BayesLogReg.make_problem_batched_with_hvp(
X_gpu, y_gpu
)

model_gpu = DensityModel(
logp_gpu,
Expand Down
23 changes: 11 additions & 12 deletions benchmarks/ParallelMCMCBenchmarks/scripts/bench_mala_bayes.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,16 @@ using AbstractMCMC: sample
using ParallelMCMC
using ParallelMCMCBenchmarks
const BayesLinReg = ParallelMCMCBenchmarks.BayesLinReg
const MALARunner = ParallelMCMCBenchmarks.MALARunner
const MALARunner = ParallelMCMCBenchmarks.MALARunner

# Problem setup

rng = MersenneTwister(20251231)
n, p = 200, 16
X = randn(rng, n, p)
X = randn(rng, n, p)
β_true = randn(rng, p)
σ = 1.0
y = X * β_true .+ σ .* randn(rng, n)
σ = 1.0
y = X * β_true .+ σ .* randn(rng, n)

logpost, gradlogpost, μ_post, _ = BayesLinReg.make_problem(X, y; σ=σ, τ=10.0)
model = DensityModel(logpost, gradlogpost, p)
Expand All @@ -46,12 +46,7 @@ x_warm, ϵ_tuned = MALARunner.tune_stepsize_mala(
mala_sampler = AdaptiveMALASampler(ϵ_tuned; n_warmup=500)

deer_sampler = ParallelMALASampler(
ϵ_tuned;
T=64,
maxiter=200,
tol_abs=1e-6,
tol_rel=1e-5,
damping=0.5,
ϵ_tuned; T=64, maxiter=200, tol_abs=1e-6, tol_rel=1e-5, damping=0.5
)

# Benchmark helper
Expand Down Expand Up @@ -79,7 +74,9 @@ println("AdaptiveMALASampler (n_warmup=500, Float64)")
println("Model: Bayesian linear regression n=$n p=$p")
println("=" ^ 60, "\n")
for (n_samples, reps, label) in configs
results[("MALA", label)] = run_bench(model, mala_sampler, n_samples; reps, label, sampler_name="MALA")
results[("MALA", label)] = run_bench(
model, mala_sampler, n_samples; reps, label, sampler_name="MALA"
)
end

# ParallelMALA (DEER)
Expand All @@ -89,7 +86,9 @@ println("ParallelMALASampler (T=64, AutoEnzyme, Float64)")
println("Model: Bayesian linear regression n=$n p=$p")
println("=" ^ 60, "\n")
for (n_samples, reps, label) in configs
results[("DEER", label)] = run_bench(model, deer_sampler, n_samples; reps, label, sampler_name="DEER")
results[("DEER", label)] = run_bench(
model, deer_sampler, n_samples; reps, label, sampler_name="DEER"
)
end

# Summary table
Expand Down
Loading
Loading