rsenne · rsenne · May 2, 2026 · Apr 28, 2026 · Apr 28, 2026 · Apr 29, 2026
diff --git a/.github/workflows/Benchmarks.yml b/.github/workflows/Benchmarks.yml
@@ -0,0 +1,109 @@
+name: Benchmarks
+
+on:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - ".github/workflows/Benchmarks.yml"
+      - "Project.toml"
+      - "src/**"
+      - "ext/**"
+      - "benchmarks/**"
+    types: [opened, synchronize, reopened, ready_for_review]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
+
+permissions:
+  contents: read
+
+jobs:
+  benchmark:
+    name: PR benchmark comparison
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+
+    env:
+      PMCMC_BENCH_SECONDS: "0.5"
+      PMCMC_BENCH_SAMPLES: "8"
+      PMCMC_BENCH_WARN_RATIO: "1.25"
+      PMCMC_BENCH_FAIL_RATIO: "1.75"
+      JULIA_NUM_PRECOMPILE_TASKS: "1"
+
+    steps:
+      - name: Checkout base
+        uses: actions/checkout@v4
+        with:
+          repository: ${{ github.event.pull_request.base.repo.full_name }}
+          ref: ${{ github.event.pull_request.base.sha }}
+          path: base
+
+      - name: Checkout PR head
+        uses: actions/checkout@v4
+        with:
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          ref: ${{ github.event.pull_request.head.sha }}
+          path: head
+
+      - uses: julia-actions/setup-julia@v2
+        with:
+          version: "1"
+          arch: x64
+
+      - name: Use Julia cache
+        uses: julia-actions/cache@v3
+
+      - name: Instantiate base package in benchmark environment
+        working-directory: head/benchmarks/ParallelMCMCBenchmarks
+        run: |
+          julia --project=. -e 'using Pkg; Pkg.develop(PackageSpec(path=joinpath(ENV["GITHUB_WORKSPACE"], "base"))); Pkg.instantiate()'
+
+      - name: Run base benchmarks
+        run: |
+          julia --project="$GITHUB_WORKSPACE/head/benchmarks/ParallelMCMCBenchmarks" \
+            "$GITHUB_WORKSPACE/head/benchmarks/ParallelMCMCBenchmarks/scripts/pr_benchmarks.jl" \
+            --seconds "$PMCMC_BENCH_SECONDS" \
+            --samples "$PMCMC_BENCH_SAMPLES" \
+            --output "$GITHUB_WORKSPACE/base-benchmarks.toml" \
+            --markdown "$GITHUB_WORKSPACE/base-benchmarks.md"
+
+      - name: Instantiate PR package in benchmark environment
+        working-directory: head/benchmarks/ParallelMCMCBenchmarks
+        run: |
+          julia --project=. -e 'using Pkg; Pkg.develop(PackageSpec(path=joinpath(ENV["GITHUB_WORKSPACE"], "head"))); Pkg.instantiate()'
+
+      - name: Run PR benchmarks
+        run: |
+          julia --project="$GITHUB_WORKSPACE/head/benchmarks/ParallelMCMCBenchmarks" \
+            "$GITHUB_WORKSPACE/head/benchmarks/ParallelMCMCBenchmarks/scripts/pr_benchmarks.jl" \
+            --seconds "$PMCMC_BENCH_SECONDS" \
+            --samples "$PMCMC_BENCH_SAMPLES" \
+            --output "$GITHUB_WORKSPACE/head-benchmarks.toml" \
+            --markdown "$GITHUB_WORKSPACE/head-benchmarks.md"
+
+      - name: Compare benchmarks
+        run: |
+          set +e
+          julia "$GITHUB_WORKSPACE/head/benchmarks/ParallelMCMCBenchmarks/scripts/compare_pr_benchmarks.jl" \
+            --base "$GITHUB_WORKSPACE/base-benchmarks.toml" \
+            --head "$GITHUB_WORKSPACE/head-benchmarks.toml" \
+            --warn-ratio "$PMCMC_BENCH_WARN_RATIO" \
+            --fail-ratio "$PMCMC_BENCH_FAIL_RATIO" \
+            --markdown "$GITHUB_WORKSPACE/benchmark-comparison.md"
+          status=$?
+          cat "$GITHUB_WORKSPACE/benchmark-comparison.md" >> "$GITHUB_STEP_SUMMARY"
+          exit "$status"
+
+      - name: Upload benchmark results
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: pr-benchmark-results
+          path: |
+            base-benchmarks.toml
+            base-benchmarks.md
+            head-benchmarks.toml
+            head-benchmarks.md
+            benchmark-comparison.md
diff --git a/.gitignore b/.gitignore
@@ -6,8 +6,19 @@
 .benchmarkci
 Manifest.toml
 benchmark/*.json
+benchmarks/ParallelMCMCBenchmarks/*-benchmarks.toml
+benchmarks/ParallelMCMCBenchmarks/*-benchmarks.md
+benchmarks/ParallelMCMCBenchmarks/benchmark-comparison.md
 coverage
 docs/build/
 env
 node_modules
 LocalPreferences.toml
+
+# ignore any AI files a contributor may have
+.codex
+.claude
+CLAUDE.md
+AGENTS.md
+CODEX.md
+.gemini
diff --git a/Project.toml b/Project.toml
@@ -10,12 +10,12 @@ CUDA = "5.11.0"
 DifferentiationInterface = "0.7.13"
 DynamicPPL = "0.40"
 Enzyme = "0.13.131"
-LinearAlgebra = "1.12.0"
+LinearAlgebra = "1"
 LogDensityProblems = "2"
 LogDensityProblemsAD = "1"
 MCMCChains = "7.7.0"
-Random = "1.11.0"
-Statistics = "1.11.1"
+Random = "1"
+Statistics = "1"
 julia = "1.10"
 
 [deps]

diff --git a/README.md b/README.md
@@ -1,22 +1,78 @@
 # ParallelMCMC
 
+<p align="center">
+  <img src="docs/src/assets/logo.png" alt="ParallelMCMC logo" width="220">
+</p>
+
 [![Stable Documentation](https://img.shields.io/badge/docs-stable-blue.svg)](https://rsenne.github.io/ParallelMCMC.jl/stable)
 [![Development documentation](https://img.shields.io/badge/docs-dev-blue.svg)](https://rsenne.github.io/ParallelMCMC.jl/dev)
 [![Test workflow status](https://github.com/rsenne/ParallelMCMC.jl/actions/workflows/Test.yml/badge.svg?branch=main)](https://github.com/rsenne/ParallelMCMC.jl/actions/workflows/Test.yml?query=branch%3Amain)
 [![Coverage](https://codecov.io/gh/rsenne/ParallelMCMC.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/rsenne/ParallelMCMC.jl)
 [![Docs workflow Status](https://github.com/rsenne/ParallelMCMC.jl/actions/workflows/Docs.yml/badge.svg?branch=main)](https://github.com/rsenne/ParallelMCMC.jl/actions/workflows/Docs.yml?query=branch%3Amain)
-[![DOI](https://zenodo.org/badge/DOI/FIXME)](https://doi.org/FIXME)
 [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.1-4baaaa.svg)](CODE_OF_CONDUCT.md)
 [![All Contributors](https://img.shields.io/github/all-contributors/rsenne/ParallelMCMC.jl?labelColor=5e1ec7&color=c0ffee&style=flat-square)](#contributors)
 [![BestieTemplate](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/JuliaBesties/BestieTemplate.jl/main/docs/src/assets/badge.json)](https://github.com/JuliaBesties/BestieTemplate.jl)
 
+<p align="center">
+  <img src="docs/src/assets/julia_deer_posterior.gif" alt="DEER trajectory estimates improving on a Julia-logo-shaped posterior" width="620">
+</p>
+
+<p align="center">
+  <em>DEER iterates on a synthetic Julia-logo-shaped posterior: orange trajectory estimates move toward the taped MALA path over repeated trajectory solves.</em>
+</p>
+
+## What this package does
+
+**ParallelMCMC.jl** implements *parallel-across-the-sequence* MCMC in Julia: instead of generating samples one at a time, an entire trajectory of $T$ correlated steps is solved *simultaneously*. This makes wall-clock time per sample sublinear in chain length on multi-core CPUs and GPUs, where conventional sequential MCMC scales linearly.
+
+The flagship algorithm is **DEER** (Lim et al. 2024; Gonzalez et al. 2024), which reformulates a chain of $T$ MALA steps as a fixed-point problem and solves it with Newton iterations. Each iteration linearizes the per-step transition around the current trajectory guess and resolves the resulting linear recursion in $O(\log T)$ parallel work via an associative prefix scan. With shared input randomness, DEER converges to the exact sequential MALA trace up to a numerical tolerance — typically in tens of iterations even for chains of tens of thousands of samples.
+
+The approach and its scaling tricks (stochastic Hutchinson Jacobian estimators, damping, sliding windows) are described in:
+
+> Zoltowski, D. M., Wu, S., Gonzalez, X., Kozachkov, L., & Linderman, S. W. (2025).
+> **Parallelizing MCMC Across the Sequence Length.** *NeurIPS 2025.*
+> [arXiv:2508.18413](https://arxiv.org/abs/2508.18413)
+
+### Samplers
+
+| Sampler | Role |
+|---|---|
+| [`ParallelMALASampler`](src/interface.jl) | **Primary** — parallel-across-sequence MALA via DEER; $O(\log T)$ per solve |
+| [`MALASampler`](src/interface.jl) | Baseline — sequential MALA with a fixed step size |
+| [`AdaptiveMALASampler`](src/interface.jl) | Baseline — sequential MALA with dual-averaging step-size adaptation |
+
+All samplers implement the [AbstractMCMC](https://github.com/TuringLang/AbstractMCMC.jl) interface and return [`MCMCChains.Chains`](https://github.com/TuringLang/MCMCChains.jl) objects, so they slot into existing Turing.jl / AbstractMCMC workflows.
+
+
+### Quick start
+
+Install the package from GitHub with:
+
+```julia-repl
+pkg> add https://github.com/rsenne/ParallelMCMC.jl
+```
+
+```julia
+using ParallelMCMC, MCMCChains
+
+logp(x)      = -0.5 * sum(abs2, x)            # 2-D standard normal
+grad_logp(x) = -x
+
+model   = DensityModel(logp, grad_logp, 2; param_names=[:x1, :x2])
+sampler = ParallelMALASampler(0.1; T=64, jacobian=:stoch_diag)
+
+chain = sample(model, sampler, 500; chain_type=MCMCChains.Chains)
+```
+
+See the [Getting Started guide](docs/src/10-getting-started.md) for worked examples, GPU usage, Turing.jl integration, and step-size tuning.
+
 ## How to Cite
 
 If you use ParallelMCMC.jl in your work, please cite using the reference given in [CITATION.cff](https://github.com/rsenne/ParallelMCMC.jl/blob/main/CITATION.cff).
 
 ## Contributing
 
-If you want to make contributions of any kind, please first that a look into our [contributing guide directly on GitHub](docs/src/90-contributing.md) or the [contributing page on the website](https://rsenne.github.io/ParallelMCMC.jl/dev/90-contributing/)
+If you want to contribute, start with the [contributing guide on GitHub](docs/src/90-contributing.md) or the [documentation site](https://rsenne.github.io/ParallelMCMC.jl/dev/90-contributing/).
 
 ---
 

diff --git a/benchmarks/ParallelMCMCBenchmarks/Project.toml b/benchmarks/ParallelMCMCBenchmarks/Project.toml
@@ -12,6 +12,7 @@ ParallelMCMC = "1a970f40-4406-51c9-a967-cb3143c111e8"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
 
 [extras]
 CUDA_Runtime_jll = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"

diff --git a/benchmarks/ParallelMCMCBenchmarks/README.md b/benchmarks/ParallelMCMCBenchmarks/README.md
@@ -0,0 +1,41 @@
+# ParallelMCMC Benchmarks
+
+This package holds reproducible benchmark workloads for `ParallelMCMC.jl`.
+
+## PR Regression Suite
+
+The pull request workflow runs a small CPU-only suite against both the PR head
+and the PR base commit, then compares median runtimes. It covers:
+
+- one allocation-light MALA transition,
+- the diagonal affine scan used by quasi-DEER,
+- a full DEER block solve on a taped Gaussian MALA trajectory,
+- the public `ParallelMALASampler` path on a small Bayesian logistic regression.
+
+Run the same suite locally from the repository root with:
+
+```bash
+julia --project=benchmarks/ParallelMCMCBenchmarks \
+  benchmarks/ParallelMCMCBenchmarks/scripts/pr_benchmarks.jl \
+  --output pr-benchmarks.toml \
+  --markdown pr-benchmarks.md
+```
+
+To compare two result files:
+
+```bash
+julia benchmarks/ParallelMCMCBenchmarks/scripts/compare_pr_benchmarks.jl \
+  --base base-benchmarks.toml \
+  --head head-benchmarks.toml \
+  --markdown benchmark-comparison.md
+```
+
+CI marks a benchmark as `watch` above a 1.25x median-time ratio and fails above
+1.75x. The thresholds can be adjusted with `PMCMC_BENCH_WARN_RATIO` and
+`PMCMC_BENCH_FAIL_RATIO`.
+
+## Manual GPU Sweeps
+
+The existing scripts in `scripts/` are still intended for deeper manual
+throughput and GPU investigations. They are not part of the PR gate because
+standard GitHub-hosted runners do not provide CUDA hardware.
diff --git a/benchmarks/ParallelMCMCBenchmarks/scripts/bench_deer_logreg.jl b/benchmarks/ParallelMCMCBenchmarks/scripts/bench_deer_logreg.jl
@@ -29,7 +29,7 @@ const BayesLogReg = ParallelMCMCBenchmarks.BayesLogReg
 
 function _parse_t_vals()
     raw = get(ENV, "PMCMC_T_VALS", "")
-    isempty(strip(raw)) && return [128, 256, 512, 1024, 2048]
+    isempty(strip(raw)) && return [512, 1024, 2048, 4096, 8192]
     return parse.(Int, strip.(split(raw, ",")))
 end
 
@@ -109,8 +109,9 @@ if _cuda_ok
     y_gpu = CUDA.CuVector(y_f32)
 
     logp_gpu, gradlogp_gpu, hvp_gpu = BayesLogReg.make_problem_with_hvp(X_gpu, y_gpu)
-    logp_gpu_batch, gradlogp_gpu_batch, hvp_gpu_batch =
-        BayesLogReg.make_problem_batched_with_hvp(X_gpu, y_gpu)
+    logp_gpu_batch, gradlogp_gpu_batch, hvp_gpu_batch = BayesLogReg.make_problem_batched_with_hvp(
+        X_gpu, y_gpu
+    )
 
     model_gpu = DensityModel(
         logp_gpu,

diff --git a/benchmarks/ParallelMCMCBenchmarks/scripts/bench_mala_bayes.jl b/benchmarks/ParallelMCMCBenchmarks/scripts/bench_mala_bayes.jl
@@ -19,16 +19,16 @@ using AbstractMCMC: sample
 using ParallelMCMC
 using ParallelMCMCBenchmarks
 const BayesLinReg = ParallelMCMCBenchmarks.BayesLinReg
-const MALARunner  = ParallelMCMCBenchmarks.MALARunner
+const MALARunner = ParallelMCMCBenchmarks.MALARunner
 
 # Problem setup
 
 rng = MersenneTwister(20251231)
 n, p = 200, 16
-X      = randn(rng, n, p)
+X = randn(rng, n, p)
 β_true = randn(rng, p)
-σ      = 1.0
-y      = X * β_true .+ σ .* randn(rng, n)
+σ = 1.0
+y = X * β_true .+ σ .* randn(rng, n)
 
 logpost, gradlogpost, μ_post, _ = BayesLinReg.make_problem(X, y; σ=σ, τ=10.0)
 model = DensityModel(logpost, gradlogpost, p)
@@ -46,12 +46,7 @@ x_warm, ϵ_tuned = MALARunner.tune_stepsize_mala(
 mala_sampler = AdaptiveMALASampler(ϵ_tuned; n_warmup=500)
 
 deer_sampler = ParallelMALASampler(
-    ϵ_tuned;
-    T=64,
-    maxiter=200,
-    tol_abs=1e-6,
-    tol_rel=1e-5,
-    damping=0.5,
+    ϵ_tuned; T=64, maxiter=200, tol_abs=1e-6, tol_rel=1e-5, damping=0.5
 )
 
 # Benchmark helper
@@ -79,7 +74,9 @@ println("AdaptiveMALASampler  (n_warmup=500, Float64)")
 println("Model: Bayesian linear regression  n=$n  p=$p")
 println("=" ^ 60, "\n")
 for (n_samples, reps, label) in configs
-    results[("MALA", label)] = run_bench(model, mala_sampler, n_samples; reps, label, sampler_name="MALA")
+    results[("MALA", label)] = run_bench(
+        model, mala_sampler, n_samples; reps, label, sampler_name="MALA"
+    )
 end
 
 # ParallelMALA (DEER)
@@ -89,7 +86,9 @@ println("ParallelMALASampler  (T=64, AutoEnzyme, Float64)")
 println("Model: Bayesian linear regression  n=$n  p=$p")
 println("=" ^ 60, "\n")
 for (n_samples, reps, label) in configs
-    results[("DEER", label)] = run_bench(model, deer_sampler, n_samples; reps, label, sampler_name="DEER")
+    results[("DEER", label)] = run_bench(
+        model, deer_sampler, n_samples; reps, label, sampler_name="DEER"
+    )
 end
 
 # Summary table