TuringLang · yebai · May 4, 2026 · Apr 30, 2026 · May 4, 2026 · May 4, 2026
diff --git a/.github/workflows/Benchmarking.yml b/.github/workflows/Benchmarking.yml
@@ -4,36 +4,11 @@ on:
   pull_request:
 
 jobs:
-  benchmark-base:
-    runs-on: ubuntu-latest
-    outputs:
-      results: ${{ steps.benchmark.outputs.results }}
-      sha: ${{ steps.benchmark.outputs.sha }}
-    steps:
-      - uses: actions/checkout@v6
-        with:
-          ref: ${{ github.base_ref }}
-      - uses: julia-actions/setup-julia@v3
-        with:
-          version: '1.11'
-      - uses: julia-actions/cache@v3
-
-      - name: Run benchmarks
-        id: benchmark
-        working-directory: ./benchmarks
-        run: |
-          # github output can't handle more than 1 line, hence the tail
-          julia --project=. -e 'using Pkg; Pkg.instantiate()'
-          results=$(julia --project=. benchmarks.jl json | tail -n 1 || true)
-          echo $results
-          echo "results=$results" >> "$GITHUB_OUTPUT"
-          echo "sha=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT"
-
-  benchmark-head:
-    runs-on: ubuntu-latest
-    outputs:
-      results: ${{ steps.benchmark.outputs.results }}
-      sha: ${{ steps.benchmark.outputs.sha }}
+  benchmark:
+    # Pinned (rather than `ubuntu-latest`) so that successive runs land on the
+    # same VM family. GitHub silently rotates `latest`, which changes the noise
+    # floor between runs and makes timings hard to compare across PRs.
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v6
         with:
@@ -44,64 +19,34 @@ jobs:
       - uses: julia-actions/cache@v3
 
       - name: Run benchmarks
-        id: benchmark
         working-directory: ./benchmarks
         run: |
-          # github output can't handle more than 1 line, hence the tail
           julia --project=. -e 'using Pkg; Pkg.instantiate()'
-          results=$(julia --project=. benchmarks.jl json | tail -n 1 || true)
-          echo $results
-          echo "results=$results" >> "$GITHUB_OUTPUT"
-          echo "sha=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT"
-
-  combine-results:
-    runs-on: ubuntu-latest
-    needs: [benchmark-base, benchmark-head]
-    steps:
-      - uses: actions/checkout@v6
-        with:
-          ref: ${{ github.event.pull_request.head.sha }}
-      - uses: julia-actions/setup-julia@v3
-        with:
-          version: '1.11'
-      - uses: julia-actions/cache@v3
 
-      - name: Combine benchmark results
-        working-directory: ./benchmarks
-        run: |
           version_info=$(julia -e 'using InteractiveUtils; versioninfo()')
-          echo "$version_info"
-          echo "VERSION_INFO<<EOF" >> $GITHUB_ENV
-          echo "$version_info" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
 
-          # save outputs of previous jobs to json file
-          echo "Base results"
-          echo "--------------------------------------------------------"
-          echo '${{needs.benchmark-base.outputs.results}}'
-          echo '${{needs.benchmark-base.outputs.results}}' > base.json
-          echo "Head results"
-          echo "--------------------------------------------------------"
-          echo '${{needs.benchmark-head.outputs.results}}'
-          echo '${{needs.benchmark-head.outputs.results}}' > head.json
-
-          # combine them and save the output as an env var for later steps
-          julia --project=. -e 'using Pkg; Pkg.instantiate()'
-          results=$(julia --project=. benchmarks.jl combine head.json base.json)
-          echo "Combined results"
-          echo "--------------------------------------------------------"
-          echo "$results"
-
-          echo "BENCHMARK_OUTPUT<<EOF" >> $GITHUB_ENV
-          echo "$results" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
+          # Capture the markdown-mode benchmark output. The `tee` keeps it in
+          # the workflow log too, so a failure during comment posting does not
+          # lose the numbers.
+          results_file=$(mktemp)
+          julia --project=. benchmarks.jl markdown | tee "$results_file"
+
+          {
+            echo "VERSION_INFO<<DPPL_BENCH_EOF"
+            echo "$version_info"
+            echo "DPPL_BENCH_EOF"
+            echo "BENCHMARK_OUTPUT<<DPPL_BENCH_EOF"
+            cat "$results_file"
+            echo "DPPL_BENCH_EOF"
+          } >> "$GITHUB_ENV"
 
       - name: Find existing benchmark comment
         uses: peter-evans/find-comment@v4
         id: find_comment
         with:
           issue-number: ${{ github.event.pull_request.number }}
           comment-author: github-actions[bot]
+          body-includes: Benchmark Report
 
       - name: Create or update benchmark comment
         uses: peter-evans/create-or-update-comment@v5
@@ -110,8 +55,11 @@ jobs:
           body: |
             ## Benchmark Report
 
-            - this PR's head: `${{ needs.benchmark-head.outputs.sha }}`
-            - base branch: `${{ needs.benchmark-base.outputs.sha }}`
+            - this PR's head: `${{ github.event.pull_request.head.sha }}`
+
+            Absolute log-density times and grad/log-density ratios are
+            reported. To judge whether a PR helps or hurts, compare against
+            the latest comment on a recent main-branch PR run.
 
             ### Computer Information
             ```

diff --git a/benchmarks/Project.toml b/benchmarks/Project.toml
@@ -1,24 +1,20 @@
-name = "DynamicPPLBenchmarks"
-uuid = "d94a1522-c11e-44a7-981a-42bf5dc1a001"
-version = "0.1.0"
-
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
 Chairmarks = "0ca39b1e-fe0b-4e98-acfc-b1656634c4de"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 DynamicPPL = "366bfd00-2699-11ea-058f-f148b4cae6d8"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
-JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LogDensityProblems = "6fdf6af0-433a-55f7-b3ed-c6c6e0b8df7c"
 Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
 PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
+Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 
 [sources]
-DynamicPPL = {path = "../"}
+DynamicPPL = {path = ".."}
 
 [compat]
 ADTypes = "1.14.0"
@@ -27,7 +23,6 @@ Distributions = "0.25.117"
 DynamicPPL = "0.41"
 Enzyme = "0.13"
 ForwardDiff = "1"
-JSON = "1.3.0"
 LogDensityProblems = "2.1.2"
 Mooncake = "0.4, 0.5"
 PrettyTables = "3"

diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -1,5 +1,10 @@
-To run the benchmarks locally, run this from the root directory of the repository:
+Run from the repository root:
 
 ```sh
+julia --project=benchmarks -e 'using Pkg; Pkg.instantiate()'
 julia --project=benchmarks benchmarks/benchmarks.jl
 ```
+
+The `Benchmarking` CI workflow runs this on each PR and posts the table as a
+comment. There is no base-vs-head comparison: judge regressions by comparing
+against the most recent main-branch run in the comment history.