DataDog · jbachorik · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026
@@ -31,6 +31,7 @@ stages:
   - integration-test
   - reliability
   - benchmarks
+  - post-benchmarks
   - fuzz
   - notify
 

@@ -2,13 +2,11 @@ variables:
   PREPARE_IMAGE: registry.ddbuild.io/images/benchmarking-platform-tools-ubuntu:latest
   DD_OCTO_STS_IMAGE: registry.ddbuild.io/images/dd-octo-sts-ci-base:2025.06-1
 
-.benchmark_job:
-  extends: .deploy-sa
+# Bridge job: triggers the BP pipeline and blocks until it completes.
+# Bridge jobs cannot appear in other jobs' needs: — downstream jobs use
+# stage ordering (post-benchmarks stage runs after benchmarks stage).
+benchmarks-trigger:
   stage: benchmarks
-  timeout: 6h
-  variables:
-    ITERATIONS: "${BENCHMARK_ITERATIONS:-1}"
-    MODES: "${BENCHMARK_MODES:-cpu,wall,alloc,memleak}"
   needs:
     - job: get-versions
       artifacts: true
@@ -24,64 +22,27 @@ variables:
     - if: '$CI_PIPELINE_SOURCE == "web"'
       when: manual
       allow_failure: true
-    # Run automatically and non-blocking on any other source (push/trigger/api/
-    # etc.) — mirrors the integration-test rules. The before_script CANCELLED
-    # gate skips branches with no open PR.
+    # Run automatically and non-blocking on any other source (push/trigger/api/etc.)
     - when: on_success
       allow_failure: true
-  script: |
-    # setup the env
-    export ARTIFACTS_DIR="$(pwd)/reports" && (mkdir "${ARTIFACTS_DIR}" || :)
-    export CANDIDATE_VERSION=${CURRENT_VERSION}
-    export BASELINE_VERSION=${PREVIOUS_VERSION}
-    export PLATFORM_DIR=".benchmarks/platform"
-
-    # check for missing candidate version
-    if [ -z "${CANDIDATE_VERSION}" ]; then echo "Missing candidate version. Skipping."; exit 0; fi
-
-    # fetch the common platform scripts
-    git -c url."https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.ddbuild.io/DataDog/".insteadOf="https://github.com/DataDog/" \
-      clone --branch dd-trace-go https://github.com/DataDog/benchmarking-platform ${PLATFORM_DIR}
-
-    # apply the specific step scripts
-    cp -r .gitlab/benchmarks/steps/* ${PLATFORM_DIR}/steps/
-    chmod a+x ${PLATFORM_DIR}/steps/*
-
-    # check for mode validity
-    ${PLATFORM_DIR}/steps/check_modes.sh
-    if [ "$(cat .job_status)" == "SKIP" ]; then exit 0; fi
-
-    # run benchmarks
-    ${PLATFORM_DIR}/steps/capture-hardware-software-info.sh
-    ${PLATFORM_DIR}/steps/run-benchmarks.sh
-    ${PLATFORM_DIR}/steps/analyze-results.sh
-    ${PLATFORM_DIR}/steps/upload-results-to-s3.sh
-  parallel:
-    matrix:
-      - RUN_MODE: ["cpu", "wall", "alloc", "memleak", "cpu,wall", "memleak,alloc", "cpu,wall,alloc,memleak"]
-  artifacts:
-    when: always
-    name: "reports"
-    paths:
-      - reports/
-    expire_in: 3 months
-
-benchmarks-candidate-amd64:
-  extends: .benchmark_job
-  tags: ["arch:amd64"]
-  image: $BENCHMARK_IMAGE_AMD64
-
-benchmarks-candidate-aarch64:
-  extends: .benchmark_job
-  tags: ["arch:arm64"]
-  image: $BENCHMARK_IMAGE_ARM64
   variables:
-    KUBERNETES_MEMORY_REQUEST: 200Gi
-    KUBERNETES_MEMORY_LIMIT: 200Gi
+    CANDIDATE_VERSION: "${CURRENT_VERSION}"
+    BASELINE_VERSION: "${PREVIOUS_VERSION}"
+    BENCHMARK_ITERATIONS: "${BENCHMARK_ITERATIONS:-5}"
+    BENCHMARK_MODES: "${BENCHMARK_MODES:-cpu,wall,alloc,memleak}"
+    DDPROF_COMMIT_SHA: "${CI_COMMIT_SHA}"
+    DDPROF_COMMIT_BRANCH: "${CI_COMMIT_BRANCH}"
+    UPSTREAM_PROJECT_NAME: "java-profiler"
+    UPSTREAM_BRANCH: "${CI_PIPELINE_ID}"
+    UPSTREAM_PIPELINE_ID: "${CI_PIPELINE_ID}"
+  trigger:
+    project: DataDog/apm-reliability/benchmarking-platform
+    branch: java-profiler
+    strategy: depend
 
 post-benchmarks-pr-comment:
   extends: .retry-config
-  stage: benchmarks
+  stage: post-benchmarks
   tags: ["arch:arm64"]
   image: registry.ddbuild.io/images/dd-octo-sts-ci-base:2025.06-1
   id_tokens:
@@ -90,47 +51,45 @@ post-benchmarks-pr-comment:
   needs:
     - job: prepare:start
       artifacts: true
-    - job: benchmarks-candidate-amd64
-      artifacts: true
-    - job: benchmarks-candidate-aarch64
-      artifacts: true
   rules:
     - if: '$JDK_VERSION != null || $DEBUG_LEVEL != null || $HASH != null || $DOWNSTREAM != null'
       when: never
     - if: '$CI_PIPELINE_SOURCE == "schedule"'
       when: never
     - if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
       when: never
-    # Always run when the candidate jobs ran, regardless of source, so results
-    # are posted back to the PR.
+    # Always run after the benchmarks stage so results are posted back to the PR.
     - when: always
-  timeout: 5m
+  timeout: 10m
   script:
+    - mkdir -p reports
+    - .gitlab/benchmarks/download-s3-reports.sh reports
     - .gitlab/benchmarks/post-pr-comment.sh reports
   allow_failure: true
 
 publish-benchmark-gh-pages:
-  stage: benchmarks
+  stage: post-benchmarks
   tags: ["arch:arm64"]
   image: registry.ddbuild.io/images/dd-octo-sts-ci-base:2025.06-1
   id_tokens:
     DDOCTOSTS_ID_TOKEN:
       aud: dd-octo-sts
-  needs:
-    - job: benchmarks-candidate-amd64
-      artifacts: true
-    - job: benchmarks-candidate-aarch64
-      artifacts: true
+  needs: []
+  # Serialize concurrent GH Pages pushes. publish-gh-pages.sh uses
+  # 'git push --force'; two concurrent pushes race and the slower one
+  # silently discards the faster one's history update.
+  resource_group: gh-pages-publish
   rules:
     - if: '$CI_PIPELINE_SOURCE == "schedule"'
       when: never
     - if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH || $CI_COMMIT_BRANCH == "main"'
       when: always
-  timeout: 10m
+  timeout: 15m
   script:
-    - ./.gitlab/benchmarks/publish-gh-pages.sh
+    - mkdir -p reports
+    - .gitlab/benchmarks/download-s3-reports.sh reports
+    - ./.gitlab/benchmarks/publish-gh-pages.sh reports
   allow_failure: true
 
 include:
   - local: .gitlab/common.yml
-  - local: .gitlab/benchmarks/images.yml
@@ -0,0 +1,97 @@
+#!/usr/bin/env bash
+# Downloads result JSONs from the BP downstream pipeline via the GitLab CI API.
+#
+# Requires only curl and python3 (stdlib) — no aws CLI, pip, or boto3 needed.
+# BP jobs already store artifacts in GitLab; this fetches them directly from
+# the downstream pipeline triggered by benchmarks-trigger.
+set -uo pipefail   # intentionally no -e: we handle errors explicitly
+
+DEST="${1:-reports}"
+mkdir -p "${DEST}"
+
+TMPDIR_LOCAL=$(mktemp -d)
+trap 'rm -rf "${TMPDIR_LOCAL}"' EXIT
+
+# ── helper: curl with explicit HTTP status checking ──────────────────────────
+# Usage: api_get <url> <output_file>
+# Returns 0 on 2xx, prints diagnostics and returns 1 otherwise.
+api_get() {
+  local url="$1" out="$2"
+  local http_code
+  http_code=$(curl -s -o "${out}" -w "%{http_code}" \
+    --header "JOB-TOKEN: ${CI_JOB_TOKEN}" "${url}")
+  if [[ "${http_code}" != 2* ]]; then
+    echo "  API ${url##*/}: HTTP ${http_code} — $(cat "${out}" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('message','?'))" 2>/dev/null || echo 'see above')"
+    return 1
+  fi
+  return 0
+}
+
+# ── 1. find the benchmarks-trigger bridge ────────────────────────────────────
+BRIDGES_FILE="${TMPDIR_LOCAL}/bridges.json"
+echo "Querying bridges for pipeline ${CI_PIPELINE_ID}…"
+if ! api_get \
+  "${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/pipelines/${CI_PIPELINE_ID}/bridges" \
+  "${BRIDGES_FILE}"; then
+  echo "Cannot read pipeline bridges (job token may lack Reporter access) — skipping download"
+  exit 0
+fi
+
+read -r BP_PROJECT_ID DOWNSTREAM_PIPELINE_ID < <(python3 - "${BRIDGES_FILE}" <<'PYEOF'
+import json, sys
+with open(sys.argv[1]) as f:
+    bridges = json.load(f)
+for b in bridges:
+    if b.get("name") == "benchmarks-trigger":
+        dp = b.get("downstream_pipeline") or {}
+        if dp.get("id") and dp.get("project_id"):
+            print(dp["project_id"], dp["id"])
+            sys.exit(0)
+print("", "")
+PYEOF
+)
+
+if [ -z "${DOWNSTREAM_PIPELINE_ID:-}" ]; then
+  echo "benchmarks-trigger bridge not found or did not run — skipping download"
+  exit 0
+fi
+echo "BP downstream pipeline: project=${BP_PROJECT_ID}  pipeline=${DOWNSTREAM_PIPELINE_ID}"
+
+# ── 2. list jobs in the downstream pipeline ──────────────────────────────────
+JOBS_FILE="${TMPDIR_LOCAL}/jobs.json"
+echo "Listing BP pipeline jobs…"
+if ! api_get \
+  "${CI_API_V4_URL}/projects/${BP_PROJECT_ID}/pipelines/${DOWNSTREAM_PIPELINE_ID}/jobs?per_page=100" \
+  "${JOBS_FILE}"; then
+  echo "Cannot list BP pipeline jobs — skipping download"
+  exit 0
+fi
+
+JOB_IDS=$(python3 -c "
+import json
+with open('${JOBS_FILE}') as f:
+    print(' '.join(str(j['id']) for j in json.load(f)))
+")
+
+# ── 3. download result_*.json from each job's artifact zip ───────────────────
+DOWNLOADED=0
+for JOB_ID in ${JOB_IDS}; do
+  ART_ZIP="${TMPDIR_LOCAL}/art_${JOB_ID}.zip"
+  ART_STATUS=$(curl -s -o "${ART_ZIP}" -w "%{http_code}" \
+    --header "JOB-TOKEN: ${CI_JOB_TOKEN}" \
+    "${CI_API_V4_URL}/projects/${BP_PROJECT_ID}/jobs/${JOB_ID}/artifacts" 2>/dev/null)
+  if [[ "${ART_STATUS}" == 2* ]]; then
+    # -j: junk paths (strip artifacts/ prefix), -q: quiet, -o: overwrite
+    if unzip -q -j "${ART_ZIP}" "artifacts/result_*.json" -d "${DEST}/" 2>/dev/null; then
+      DOWNLOADED=$((DOWNLOADED + 1))
+    fi
+  fi
+done
+
+RESULT_COUNT=$(find "${DEST}" -name "result_*.json" | wc -l)
+echo "result_*.json files: ${RESULT_COUNT} (from ${DOWNLOADED} BP job(s))"
+
+if [ "${RESULT_COUNT}" -eq 0 ]; then
+  echo "WARNING: no result JSONs found — BP jobs may not have run or produced artifacts yet"
+  exit 1
+fi
@@ -1,8 +1,9 @@
 #!/usr/bin/env bash
-# Post aggregated benchmark comparison results as a single PR comment.
+# Post aggregated benchmark results as a single PR comment.
 #
-# Expects all per-cell comparison-baseline-vs-candidate_*.md reports to be
-# present under REPORTS_DIR (default: reports/).
+# Handles two report types found under REPORTS_DIR (default: reports/):
+#   - comparison-baseline-vs-candidate_*.md  (perf comparison benchmarks)
+#   - result_${BENCHMARK}_${JDK}_${LIBRARY}.json  (reliability benchmarks)
 #
 # Required env:
 #   DDPROF_COMMIT_BRANCH  – branch name used to locate the open PR
@@ -14,7 +15,7 @@ set -euo pipefail
 REPORTS_DIR="${1:-reports}"
 HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
-# Aggregate all per-cell reports into a single comment body
+# --- Perf comparison reports (markdown, one per benchmark cell) ---
 SECTIONS=""
 for md in "${REPORTS_DIR}"/comparison-baseline-vs-candidate_*.md; do
   [ -f "${md}" ] || continue
@@ -28,6 +29,80 @@ $(cat "${md}")
 "
 done
 
+# --- Reliability reports (JSON, one per benchmark×JDK×library) ---
+RELIABILITY_TABLE=$(python3 - "${REPORTS_DIR}" <<'PYEOF'
+import json, sys, glob, collections
+
+reports_dir = sys.argv[1]
+results = collections.defaultdict(dict)
+
+for path in sorted(glob.glob(f"{reports_dir}/result_*.json")):
+    try:
+        with open(path) as f:
+            r = json.load(f)
+        key = (r["benchmark"], r["jdk"])
+        results[key][r["library"]] = r
+    except Exception:
+        continue
+
+if not results:
+    sys.exit(0)
+
+def fmt_avg(r):
+    icon = "💥" if r.get("crashed") else "✅"
+    avg = r.get("avg_ms", 0)
+    cnt = r.get("run_count", 0)
+    return f"{icon} {avg} ms ({cnt} iters)"
+
+def fmt_delta(latest, dev):
+    la, da = latest.get("avg_ms", 0), dev.get("avg_ms", 0)
+    if not la or not da:
+        return "—"
+    pct = (da - la) / la * 100
+    # positive = dev is slower (potential regression)
+    arrow = "🔴 +" if pct > 2 else ("🟢 " if pct < -2 else "")
+    return f"{arrow}{pct:+.1f}%"
+
+def fmt_uploads(r):
+    return str(r.get("upload_count", 0))
+
+def fmt_issues(r):
+    e = r.get("error_count", 0)
+    w = r.get("warn_count", 0)
+    if e == 0 and w == 0:
+        return "—"
+    parts = []
+    if e: parts.append(f"E:{e}")
+    if w: parts.append(f"W:{w}")
+    return "⚠️ " + " ".join(parts)
+
+lines = [
+    "### Reliability Benchmarks",
+    "",
+    "| Benchmark | JDK | Latest | Dev | Δ% (dev vs latest) | Uploads L/D | Issues L/D |",
+    "|-----------|-----|--------|-----|-------------------|-------------|------------|",
+]
+for (bench, jdk) in sorted(results.keys()):
+    libs = results[(bench, jdk)]
+    latest = libs.get("latest", {})
+    dev    = libs.get("dev", {})
+
+    col_latest  = fmt_avg(latest)  if latest else "—"
+    col_dev     = fmt_avg(dev)     if dev    else "—"
+    col_delta   = fmt_delta(latest, dev) if (latest and dev) else "—"
+    col_uploads = f"{fmt_uploads(latest)} / {fmt_uploads(dev)}"
+    col_issues  = f"{fmt_issues(latest)} / {fmt_issues(dev)}"
+
+    lines.append(f"| {bench} | {jdk} | {col_latest} | {col_dev} | {col_delta} | {col_uploads} | {col_issues} |")
+
+print("\n".join(lines))
+PYEOF
+) || true
+
+[ -n "${RELIABILITY_TABLE}" ] && SECTIONS="${SECTIONS}
+${RELIABILITY_TABLE}
+"
+
 if [ -z "${SECTIONS}" ]; then
   echo "No benchmark reports found under ${REPORTS_DIR} — skipping comment"
   exit 0