From 3fc25e82dc08f8dab5b2636a6f81702c0c2765c8 Mon Sep 17 00:00:00 2001
From: Jaroslav Bachorik <jaroslav.bachorik@datadoghq.com>
Date: Thu, 18 Jun 2026 17:14:57 +0200
Subject: [PATCH 1/8] fix(ci): migrate benchmarks to benchmarking-platform
 trigger

---
 .gitlab-ci.yml                            |   1 +
 .gitlab/benchmarks/.gitlab-ci.yml         | 107 +++++++---------------
 .gitlab/benchmarks/download-s3-reports.sh |  28 ++++++
 3 files changed, 62 insertions(+), 74 deletions(-)
 create mode 100755 .gitlab/benchmarks/download-s3-reports.sh

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 83ce0def5..b0c73be46 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -31,6 +31,7 @@ stages:
   - integration-test
   - reliability
   - benchmarks
+  - post-benchmarks
   - fuzz
   - notify
 
diff --git a/.gitlab/benchmarks/.gitlab-ci.yml b/.gitlab/benchmarks/.gitlab-ci.yml
index 960383af6..e322fb131 100644
--- a/.gitlab/benchmarks/.gitlab-ci.yml
+++ b/.gitlab/benchmarks/.gitlab-ci.yml
@@ -2,13 +2,11 @@ variables:
   PREPARE_IMAGE: registry.ddbuild.io/images/benchmarking-platform-tools-ubuntu:latest
   DD_OCTO_STS_IMAGE: registry.ddbuild.io/images/dd-octo-sts-ci-base:2025.06-1
 
-.benchmark_job:
-  extends: .deploy-sa
+# Bridge job: triggers the BP pipeline and blocks until it completes.
+# Bridge jobs cannot appear in other jobs' needs: — downstream jobs use
+# stage ordering (post-benchmarks stage runs after benchmarks stage).
+benchmarks-trigger:
   stage: benchmarks
-  timeout: 6h
-  variables:
-    ITERATIONS: "${BENCHMARK_ITERATIONS:-1}"
-    MODES: "${BENCHMARK_MODES:-cpu,wall,alloc,memleak}"
   needs:
     - job: get-versions
       artifacts: true
@@ -24,64 +22,27 @@ variables:
     - if: '$CI_PIPELINE_SOURCE == "web"'
       when: manual
       allow_failure: true
-    # Run automatically and non-blocking on any other source (push/trigger/api/
-    # etc.) — mirrors the integration-test rules. The before_script CANCELLED
-    # gate skips branches with no open PR.
+    # Run automatically and non-blocking on any other source (push/trigger/api/etc.)
     - when: on_success
       allow_failure: true
-  script: |
-    # setup the env
-    export ARTIFACTS_DIR="$(pwd)/reports" && (mkdir "${ARTIFACTS_DIR}" || :)
-    export CANDIDATE_VERSION=${CURRENT_VERSION}
-    export BASELINE_VERSION=${PREVIOUS_VERSION}
-    export PLATFORM_DIR=".benchmarks/platform"
-
-    # check for missing candidate version
-    if [ -z "${CANDIDATE_VERSION}" ]; then echo "Missing candidate version. Skipping."; exit 0; fi
-
-    # fetch the common platform scripts
-    git -c url."https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.ddbuild.io/DataDog/".insteadOf="https://github.com/DataDog/" \
-      clone --branch dd-trace-go https://github.com/DataDog/benchmarking-platform ${PLATFORM_DIR}
-
-    # apply the specific step scripts
-    cp -r .gitlab/benchmarks/steps/* ${PLATFORM_DIR}/steps/
-    chmod a+x ${PLATFORM_DIR}/steps/*
-
-    # check for mode validity
-    ${PLATFORM_DIR}/steps/check_modes.sh
-    if [ "$(cat .job_status)" == "SKIP" ]; then exit 0; fi
-
-    # run benchmarks
-    ${PLATFORM_DIR}/steps/capture-hardware-software-info.sh
-    ${PLATFORM_DIR}/steps/run-benchmarks.sh
-    ${PLATFORM_DIR}/steps/analyze-results.sh
-    ${PLATFORM_DIR}/steps/upload-results-to-s3.sh
-  parallel:
-    matrix:
-      - RUN_MODE: ["cpu", "wall", "alloc", "memleak", "cpu,wall", "memleak,alloc", "cpu,wall,alloc,memleak"]
-  artifacts:
-    when: always
-    name: "reports"
-    paths:
-      - reports/
-    expire_in: 3 months
-
-benchmarks-candidate-amd64:
-  extends: .benchmark_job
-  tags: ["arch:amd64"]
-  image: $BENCHMARK_IMAGE_AMD64
-
-benchmarks-candidate-aarch64:
-  extends: .benchmark_job
-  tags: ["arch:arm64"]
-  image: $BENCHMARK_IMAGE_ARM64
   variables:
-    KUBERNETES_MEMORY_REQUEST: 200Gi
-    KUBERNETES_MEMORY_LIMIT: 200Gi
+    CANDIDATE_VERSION: "${CURRENT_VERSION}"
+    BASELINE_VERSION: "${PREVIOUS_VERSION}"
+    BENCHMARK_ITERATIONS: "${BENCHMARK_ITERATIONS:-5}"
+    BENCHMARK_MODES: "${BENCHMARK_MODES:-cpu,wall,alloc,memleak}"
+    DDPROF_COMMIT_SHA: "${CI_COMMIT_SHA}"
+    DDPROF_COMMIT_BRANCH: "${CI_COMMIT_BRANCH}"
+    UPSTREAM_PROJECT_NAME: "java-profiler"
+    UPSTREAM_BRANCH: "${CI_PIPELINE_ID}"
+    UPSTREAM_PIPELINE_ID: "${CI_PIPELINE_ID}"
+  trigger:
+    project: DataDog/apm-reliability/benchmarking-platform
+    branch: java-profiler
+    strategy: depend
 
 post-benchmarks-pr-comment:
   extends: .retry-config
-  stage: benchmarks
+  stage: post-benchmarks
   tags: ["arch:arm64"]
   image: registry.ddbuild.io/images/dd-octo-sts-ci-base:2025.06-1
   id_tokens:
@@ -90,10 +51,6 @@ post-benchmarks-pr-comment:
   needs:
     - job: prepare:start
       artifacts: true
-    - job: benchmarks-candidate-amd64
-      artifacts: true
-    - job: benchmarks-candidate-aarch64
-      artifacts: true
   rules:
     - if: '$JDK_VERSION != null || $DEBUG_LEVEL != null || $HASH != null || $DOWNSTREAM != null'
       when: never
@@ -101,36 +58,38 @@ post-benchmarks-pr-comment:
       when: never
     - if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
       when: never
-    # Always run when the candidate jobs ran, regardless of source, so results
-    # are posted back to the PR.
+    # Always run after the benchmarks stage so results are posted back to the PR.
     - when: always
-  timeout: 5m
+  timeout: 10m
   script:
+    - mkdir -p reports
+    - .gitlab/benchmarks/download-s3-reports.sh reports
     - .gitlab/benchmarks/post-pr-comment.sh reports
   allow_failure: true
 
 publish-benchmark-gh-pages:
-  stage: benchmarks
+  stage: post-benchmarks
   tags: ["arch:arm64"]
   image: registry.ddbuild.io/images/dd-octo-sts-ci-base:2025.06-1
   id_tokens:
     DDOCTOSTS_ID_TOKEN:
       aud: dd-octo-sts
-  needs:
-    - job: benchmarks-candidate-amd64
-      artifacts: true
-    - job: benchmarks-candidate-aarch64
-      artifacts: true
+  needs: []
+  # Serialize concurrent GH Pages pushes. publish-gh-pages.sh uses
+  # 'git push --force'; two concurrent pushes race and the slower one
+  # silently discards the faster one's history update.
+  resource_group: gh-pages-publish
   rules:
     - if: '$CI_PIPELINE_SOURCE == "schedule"'
       when: never
     - if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH || $CI_COMMIT_BRANCH == "main"'
       when: always
-  timeout: 10m
+  timeout: 15m
   script:
-    - ./.gitlab/benchmarks/publish-gh-pages.sh
+    - mkdir -p reports
+    - .gitlab/benchmarks/download-s3-reports.sh reports
+    - ./.gitlab/benchmarks/publish-gh-pages.sh reports
   allow_failure: true
 
 include:
   - local: .gitlab/common.yml
-  - local: .gitlab/benchmarks/images.yml
diff --git a/.gitlab/benchmarks/download-s3-reports.sh b/.gitlab/benchmarks/download-s3-reports.sh
new file mode 100755
index 000000000..51862cdcc
--- /dev/null
+++ b/.gitlab/benchmarks/download-s3-reports.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+# Downloads benchmark reports uploaded by the BP pipeline for this pipeline run.
+#
+# The BP upload key is:
+#   s3://relenv-benchmarking-data/java-profiler/${CI_PIPELINE_ID}/${BP_JOB_ID}/
+# We pass UPSTREAM_PIPELINE_ID=${CI_PIPELINE_ID} and UPSTREAM_BRANCH=${CI_PIPELINE_ID}
+# to BP; each BP job uploads to its own CI_JOB_ID leaf under that prefix, so
+# syncing the whole prefix captures all arch+mode results.
+set -euo pipefail
+
+DEST="${1:-reports}"
+S3_BUCKET="relenv-benchmarking-data"
+S3_PREFIX="java-profiler/${CI_PIPELINE_ID}"
+
+mkdir -p "${DEST}"
+aws s3 sync "s3://${S3_BUCKET}/${S3_PREFIX}/" "${DEST}/" \
+  --exclude "*" \
+  --include "comparison-baseline-vs-candidate_*.md" \
+  --include "comparison-baseline-vs-candidate_*.html" \
+  --include "*.json"
+
+echo "Downloaded from s3://${S3_BUCKET}/${S3_PREFIX}/ → ${DEST}/"
+FILE_COUNT=$(find "${DEST}" -type f | wc -l)
+echo "Files downloaded: ${FILE_COUNT}"
+if [ "${FILE_COUNT}" -eq 0 ]; then
+  echo "ERROR: no benchmark reports found — BP pipeline may not have uploaded yet" >&2
+  exit 1
+fi

From 2be4d5ab978e273499f06eb70feee319b535773f Mon Sep 17 00:00:00 2001
From: Jaroslav Bachorik <jaroslav.bachorik@datadoghq.com>
Date: Thu, 18 Jun 2026 21:50:34 +0200
Subject: [PATCH 2/8] feat(ci): add reliability table to benchmark PR comment

---
 .gitlab/benchmarks/post-pr-comment.sh | 56 +++++++++++++++++++++++++--
 1 file changed, 52 insertions(+), 4 deletions(-)

diff --git a/.gitlab/benchmarks/post-pr-comment.sh b/.gitlab/benchmarks/post-pr-comment.sh
index de410dff9..f06c413ef 100755
--- a/.gitlab/benchmarks/post-pr-comment.sh
+++ b/.gitlab/benchmarks/post-pr-comment.sh
@@ -1,8 +1,9 @@
 #!/usr/bin/env bash
-# Post aggregated benchmark comparison results as a single PR comment.
+# Post aggregated benchmark results as a single PR comment.
 #
-# Expects all per-cell comparison-baseline-vs-candidate_*.md reports to be
-# present under REPORTS_DIR (default: reports/).
+# Handles two report types found under REPORTS_DIR (default: reports/):
+#   - comparison-baseline-vs-candidate_*.md  (perf comparison benchmarks)
+#   - result_${BENCHMARK}_${JDK}_${LIBRARY}.json  (reliability benchmarks)
 #
 # Required env:
 #   DDPROF_COMMIT_BRANCH  – branch name used to locate the open PR
@@ -14,7 +15,7 @@ set -euo pipefail
 REPORTS_DIR="${1:-reports}"
 HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
-# Aggregate all per-cell reports into a single comment body
+# --- Perf comparison reports (markdown, one per benchmark cell) ---
 SECTIONS=""
 for md in "${REPORTS_DIR}"/comparison-baseline-vs-candidate_*.md; do
   [ -f "${md}" ] || continue
@@ -28,6 +29,53 @@ $(cat "${md}")
 "
 done
 
+# --- Reliability reports (JSON, one per benchmark×JDK×library) ---
+RELIABILITY_TABLE=$(python3 - "${REPORTS_DIR}" <<'PYEOF'
+import json, sys, glob, collections
+
+reports_dir = sys.argv[1]
+results = collections.defaultdict(dict)
+
+for path in sorted(glob.glob(f"{reports_dir}/result_*.json")):
+    try:
+        with open(path) as f:
+            r = json.load(f)
+        key = (r["benchmark"], r["jdk"])
+        results[key][r["library"]] = r
+    except Exception:
+        continue
+
+if not results:
+    sys.exit(0)
+
+lines = [
+    "### Reliability Benchmarks",
+    "",
+    "| Benchmark | JDK | Latest | Dev |",
+    "|-----------|-----|--------|-----|",
+]
+for (bench, jdk) in sorted(results.keys()):
+    libs = results[(bench, jdk)]
+
+    def fmt(lib):
+        if lib not in libs:
+            return "—"
+        r = libs[lib]
+        icon = "💥" if r.get("crashed") else "✅"
+        avg = r.get("avg_ms", 0)
+        cnt = r.get("run_count", 0)
+        return f"{icon} {avg} ms ({cnt} iters)"
+
+    lines.append(f"| {bench} | {jdk} | {fmt('latest')} | {fmt('dev')} |")
+
+print("\n".join(lines))
+PYEOF
+) || true
+
+[ -n "${RELIABILITY_TABLE}" ] && SECTIONS="${SECTIONS}
+${RELIABILITY_TABLE}
+"
+
 if [ -z "${SECTIONS}" ]; then
   echo "No benchmark reports found under ${REPORTS_DIR} — skipping comment"
   exit 0

From ba307c6d9400808ce51ddc855f5afe45a2ed5317 Mon Sep 17 00:00:00 2001
From: Jaroslav Bachorik <jaroslav.bachorik@datadoghq.com>
Date: Thu, 18 Jun 2026 22:04:54 +0200
Subject: [PATCH 3/8] feat(ci): add delta%, upload count, issues to reliability
 table

---
 .gitlab/benchmarks/post-pr-comment.sh | 49 +++++++++++++++++++++------
 1 file changed, 38 insertions(+), 11 deletions(-)

diff --git a/.gitlab/benchmarks/post-pr-comment.sh b/.gitlab/benchmarks/post-pr-comment.sh
index f06c413ef..40f46a3e2 100755
--- a/.gitlab/benchmarks/post-pr-comment.sh
+++ b/.gitlab/benchmarks/post-pr-comment.sh
@@ -48,25 +48,52 @@ for path in sorted(glob.glob(f"{reports_dir}/result_*.json")):
 if not results:
     sys.exit(0)
 
+def fmt_avg(r):
+    icon = "💥" if r.get("crashed") else "✅"
+    avg = r.get("avg_ms", 0)
+    cnt = r.get("run_count", 0)
+    return f"{icon} {avg} ms ({cnt} iters)"
+
+def fmt_delta(latest, dev):
+    la, da = latest.get("avg_ms", 0), dev.get("avg_ms", 0)
+    if not la or not da:
+        return "—"
+    pct = (da - la) / la * 100
+    # positive = dev is slower (potential regression)
+    arrow = "🔴 +" if pct > 2 else ("🟢 " if pct < -2 else "")
+    return f"{arrow}{pct:+.1f}%"
+
+def fmt_uploads(r):
+    return str(r.get("upload_count", 0))
+
+def fmt_issues(r):
+    e = r.get("error_count", 0)
+    w = r.get("warn_count", 0)
+    if e == 0 and w == 0:
+        return "—"
+    parts = []
+    if e: parts.append(f"E:{e}")
+    if w: parts.append(f"W:{w}")
+    return "⚠️ " + " ".join(parts)
+
 lines = [
     "### Reliability Benchmarks",
     "",
-    "| Benchmark | JDK | Latest | Dev |",
-    "|-----------|-----|--------|-----|",
+    "| Benchmark | JDK | Latest | Dev | Δ% (dev vs latest) | Uploads L/D | Issues L/D |",
+    "|-----------|-----|--------|-----|-------------------|-------------|------------|",
 ]
 for (bench, jdk) in sorted(results.keys()):
     libs = results[(bench, jdk)]
+    latest = libs.get("latest", {})
+    dev    = libs.get("dev", {})
 
-    def fmt(lib):
-        if lib not in libs:
-            return "—"
-        r = libs[lib]
-        icon = "💥" if r.get("crashed") else "✅"
-        avg = r.get("avg_ms", 0)
-        cnt = r.get("run_count", 0)
-        return f"{icon} {avg} ms ({cnt} iters)"
+    col_latest  = fmt_avg(latest)  if latest else "—"
+    col_dev     = fmt_avg(dev)     if dev    else "—"
+    col_delta   = fmt_delta(latest, dev) if (latest and dev) else "—"
+    col_uploads = f"{fmt_uploads(latest)} / {fmt_uploads(dev)}"
+    col_issues  = f"{fmt_issues(latest)} / {fmt_issues(dev)}"
 
-    lines.append(f"| {bench} | {jdk} | {fmt('latest')} | {fmt('dev')} |")
+    lines.append(f"| {bench} | {jdk} | {col_latest} | {col_dev} | {col_delta} | {col_uploads} | {col_issues} |")
 
 print("\n".join(lines))
 PYEOF

From e527a2f8a29222c822aac1fa0312ae65e4d4c513 Mon Sep 17 00:00:00 2001
From: Jaroslav Bachorik <jaroslav.bachorik@datadoghq.com>
Date: Thu, 18 Jun 2026 22:22:17 +0200
Subject: [PATCH 4/8] fix(ci): use boto3 for S3 download; add
 benchmarking-platform SA to post-benchmark jobs

---
 .gitlab/benchmarks/.gitlab-ci.yml         |  4 ++
 .gitlab/benchmarks/download-s3-reports.sh | 48 +++++++++++++++++------
 2 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/.gitlab/benchmarks/.gitlab-ci.yml b/.gitlab/benchmarks/.gitlab-ci.yml
index e322fb131..18cd8c693 100644
--- a/.gitlab/benchmarks/.gitlab-ci.yml
+++ b/.gitlab/benchmarks/.gitlab-ci.yml
@@ -48,6 +48,8 @@ post-benchmarks-pr-comment:
   id_tokens:
     DDOCTOSTS_ID_TOKEN:
       aud: dd-octo-sts
+  variables:
+    KUBERNETES_SERVICE_ACCOUNT_OVERWRITE: benchmarking-platform
   needs:
     - job: prepare:start
       artifacts: true
@@ -74,6 +76,8 @@ publish-benchmark-gh-pages:
   id_tokens:
     DDOCTOSTS_ID_TOKEN:
       aud: dd-octo-sts
+  variables:
+    KUBERNETES_SERVICE_ACCOUNT_OVERWRITE: benchmarking-platform
   needs: []
   # Serialize concurrent GH Pages pushes. publish-gh-pages.sh uses
   # 'git push --force'; two concurrent pushes race and the slower one
diff --git a/.gitlab/benchmarks/download-s3-reports.sh b/.gitlab/benchmarks/download-s3-reports.sh
index 51862cdcc..bb7aaafa5 100755
--- a/.gitlab/benchmarks/download-s3-reports.sh
+++ b/.gitlab/benchmarks/download-s3-reports.sh
@@ -6,6 +6,9 @@
 # We pass UPSTREAM_PIPELINE_ID=${CI_PIPELINE_ID} and UPSTREAM_BRANCH=${CI_PIPELINE_ID}
 # to BP; each BP job uploads to its own CI_JOB_ID leaf under that prefix, so
 # syncing the whole prefix captures all arch+mode results.
+#
+# Uses boto3 (Python) rather than the aws CLI — the post-benchmark jobs run in
+# dd-octo-sts-ci-base which ships Python but not the aws CLI binary.
 set -euo pipefail
 
 DEST="${1:-reports}"
@@ -13,16 +16,35 @@ S3_BUCKET="relenv-benchmarking-data"
 S3_PREFIX="java-profiler/${CI_PIPELINE_ID}"
 
 mkdir -p "${DEST}"
-aws s3 sync "s3://${S3_BUCKET}/${S3_PREFIX}/" "${DEST}/" \
-  --exclude "*" \
-  --include "comparison-baseline-vs-candidate_*.md" \
-  --include "comparison-baseline-vs-candidate_*.html" \
-  --include "*.json"
-
-echo "Downloaded from s3://${S3_BUCKET}/${S3_PREFIX}/ → ${DEST}/"
-FILE_COUNT=$(find "${DEST}" -type f | wc -l)
-echo "Files downloaded: ${FILE_COUNT}"
-if [ "${FILE_COUNT}" -eq 0 ]; then
-  echo "ERROR: no benchmark reports found — BP pipeline may not have uploaded yet" >&2
-  exit 1
-fi
+
+# Ensure boto3 is available; install quietly if missing.
+python3 -c "import boto3" 2>/dev/null || pip3 install --quiet boto3
+
+python3 - "${DEST}" "${S3_BUCKET}" "${S3_PREFIX}" <<'PYEOF'
+import boto3, os, sys
+
+dest, bucket, prefix = sys.argv[1], sys.argv[2], sys.argv[3]
+
+INCLUDE_PATTERNS = (".json", "-vs-candidate.md", "-vs-candidate.html")
+
+s3 = boto3.client("s3")
+paginator = s3.get_paginator("list_objects_v2")
+
+downloaded = 0
+for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
+    for obj in page.get("Contents", []):
+        key = obj["Key"]
+        name = os.path.basename(key)
+        if not any(name.endswith(p) for p in INCLUDE_PATTERNS):
+            continue
+        local_path = os.path.join(dest, name)
+        print(f"  {key} → {name}")
+        s3.download_file(bucket, key, local_path)
+        downloaded += 1
+
+print(f"Downloaded {downloaded} file(s) from s3://{bucket}/{prefix}/")
+if downloaded == 0:
+    print("ERROR: no benchmark reports found — BP pipeline may not have uploaded yet",
+          file=sys.stderr)
+    sys.exit(1)
+PYEOF

From 200d347e13a968bd9f134c5dfc5b992552d217ff Mon Sep 17 00:00:00 2001
From: Jaroslav Bachorik <jaroslav.bachorik@datadoghq.com>
Date: Thu, 18 Jun 2026 22:29:43 +0200
Subject: [PATCH 5/8] fix(ci): fetch BP artifacts via GitLab API instead of aws
 s3 sync

---
 .gitlab/benchmarks/.gitlab-ci.yml         |   4 -
 .gitlab/benchmarks/download-s3-reports.sh | 100 ++++++++++++++--------
 2 files changed, 63 insertions(+), 41 deletions(-)

diff --git a/.gitlab/benchmarks/.gitlab-ci.yml b/.gitlab/benchmarks/.gitlab-ci.yml
index 18cd8c693..e322fb131 100644
--- a/.gitlab/benchmarks/.gitlab-ci.yml
+++ b/.gitlab/benchmarks/.gitlab-ci.yml
@@ -48,8 +48,6 @@ post-benchmarks-pr-comment:
   id_tokens:
     DDOCTOSTS_ID_TOKEN:
       aud: dd-octo-sts
-  variables:
-    KUBERNETES_SERVICE_ACCOUNT_OVERWRITE: benchmarking-platform
   needs:
     - job: prepare:start
       artifacts: true
@@ -76,8 +74,6 @@ publish-benchmark-gh-pages:
   id_tokens:
     DDOCTOSTS_ID_TOKEN:
       aud: dd-octo-sts
-  variables:
-    KUBERNETES_SERVICE_ACCOUNT_OVERWRITE: benchmarking-platform
   needs: []
   # Serialize concurrent GH Pages pushes. publish-gh-pages.sh uses
   # 'git push --force'; two concurrent pushes race and the slower one
diff --git a/.gitlab/benchmarks/download-s3-reports.sh b/.gitlab/benchmarks/download-s3-reports.sh
index bb7aaafa5..cef92a9f3 100755
--- a/.gitlab/benchmarks/download-s3-reports.sh
+++ b/.gitlab/benchmarks/download-s3-reports.sh
@@ -1,50 +1,76 @@
 #!/usr/bin/env bash
-# Downloads benchmark reports uploaded by the BP pipeline for this pipeline run.
+# Downloads result JSONs from the BP downstream pipeline via the GitLab CI API.
 #
-# The BP upload key is:
-#   s3://relenv-benchmarking-data/java-profiler/${CI_PIPELINE_ID}/${BP_JOB_ID}/
-# We pass UPSTREAM_PIPELINE_ID=${CI_PIPELINE_ID} and UPSTREAM_BRANCH=${CI_PIPELINE_ID}
-# to BP; each BP job uploads to its own CI_JOB_ID leaf under that prefix, so
-# syncing the whole prefix captures all arch+mode results.
-#
-# Uses boto3 (Python) rather than the aws CLI — the post-benchmark jobs run in
-# dd-octo-sts-ci-base which ships Python but not the aws CLI binary.
+# Requires only curl and python3 (stdlib) — no aws CLI, pip, or boto3 needed.
+# BP jobs already store artifacts in GitLab; this fetches them directly from
+# the downstream pipeline triggered by benchmarks-trigger.
 set -euo pipefail
 
 DEST="${1:-reports}"
-S3_BUCKET="relenv-benchmarking-data"
-S3_PREFIX="java-profiler/${CI_PIPELINE_ID}"
-
 mkdir -p "${DEST}"
 
-# Ensure boto3 is available; install quietly if missing.
-python3 -c "import boto3" 2>/dev/null || pip3 install --quiet boto3
+TMPDIR_LOCAL=$(mktemp -d)
+trap 'rm -rf "${TMPDIR_LOCAL}"' EXIT
+
+# ── 1. find the benchmarks-trigger bridge to get downstream project + pipeline ──
+BRIDGES_FILE="${TMPDIR_LOCAL}/bridges.json"
+curl -sf \
+  --header "JOB-TOKEN: ${CI_JOB_TOKEN}" \
+  "${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/pipelines/${CI_PIPELINE_ID}/bridges" \
+  > "${BRIDGES_FILE}"
 
-python3 - "${DEST}" "${S3_BUCKET}" "${S3_PREFIX}" <<'PYEOF'
-import boto3, os, sys
+read -r BP_PROJECT_ID DOWNSTREAM_PIPELINE_ID < <(python3 - "${BRIDGES_FILE}" <<'PYEOF'
+import json, sys
+with open(sys.argv[1]) as f:
+    bridges = json.load(f)
+for b in bridges:
+    if b.get("name") == "benchmarks-trigger":
+        dp = b.get("downstream_pipeline") or {}
+        if dp.get("id") and dp.get("project_id"):
+            print(dp["project_id"], dp["id"])
+            sys.exit(0)
+print("", "")
+PYEOF
+)
 
-dest, bucket, prefix = sys.argv[1], sys.argv[2], sys.argv[3]
+if [ -z "${DOWNSTREAM_PIPELINE_ID:-}" ]; then
+  echo "No downstream BP pipeline found for benchmarks-trigger — skipping download"
+  exit 0
+fi
+echo "BP downstream pipeline: project=${BP_PROJECT_ID}  pipeline=${DOWNSTREAM_PIPELINE_ID}"
 
-INCLUDE_PATTERNS = (".json", "-vs-candidate.md", "-vs-candidate.html")
+# ── 2. list jobs in the downstream pipeline ──
+JOBS_FILE="${TMPDIR_LOCAL}/jobs.json"
+curl -sf \
+  --header "JOB-TOKEN: ${CI_JOB_TOKEN}" \
+  "${CI_API_V4_URL}/projects/${BP_PROJECT_ID}/pipelines/${DOWNSTREAM_PIPELINE_ID}/jobs?per_page=100" \
+  > "${JOBS_FILE}"
 
-s3 = boto3.client("s3")
-paginator = s3.get_paginator("list_objects_v2")
+JOB_IDS=$(python3 -c "
+import json
+with open('${JOBS_FILE}') as f:
+    print(' '.join(str(j['id']) for j in json.load(f)))
+")
 
-downloaded = 0
-for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
-    for obj in page.get("Contents", []):
-        key = obj["Key"]
-        name = os.path.basename(key)
-        if not any(name.endswith(p) for p in INCLUDE_PATTERNS):
-            continue
-        local_path = os.path.join(dest, name)
-        print(f"  {key} → {name}")
-        s3.download_file(bucket, key, local_path)
-        downloaded += 1
+# ── 3. download result_*.json from each job's artifact zip ──
+DOWNLOADED=0
+for JOB_ID in ${JOB_IDS}; do
+  ART_ZIP="${TMPDIR_LOCAL}/art_${JOB_ID}.zip"
+  if curl -sf \
+    --header "JOB-TOKEN: ${CI_JOB_TOKEN}" \
+    "${CI_API_V4_URL}/projects/${BP_PROJECT_ID}/jobs/${JOB_ID}/artifacts" \
+    --output "${ART_ZIP}" 2>/dev/null; then
+    # -j: junk paths (strip the artifacts/ prefix), quiet, overwrite
+    if unzip -q -j "${ART_ZIP}" "artifacts/result_*.json" -d "${DEST}/" 2>/dev/null; then
+      DOWNLOADED=$((DOWNLOADED + 1))
+    fi
+  fi
+done
 
-print(f"Downloaded {downloaded} file(s) from s3://{bucket}/{prefix}/")
-if downloaded == 0:
-    print("ERROR: no benchmark reports found — BP pipeline may not have uploaded yet",
-          file=sys.stderr)
-    sys.exit(1)
-PYEOF
+RESULT_COUNT=$(find "${DEST}" -name "result_*.json" | wc -l)
+echo "result_*.json files downloaded: ${RESULT_COUNT} (from ${DOWNLOADED} BP job(s))"
+
+if [ "${RESULT_COUNT}" -eq 0 ]; then
+  echo "WARNING: no result JSONs found — BP jobs may not have run yet" >&2
+  exit 1
+fi

From 5f658b0dfd0d6a8de4d52a89cb8c7e421cd1e079 Mon Sep 17 00:00:00 2001
From: Jaroslav Bachorik <jaroslav.bachorik@datadoghq.com>
Date: Thu, 18 Jun 2026 22:38:34 +0200
Subject: [PATCH 6/8] fix(ci): diagnose API errors explicitly, no silent set -e
 kill

---
 .gitlab/benchmarks/download-s3-reports.sh | 55 ++++++++++++++++-------
 1 file changed, 38 insertions(+), 17 deletions(-)

diff --git a/.gitlab/benchmarks/download-s3-reports.sh b/.gitlab/benchmarks/download-s3-reports.sh
index cef92a9f3..762062e7c 100755
--- a/.gitlab/benchmarks/download-s3-reports.sh
+++ b/.gitlab/benchmarks/download-s3-reports.sh
@@ -4,7 +4,7 @@
 # Requires only curl and python3 (stdlib) — no aws CLI, pip, or boto3 needed.
 # BP jobs already store artifacts in GitLab; this fetches them directly from
 # the downstream pipeline triggered by benchmarks-trigger.
-set -euo pipefail
+set -uo pipefail   # intentionally no -e: we handle errors explicitly
 
 DEST="${1:-reports}"
 mkdir -p "${DEST}"
@@ -12,12 +12,30 @@ mkdir -p "${DEST}"
 TMPDIR_LOCAL=$(mktemp -d)
 trap 'rm -rf "${TMPDIR_LOCAL}"' EXIT
 
-# ── 1. find the benchmarks-trigger bridge to get downstream project + pipeline ──
+# ── helper: curl with explicit HTTP status checking ──────────────────────────
+# Usage: api_get <url> <output_file>
+# Returns 0 on 2xx, prints diagnostics and returns 1 otherwise.
+api_get() {
+  local url="$1" out="$2"
+  local http_code
+  http_code=$(curl -s -o "${out}" -w "%{http_code}" \
+    --header "JOB-TOKEN: ${CI_JOB_TOKEN}" "${url}")
+  if [[ "${http_code}" != 2* ]]; then
+    echo "  API ${url##*/}: HTTP ${http_code} — $(cat "${out}" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('message','?'))" 2>/dev/null || echo 'see above')"
+    return 1
+  fi
+  return 0
+}
+
+# ── 1. find the benchmarks-trigger bridge ────────────────────────────────────
 BRIDGES_FILE="${TMPDIR_LOCAL}/bridges.json"
-curl -sf \
-  --header "JOB-TOKEN: ${CI_JOB_TOKEN}" \
+echo "Querying bridges for pipeline ${CI_PIPELINE_ID}…"
+if ! api_get \
   "${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/pipelines/${CI_PIPELINE_ID}/bridges" \
-  > "${BRIDGES_FILE}"
+  "${BRIDGES_FILE}"; then
+  echo "Cannot read pipeline bridges (job token may lack Reporter access) — skipping download"
+  exit 0
+fi
 
 read -r BP_PROJECT_ID DOWNSTREAM_PIPELINE_ID < <(python3 - "${BRIDGES_FILE}" <<'PYEOF'
 import json, sys
@@ -34,17 +52,20 @@ PYEOF
 )
 
 if [ -z "${DOWNSTREAM_PIPELINE_ID:-}" ]; then
-  echo "No downstream BP pipeline found for benchmarks-trigger — skipping download"
+  echo "benchmarks-trigger bridge not found or did not run — skipping download"
   exit 0
 fi
 echo "BP downstream pipeline: project=${BP_PROJECT_ID}  pipeline=${DOWNSTREAM_PIPELINE_ID}"
 
-# ── 2. list jobs in the downstream pipeline ──
+# ── 2. list jobs in the downstream pipeline ──────────────────────────────────
 JOBS_FILE="${TMPDIR_LOCAL}/jobs.json"
-curl -sf \
-  --header "JOB-TOKEN: ${CI_JOB_TOKEN}" \
+echo "Listing BP pipeline jobs…"
+if ! api_get \
   "${CI_API_V4_URL}/projects/${BP_PROJECT_ID}/pipelines/${DOWNSTREAM_PIPELINE_ID}/jobs?per_page=100" \
-  > "${JOBS_FILE}"
+  "${JOBS_FILE}"; then
+  echo "Cannot list BP pipeline jobs — skipping download"
+  exit 0
+fi
 
 JOB_IDS=$(python3 -c "
 import json
@@ -52,15 +73,15 @@ with open('${JOBS_FILE}') as f:
     print(' '.join(str(j['id']) for j in json.load(f)))
 ")
 
-# ── 3. download result_*.json from each job's artifact zip ──
+# ── 3. download result_*.json from each job's artifact zip ───────────────────
 DOWNLOADED=0
 for JOB_ID in ${JOB_IDS}; do
   ART_ZIP="${TMPDIR_LOCAL}/art_${JOB_ID}.zip"
-  if curl -sf \
+  ART_STATUS=$(curl -s -o "${ART_ZIP}" -w "%{http_code}" \
     --header "JOB-TOKEN: ${CI_JOB_TOKEN}" \
-    "${CI_API_V4_URL}/projects/${BP_PROJECT_ID}/jobs/${JOB_ID}/artifacts" \
-    --output "${ART_ZIP}" 2>/dev/null; then
-    # -j: junk paths (strip the artifacts/ prefix), quiet, overwrite
+    "${CI_API_V4_URL}/projects/${BP_PROJECT_ID}/jobs/${JOB_ID}/artifacts" 2>/dev/null)
+  if [[ "${ART_STATUS}" == 2* ]]; then
+    # -j: junk paths (strip artifacts/ prefix), -q: quiet, -o: overwrite
     if unzip -q -j "${ART_ZIP}" "artifacts/result_*.json" -d "${DEST}/" 2>/dev/null; then
       DOWNLOADED=$((DOWNLOADED + 1))
     fi
@@ -68,9 +89,9 @@ for JOB_ID in ${JOB_IDS}; do
 done
 
 RESULT_COUNT=$(find "${DEST}" -name "result_*.json" | wc -l)
-echo "result_*.json files downloaded: ${RESULT_COUNT} (from ${DOWNLOADED} BP job(s))"
+echo "result_*.json files: ${RESULT_COUNT} (from ${DOWNLOADED} BP job(s))"
 
 if [ "${RESULT_COUNT}" -eq 0 ]; then
-  echo "WARNING: no result JSONs found — BP jobs may not have run yet" >&2
+  echo "WARNING: no result JSONs found — BP jobs may not have run or produced artifacts yet"
   exit 1
 fi

From 7580df27de5bad718f9ea06830053d29d3ed9302 Mon Sep 17 00:00:00 2001
From: Jaroslav Bachorik <jaroslav.bachorik@datadoghq.com>
Date: Fri, 19 Jun 2026 08:56:20 +0200
Subject: [PATCH 7/8] fix: pass MAVEN_REPOSITORY_PROXY to BP trigger

---
 .gitlab/benchmarks/.gitlab-ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitlab/benchmarks/.gitlab-ci.yml b/.gitlab/benchmarks/.gitlab-ci.yml
index e322fb131..318a62fa8 100644
--- a/.gitlab/benchmarks/.gitlab-ci.yml
+++ b/.gitlab/benchmarks/.gitlab-ci.yml
@@ -35,6 +35,7 @@ benchmarks-trigger:
     UPSTREAM_PROJECT_NAME: "java-profiler"
     UPSTREAM_BRANCH: "${CI_PIPELINE_ID}"
     UPSTREAM_PIPELINE_ID: "${CI_PIPELINE_ID}"
+    MAVEN_REPOSITORY_PROXY: "${MAVEN_REPOSITORY_PROXY}"
   trigger:
     project: DataDog/apm-reliability/benchmarking-platform
     branch: java-profiler

From 7d9f6799161980c4ca9257450e604dc5e433fe7c Mon Sep 17 00:00:00 2001
From: Jaroslav Bachorik <jaroslav.bachorik@datadoghq.com>
Date: Fri, 19 Jun 2026 09:49:24 +0200
Subject: [PATCH 8/8] fix: extend octo-sts trust to BP; wait for
 benchmarks-trigger before comment

---
 .github/chainguard/async-profiler-build.ci.sts.yaml | 4 ++--
 .gitlab/benchmarks/.gitlab-ci.yml                   | 6 +++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/chainguard/async-profiler-build.ci.sts.yaml b/.github/chainguard/async-profiler-build.ci.sts.yaml
index 339ec1a03..8d3254f6f 100644
--- a/.github/chainguard/async-profiler-build.ci.sts.yaml
+++ b/.github/chainguard/async-profiler-build.ci.sts.yaml
@@ -1,7 +1,7 @@
-# Allow java-profiler GitLab CI to publish reports and manage issues
+# Allow java-profiler and benchmarking-platform GitLab CI to post reports
 issuer: https://gitlab.ddbuild.io
 
-subject_pattern: "project_path:DataDog/java-profiler:ref_type:branch:ref:.*"
+subject_pattern: "project_path:(DataDog/java-profiler|DataDog/apm-reliability/benchmarking-platform):ref_type:branch:ref:.*"
 
 permissions:
   contents: write
diff --git a/.gitlab/benchmarks/.gitlab-ci.yml b/.gitlab/benchmarks/.gitlab-ci.yml
index 318a62fa8..18359cf02 100644
--- a/.gitlab/benchmarks/.gitlab-ci.yml
+++ b/.gitlab/benchmarks/.gitlab-ci.yml
@@ -52,6 +52,8 @@ post-benchmarks-pr-comment:
   needs:
     - job: prepare:start
       artifacts: true
+    - job: benchmarks-trigger
+      artifacts: false
   rules:
     - if: '$JDK_VERSION != null || $DEBUG_LEVEL != null || $HASH != null || $DOWNSTREAM != null'
       when: never
@@ -75,7 +77,9 @@ publish-benchmark-gh-pages:
   id_tokens:
     DDOCTOSTS_ID_TOKEN:
       aud: dd-octo-sts
-  needs: []
+  needs:
+    - job: benchmarks-trigger
+      artifacts: false
   # Serialize concurrent GH Pages pushes. publish-gh-pages.sh uses
   # 'git push --force'; two concurrent pushes race and the slower one
   # silently discards the faster one's history update.