diff --git a/.github/chainguard/async-profiler-build.ci.sts.yaml b/.github/chainguard/async-profiler-build.ci.sts.yaml index 339ec1a03..8d3254f6f 100644 --- a/.github/chainguard/async-profiler-build.ci.sts.yaml +++ b/.github/chainguard/async-profiler-build.ci.sts.yaml @@ -1,7 +1,7 @@ -# Allow java-profiler GitLab CI to publish reports and manage issues +# Allow java-profiler and benchmarking-platform GitLab CI to post reports issuer: https://gitlab.ddbuild.io -subject_pattern: "project_path:DataDog/java-profiler:ref_type:branch:ref:.*" +subject_pattern: "project_path:(DataDog/java-profiler|DataDog/apm-reliability/benchmarking-platform):ref_type:branch:ref:.*" permissions: contents: write diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 83ce0def5..b0c73be46 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -31,6 +31,7 @@ stages: - integration-test - reliability - benchmarks + - post-benchmarks - fuzz - notify diff --git a/.gitlab/benchmarks/.gitlab-ci.yml b/.gitlab/benchmarks/.gitlab-ci.yml index 960383af6..d1f5f1b07 100644 --- a/.gitlab/benchmarks/.gitlab-ci.yml +++ b/.gitlab/benchmarks/.gitlab-ci.yml @@ -2,13 +2,11 @@ variables: PREPARE_IMAGE: registry.ddbuild.io/images/benchmarking-platform-tools-ubuntu:latest DD_OCTO_STS_IMAGE: registry.ddbuild.io/images/dd-octo-sts-ci-base:2025.06-1 -.benchmark_job: - extends: .deploy-sa +# Bridge job: triggers the BP pipeline and blocks until it completes. +# Bridge jobs cannot appear in other jobs' needs: — downstream jobs use +# stage ordering (post-benchmarks stage runs after benchmarks stage). +benchmarks-trigger: stage: benchmarks - timeout: 6h - variables: - ITERATIONS: "${BENCHMARK_ITERATIONS:-1}" - MODES: "${BENCHMARK_MODES:-cpu,wall,alloc,memleak}" needs: - job: get-versions artifacts: true @@ -24,113 +22,49 @@ variables: - if: '$CI_PIPELINE_SOURCE == "web"' when: manual allow_failure: true - # Run automatically and non-blocking on any other source (push/trigger/api/ - # etc.) — mirrors the integration-test rules. The before_script CANCELLED - # gate skips branches with no open PR. + # Run automatically and non-blocking on any other source (push/trigger/api/etc.) - when: on_success allow_failure: true - script: | - # setup the env - export ARTIFACTS_DIR="$(pwd)/reports" && (mkdir "${ARTIFACTS_DIR}" || :) - export CANDIDATE_VERSION=${CURRENT_VERSION} - export BASELINE_VERSION=${PREVIOUS_VERSION} - export PLATFORM_DIR=".benchmarks/platform" - - # check for missing candidate version - if [ -z "${CANDIDATE_VERSION}" ]; then echo "Missing candidate version. Skipping."; exit 0; fi - - # fetch the common platform scripts - git -c url."https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.ddbuild.io/DataDog/".insteadOf="https://github.com/DataDog/" \ - clone --branch dd-trace-go https://github.com/DataDog/benchmarking-platform ${PLATFORM_DIR} - - # apply the specific step scripts - cp -r .gitlab/benchmarks/steps/* ${PLATFORM_DIR}/steps/ - chmod a+x ${PLATFORM_DIR}/steps/* - - # check for mode validity - ${PLATFORM_DIR}/steps/check_modes.sh - if [ "$(cat .job_status)" == "SKIP" ]; then exit 0; fi - - # run benchmarks - ${PLATFORM_DIR}/steps/capture-hardware-software-info.sh - ${PLATFORM_DIR}/steps/run-benchmarks.sh - ${PLATFORM_DIR}/steps/analyze-results.sh - ${PLATFORM_DIR}/steps/upload-results-to-s3.sh - parallel: - matrix: - - RUN_MODE: ["cpu", "wall", "alloc", "memleak", "cpu,wall", "memleak,alloc", "cpu,wall,alloc,memleak"] - artifacts: - when: always - name: "reports" - paths: - - reports/ - expire_in: 3 months - -benchmarks-candidate-amd64: - extends: .benchmark_job - tags: ["arch:amd64"] - image: $BENCHMARK_IMAGE_AMD64 - -benchmarks-candidate-aarch64: - extends: .benchmark_job - tags: ["arch:arm64"] - image: $BENCHMARK_IMAGE_ARM64 variables: - KUBERNETES_MEMORY_REQUEST: 200Gi - KUBERNETES_MEMORY_LIMIT: 200Gi + CANDIDATE_VERSION: "${CURRENT_VERSION}" + BASELINE_VERSION: "${PREVIOUS_VERSION}" + BENCHMARK_ITERATIONS: "${BENCHMARK_ITERATIONS:-5}" + BENCHMARK_MODES: "${BENCHMARK_MODES:-cpu,wall,alloc,memleak}" + DDPROF_COMMIT_SHA: "${CI_COMMIT_SHA}" + DDPROF_COMMIT_BRANCH: "${CI_COMMIT_BRANCH}" + UPSTREAM_PROJECT_NAME: "java-profiler" + UPSTREAM_BRANCH: "${CI_PIPELINE_ID}" + UPSTREAM_PIPELINE_ID: "${CI_PIPELINE_ID}" + MAVEN_REPOSITORY_PROXY: "${MAVEN_REPOSITORY_PROXY}" + trigger: + project: DataDog/apm-reliability/benchmarking-platform + branch: java-profiler + strategy: depend -post-benchmarks-pr-comment: - extends: .retry-config - stage: benchmarks - tags: ["arch:arm64"] - image: registry.ddbuild.io/images/dd-octo-sts-ci-base:2025.06-1 - id_tokens: - DDOCTOSTS_ID_TOKEN: - aud: dd-octo-sts - needs: - - job: prepare:start - artifacts: true - - job: benchmarks-candidate-amd64 - artifacts: true - - job: benchmarks-candidate-aarch64 - artifacts: true - rules: - - if: '$JDK_VERSION != null || $DEBUG_LEVEL != null || $HASH != null || $DOWNSTREAM != null' - when: never - - if: '$CI_PIPELINE_SOURCE == "schedule"' - when: never - - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' - when: never - # Always run when the candidate jobs ran, regardless of source, so results - # are posted back to the PR. - - when: always - timeout: 5m - script: - - .gitlab/benchmarks/post-pr-comment.sh reports - allow_failure: true publish-benchmark-gh-pages: - stage: benchmarks + stage: post-benchmarks tags: ["arch:arm64"] image: registry.ddbuild.io/images/dd-octo-sts-ci-base:2025.06-1 id_tokens: DDOCTOSTS_ID_TOKEN: aud: dd-octo-sts - needs: - - job: benchmarks-candidate-amd64 - artifacts: true - - job: benchmarks-candidate-aarch64 - artifacts: true + needs: [] + # Serialize concurrent GH Pages pushes. publish-gh-pages.sh uses + # 'git push --force'; two concurrent pushes race and the slower one + # silently discards the faster one's history update. + resource_group: gh-pages-publish rules: - if: '$CI_PIPELINE_SOURCE == "schedule"' when: never - if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH || $CI_COMMIT_BRANCH == "main"' when: always - timeout: 10m + timeout: 15m script: - - ./.gitlab/benchmarks/publish-gh-pages.sh + - mkdir -p reports + - .gitlab/benchmarks/download-s3-reports.sh reports + - ./.gitlab/benchmarks/publish-gh-pages.sh reports allow_failure: true include: - local: .gitlab/common.yml - - local: .gitlab/benchmarks/images.yml diff --git a/.gitlab/benchmarks/download-s3-reports.sh b/.gitlab/benchmarks/download-s3-reports.sh new file mode 100755 index 000000000..762062e7c --- /dev/null +++ b/.gitlab/benchmarks/download-s3-reports.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash +# Downloads result JSONs from the BP downstream pipeline via the GitLab CI API. +# +# Requires only curl and python3 (stdlib) — no aws CLI, pip, or boto3 needed. +# BP jobs already store artifacts in GitLab; this fetches them directly from +# the downstream pipeline triggered by benchmarks-trigger. +set -uo pipefail # intentionally no -e: we handle errors explicitly + +DEST="${1:-reports}" +mkdir -p "${DEST}" + +TMPDIR_LOCAL=$(mktemp -d) +trap 'rm -rf "${TMPDIR_LOCAL}"' EXIT + +# ── helper: curl with explicit HTTP status checking ────────────────────────── +# Usage: api_get +# Returns 0 on 2xx, prints diagnostics and returns 1 otherwise. +api_get() { + local url="$1" out="$2" + local http_code + http_code=$(curl -s -o "${out}" -w "%{http_code}" \ + --header "JOB-TOKEN: ${CI_JOB_TOKEN}" "${url}") + if [[ "${http_code}" != 2* ]]; then + echo " API ${url##*/}: HTTP ${http_code} — $(cat "${out}" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('message','?'))" 2>/dev/null || echo 'see above')" + return 1 + fi + return 0 +} + +# ── 1. find the benchmarks-trigger bridge ──────────────────────────────────── +BRIDGES_FILE="${TMPDIR_LOCAL}/bridges.json" +echo "Querying bridges for pipeline ${CI_PIPELINE_ID}…" +if ! api_get \ + "${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/pipelines/${CI_PIPELINE_ID}/bridges" \ + "${BRIDGES_FILE}"; then + echo "Cannot read pipeline bridges (job token may lack Reporter access) — skipping download" + exit 0 +fi + +read -r BP_PROJECT_ID DOWNSTREAM_PIPELINE_ID < <(python3 - "${BRIDGES_FILE}" <<'PYEOF' +import json, sys +with open(sys.argv[1]) as f: + bridges = json.load(f) +for b in bridges: + if b.get("name") == "benchmarks-trigger": + dp = b.get("downstream_pipeline") or {} + if dp.get("id") and dp.get("project_id"): + print(dp["project_id"], dp["id"]) + sys.exit(0) +print("", "") +PYEOF +) + +if [ -z "${DOWNSTREAM_PIPELINE_ID:-}" ]; then + echo "benchmarks-trigger bridge not found or did not run — skipping download" + exit 0 +fi +echo "BP downstream pipeline: project=${BP_PROJECT_ID} pipeline=${DOWNSTREAM_PIPELINE_ID}" + +# ── 2. list jobs in the downstream pipeline ────────────────────────────────── +JOBS_FILE="${TMPDIR_LOCAL}/jobs.json" +echo "Listing BP pipeline jobs…" +if ! api_get \ + "${CI_API_V4_URL}/projects/${BP_PROJECT_ID}/pipelines/${DOWNSTREAM_PIPELINE_ID}/jobs?per_page=100" \ + "${JOBS_FILE}"; then + echo "Cannot list BP pipeline jobs — skipping download" + exit 0 +fi + +JOB_IDS=$(python3 -c " +import json +with open('${JOBS_FILE}') as f: + print(' '.join(str(j['id']) for j in json.load(f))) +") + +# ── 3. download result_*.json from each job's artifact zip ─────────────────── +DOWNLOADED=0 +for JOB_ID in ${JOB_IDS}; do + ART_ZIP="${TMPDIR_LOCAL}/art_${JOB_ID}.zip" + ART_STATUS=$(curl -s -o "${ART_ZIP}" -w "%{http_code}" \ + --header "JOB-TOKEN: ${CI_JOB_TOKEN}" \ + "${CI_API_V4_URL}/projects/${BP_PROJECT_ID}/jobs/${JOB_ID}/artifacts" 2>/dev/null) + if [[ "${ART_STATUS}" == 2* ]]; then + # -j: junk paths (strip artifacts/ prefix), -q: quiet, -o: overwrite + if unzip -q -j "${ART_ZIP}" "artifacts/result_*.json" -d "${DEST}/" 2>/dev/null; then + DOWNLOADED=$((DOWNLOADED + 1)) + fi + fi +done + +RESULT_COUNT=$(find "${DEST}" -name "result_*.json" | wc -l) +echo "result_*.json files: ${RESULT_COUNT} (from ${DOWNLOADED} BP job(s))" + +if [ "${RESULT_COUNT}" -eq 0 ]; then + echo "WARNING: no result JSONs found — BP jobs may not have run or produced artifacts yet" + exit 1 +fi diff --git a/.gitlab/benchmarks/post-pr-comment.sh b/.gitlab/benchmarks/post-pr-comment.sh index de410dff9..40f46a3e2 100755 --- a/.gitlab/benchmarks/post-pr-comment.sh +++ b/.gitlab/benchmarks/post-pr-comment.sh @@ -1,8 +1,9 @@ #!/usr/bin/env bash -# Post aggregated benchmark comparison results as a single PR comment. +# Post aggregated benchmark results as a single PR comment. # -# Expects all per-cell comparison-baseline-vs-candidate_*.md reports to be -# present under REPORTS_DIR (default: reports/). +# Handles two report types found under REPORTS_DIR (default: reports/): +# - comparison-baseline-vs-candidate_*.md (perf comparison benchmarks) +# - result_${BENCHMARK}_${JDK}_${LIBRARY}.json (reliability benchmarks) # # Required env: # DDPROF_COMMIT_BRANCH – branch name used to locate the open PR @@ -14,7 +15,7 @@ set -euo pipefail REPORTS_DIR="${1:-reports}" HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -# Aggregate all per-cell reports into a single comment body +# --- Perf comparison reports (markdown, one per benchmark cell) --- SECTIONS="" for md in "${REPORTS_DIR}"/comparison-baseline-vs-candidate_*.md; do [ -f "${md}" ] || continue @@ -28,6 +29,80 @@ $(cat "${md}") " done +# --- Reliability reports (JSON, one per benchmark×JDK×library) --- +RELIABILITY_TABLE=$(python3 - "${REPORTS_DIR}" <<'PYEOF' +import json, sys, glob, collections + +reports_dir = sys.argv[1] +results = collections.defaultdict(dict) + +for path in sorted(glob.glob(f"{reports_dir}/result_*.json")): + try: + with open(path) as f: + r = json.load(f) + key = (r["benchmark"], r["jdk"]) + results[key][r["library"]] = r + except Exception: + continue + +if not results: + sys.exit(0) + +def fmt_avg(r): + icon = "💥" if r.get("crashed") else "✅" + avg = r.get("avg_ms", 0) + cnt = r.get("run_count", 0) + return f"{icon} {avg} ms ({cnt} iters)" + +def fmt_delta(latest, dev): + la, da = latest.get("avg_ms", 0), dev.get("avg_ms", 0) + if not la or not da: + return "—" + pct = (da - la) / la * 100 + # positive = dev is slower (potential regression) + arrow = "🔴 +" if pct > 2 else ("🟢 " if pct < -2 else "") + return f"{arrow}{pct:+.1f}%" + +def fmt_uploads(r): + return str(r.get("upload_count", 0)) + +def fmt_issues(r): + e = r.get("error_count", 0) + w = r.get("warn_count", 0) + if e == 0 and w == 0: + return "—" + parts = [] + if e: parts.append(f"E:{e}") + if w: parts.append(f"W:{w}") + return "⚠️ " + " ".join(parts) + +lines = [ + "### Reliability Benchmarks", + "", + "| Benchmark | JDK | Latest | Dev | Δ% (dev vs latest) | Uploads L/D | Issues L/D |", + "|-----------|-----|--------|-----|-------------------|-------------|------------|", +] +for (bench, jdk) in sorted(results.keys()): + libs = results[(bench, jdk)] + latest = libs.get("latest", {}) + dev = libs.get("dev", {}) + + col_latest = fmt_avg(latest) if latest else "—" + col_dev = fmt_avg(dev) if dev else "—" + col_delta = fmt_delta(latest, dev) if (latest and dev) else "—" + col_uploads = f"{fmt_uploads(latest)} / {fmt_uploads(dev)}" + col_issues = f"{fmt_issues(latest)} / {fmt_issues(dev)}" + + lines.append(f"| {bench} | {jdk} | {col_latest} | {col_dev} | {col_delta} | {col_uploads} | {col_issues} |") + +print("\n".join(lines)) +PYEOF +) || true + +[ -n "${RELIABILITY_TABLE}" ] && SECTIONS="${SECTIONS} +${RELIABILITY_TABLE} +" + if [ -z "${SECTIONS}" ]; then echo "No benchmark reports found under ${REPORTS_DIR} — skipping comment" exit 0