From 10bd343a1edee2d4e3e06dfdcde7c2c0b4e25216 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Fri, 20 Feb 2026 13:44:05 +1300 Subject: [PATCH 01/21] [ML] Add AI-powered build failure analysis to CI pipelines When a Buildkite build fails, a new soft-fail step fetches the failed step logs and sends them to Claude for diagnosis. The analysis (root cause, classification, suggested fix, confidence) is posted as a Buildkite annotation directly on the build page. The step uses an `if` guard so it only runs when the build is failing, and the Claude API key is retrieved from Vault at runtime. Co-authored-by: Cursor --- .buildkite/branch.json.py | 3 + .buildkite/hooks/post-checkout | 6 + .buildkite/job-build-test-all-debug.json.py | 3 + .buildkite/pipeline.json.py | 3 + .../pipelines/analyze_build_failure.yml.sh | 27 ++ dev-tools/analyze_build_failure.py | 253 ++++++++++++++++++ 6 files changed, 295 insertions(+) create mode 100755 .buildkite/pipelines/analyze_build_failure.yml.sh create mode 100755 dev-tools/analyze_build_failure.py diff --git a/.buildkite/branch.json.py b/.buildkite/branch.json.py index 4916a89cc..70f8f9754 100755 --- a/.buildkite/branch.json.py +++ b/.buildkite/branch.json.py @@ -46,6 +46,9 @@ def main(): # Ingest step-level timings into Elasticsearch for anomaly detection pipeline_steps.append(pipeline_steps.generate_step("Ingest build timings", ".buildkite/pipelines/ingest_build_timings.yml.sh")) + # Analyze failures with AI if the build failed + pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure", + ".buildkite/pipelines/analyze_build_failure.yml.sh")) # Build the DRA artifacts and upload to S3 and GCS pipeline_steps.append(pipeline_steps.generate_step("Create daily releasable artifacts", diff --git a/.buildkite/hooks/post-checkout b/.buildkite/hooks/post-checkout index aca27b008..c3006feab 100644 --- a/.buildkite/hooks/post-checkout +++ b/.buildkite/hooks/post-checkout @@ -33,6 +33,12 @@ if [[ "$BUILDKITE_PIPELINE_SLUG" == ml-cpp* ]]; then export ES_API_KEY=$(vault read -field=api_key secret/ci/elastic-ml-cpp/elasticsearch/ci_analytics 2>/dev/null || echo "") fi + if [[ "$BUILDKITE_STEP_KEY" == "analyze_build_failure" ]]; then + export BUILDKITE_API_READ_TOKEN=$(vault read -field=token secret/ci/elastic-ml-cpp/buildkite/api_read_token 2>/dev/null || echo "") + export ANTHROPIC_API_KEY=$(vault read -field=api_key secret/ci/elastic-ml-cpp/anthropic/claude 2>/dev/null || echo "") + export SLACK_WEBHOOK_URL=$(vault read -field=url secret/ci/elastic-ml-cpp/slack/build_failure_webhook 2>/dev/null || echo "") + fi + # GCS service account — inject credentials for build and Java IT steps. # Build steps use it for sccache; Java IT steps use it for the Gradle # build cache. The key is stored in Vault. diff --git a/.buildkite/job-build-test-all-debug.json.py b/.buildkite/job-build-test-all-debug.json.py index 7aa0e4a61..39347d086 100755 --- a/.buildkite/job-build-test-all-debug.json.py +++ b/.buildkite/job-build-test-all-debug.json.py @@ -57,6 +57,9 @@ def main(): # Ingest step-level timings into Elasticsearch for anomaly detection pipeline_steps.append(pipeline_steps.generate_step("Ingest build timings", ".buildkite/pipelines/ingest_build_timings.yml.sh")) + # Analyze failures with AI if the build failed + pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure", + ".buildkite/pipelines/analyze_build_failure.yml.sh")) pipeline["env"] = env pipeline["steps"] = pipeline_steps diff --git a/.buildkite/pipeline.json.py b/.buildkite/pipeline.json.py index 1796a665b..c15d6bb84 100755 --- a/.buildkite/pipeline.json.py +++ b/.buildkite/pipeline.json.py @@ -68,6 +68,9 @@ def main(): # Check for build timing regressions against nightly baseline pipeline_steps.append(pipeline_steps.generate_step("Check build timing regressions", ".buildkite/pipelines/check_build_regression.yml.sh")) + # Analyze failures with AI if the build failed + pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure", + ".buildkite/pipelines/analyze_build_failure.yml.sh")) pipeline["env"] = env pipeline["steps"] = pipeline_steps diff --git a/.buildkite/pipelines/analyze_build_failure.yml.sh b/.buildkite/pipelines/analyze_build_failure.yml.sh new file mode 100755 index 000000000..162c3fb05 --- /dev/null +++ b/.buildkite/pipelines/analyze_build_failure.yml.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +# or more contributor license agreements. Licensed under the Elastic License +# 2.0 and the following additional limitation. Functionality enabled by the +# files subject to the Elastic License 2.0 may only be used in production when +# invoked by an Elasticsearch process with a license key installed that permits +# use of machine learning features. You may not use this file except in +# compliance with the Elastic License 2.0 and the foregoing additional +# limitation. + +cat < + +### Classification + + +### Suggested Fix + + +### Confidence + +""" + + +def get_env_or_file(env_var, file_path): + val = os.environ.get(env_var, "").strip() + if val: + return val + if file_path: + p = Path(file_path).expanduser() + if p.exists(): + return p.read_text().strip() + return None + + +def buildkite_get(path, token): + url = f"https://api.buildkite.com/v2/organizations/{BUILDKITE_ORG}/{path}" + req = urllib.request.Request(url, headers={"Authorization": f"Bearer {token}"}) + with urllib.request.urlopen(req) as resp: + return json.loads(resp.read()) + + +def get_job_log(log_url, token): + """Fetch the raw log for a Buildkite job.""" + req = urllib.request.Request( + log_url, + headers={ + "Authorization": f"Bearer {token}", + "Accept": "text/plain", + }, + ) + try: + with urllib.request.urlopen(req) as resp: + return resp.read().decode("utf-8", errors="replace") + except urllib.error.HTTPError: + return None + + +def truncate_log(log_text, max_chars=MAX_LOG_CHARS): + """Keep the last max_chars of the log (the end usually has the error).""" + if not log_text or len(log_text) <= max_chars: + return log_text + return f"... [truncated {len(log_text) - max_chars} chars] ...\n" + log_text[-max_chars:] + + +def call_claude(api_key, prompt): + body = json.dumps({ + "model": ANTHROPIC_MODEL, + "max_tokens": MAX_RESPONSE_TOKENS, + "system": SYSTEM_PROMPT, + "messages": [{"role": "user", "content": prompt}], + }).encode("utf-8") + + req = urllib.request.Request( + ANTHROPIC_API_URL, + data=body, + headers={ + "x-api-key": api_key, + "anthropic-version": "2023-06-01", + "Content-Type": "application/json", + }, + ) + with urllib.request.urlopen(req, timeout=60) as resp: + result = json.loads(resp.read()) + + for block in result.get("content", []): + if block.get("type") == "text": + return block["text"] + return "No analysis generated." + + +def main(): + parser = argparse.ArgumentParser(description="Analyze Buildkite build failures with Claude") + parser.add_argument("--pipeline", default=os.environ.get("BUILDKITE_PIPELINE_SLUG")) + parser.add_argument("--build", type=int, default=int(os.environ.get("BUILDKITE_BUILD_NUMBER", "0"))) + parser.add_argument("--dry-run", action="store_true", help="Print analysis without annotating") + args = parser.parse_args() + + if not args.pipeline or not args.build: + print("Error: --pipeline and --build required", file=sys.stderr) + sys.exit(1) + + bk_token = (get_env_or_file("BUILDKITE_TOKEN", "~/.buildkite/token") + or get_env_or_file("BUILDKITE_API_READ_TOKEN", "")) + claude_key = get_env_or_file("ANTHROPIC_API_KEY", "~/.elastic/claude_api_key") + + if not bk_token: + print("Error: No Buildkite token available", file=sys.stderr) + sys.exit(1) + if not claude_key: + print("Error: No Anthropic API key available", file=sys.stderr) + sys.exit(1) + + print(f"Analyzing {args.pipeline} build #{args.build}...") + + build = buildkite_get(f"pipelines/{args.pipeline}/builds/{args.build}", bk_token) + + if build.get("state") == "passed": + print("Build passed — nothing to analyze.") + sys.exit(0) + + failed_jobs = [ + j for j in build.get("jobs", []) + if j.get("type") == "script" and j.get("state") == "failed" + ] + + if not failed_jobs: + print("No failed steps found.") + sys.exit(0) + + print(f"Found {len(failed_jobs)} failed step(s)") + + all_analyses = [] + + for job in failed_jobs: + step_key = job.get("step_key", "unknown") + step_label = job.get("name", step_key) + raw_log_url = job.get("raw_log_url", "") + + print(f"\nAnalyzing: {step_label} ({step_key})") + + log_text = get_job_log(raw_log_url, bk_token) if raw_log_url else None + if not log_text: + print(f" Could not fetch log, skipping") + continue + + log_excerpt = truncate_log(log_text) + + prompt = f"""Analyze this CI build failure. + +**Pipeline**: {args.pipeline} +**Build**: #{args.build} +**Branch**: {build.get('branch', 'unknown')} +**Failed step**: {step_label} (key: {step_key}) + +{KNOWN_FAILURE_PATTERNS} + +**Build log (last {MAX_LOG_CHARS} chars)**: +``` +{log_excerpt} +``` + +Analyze the root cause and suggest a fix.""" + + try: + analysis = call_claude(claude_key, prompt) + except Exception as e: + analysis = f"Failed to get analysis: {e}" + + print(f"\n{analysis}") + all_analyses.append(f"## {step_label}\n\n{analysis}") + + if not all_analyses: + print("No analyses generated.") + sys.exit(0) + + full_annotation = f"# 🔍 Build Failure Analysis\n\n" + full_annotation += f"*Pipeline*: `{args.pipeline}` | *Build*: #{args.build} | *Branch*: `{build.get('branch', '?')}`\n\n" + full_annotation += "\n\n---\n\n".join(all_analyses) + full_annotation += "\n\n---\n*Analysis generated by Claude. Verify before acting.*" + + if not args.dry_run: + try: + subprocess.run( + ["buildkite-agent", "annotate", + "--style", "error", + "--context", "build-failure-analysis"], + input=full_annotation.encode(), + check=True, + ) + print("\nAnnotation posted to Buildkite.") + except (FileNotFoundError, subprocess.CalledProcessError) as e: + print(f"\nCould not post annotation: {e}", file=sys.stderr) + print("Full analysis printed above.") + + +if __name__ == "__main__": + main() From 4dd54423e567ee83bb32a894416a4c98f1d77a9c Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Fri, 20 Feb 2026 13:58:31 +1300 Subject: [PATCH 02/21] [ML] Add Slack notifications to build failure analyzer When SLACK_WEBHOOK_URL is set, posts a compact summary of each failed step's AI diagnosis to #machine-learn-build. The message includes the classification emoji, root cause, and a link back to the build page. The webhook URL is retrieved from Vault at runtime; if absent, the Slack step is silently skipped and only the Buildkite annotation is posted. Co-authored-by: Cursor --- dev-tools/analyze_build_failure.py | 98 +++++++++++++++++++++++++++++- 1 file changed, 95 insertions(+), 3 deletions(-) diff --git a/dev-tools/analyze_build_failure.py b/dev-tools/analyze_build_failure.py index 7522a6840..cd55769af 100755 --- a/dev-tools/analyze_build_failure.py +++ b/dev-tools/analyze_build_failure.py @@ -2,7 +2,8 @@ """Analyze a Buildkite build failure using Claude and post a diagnosis. Fetches logs from failed build steps, sends them to the Anthropic Claude API -with repository context, and posts the analysis as a Buildkite annotation. +with repository context, and posts the analysis as a Buildkite annotation +and optionally to Slack. Usage: # Analyze the current build (in CI) @@ -11,12 +12,13 @@ # Analyze a specific build python3 dev-tools/analyze_build_failure.py --pipeline ml-cpp-snapshot-builds --build 5819 - # Dry run (print to stdout, don't annotate) + # Dry run (print to stdout, don't annotate or post to Slack) python3 dev-tools/analyze_build_failure.py --pipeline ml-cpp-snapshot-builds --build 5819 --dry-run Environment: BUILDKITE_TOKEN / BUILDKITE_API_READ_TOKEN Buildkite API token ANTHROPIC_API_KEY Claude API key + SLACK_WEBHOOK_URL Slack incoming webhook (optional) BUILDKITE_PIPELINE_SLUG Current pipeline (set by Buildkite) BUILDKITE_BUILD_NUMBER Current build number (set by Buildkite) """ @@ -144,11 +146,88 @@ def call_claude(api_key, prompt): return "No analysis generated." +def post_to_slack(webhook_url, pipeline, build_number, branch, build_url, analyses): + """Post a summary of the failure analysis to Slack.""" + # Slack uses mrkdwn, not full markdown — convert minimally + blocks = [ + { + "type": "header", + "text": { + "type": "plain_text", + "text": "Build Failure Analysis", + }, + }, + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": ( + f"*Pipeline:* `{pipeline}` | *Build:* <{build_url}|#{build_number}> | *Branch:* `{branch}`" + ), + }, + }, + ] + + for step_label, analysis in analyses: + # Extract just the classification and root cause for a compact Slack message + lines = analysis.split("\n") + root_cause = "" + classification = "" + for i, line in enumerate(lines): + if line.startswith("### Root Cause"): + root_cause = lines[i + 1].strip() if i + 1 < len(lines) else "" + elif line.startswith("### Classification"): + classification = lines[i + 1].strip() if i + 1 < len(lines) else "" + + emoji = { + "infrastructure/transient": ":cloud:", + "code bug": ":bug:", + "test failure": ":test_tube:", + "configuration": ":gear:", + "dependency": ":package:", + }.get(classification, ":warning:") + + blocks.append({"type": "divider"}) + blocks.append({ + "type": "section", + "text": { + "type": "mrkdwn", + "text": f"{emoji} *{step_label}*\n>{root_cause}\n_Classification: {classification}_", + }, + }) + + blocks.append({"type": "divider"}) + blocks.append({ + "type": "context", + "elements": [ + { + "type": "mrkdwn", + "text": f"<{build_url}|View build> | Analysis by Claude — verify before acting", + } + ], + }) + + payload = json.dumps({"blocks": blocks}).encode("utf-8") + req = urllib.request.Request( + webhook_url, + data=payload, + headers={"Content-Type": "application/json"}, + ) + try: + with urllib.request.urlopen(req, timeout=10) as resp: + if resp.status == 200: + print("Slack notification posted.") + else: + print(f"Slack returned status {resp.status}", file=sys.stderr) + except Exception as e: + print(f"Could not post to Slack: {e}", file=sys.stderr) + + def main(): parser = argparse.ArgumentParser(description="Analyze Buildkite build failures with Claude") parser.add_argument("--pipeline", default=os.environ.get("BUILDKITE_PIPELINE_SLUG")) parser.add_argument("--build", type=int, default=int(os.environ.get("BUILDKITE_BUILD_NUMBER", "0"))) - parser.add_argument("--dry-run", action="store_true", help="Print analysis without annotating") + parser.add_argument("--dry-run", action="store_true", help="Print analysis without annotating or posting to Slack") args = parser.parse_args() if not args.pipeline or not args.build: @@ -185,7 +264,11 @@ def main(): print(f"Found {len(failed_jobs)} failed step(s)") + slack_webhook = get_env_or_file("SLACK_WEBHOOK_URL", "") + build_url = build.get("web_url", f"https://buildkite.com/{BUILDKITE_ORG}/{args.pipeline}/builds/{args.build}") + all_analyses = [] + slack_analyses = [] for job in failed_jobs: step_key = job.get("step_key", "unknown") @@ -224,6 +307,7 @@ def main(): print(f"\n{analysis}") all_analyses.append(f"## {step_label}\n\n{analysis}") + slack_analyses.append((step_label, analysis)) if not all_analyses: print("No analyses generated.") @@ -248,6 +332,14 @@ def main(): print(f"\nCould not post annotation: {e}", file=sys.stderr) print("Full analysis printed above.") + if slack_webhook: + post_to_slack( + slack_webhook, args.pipeline, args.build, + build.get("branch", "?"), build_url, slack_analyses, + ) + else: + print("No SLACK_WEBHOOK_URL set, skipping Slack notification.") + if __name__ == "__main__": main() From 64b138e027d96947c4863ac2f67e7a5c3064fb0b Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Fri, 27 Feb 2026 15:12:07 +1300 Subject: [PATCH 03/21] Post build failure analysis as a GitHub PR comment When the build is a PR build (BUILDKITE_PULL_REQUEST is set), post the Claude analysis as a comment on the GitHub PR in addition to the Buildkite annotation and Slack notification. Uses an HTML comment marker to find and update existing comments on rebuild/retry, avoiding duplicate comments on the same PR. Addresses review feedback from valeriy42 requesting better visibility of failure analysis for PR authors. Made-with: Cursor --- .buildkite/hooks/post-checkout | 1 + dev-tools/analyze_build_failure.py | 76 ++++++++++++++++++++++++++++-- 2 files changed, 74 insertions(+), 3 deletions(-) diff --git a/.buildkite/hooks/post-checkout b/.buildkite/hooks/post-checkout index c3006feab..07d50cf16 100644 --- a/.buildkite/hooks/post-checkout +++ b/.buildkite/hooks/post-checkout @@ -37,6 +37,7 @@ if [[ "$BUILDKITE_PIPELINE_SLUG" == ml-cpp* ]]; then export BUILDKITE_API_READ_TOKEN=$(vault read -field=token secret/ci/elastic-ml-cpp/buildkite/api_read_token 2>/dev/null || echo "") export ANTHROPIC_API_KEY=$(vault read -field=api_key secret/ci/elastic-ml-cpp/anthropic/claude 2>/dev/null || echo "") export SLACK_WEBHOOK_URL=$(vault read -field=url secret/ci/elastic-ml-cpp/slack/build_failure_webhook 2>/dev/null || echo "") + export GITHUB_TOKEN=$(vault read -field=token secret/ci/elastic-ml-cpp/github/pr_comment_token 2>/dev/null || echo "") fi # GCS service account — inject credentials for build and Java IT steps. diff --git a/dev-tools/analyze_build_failure.py b/dev-tools/analyze_build_failure.py index cd55769af..456fc5348 100755 --- a/dev-tools/analyze_build_failure.py +++ b/dev-tools/analyze_build_failure.py @@ -2,8 +2,8 @@ """Analyze a Buildkite build failure using Claude and post a diagnosis. Fetches logs from failed build steps, sends them to the Anthropic Claude API -with repository context, and posts the analysis as a Buildkite annotation -and optionally to Slack. +with repository context, and posts the analysis as a Buildkite annotation, +a GitHub PR comment (for PR builds), and optionally to Slack. Usage: # Analyze the current build (in CI) @@ -12,15 +12,17 @@ # Analyze a specific build python3 dev-tools/analyze_build_failure.py --pipeline ml-cpp-snapshot-builds --build 5819 - # Dry run (print to stdout, don't annotate or post to Slack) + # Dry run (print to stdout, don't annotate or post to Slack/GitHub) python3 dev-tools/analyze_build_failure.py --pipeline ml-cpp-snapshot-builds --build 5819 --dry-run Environment: BUILDKITE_TOKEN / BUILDKITE_API_READ_TOKEN Buildkite API token ANTHROPIC_API_KEY Claude API key + GITHUB_TOKEN GitHub API token (optional, for PR comments) SLACK_WEBHOOK_URL Slack incoming webhook (optional) BUILDKITE_PIPELINE_SLUG Current pipeline (set by Buildkite) BUILDKITE_BUILD_NUMBER Current build number (set by Buildkite) + BUILDKITE_PULL_REQUEST PR number (set by Buildkite for PR builds) """ import argparse @@ -33,6 +35,9 @@ from pathlib import Path BUILDKITE_ORG = "elastic" +GITHUB_REPO = "elastic/ml-cpp" +GITHUB_API_URL = "https://api.github.com" +GITHUB_COMMENT_MARKER = "" ANTHROPIC_API_URL = "https://api.anthropic.com/v1/messages" ANTHROPIC_MODEL = "claude-sonnet-4-20250514" MAX_LOG_CHARS = 30000 @@ -223,6 +228,60 @@ def post_to_slack(webhook_url, pipeline, build_number, branch, build_url, analys print(f"Could not post to Slack: {e}", file=sys.stderr) +def github_api(method, path, token, data=None): + """Make a GitHub API request and return the parsed JSON response.""" + url = f"{GITHUB_API_URL}{path}" + body = json.dumps(data).encode("utf-8") if data else None + req = urllib.request.Request(url, data=body, method=method, headers={ + "Authorization": f"token {token}", + "Accept": "application/vnd.github.v3+json", + "Content-Type": "application/json", + }) + with urllib.request.urlopen(req, timeout=15) as resp: + return json.loads(resp.read()) + + +def find_existing_comment(pr_number, token): + """Find an existing analysis comment on the PR by looking for the marker.""" + page = 1 + while True: + comments = github_api( + "GET", f"/repos/{GITHUB_REPO}/issues/{pr_number}/comments?per_page=100&page={page}", token + ) + if not comments: + break + for comment in comments: + if GITHUB_COMMENT_MARKER in comment.get("body", ""): + return comment["id"] + page += 1 + return None + + +def post_to_github(token, pr_number, build_url, annotation_body): + """Post or update a build failure analysis comment on a GitHub PR.""" + comment_body = ( + f"{GITHUB_COMMENT_MARKER}\n" + f"## :mag: Build Failure Analysis\n\n" + f"{annotation_body}\n\n" + f"---\n" + f"[View Buildkite build]({build_url}) | " + f"*Analysis generated by Claude. Verify before acting.*" + ) + + try: + existing_id = find_existing_comment(pr_number, token) + if existing_id: + github_api("PATCH", f"/repos/{GITHUB_REPO}/issues/comments/{existing_id}", token, + {"body": comment_body}) + print(f"Updated existing GitHub comment on PR #{pr_number}.") + else: + github_api("POST", f"/repos/{GITHUB_REPO}/issues/{pr_number}/comments", token, + {"body": comment_body}) + print(f"Posted GitHub comment on PR #{pr_number}.") + except Exception as e: + print(f"Could not post to GitHub: {e}", file=sys.stderr) + + def main(): parser = argparse.ArgumentParser(description="Analyze Buildkite build failures with Claude") parser.add_argument("--pipeline", default=os.environ.get("BUILDKITE_PIPELINE_SLUG")) @@ -265,6 +324,9 @@ def main(): print(f"Found {len(failed_jobs)} failed step(s)") slack_webhook = get_env_or_file("SLACK_WEBHOOK_URL", "") + github_token = get_env_or_file("GITHUB_TOKEN", "") + pr_number = os.environ.get("BUILDKITE_PULL_REQUEST", "false") + pr_number = int(pr_number) if pr_number not in ("false", "") else None build_url = build.get("web_url", f"https://buildkite.com/{BUILDKITE_ORG}/{args.pipeline}/builds/{args.build}") all_analyses = [] @@ -340,6 +402,14 @@ def main(): else: print("No SLACK_WEBHOOK_URL set, skipping Slack notification.") + if github_token and pr_number: + annotation_body = "\n\n---\n\n".join(all_analyses) + post_to_github(github_token, pr_number, build_url, annotation_body) + elif pr_number: + print("No GITHUB_TOKEN set, skipping GitHub PR comment.") + else: + print("Not a PR build, skipping GitHub PR comment.") + if __name__ == "__main__": main() From b15417fb1cb8952380f760a44f3f14cdd1ffd079 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Fri, 27 Feb 2026 15:17:35 +1300 Subject: [PATCH 04/21] Add --pr flag for testing GitHub comment posting Allows overriding the PR number from the command line, useful for local testing of the GitHub comment feature without being in a Buildkite PR build environment. Tested end-to-end against build #2232 (Bayesian test timeout), posting to a throwaway PR. Both initial post and update-in-place (deduplication) verified working. Made-with: Cursor --- dev-tools/analyze_build_failure.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/dev-tools/analyze_build_failure.py b/dev-tools/analyze_build_failure.py index 456fc5348..cda73e85c 100755 --- a/dev-tools/analyze_build_failure.py +++ b/dev-tools/analyze_build_failure.py @@ -286,6 +286,8 @@ def main(): parser = argparse.ArgumentParser(description="Analyze Buildkite build failures with Claude") parser.add_argument("--pipeline", default=os.environ.get("BUILDKITE_PIPELINE_SLUG")) parser.add_argument("--build", type=int, default=int(os.environ.get("BUILDKITE_BUILD_NUMBER", "0"))) + parser.add_argument("--pr", type=int, default=None, + help="Override PR number (for testing GitHub comment posting)") parser.add_argument("--dry-run", action="store_true", help="Print analysis without annotating or posting to Slack") args = parser.parse_args() @@ -325,8 +327,10 @@ def main(): slack_webhook = get_env_or_file("SLACK_WEBHOOK_URL", "") github_token = get_env_or_file("GITHUB_TOKEN", "") - pr_number = os.environ.get("BUILDKITE_PULL_REQUEST", "false") - pr_number = int(pr_number) if pr_number not in ("false", "") else None + pr_number = args.pr + if pr_number is None: + pr_env = os.environ.get("BUILDKITE_PULL_REQUEST", "false") + pr_number = int(pr_env) if pr_env not in ("false", "") else None build_url = build.get("web_url", f"https://buildkite.com/{BUILDKITE_ORG}/{args.pipeline}/builds/{args.build}") all_analyses = [] From 9ff81a16d0fb1b9618a859139b78dbca713df102 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Mon, 16 Mar 2026 15:39:39 +1300 Subject: [PATCH 05/21] [ML] Make AI failure analysis opt-in for PR builds Failure analysis now only runs on PR builds when triggered by a `buildkite analyze` comment, avoiding unnecessary API token usage. Nightly and debug pipelines retain automatic analysis on failure. Made-with: Cursor --- .buildkite/ml_pipeline/config.py | 4 +++- .buildkite/pipeline.json.py | 6 +++--- .buildkite/pull-requests.json | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.buildkite/ml_pipeline/config.py b/.buildkite/ml_pipeline/config.py index 4669ce8b5..502217c00 100644 --- a/.buildkite/ml_pipeline/config.py +++ b/.buildkite/ml_pipeline/config.py @@ -19,6 +19,7 @@ class Config: build_x86_64: str = "" run_qa_tests: bool = False run_pytorch_tests: bool = False + run_analyze: bool = False action: str = "build" def parse_comment(self): @@ -37,7 +38,8 @@ def parse_comment(self): self.action = os.environ["GITHUB_PR_COMMENT_VAR_ACTION"] self.run_qa_tests = self.action == "run_qa_tests" self.run_pytorch_tests = self.action == "run_pytorch_tests" - if self.run_pytorch_tests or self.run_qa_tests: + self.run_analyze = self.action == "analyze" + if self.run_pytorch_tests or self.run_qa_tests or self.run_analyze: self.action = "build" # If the ACTION is set to "run_qa_tests" then set some optional variables governing the ES branch to build, the diff --git a/.buildkite/pipeline.json.py b/.buildkite/pipeline.json.py index c15d6bb84..f7a48077e 100755 --- a/.buildkite/pipeline.json.py +++ b/.buildkite/pipeline.json.py @@ -68,9 +68,9 @@ def main(): # Check for build timing regressions against nightly baseline pipeline_steps.append(pipeline_steps.generate_step("Check build timing regressions", ".buildkite/pipelines/check_build_regression.yml.sh")) - # Analyze failures with AI if the build failed - pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure", - ".buildkite/pipelines/analyze_build_failure.yml.sh")) + if config.run_analyze: + pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure", + ".buildkite/pipelines/analyze_build_failure.yml.sh")) pipeline["env"] = env pipeline["steps"] = pipeline_steps diff --git a/.buildkite/pull-requests.json b/.buildkite/pull-requests.json index dcea82794..81248a767 100644 --- a/.buildkite/pull-requests.json +++ b/.buildkite/pull-requests.json @@ -9,7 +9,7 @@ "commit_status_context": "ml-cpp-ci", "build_on_commit": true, "build_on_comment": true, - "trigger_comment_regex": "^(?:(?:buildkite +)(?build|debug|run_qa_tests|run_pytorch_tests)(=(?(?:[^ ]+)))? *(?: for ES_BRANCH=(?([.0-9a-zA-Z]+)))? *(?:with STACK_VERSION=(?([.0-9]+)))? *(?: *on *(?(?:[ ,]*(?:windows|linux|mac(os)?))+))?) *(?(?:[, ]*aarch64|x86_64)+)?$", + "trigger_comment_regex": "^(?:(?:buildkite +)(?build|debug|run_qa_tests|run_pytorch_tests|analyze)(=(?(?:[^ ]+)))? *(?: for ES_BRANCH=(?([.0-9a-zA-Z]+)))? *(?:with STACK_VERSION=(?([.0-9]+)))? *(?: *on *(?(?:[ ,]*(?:windows|linux|mac(os)?))+))?) *(?(?:[, ]*aarch64|x86_64)+)?$", "always_trigger_comment_regex": "^(?:(?:buildkite\\W+)?(?:build|test)\\W+(?:this|it))", "skip_ci_labels": ["skip-ci", "jenkins-ci", ">test-mute", ">docs"], "skip_target_branches": ["6.8", "7.11", "7.12"], From 44594cd5fdfd3191a9fa8824b5ecbef3588dc903 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Thu, 19 Mar 2026 14:19:07 +1300 Subject: [PATCH 06/21] [ML] Enable native Buildkite PR comments for build failures Enable the ELASTIC_PR_COMMENTS_ENABLED feature on the PR builds pipeline so that elasticmachine posts a summary comment listing failed steps and build history directly on the GitHub PR. Made-with: Cursor --- catalog-info.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/catalog-info.yaml b/catalog-info.yaml index 17f8aad68..44cd2a4aa 100644 --- a/catalog-info.yaml +++ b/catalog-info.yaml @@ -38,6 +38,8 @@ spec: publish_commit_status: false publish_commit_status_per_step: true trigger_mode: code + env: + ELASTIC_PR_COMMENTS_ENABLED: 'true' repository: elastic/ml-cpp skip_intermediate_builds: true teams: From 1ea0535f226d640a3e2ee75d34311161c2a03d7a Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Thu, 19 Mar 2026 14:34:41 +1300 Subject: [PATCH 07/21] [ML] Post AI analysis as PR comment via GitHub Actions Replace direct GitHub API calls from the Buildkite analyze step with a GitHub Actions workflow that uses the built-in GITHUB_TOKEN. The Buildkite step now saves the analysis as build metadata, and a GitHub Actions workflow triggered by the commit status event fetches it and posts/updates the PR comment. This eliminates the need for a personal access token or GitHub App for PR comments. Made-with: Cursor --- .buildkite/hooks/post-checkout | 1 - .github/workflows/post-build-analysis.yml | 133 ++++++++++++++++++++++ dev-tools/analyze_build_failure.py | 91 +++------------ 3 files changed, 150 insertions(+), 75 deletions(-) create mode 100644 .github/workflows/post-build-analysis.yml diff --git a/.buildkite/hooks/post-checkout b/.buildkite/hooks/post-checkout index 07d50cf16..c3006feab 100644 --- a/.buildkite/hooks/post-checkout +++ b/.buildkite/hooks/post-checkout @@ -37,7 +37,6 @@ if [[ "$BUILDKITE_PIPELINE_SLUG" == ml-cpp* ]]; then export BUILDKITE_API_READ_TOKEN=$(vault read -field=token secret/ci/elastic-ml-cpp/buildkite/api_read_token 2>/dev/null || echo "") export ANTHROPIC_API_KEY=$(vault read -field=api_key secret/ci/elastic-ml-cpp/anthropic/claude 2>/dev/null || echo "") export SLACK_WEBHOOK_URL=$(vault read -field=url secret/ci/elastic-ml-cpp/slack/build_failure_webhook 2>/dev/null || echo "") - export GITHUB_TOKEN=$(vault read -field=token secret/ci/elastic-ml-cpp/github/pr_comment_token 2>/dev/null || echo "") fi # GCS service account — inject credentials for build and Java IT steps. diff --git a/.github/workflows/post-build-analysis.yml b/.github/workflows/post-build-analysis.yml new file mode 100644 index 000000000..e3dfdeeb4 --- /dev/null +++ b/.github/workflows/post-build-analysis.yml @@ -0,0 +1,133 @@ +name: Post Build Failure Analysis + +# Triggered by commit status updates from Buildkite. When the +# analyze_build_failure step completes, Buildkite posts a commit status +# which fires this workflow. We fetch the AI analysis from Buildkite +# build metadata and post it as a PR comment using the built-in +# GITHUB_TOKEN (no PAT or GitHub App needed). + +on: + status: + +permissions: + pull-requests: write + statuses: read + +jobs: + post-analysis: + # Only run when the analyze step succeeds (soft_fail means Buildkite + # reports success even if the analysis itself had issues). + if: >- + github.event.state == 'success' && + contains(github.event.context, 'Analyze build failure') + runs-on: ubuntu-latest + steps: + - name: Find PR for commit + id: find-pr + env: + GH_TOKEN: ${{ github.token }} + SHA: ${{ github.event.sha }} + run: | + PR_NUMBER=$(gh api "repos/${{ github.repository }}/commits/${SHA}/pulls" \ + --jq '.[0].number // empty' 2>/dev/null || true) + if [ -z "$PR_NUMBER" ]; then + echo "No PR found for commit ${SHA} — skipping." + echo "skip=true" >> "$GITHUB_OUTPUT" + else + echo "Found PR #${PR_NUMBER}" + echo "skip=false" >> "$GITHUB_OUTPUT" + echo "pr_number=${PR_NUMBER}" >> "$GITHUB_OUTPUT" + fi + + - name: Extract Buildkite build info + if: steps.find-pr.outputs.skip != 'true' + id: bk-info + env: + TARGET_URL: ${{ github.event.target_url }} + run: | + # target_url looks like: + # https://buildkite.com/elastic/ml-cpp-pr-builds/builds/2361#step-key + # Extract pipeline slug and build number. + PIPELINE=$(echo "$TARGET_URL" | sed -n 's|.*/elastic/\([^/]*\)/builds/.*|\1|p') + BUILD_NUM=$(echo "$TARGET_URL" | sed -n 's|.*/builds/\([0-9]*\).*|\1|p') + if [ -z "$PIPELINE" ] || [ -z "$BUILD_NUM" ]; then + echo "Could not parse Buildkite URL: $TARGET_URL" + echo "skip=true" >> "$GITHUB_OUTPUT" + else + echo "Pipeline: $PIPELINE, Build: $BUILD_NUM" + echo "skip=false" >> "$GITHUB_OUTPUT" + echo "pipeline=${PIPELINE}" >> "$GITHUB_OUTPUT" + echo "build_num=${BUILD_NUM}" >> "$GITHUB_OUTPUT" + fi + + - name: Fetch analysis from Buildkite + if: >- + steps.find-pr.outputs.skip != 'true' && + steps.bk-info.outputs.skip != 'true' + id: fetch + env: + BK_TOKEN: ${{ secrets.BUILDKITE_API_READ_TOKEN }} + PIPELINE: ${{ steps.bk-info.outputs.pipeline }} + BUILD_NUM: ${{ steps.bk-info.outputs.build_num }} + run: | + if [ -z "$BK_TOKEN" ]; then + echo "BUILDKITE_API_READ_TOKEN secret not set — skipping." + echo "skip=true" >> "$GITHUB_OUTPUT" + exit 0 + fi + + # Fetch build metadata containing the analysis. + ANALYSIS=$(curl -sS -f \ + -H "Authorization: Bearer ${BK_TOKEN}" \ + "https://api.buildkite.com/v2/organizations/elastic/pipelines/${PIPELINE}/builds/${BUILD_NUM}/meta-data/build-failure-analysis" \ + 2>/dev/null) || true + + if [ -z "$ANALYSIS" ]; then + echo "No analysis metadata found — skipping." + echo "skip=true" >> "$GITHUB_OUTPUT" + exit 0 + fi + + # Save to file to avoid shell quoting issues. + echo "$ANALYSIS" > /tmp/analysis.md + echo "skip=false" >> "$GITHUB_OUTPUT" + + - name: Post or update PR comment + if: >- + steps.find-pr.outputs.skip != 'true' && + steps.bk-info.outputs.skip != 'true' && + steps.fetch.outputs.skip != 'true' + env: + GH_TOKEN: ${{ github.token }} + PR_NUMBER: ${{ steps.find-pr.outputs.pr_number }} + PIPELINE: ${{ steps.bk-info.outputs.pipeline }} + BUILD_NUM: ${{ steps.bk-info.outputs.build_num }} + run: | + MARKER="" + BUILD_URL="https://buildkite.com/elastic/${PIPELINE}/builds/${BUILD_NUM}" + ANALYSIS=$(cat /tmp/analysis.md) + + BODY=$(cat </dev/null | head -1) + + if [ -n "$EXISTING_ID" ]; then + gh api "repos/${{ github.repository }}/issues/comments/${EXISTING_ID}" \ + -X PATCH -f body="$BODY" + echo "Updated existing comment on PR #${PR_NUMBER}." + else + gh api "repos/${{ github.repository }}/issues/${PR_NUMBER}/comments" \ + -f body="$BODY" + echo "Posted new comment on PR #${PR_NUMBER}." + fi diff --git a/dev-tools/analyze_build_failure.py b/dev-tools/analyze_build_failure.py index cda73e85c..ba93e1d33 100755 --- a/dev-tools/analyze_build_failure.py +++ b/dev-tools/analyze_build_failure.py @@ -3,7 +3,8 @@ Fetches logs from failed build steps, sends them to the Anthropic Claude API with repository context, and posts the analysis as a Buildkite annotation, -a GitHub PR comment (for PR builds), and optionally to Slack. +Buildkite build metadata (for the GitHub Actions PR comment workflow), +and optionally to Slack. Usage: # Analyze the current build (in CI) @@ -18,11 +19,9 @@ Environment: BUILDKITE_TOKEN / BUILDKITE_API_READ_TOKEN Buildkite API token ANTHROPIC_API_KEY Claude API key - GITHUB_TOKEN GitHub API token (optional, for PR comments) SLACK_WEBHOOK_URL Slack incoming webhook (optional) BUILDKITE_PIPELINE_SLUG Current pipeline (set by Buildkite) BUILDKITE_BUILD_NUMBER Current build number (set by Buildkite) - BUILDKITE_PULL_REQUEST PR number (set by Buildkite for PR builds) """ import argparse @@ -35,9 +34,6 @@ from pathlib import Path BUILDKITE_ORG = "elastic" -GITHUB_REPO = "elastic/ml-cpp" -GITHUB_API_URL = "https://api.github.com" -GITHUB_COMMENT_MARKER = "" ANTHROPIC_API_URL = "https://api.anthropic.com/v1/messages" ANTHROPIC_MODEL = "claude-sonnet-4-20250514" MAX_LOG_CHARS = 30000 @@ -228,66 +224,11 @@ def post_to_slack(webhook_url, pipeline, build_number, branch, build_url, analys print(f"Could not post to Slack: {e}", file=sys.stderr) -def github_api(method, path, token, data=None): - """Make a GitHub API request and return the parsed JSON response.""" - url = f"{GITHUB_API_URL}{path}" - body = json.dumps(data).encode("utf-8") if data else None - req = urllib.request.Request(url, data=body, method=method, headers={ - "Authorization": f"token {token}", - "Accept": "application/vnd.github.v3+json", - "Content-Type": "application/json", - }) - with urllib.request.urlopen(req, timeout=15) as resp: - return json.loads(resp.read()) - - -def find_existing_comment(pr_number, token): - """Find an existing analysis comment on the PR by looking for the marker.""" - page = 1 - while True: - comments = github_api( - "GET", f"/repos/{GITHUB_REPO}/issues/{pr_number}/comments?per_page=100&page={page}", token - ) - if not comments: - break - for comment in comments: - if GITHUB_COMMENT_MARKER in comment.get("body", ""): - return comment["id"] - page += 1 - return None - - -def post_to_github(token, pr_number, build_url, annotation_body): - """Post or update a build failure analysis comment on a GitHub PR.""" - comment_body = ( - f"{GITHUB_COMMENT_MARKER}\n" - f"## :mag: Build Failure Analysis\n\n" - f"{annotation_body}\n\n" - f"---\n" - f"[View Buildkite build]({build_url}) | " - f"*Analysis generated by Claude. Verify before acting.*" - ) - - try: - existing_id = find_existing_comment(pr_number, token) - if existing_id: - github_api("PATCH", f"/repos/{GITHUB_REPO}/issues/comments/{existing_id}", token, - {"body": comment_body}) - print(f"Updated existing GitHub comment on PR #{pr_number}.") - else: - github_api("POST", f"/repos/{GITHUB_REPO}/issues/{pr_number}/comments", token, - {"body": comment_body}) - print(f"Posted GitHub comment on PR #{pr_number}.") - except Exception as e: - print(f"Could not post to GitHub: {e}", file=sys.stderr) - def main(): parser = argparse.ArgumentParser(description="Analyze Buildkite build failures with Claude") parser.add_argument("--pipeline", default=os.environ.get("BUILDKITE_PIPELINE_SLUG")) parser.add_argument("--build", type=int, default=int(os.environ.get("BUILDKITE_BUILD_NUMBER", "0"))) - parser.add_argument("--pr", type=int, default=None, - help="Override PR number (for testing GitHub comment posting)") parser.add_argument("--dry-run", action="store_true", help="Print analysis without annotating or posting to Slack") args = parser.parse_args() @@ -326,11 +267,6 @@ def main(): print(f"Found {len(failed_jobs)} failed step(s)") slack_webhook = get_env_or_file("SLACK_WEBHOOK_URL", "") - github_token = get_env_or_file("GITHUB_TOKEN", "") - pr_number = args.pr - if pr_number is None: - pr_env = os.environ.get("BUILDKITE_PULL_REQUEST", "false") - pr_number = int(pr_env) if pr_env not in ("false", "") else None build_url = build.get("web_url", f"https://buildkite.com/{BUILDKITE_ORG}/{args.pipeline}/builds/{args.build}") all_analyses = [] @@ -398,6 +334,21 @@ def main(): print(f"\nCould not post annotation: {e}", file=sys.stderr) print("Full analysis printed above.") + # Store analysis as build metadata so that the GitHub Actions + # workflow (post-build-analysis.yml) can fetch it and post a + # PR comment using the built-in GITHUB_TOKEN. + annotation_body = "\n\n---\n\n".join(all_analyses) + try: + subprocess.run( + ["buildkite-agent", "meta-data", "set", + "build-failure-analysis"], + input=annotation_body.encode(), + check=True, + ) + print("Analysis saved as build metadata.") + except (FileNotFoundError, subprocess.CalledProcessError) as e: + print(f"Could not save build metadata: {e}", file=sys.stderr) + if slack_webhook: post_to_slack( slack_webhook, args.pipeline, args.build, @@ -406,14 +357,6 @@ def main(): else: print("No SLACK_WEBHOOK_URL set, skipping Slack notification.") - if github_token and pr_number: - annotation_body = "\n\n---\n\n".join(all_analyses) - post_to_github(github_token, pr_number, build_url, annotation_body) - elif pr_number: - print("No GITHUB_TOKEN set, skipping GitHub PR comment.") - else: - print("Not a PR build, skipping GitHub PR comment.") - if __name__ == "__main__": main() From 9fec3f6e81cfd53c9c1578a386a93d53dfaf740e Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Thu, 19 Mar 2026 14:44:13 +1300 Subject: [PATCH 08/21] [ML] Add temporary workflow to test Vault OIDC for GitHub Actions Made-with: Cursor --- .github/workflows/test-vault-oidc.yml | 55 +++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 .github/workflows/test-vault-oidc.yml diff --git a/.github/workflows/test-vault-oidc.yml b/.github/workflows/test-vault-oidc.yml new file mode 100644 index 000000000..9f484a8fd --- /dev/null +++ b/.github/workflows/test-vault-oidc.yml @@ -0,0 +1,55 @@ +name: Test Vault OIDC + +on: + pull_request: + +permissions: + id-token: write + contents: read + +jobs: + test-vault: + runs-on: ubuntu-latest + steps: + - name: Check Vault JWT auth endpoint + run: | + echo "=== Checking if Vault has a JWT auth method enabled ===" + # This is a public, unauthenticated endpoint that returns config + # if the JWT auth method exists. A 404 means it's not enabled. + for path in jwt github-actions oidc; do + STATUS=$(curl -s -o /dev/null -w "%{http_code}" \ + "https://vault-ci-prod.elastic.dev/v1/auth/${path}/.well-known/openid-configuration" 2>/dev/null) + echo " auth/${path}: HTTP ${STATUS}" + done + + echo "" + echo "=== Requesting GitHub OIDC token ===" + if [ -n "$ACTIONS_ID_TOKEN_REQUEST_URL" ]; then + OIDC_TOKEN=$(curl -sS -H "Authorization: bearer ${ACTIONS_ID_TOKEN_REQUEST_TOKEN}" \ + "${ACTIONS_ID_TOKEN_REQUEST_URL}&audience=https://vault-ci-prod.elastic.dev" | jq -r '.value') + if [ -n "$OIDC_TOKEN" ] && [ "$OIDC_TOKEN" != "null" ]; then + echo "Got OIDC token (first 20 chars): ${OIDC_TOKEN:0:20}..." + + # Decode the JWT claims (middle segment) to see what info it carries. + CLAIMS=$(echo "$OIDC_TOKEN" | cut -d. -f2 | base64 -d 2>/dev/null | jq . 2>/dev/null || echo "could not decode") + echo "" + echo "=== OIDC token claims ===" + echo "$CLAIMS" + + echo "" + echo "=== Attempting Vault JWT login ===" + for path in jwt github-actions oidc; do + echo " Trying auth/${path}/login..." + RESPONSE=$(curl -sS -X POST \ + "https://vault-ci-prod.elastic.dev/v1/auth/${path}/login" \ + -H "Content-Type: application/json" \ + -d "{\"jwt\": \"${OIDC_TOKEN}\", \"role\": \"\"}" 2>&1) + echo " Response: $(echo "$RESPONSE" | jq -c '.errors // .auth.policies // .' 2>/dev/null || echo "$RESPONSE")" + echo "" + done + else + echo "Failed to get OIDC token" + fi + else + echo "ACTIONS_ID_TOKEN_REQUEST_URL not set — id-token permission may be missing" + fi From f404519056a6415f1c4efa37a95c35bde78e18e7 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Thu, 19 Mar 2026 14:51:26 +1300 Subject: [PATCH 09/21] [ML] Remove temporary Vault OIDC test workflow The test confirmed Vault is reachable from GitHub Actions runners and JWT auth paths exist. Actual OIDC login needs to be verified with the infra team. Made-with: Cursor --- .github/workflows/test-vault-oidc.yml | 55 --------------------------- 1 file changed, 55 deletions(-) delete mode 100644 .github/workflows/test-vault-oidc.yml diff --git a/.github/workflows/test-vault-oidc.yml b/.github/workflows/test-vault-oidc.yml deleted file mode 100644 index 9f484a8fd..000000000 --- a/.github/workflows/test-vault-oidc.yml +++ /dev/null @@ -1,55 +0,0 @@ -name: Test Vault OIDC - -on: - pull_request: - -permissions: - id-token: write - contents: read - -jobs: - test-vault: - runs-on: ubuntu-latest - steps: - - name: Check Vault JWT auth endpoint - run: | - echo "=== Checking if Vault has a JWT auth method enabled ===" - # This is a public, unauthenticated endpoint that returns config - # if the JWT auth method exists. A 404 means it's not enabled. - for path in jwt github-actions oidc; do - STATUS=$(curl -s -o /dev/null -w "%{http_code}" \ - "https://vault-ci-prod.elastic.dev/v1/auth/${path}/.well-known/openid-configuration" 2>/dev/null) - echo " auth/${path}: HTTP ${STATUS}" - done - - echo "" - echo "=== Requesting GitHub OIDC token ===" - if [ -n "$ACTIONS_ID_TOKEN_REQUEST_URL" ]; then - OIDC_TOKEN=$(curl -sS -H "Authorization: bearer ${ACTIONS_ID_TOKEN_REQUEST_TOKEN}" \ - "${ACTIONS_ID_TOKEN_REQUEST_URL}&audience=https://vault-ci-prod.elastic.dev" | jq -r '.value') - if [ -n "$OIDC_TOKEN" ] && [ "$OIDC_TOKEN" != "null" ]; then - echo "Got OIDC token (first 20 chars): ${OIDC_TOKEN:0:20}..." - - # Decode the JWT claims (middle segment) to see what info it carries. - CLAIMS=$(echo "$OIDC_TOKEN" | cut -d. -f2 | base64 -d 2>/dev/null | jq . 2>/dev/null || echo "could not decode") - echo "" - echo "=== OIDC token claims ===" - echo "$CLAIMS" - - echo "" - echo "=== Attempting Vault JWT login ===" - for path in jwt github-actions oidc; do - echo " Trying auth/${path}/login..." - RESPONSE=$(curl -sS -X POST \ - "https://vault-ci-prod.elastic.dev/v1/auth/${path}/login" \ - -H "Content-Type: application/json" \ - -d "{\"jwt\": \"${OIDC_TOKEN}\", \"role\": \"\"}" 2>&1) - echo " Response: $(echo "$RESPONSE" | jq -c '.errors // .auth.policies // .' 2>/dev/null || echo "$RESPONSE")" - echo "" - done - else - echo "Failed to get OIDC token" - fi - else - echo "ACTIONS_ID_TOKEN_REQUEST_URL not set — id-token permission may be missing" - fi From 4cc72f30baab1434855ddd85ea5524ee71fa9801 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Fri, 20 Mar 2026 13:10:35 +1300 Subject: [PATCH 10/21] [ML] Use dynamic depends_on for analyze_build_failure step Apply the same fix as PR #3003 to the analyze_build_failure step: compute which build step keys will exist based on the platform config and pass them as ML_BUILD_STEP_KEYS for the shell script to use in its depends_on section. This prevents "Step dependencies not found" errors when not all platforms are built. Made-with: Cursor --- .buildkite/branch.json.py | 16 +++++++++++++ .buildkite/job-build-test-all-debug.json.py | 22 ++++++++++++++---- .buildkite/pipeline.json.py | 23 +++++++++++++++---- .../pipelines/analyze_build_failure.yml.sh | 22 ++++++++++++------ 4 files changed, 66 insertions(+), 17 deletions(-) diff --git a/.buildkite/branch.json.py b/.buildkite/branch.json.py index 70f8f9754..5fb9c02c9 100755 --- a/.buildkite/branch.json.py +++ b/.buildkite/branch.json.py @@ -30,6 +30,21 @@ def main(): ".buildkite/pipelines/format_and_validation.yml.sh")) config = buildConfig.Config() config.parse() + + build_step_keys = [] + if config.build_linux and config.build_aarch64: + build_step_keys.append("build_test_linux-aarch64-RelWithDebInfo") + if config.build_linux and config.build_x86_64: + build_step_keys.append("build_test_linux-x86_64-RelWithDebInfo") + if config.build_macos and config.build_aarch64: + build_step_keys.append("build_test_macos-aarch64-RelWithDebInfo") + if config.build_windows and config.build_x86_64: + build_step_keys.append("build_test_Windows-x86_64-RelWithDebInfo") + + env = { + "ML_BUILD_STEP_KEYS": ",".join(build_step_keys), + } + if config.build_windows: build_windows = pipeline_steps.generate_step_template("Windows", "build", "", config.build_x86_64) pipeline_steps.append(build_windows) @@ -58,6 +73,7 @@ def main(): pipeline_steps.append(pipeline_steps.generate_step("Upload daily releasable artifacts to GCS", ".buildkite/pipelines/upload_dra_to_gcs.yml.sh")) + pipeline["env"] = env pipeline["steps"] = pipeline_steps print(json.dumps(pipeline, indent=2)) diff --git a/.buildkite/job-build-test-all-debug.json.py b/.buildkite/job-build-test-all-debug.json.py index 39347d086..13abc2218 100755 --- a/.buildkite/job-build-test-all-debug.json.py +++ b/.buildkite/job-build-test-all-debug.json.py @@ -24,11 +24,6 @@ config as buildConfig, ) -env = { - "BUILD_SNAPSHOT": "true", - "VERSION_QUALIFIER": "" -} - def main(): pipeline = {} pipeline_steps = step.PipelineStep([]) @@ -40,6 +35,23 @@ def main(): ".buildkite/pipelines/format_and_validation.yml.sh")) config = buildConfig.Config() config.parse() + + build_step_keys = [] + if config.build_linux and config.build_aarch64: + build_step_keys.append("build_test_linux-aarch64-RelWithDebInfo") + if config.build_linux and config.build_x86_64: + build_step_keys.append("build_test_linux-x86_64-RelWithDebInfo") + if config.build_macos and config.build_aarch64: + build_step_keys.append("build_test_macos-aarch64-RelWithDebInfo") + if config.build_windows and config.build_x86_64: + build_step_keys.append("build_test_Windows-x86_64-RelWithDebInfo") + + env = { + "BUILD_SNAPSHOT": "true", + "VERSION_QUALIFIER": "", + "ML_BUILD_STEP_KEYS": ",".join(build_step_keys), + } + if config.build_windows: debug_windows = pipeline_steps.generate_step_template("Windows", "debug", "", config.build_x86_64) pipeline_steps.append(debug_windows) diff --git a/.buildkite/pipeline.json.py b/.buildkite/pipeline.json.py index f7a48077e..1e90d3653 100755 --- a/.buildkite/pipeline.json.py +++ b/.buildkite/pipeline.json.py @@ -23,11 +23,6 @@ config as buildConfig, ) -# Ensure VERSION_QUALIFIER is always empty for PR builds -env = { - "VERSION_QUALIFIER": "" -} - def main(): pipeline = {} pipeline_steps = step.PipelineStep([]) @@ -39,6 +34,24 @@ def main(): ".buildkite/pipelines/format_and_validation.yml.sh")) config = buildConfig.Config() config.parse() + + # Compute which build step keys will exist so that analytics and + # failure-analysis steps can emit a correct depends_on list. + build_step_keys = [] + if config.build_linux and config.build_aarch64: + build_step_keys.append("build_test_linux-aarch64-RelWithDebInfo") + if config.build_linux and config.build_x86_64: + build_step_keys.append("build_test_linux-x86_64-RelWithDebInfo") + if config.build_macos and config.build_aarch64: + build_step_keys.append("build_test_macos-aarch64-RelWithDebInfo") + if config.build_windows and config.build_x86_64: + build_step_keys.append("build_test_Windows-x86_64-RelWithDebInfo") + + env = { + "VERSION_QUALIFIER": "", + "ML_BUILD_STEP_KEYS": ",".join(build_step_keys), + } + if config.build_windows: build_windows = pipeline_steps.generate_step_template("Windows", config.action, "", config.build_x86_64) pipeline_steps.append(build_windows) diff --git a/.buildkite/pipelines/analyze_build_failure.yml.sh b/.buildkite/pipelines/analyze_build_failure.yml.sh index 162c3fb05..4e74c249c 100755 --- a/.buildkite/pipelines/analyze_build_failure.yml.sh +++ b/.buildkite/pipelines/analyze_build_failure.yml.sh @@ -8,17 +8,25 @@ # compliance with the Elastic License 2.0 and the foregoing additional # limitation. -cat < Date: Fri, 20 Mar 2026 13:32:49 +1300 Subject: [PATCH 11/21] [ML] Always include build failure analysis in PR pipelines The analyze_build_failure step already guards itself with if: "build.state == 'failed' || build.state == 'failing'" so it is automatically skipped for passing builds. Making it always-on (rather than requiring a special "buildkite analyze" comment trigger) ensures it is available whenever a build fails without needing to be requested in advance. Remove the run_analyze config flag and the "analyze" action from the PR comment trigger regex since they are no longer needed. Made-with: Cursor --- .buildkite/ml_pipeline/config.py | 4 +--- .buildkite/pipeline.json.py | 8 +++++--- .buildkite/pull-requests.json | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.buildkite/ml_pipeline/config.py b/.buildkite/ml_pipeline/config.py index 502217c00..4669ce8b5 100644 --- a/.buildkite/ml_pipeline/config.py +++ b/.buildkite/ml_pipeline/config.py @@ -19,7 +19,6 @@ class Config: build_x86_64: str = "" run_qa_tests: bool = False run_pytorch_tests: bool = False - run_analyze: bool = False action: str = "build" def parse_comment(self): @@ -38,8 +37,7 @@ def parse_comment(self): self.action = os.environ["GITHUB_PR_COMMENT_VAR_ACTION"] self.run_qa_tests = self.action == "run_qa_tests" self.run_pytorch_tests = self.action == "run_pytorch_tests" - self.run_analyze = self.action == "analyze" - if self.run_pytorch_tests or self.run_qa_tests or self.run_analyze: + if self.run_pytorch_tests or self.run_qa_tests: self.action = "build" # If the ACTION is set to "run_qa_tests" then set some optional variables governing the ES branch to build, the diff --git a/.buildkite/pipeline.json.py b/.buildkite/pipeline.json.py index 1e90d3653..e26cf8781 100755 --- a/.buildkite/pipeline.json.py +++ b/.buildkite/pipeline.json.py @@ -81,9 +81,11 @@ def main(): # Check for build timing regressions against nightly baseline pipeline_steps.append(pipeline_steps.generate_step("Check build timing regressions", ".buildkite/pipelines/check_build_regression.yml.sh")) - if config.run_analyze: - pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure", - ".buildkite/pipelines/analyze_build_failure.yml.sh")) + # Analyze failures with AI — the step itself uses + # if: "build.state == 'failed' || build.state == 'failing'" + # so it is automatically skipped for passing builds. + pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure", + ".buildkite/pipelines/analyze_build_failure.yml.sh")) pipeline["env"] = env pipeline["steps"] = pipeline_steps diff --git a/.buildkite/pull-requests.json b/.buildkite/pull-requests.json index 81248a767..dcea82794 100644 --- a/.buildkite/pull-requests.json +++ b/.buildkite/pull-requests.json @@ -9,7 +9,7 @@ "commit_status_context": "ml-cpp-ci", "build_on_commit": true, "build_on_comment": true, - "trigger_comment_regex": "^(?:(?:buildkite +)(?build|debug|run_qa_tests|run_pytorch_tests|analyze)(=(?(?:[^ ]+)))? *(?: for ES_BRANCH=(?([.0-9a-zA-Z]+)))? *(?:with STACK_VERSION=(?([.0-9]+)))? *(?: *on *(?(?:[ ,]*(?:windows|linux|mac(os)?))+))?) *(?(?:[, ]*aarch64|x86_64)+)?$", + "trigger_comment_regex": "^(?:(?:buildkite +)(?build|debug|run_qa_tests|run_pytorch_tests)(=(?(?:[^ ]+)))? *(?: for ES_BRANCH=(?([.0-9a-zA-Z]+)))? *(?:with STACK_VERSION=(?([.0-9]+)))? *(?: *on *(?(?:[ ,]*(?:windows|linux|mac(os)?))+))?) *(?(?:[, ]*aarch64|x86_64)+)?$", "always_trigger_comment_regex": "^(?:(?:buildkite\\W+)?(?:build|test)\\W+(?:this|it))", "skip_ci_labels": ["skip-ci", "jenkins-ci", ">test-mute", ">docs"], "skip_target_branches": ["6.8", "7.11", "7.12"], From 13b1fb23b42198bc15bf0464ce27d71e79c4760a Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Fri, 20 Mar 2026 13:35:07 +1300 Subject: [PATCH 12/21] [ML] TEMPORARY: deliberate compile error for CI testing Introduce a compile error to test the build failure analysis step. This commit will be reverted immediately after verifying the step. Made-with: Cursor --- lib/ver/CBuildInfo.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/ver/CBuildInfo.cc b/lib/ver/CBuildInfo.cc index c1c916bdc..8090698d1 100644 --- a/lib/ver/CBuildInfo.cc +++ b/lib/ver/CBuildInfo.cc @@ -10,6 +10,8 @@ */ #include +#error "DELIBERATE FAILURE: testing CI build failure analysis step — will be reverted" + #include #include From 1985f089964a5263824a489b3e5b0fbbb1b70bc9 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Fri, 20 Mar 2026 13:51:35 +1300 Subject: [PATCH 13/21] [ML] Fix analyze step and revert deliberate compile error Remove the Buildkite `if` condition from analyze_build_failure.yml.sh. Buildkite evaluates `if` on dynamically uploaded steps at upload time (not at step execution time), so the condition always saw build.state == 'running' and the step was never created. The Python script already checks the build state via the Buildkite API and exits early if the build passed, so the YAML-level `if` is unnecessary. Also reverts the deliberate compile error in CBuildInfo.cc that was used to test the failure analysis flow. Made-with: Cursor --- .buildkite/pipelines/analyze_build_failure.yml.sh | 1 - lib/ver/CBuildInfo.cc | 2 -- 2 files changed, 3 deletions(-) diff --git a/.buildkite/pipelines/analyze_build_failure.yml.sh b/.buildkite/pipelines/analyze_build_failure.yml.sh index 4e74c249c..a0c48dd67 100755 --- a/.buildkite/pipelines/analyze_build_failure.yml.sh +++ b/.buildkite/pipelines/analyze_build_failure.yml.sh @@ -28,7 +28,6 @@ fi cat <<'EOL' allow_dependency_failure: true - if: "build.state == 'failed' || build.state == 'failing'" soft_fail: true agents: image: "python:3-slim" diff --git a/lib/ver/CBuildInfo.cc b/lib/ver/CBuildInfo.cc index 8090698d1..c1c916bdc 100644 --- a/lib/ver/CBuildInfo.cc +++ b/lib/ver/CBuildInfo.cc @@ -10,8 +10,6 @@ */ #include -#error "DELIBERATE FAILURE: testing CI build failure analysis step — will be reverted" - #include #include From 74290242ebf359e4b539c6c75ea2ae4fbe257c51 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Fri, 20 Mar 2026 13:53:17 +1300 Subject: [PATCH 14/21] [ML] TEMPORARY: deliberate compile error for CI testing (take 2) Made-with: Cursor --- lib/ver/CBuildInfo.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/ver/CBuildInfo.cc b/lib/ver/CBuildInfo.cc index c1c916bdc..8090698d1 100644 --- a/lib/ver/CBuildInfo.cc +++ b/lib/ver/CBuildInfo.cc @@ -10,6 +10,8 @@ */ #include +#error "DELIBERATE FAILURE: testing CI build failure analysis step — will be reverted" + #include #include From 8466537ebf4a397d4ac1afb16970c8c1653d0015 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Fri, 20 Mar 2026 14:11:30 +1300 Subject: [PATCH 15/21] [ML] Fix analyze step Docker image and revert compile error Use python:3 instead of python:3-slim for the analyze_build_failure step. The slim image lacks curl and git which the Buildkite agent hooks require. Also reverts the deliberate compile error. Made-with: Cursor --- .buildkite/pipelines/analyze_build_failure.yml.sh | 2 +- lib/ver/CBuildInfo.cc | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.buildkite/pipelines/analyze_build_failure.yml.sh b/.buildkite/pipelines/analyze_build_failure.yml.sh index a0c48dd67..8bdce871b 100755 --- a/.buildkite/pipelines/analyze_build_failure.yml.sh +++ b/.buildkite/pipelines/analyze_build_failure.yml.sh @@ -30,5 +30,5 @@ cat <<'EOL' allow_dependency_failure: true soft_fail: true agents: - image: "python:3-slim" + image: "python:3" EOL diff --git a/lib/ver/CBuildInfo.cc b/lib/ver/CBuildInfo.cc index 8090698d1..c1c916bdc 100644 --- a/lib/ver/CBuildInfo.cc +++ b/lib/ver/CBuildInfo.cc @@ -10,8 +10,6 @@ */ #include -#error "DELIBERATE FAILURE: testing CI build failure analysis step — will be reverted" - #include #include From 925c2d65e4b82a372d473160f10a81c8b2f1da25 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Fri, 20 Mar 2026 14:12:35 +1300 Subject: [PATCH 16/21] [ML] TEMPORARY: deliberate compile error for CI testing (take 3) Made-with: Cursor --- lib/ver/CBuildInfo.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/ver/CBuildInfo.cc b/lib/ver/CBuildInfo.cc index c1c916bdc..8090698d1 100644 --- a/lib/ver/CBuildInfo.cc +++ b/lib/ver/CBuildInfo.cc @@ -10,6 +10,8 @@ */ #include +#error "DELIBERATE FAILURE: testing CI build failure analysis step — will be reverted" + #include #include From 0d804154c3784f68e6b07f85aa423099e2d35133 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Fri, 20 Mar 2026 14:31:27 +1300 Subject: [PATCH 17/21] [ML] Revert deliberate compile error after successful CI test The "Analyze build failure" step ran successfully on Build #2385, correctly identifying the deliberate #error as a code bug with high confidence. Reverting to restore normal builds. Made-with: Cursor --- lib/ver/CBuildInfo.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/lib/ver/CBuildInfo.cc b/lib/ver/CBuildInfo.cc index 8090698d1..c1c916bdc 100644 --- a/lib/ver/CBuildInfo.cc +++ b/lib/ver/CBuildInfo.cc @@ -10,8 +10,6 @@ */ #include -#error "DELIBERATE FAILURE: testing CI build failure analysis step — will be reverted" - #include #include From bdea42c38b29fdc6677e7e2ddb0d29a21c1596a1 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Fri, 20 Mar 2026 15:00:51 +1300 Subject: [PATCH 18/21] [ML] Make analyze step opt-in via "buildkite analyze" PR comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of always including the analysis step or requiring a full rebuild, "buildkite analyze" now triggers a lightweight pipeline that finds the most recent failed build for the branch via the Buildkite API and analyzes it retroactively — no recompilation needed. Also improves log extraction: instead of blindly taking the last 30K chars (which often misses the actual error), the script now scans for error patterns and extracts matching lines with surrounding context. Made-with: Cursor --- .buildkite/ml_pipeline/config.py | 4 +- .buildkite/pipeline.json.py | 21 ++- .../pipelines/analyze_build_failure.yml.sh | 12 +- .buildkite/pull-requests.json | 2 +- dev-tools/analyze_build_failure.py | 125 +++++++++++++++++- 5 files changed, 146 insertions(+), 18 deletions(-) diff --git a/.buildkite/ml_pipeline/config.py b/.buildkite/ml_pipeline/config.py index 4669ce8b5..502217c00 100644 --- a/.buildkite/ml_pipeline/config.py +++ b/.buildkite/ml_pipeline/config.py @@ -19,6 +19,7 @@ class Config: build_x86_64: str = "" run_qa_tests: bool = False run_pytorch_tests: bool = False + run_analyze: bool = False action: str = "build" def parse_comment(self): @@ -37,7 +38,8 @@ def parse_comment(self): self.action = os.environ["GITHUB_PR_COMMENT_VAR_ACTION"] self.run_qa_tests = self.action == "run_qa_tests" self.run_pytorch_tests = self.action == "run_pytorch_tests" - if self.run_pytorch_tests or self.run_qa_tests: + self.run_analyze = self.action == "analyze" + if self.run_pytorch_tests or self.run_qa_tests or self.run_analyze: self.action = "build" # If the ACTION is set to "run_qa_tests" then set some optional variables governing the ES branch to build, the diff --git a/.buildkite/pipeline.json.py b/.buildkite/pipeline.json.py index e26cf8781..87fd84f95 100755 --- a/.buildkite/pipeline.json.py +++ b/.buildkite/pipeline.json.py @@ -24,16 +24,28 @@ ) def main(): + config = buildConfig.Config() + config.parse() + pipeline = {} pipeline_steps = step.PipelineStep([]) + + # "buildkite analyze" triggers a lightweight pipeline that finds and + # analyzes the most recent failed build for this branch — no compilation. + if config.run_analyze: + pipeline["env"] = {"ML_ANALYZE_PREVIOUS": "true"} + pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure", + ".buildkite/pipelines/analyze_build_failure.yml.sh")) + pipeline["steps"] = pipeline_steps + print(json.dumps(pipeline, indent=2)) + return + pipeline_steps.append(pipeline_steps.generate_step("Queue a :slack: notification for the pipeline", ".buildkite/pipelines/send_slack_notification.sh")) pipeline_steps.append(pipeline_steps.generate_step("Queue a :email: notification for the pipeline", ".buildkite/pipelines/send_email_notification.sh")) pipeline_steps.append(pipeline_steps.generate_step("Upload clang-format validation", ".buildkite/pipelines/format_and_validation.yml.sh")) - config = buildConfig.Config() - config.parse() # Compute which build step keys will exist so that analytics and # failure-analysis steps can emit a correct depends_on list. @@ -81,11 +93,6 @@ def main(): # Check for build timing regressions against nightly baseline pipeline_steps.append(pipeline_steps.generate_step("Check build timing regressions", ".buildkite/pipelines/check_build_regression.yml.sh")) - # Analyze failures with AI — the step itself uses - # if: "build.state == 'failed' || build.state == 'failing'" - # so it is automatically skipped for passing builds. - pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure", - ".buildkite/pipelines/analyze_build_failure.yml.sh")) pipeline["env"] = env pipeline["steps"] = pipeline_steps diff --git a/.buildkite/pipelines/analyze_build_failure.yml.sh b/.buildkite/pipelines/analyze_build_failure.yml.sh index 8bdce871b..984b0c5bd 100755 --- a/.buildkite/pipelines/analyze_build_failure.yml.sh +++ b/.buildkite/pipelines/analyze_build_failure.yml.sh @@ -8,16 +8,22 @@ # compliance with the Elastic License 2.0 and the foregoing additional # limitation. -cat <<'EOL' +EXTRA_FLAGS="" +if [ "${ML_ANALYZE_PREVIOUS:-}" = "true" ]; then + EXTRA_FLAGS=" --find-previous-failure" +fi + +cat <build|debug|run_qa_tests|run_pytorch_tests)(=(?(?:[^ ]+)))? *(?: for ES_BRANCH=(?([.0-9a-zA-Z]+)))? *(?:with STACK_VERSION=(?([.0-9]+)))? *(?: *on *(?(?:[ ,]*(?:windows|linux|mac(os)?))+))?) *(?(?:[, ]*aarch64|x86_64)+)?$", + "trigger_comment_regex": "^(?:(?:buildkite +)(?build|debug|run_qa_tests|run_pytorch_tests|analyze)(=(?(?:[^ ]+)))? *(?: for ES_BRANCH=(?([.0-9a-zA-Z]+)))? *(?:with STACK_VERSION=(?([.0-9]+)))? *(?: *on *(?(?:[ ,]*(?:windows|linux|mac(os)?))+))?) *(?(?:[, ]*aarch64|x86_64)+)?$", "always_trigger_comment_regex": "^(?:(?:buildkite\\W+)?(?:build|test)\\W+(?:this|it))", "skip_ci_labels": ["skip-ci", "jenkins-ci", ">test-mute", ">docs"], "skip_target_branches": ["6.8", "7.11", "7.12"], diff --git a/dev-tools/analyze_build_failure.py b/dev-tools/analyze_build_failure.py index ba93e1d33..e097771e1 100755 --- a/dev-tools/analyze_build_failure.py +++ b/dev-tools/analyze_build_failure.py @@ -13,6 +13,10 @@ # Analyze a specific build python3 dev-tools/analyze_build_failure.py --pipeline ml-cpp-snapshot-builds --build 5819 + # Find and analyze the most recent failed build for the current branch + # (used by "buildkite analyze" PR comment — no rebuild needed) + python3 dev-tools/analyze_build_failure.py --find-previous-failure + # Dry run (print to stdout, don't annotate or post to Slack/GitHub) python3 dev-tools/analyze_build_failure.py --pipeline ml-cpp-snapshot-builds --build 5819 --dry-run @@ -27,8 +31,10 @@ import argparse import json import os +import re import subprocess import sys +import urllib.parse import urllib.request import urllib.error from pathlib import Path @@ -98,6 +104,20 @@ def buildkite_get(path, token): return json.loads(resp.read()) +def find_previous_failed_build(pipeline, token, branch=None, exclude_build=None): + """Find the most recent failed build for a pipeline, optionally filtered by branch.""" + params = {"state": "failed", "per_page": "5"} + if branch: + params["branch"] = branch + query = urllib.parse.urlencode(params) + builds = buildkite_get(f"pipelines/{pipeline}/builds?{query}", token) + for build in builds: + if exclude_build and build.get("number") == exclude_build: + continue + return build + return None + + def get_job_log(log_url, token): """Fetch the raw log for a Buildkite job.""" req = urllib.request.Request( @@ -114,11 +134,92 @@ def get_job_log(log_url, token): return None -def truncate_log(log_text, max_chars=MAX_LOG_CHARS): - """Keep the last max_chars of the log (the end usually has the error).""" - if not log_text or len(log_text) <= max_chars: +ERROR_PATTERNS = re.compile( + r"(?i)" + r"(?:^|\s)error(?:\s|:|\[|C\d)" # "error:", "error C2338", "error[E" + r"|fatal error" + r"|^#error\b" + r"|FAILED" + r"|BOOST_ERROR" + r"|BOOST_FAIL" + r"|: fatal:" # linker fatal + r"|ninja: build stopped" + r"|make.*\*\*\*" # make: *** [target] Error + r"|CMake Error" + r"|assertion failed" + r"|LINK : fatal" # MSVC linker + r"|unresolved external" + r"|cannot find -l" # linker: cannot find library + r"|undefined reference" + r"|Segmentation fault" + r"|signal \d+" + r"|exit code \d+" + r"|Exit status: \d+(?!.*exit code 0)" +) + +ANSI_ESCAPE = re.compile(r"\x1b\[[0-9;]*[A-Za-z]|\x1b\].*?\x07") +BK_TIMESTAMP = re.compile(r"_bk;t=\d+") + + +def strip_terminal_noise(log_text): + """Remove ANSI escapes and Buildkite timestamp markers.""" + text = ANSI_ESCAPE.sub("", log_text) + return BK_TIMESTAMP.sub("", text) + + +def extract_error_context(log_text, context_lines=10, max_chars=MAX_LOG_CHARS): + """Extract error-relevant sections from a build log. + + Scans every line for error patterns and collects matching lines with + surrounding context. Always appends the tail of the log (which + typically contains the build summary / exit code). The combined + output is capped at *max_chars*. + """ + if not log_text: + return log_text + + log_text = strip_terminal_noise(log_text) + lines = log_text.splitlines() + + if len(log_text) <= max_chars: return log_text - return f"... [truncated {len(log_text) - max_chars} chars] ...\n" + log_text[-max_chars:] + + # Find line indices that match error patterns. + error_indices = set() + for i, line in enumerate(lines): + if ERROR_PATTERNS.search(line): + error_indices.add(i) + + # Expand each match with context_lines before/after, merging overlaps. + include = set() + for idx in sorted(error_indices): + for j in range(max(0, idx - context_lines), min(len(lines), idx + context_lines + 1)): + include.add(j) + + # Always include the last 80 lines (build summary / exit info). + tail_start = max(0, len(lines) - 80) + for j in range(tail_start, len(lines)): + include.add(j) + + # Build the excerpt, inserting "..." markers for skipped regions. + sections = [] + prev = -2 + for i in sorted(include): + if i != prev + 1: + sections.append("... [skipped] ...") + sections.append(lines[i]) + prev = i + + excerpt = "\n".join(sections) + + # Final safety cap — if still too long, keep the head and tail. + if len(excerpt) > max_chars: + half = max_chars // 2 + excerpt = (excerpt[:half] + + f"\n... [trimmed {len(excerpt) - max_chars} chars] ...\n" + + excerpt[-half:]) + + return excerpt def call_claude(api_key, prompt): @@ -229,6 +330,8 @@ def main(): parser = argparse.ArgumentParser(description="Analyze Buildkite build failures with Claude") parser.add_argument("--pipeline", default=os.environ.get("BUILDKITE_PIPELINE_SLUG")) parser.add_argument("--build", type=int, default=int(os.environ.get("BUILDKITE_BUILD_NUMBER", "0"))) + parser.add_argument("--find-previous-failure", action="store_true", + help="Find and analyze the most recent failed build for the current branch") parser.add_argument("--dry-run", action="store_true", help="Print analysis without annotating or posting to Slack") args = parser.parse_args() @@ -247,6 +350,16 @@ def main(): print("Error: No Anthropic API key available", file=sys.stderr) sys.exit(1) + if args.find_previous_failure: + branch = os.environ.get("BUILDKITE_BRANCH") + print(f"Searching for previous failed build on branch '{branch}'...") + prev = find_previous_failed_build(args.pipeline, bk_token, branch, args.build) + if not prev: + print(f"No previous failed build found for branch '{branch}' — nothing to analyze.") + sys.exit(0) + args.build = prev["number"] + print(f"Found failed build #{args.build}: {prev.get('web_url', '')}") + print(f"Analyzing {args.pipeline} build #{args.build}...") build = buildkite_get(f"pipelines/{args.pipeline}/builds/{args.build}", bk_token) @@ -284,7 +397,7 @@ def main(): print(f" Could not fetch log, skipping") continue - log_excerpt = truncate_log(log_text) + log_excerpt = extract_error_context(log_text) prompt = f"""Analyze this CI build failure. @@ -295,7 +408,7 @@ def main(): {KNOWN_FAILURE_PATTERNS} -**Build log (last {MAX_LOG_CHARS} chars)**: +**Build log (error-relevant sections extracted from full log)**: ``` {log_excerpt} ``` From b3c7cb114965c479a34ad0f0209b1bdea4f91475 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Fri, 20 Mar 2026 15:37:42 +1300 Subject: [PATCH 19/21] [ML] Improve Boost.Test failure detection in log extraction Replace BOOST_ERROR/BOOST_FAIL patterns (source-code macro names that don't appear in logs) with a pattern matching the actual Boost.Test summary output: "*** N failure(s) detected in test suite". Made-with: Cursor --- dev-tools/analyze_build_failure.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dev-tools/analyze_build_failure.py b/dev-tools/analyze_build_failure.py index e097771e1..43d6b449c 100755 --- a/dev-tools/analyze_build_failure.py +++ b/dev-tools/analyze_build_failure.py @@ -140,8 +140,7 @@ def get_job_log(log_url, token): r"|fatal error" r"|^#error\b" r"|FAILED" - r"|BOOST_ERROR" - r"|BOOST_FAIL" + r"|\*\*\* \d+ failure" # Boost.Test: *** N failure(s) detected r"|: fatal:" # linker fatal r"|ninja: build stopped" r"|make.*\*\*\*" # make: *** [target] Error From c9d9ef029efc125cbc8744a50cbf8aa836824399 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Fri, 20 Mar 2026 15:39:57 +1300 Subject: [PATCH 20/21] =?UTF-8?q?[ML]=20TEMPORARY:=20deliberate=20test=20f?= =?UTF-8?q?ailure=20for=20CI=20analysis=20testing=20=E2=80=94=20will=20be?= =?UTF-8?q?=20reverted?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Made-with: Cursor --- lib/ver/unittest/CBuildInfoTest.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/ver/unittest/CBuildInfoTest.cc b/lib/ver/unittest/CBuildInfoTest.cc index b1382a905..bb2bc337d 100644 --- a/lib/ver/unittest/CBuildInfoTest.cc +++ b/lib/ver/unittest/CBuildInfoTest.cc @@ -19,6 +19,10 @@ BOOST_AUTO_TEST_SUITE(CBuildInfoTest) +BOOST_AUTO_TEST_CASE(testDeliberateFailure) { + BOOST_FAIL("DELIBERATE TEST FAILURE: testing CI build failure analysis step — will be reverted"); +} + BOOST_AUTO_TEST_CASE(testFullInfo) { std::string fullInfo(ml::ver::CBuildInfo::fullInfo()); LOG_DEBUG(<< fullInfo); From bfe59eb8b2f54646ce2ea21419ce9c26ef30a5aa Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Fri, 20 Mar 2026 16:13:52 +1300 Subject: [PATCH 21/21] [ML] Revert deliberate test failure after successful analysis testing The analysis step correctly identified the Boost.Test failure on all platforms. Reverting to restore normal test behaviour. Made-with: Cursor --- lib/ver/unittest/CBuildInfoTest.cc | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lib/ver/unittest/CBuildInfoTest.cc b/lib/ver/unittest/CBuildInfoTest.cc index bb2bc337d..b1382a905 100644 --- a/lib/ver/unittest/CBuildInfoTest.cc +++ b/lib/ver/unittest/CBuildInfoTest.cc @@ -19,10 +19,6 @@ BOOST_AUTO_TEST_SUITE(CBuildInfoTest) -BOOST_AUTO_TEST_CASE(testDeliberateFailure) { - BOOST_FAIL("DELIBERATE TEST FAILURE: testing CI build failure analysis step — will be reverted"); -} - BOOST_AUTO_TEST_CASE(testFullInfo) { std::string fullInfo(ml::ver::CBuildInfo::fullInfo()); LOG_DEBUG(<< fullInfo);