From 10bd343a1edee2d4e3e06dfdcde7c2c0b4e25216 Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Fri, 20 Feb 2026 13:44:05 +1300
Subject: [PATCH 01/21] [ML] Add AI-powered build failure analysis to CI
 pipelines

When a Buildkite build fails, a new soft-fail step fetches the failed
step logs and sends them to Claude for diagnosis.  The analysis
(root cause, classification, suggested fix, confidence) is posted as
a Buildkite annotation directly on the build page.

The step uses an `if` guard so it only runs when the build is
failing, and the Claude API key is retrieved from Vault at runtime.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .buildkite/branch.json.py                     |   3 +
 .buildkite/hooks/post-checkout                |   6 +
 .buildkite/job-build-test-all-debug.json.py   |   3 +
 .buildkite/pipeline.json.py                   |   3 +
 .../pipelines/analyze_build_failure.yml.sh    |  27 ++
 dev-tools/analyze_build_failure.py            | 253 ++++++++++++++++++
 6 files changed, 295 insertions(+)
 create mode 100755 .buildkite/pipelines/analyze_build_failure.yml.sh
 create mode 100755 dev-tools/analyze_build_failure.py

diff --git a/.buildkite/branch.json.py b/.buildkite/branch.json.py
index 4916a89cc..70f8f9754 100755
--- a/.buildkite/branch.json.py
+++ b/.buildkite/branch.json.py
@@ -46,6 +46,9 @@ def main():
     # Ingest step-level timings into Elasticsearch for anomaly detection
     pipeline_steps.append(pipeline_steps.generate_step("Ingest build timings",
                                                        ".buildkite/pipelines/ingest_build_timings.yml.sh"))
+    # Analyze failures with AI if the build failed
+    pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure",
+                                                       ".buildkite/pipelines/analyze_build_failure.yml.sh"))
 
     # Build the DRA artifacts and upload to S3 and GCS
     pipeline_steps.append(pipeline_steps.generate_step("Create daily releasable artifacts",
diff --git a/.buildkite/hooks/post-checkout b/.buildkite/hooks/post-checkout
index aca27b008..c3006feab 100644
--- a/.buildkite/hooks/post-checkout
+++ b/.buildkite/hooks/post-checkout
@@ -33,6 +33,12 @@ if [[ "$BUILDKITE_PIPELINE_SLUG" == ml-cpp* ]]; then
     export ES_API_KEY=$(vault read -field=api_key secret/ci/elastic-ml-cpp/elasticsearch/ci_analytics 2>/dev/null || echo "")
   fi
 
+  if [[ "$BUILDKITE_STEP_KEY" == "analyze_build_failure" ]]; then
+    export BUILDKITE_API_READ_TOKEN=$(vault read -field=token secret/ci/elastic-ml-cpp/buildkite/api_read_token 2>/dev/null || echo "")
+    export ANTHROPIC_API_KEY=$(vault read -field=api_key secret/ci/elastic-ml-cpp/anthropic/claude 2>/dev/null || echo "")
+    export SLACK_WEBHOOK_URL=$(vault read -field=url secret/ci/elastic-ml-cpp/slack/build_failure_webhook 2>/dev/null || echo "")
+  fi
+
   # GCS service account — inject credentials for build and Java IT steps.
   # Build steps use it for sccache; Java IT steps use it for the Gradle
   # build cache.  The key is stored in Vault.
diff --git a/.buildkite/job-build-test-all-debug.json.py b/.buildkite/job-build-test-all-debug.json.py
index 7aa0e4a61..39347d086 100755
--- a/.buildkite/job-build-test-all-debug.json.py
+++ b/.buildkite/job-build-test-all-debug.json.py
@@ -57,6 +57,9 @@ def main():
     # Ingest step-level timings into Elasticsearch for anomaly detection
     pipeline_steps.append(pipeline_steps.generate_step("Ingest build timings",
                                                        ".buildkite/pipelines/ingest_build_timings.yml.sh"))
+    # Analyze failures with AI if the build failed
+    pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure",
+                                                       ".buildkite/pipelines/analyze_build_failure.yml.sh"))
 
     pipeline["env"] = env
     pipeline["steps"] = pipeline_steps
diff --git a/.buildkite/pipeline.json.py b/.buildkite/pipeline.json.py
index 1796a665b..c15d6bb84 100755
--- a/.buildkite/pipeline.json.py
+++ b/.buildkite/pipeline.json.py
@@ -68,6 +68,9 @@ def main():
     # Check for build timing regressions against nightly baseline
     pipeline_steps.append(pipeline_steps.generate_step("Check build timing regressions",
                                                        ".buildkite/pipelines/check_build_regression.yml.sh"))
+    # Analyze failures with AI if the build failed
+    pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure",
+                                                       ".buildkite/pipelines/analyze_build_failure.yml.sh"))
 
     pipeline["env"] = env
     pipeline["steps"] = pipeline_steps
diff --git a/.buildkite/pipelines/analyze_build_failure.yml.sh b/.buildkite/pipelines/analyze_build_failure.yml.sh
new file mode 100755
index 000000000..162c3fb05
--- /dev/null
+++ b/.buildkite/pipelines/analyze_build_failure.yml.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+# or more contributor license agreements. Licensed under the Elastic License
+# 2.0 and the following additional limitation. Functionality enabled by the
+# files subject to the Elastic License 2.0 may only be used in production when
+# invoked by an Elasticsearch process with a license key installed that permits
+# use of machine learning features. You may not use this file except in
+# compliance with the Elastic License 2.0 and the foregoing additional
+# limitation.
+
+cat <<EOL
+steps:
+  - label: "Analyze build failure :mag:"
+    key: "analyze_build_failure"
+    command:
+        - "python3 dev-tools/analyze_build_failure.py --pipeline \$BUILDKITE_PIPELINE_SLUG --build \$BUILDKITE_BUILD_NUMBER"
+    depends_on:
+        - "build_test_linux-aarch64-RelWithDebInfo"
+        - "build_test_linux-x86_64-RelWithDebInfo"
+        - "build_test_macos-aarch64-RelWithDebInfo"
+        - "build_test_Windows-x86_64-RelWithDebInfo"
+    allow_dependency_failure: true
+    if: "build.state == 'failed' || build.state == 'failing'"
+    soft_fail: true
+    agents:
+      image: "python:3-slim"
+EOL
diff --git a/dev-tools/analyze_build_failure.py b/dev-tools/analyze_build_failure.py
new file mode 100755
index 000000000..7522a6840
--- /dev/null
+++ b/dev-tools/analyze_build_failure.py
@@ -0,0 +1,253 @@
+#!/usr/bin/env python3
+"""Analyze a Buildkite build failure using Claude and post a diagnosis.
+
+Fetches logs from failed build steps, sends them to the Anthropic Claude API
+with repository context, and posts the analysis as a Buildkite annotation.
+
+Usage:
+    # Analyze the current build (in CI)
+    python3 dev-tools/analyze_build_failure.py
+
+    # Analyze a specific build
+    python3 dev-tools/analyze_build_failure.py --pipeline ml-cpp-snapshot-builds --build 5819
+
+    # Dry run (print to stdout, don't annotate)
+    python3 dev-tools/analyze_build_failure.py --pipeline ml-cpp-snapshot-builds --build 5819 --dry-run
+
+Environment:
+    BUILDKITE_TOKEN / BUILDKITE_API_READ_TOKEN   Buildkite API token
+    ANTHROPIC_API_KEY                             Claude API key
+    BUILDKITE_PIPELINE_SLUG                       Current pipeline (set by Buildkite)
+    BUILDKITE_BUILD_NUMBER                        Current build number (set by Buildkite)
+"""
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+import urllib.request
+import urllib.error
+from pathlib import Path
+
+BUILDKITE_ORG = "elastic"
+ANTHROPIC_API_URL = "https://api.anthropic.com/v1/messages"
+ANTHROPIC_MODEL = "claude-sonnet-4-20250514"
+MAX_LOG_CHARS = 30000
+MAX_RESPONSE_TOKENS = 2048
+
+KNOWN_FAILURE_PATTERNS = """
+Known transient/infrastructure failures:
+- "Unable to download toolchain" / Adoptium JDK download failure: transient, retry usually fixes it
+- "Exceeded maximum artifact size limit of 10 GiB": artifact_paths glob is too broad
+- "sccache: error: couldn't connect to server": sccache server not running, check setup_sccache.sh
+- CKMostCorrelatedTest/testScale timeout: CPU contention on low-core machines, check test parallelism
+- CMultiFileDataAdderTest collision: test isolation bug with temp file naming
+
+Known compilation patterns:
+- "redefinition of" in unity builds: file needs SKIP_UNITY_BUILD_INCLUSION or unity disabled for library
+- boost/unordered_map.hpp conflicts: remove from PCH list
+- "mspdbsrv.exe" errors on Windows: switch from /Zi to /Z7
+"""
+
+SYSTEM_PROMPT = """You are a CI build failure analyst for the elastic/ml-cpp repository.
+This is a C++ codebase that builds on Linux (x86_64, aarch64), macOS (aarch64), and Windows (x86_64).
+Build system: CMake with Boost, uses Docker for Linux builds, Gradle for macOS/Windows, Buildkite for CI.
+
+Your job is to:
+1. Identify the root cause of the failure from the build log
+2. Classify it as: code bug, test failure, infrastructure/transient, configuration issue, or dependency issue
+3. Suggest a specific fix or workaround
+4. If it's transient, say so clearly — don't over-diagnose
+
+Be concise and actionable. Use markdown formatting.
+Format your response as:
+
+### Root Cause
+<1-2 sentences>
+
+### Classification
+<one of: code bug | test failure | infrastructure/transient | configuration | dependency>
+
+### Suggested Fix
+<specific actionable steps>
+
+### Confidence
+<high | medium | low> — <brief justification>
+"""
+
+
+def get_env_or_file(env_var, file_path):
+    val = os.environ.get(env_var, "").strip()
+    if val:
+        return val
+    if file_path:
+        p = Path(file_path).expanduser()
+        if p.exists():
+            return p.read_text().strip()
+    return None
+
+
+def buildkite_get(path, token):
+    url = f"https://api.buildkite.com/v2/organizations/{BUILDKITE_ORG}/{path}"
+    req = urllib.request.Request(url, headers={"Authorization": f"Bearer {token}"})
+    with urllib.request.urlopen(req) as resp:
+        return json.loads(resp.read())
+
+
+def get_job_log(log_url, token):
+    """Fetch the raw log for a Buildkite job."""
+    req = urllib.request.Request(
+        log_url,
+        headers={
+            "Authorization": f"Bearer {token}",
+            "Accept": "text/plain",
+        },
+    )
+    try:
+        with urllib.request.urlopen(req) as resp:
+            return resp.read().decode("utf-8", errors="replace")
+    except urllib.error.HTTPError:
+        return None
+
+
+def truncate_log(log_text, max_chars=MAX_LOG_CHARS):
+    """Keep the last max_chars of the log (the end usually has the error)."""
+    if not log_text or len(log_text) <= max_chars:
+        return log_text
+    return f"... [truncated {len(log_text) - max_chars} chars] ...\n" + log_text[-max_chars:]
+
+
+def call_claude(api_key, prompt):
+    body = json.dumps({
+        "model": ANTHROPIC_MODEL,
+        "max_tokens": MAX_RESPONSE_TOKENS,
+        "system": SYSTEM_PROMPT,
+        "messages": [{"role": "user", "content": prompt}],
+    }).encode("utf-8")
+
+    req = urllib.request.Request(
+        ANTHROPIC_API_URL,
+        data=body,
+        headers={
+            "x-api-key": api_key,
+            "anthropic-version": "2023-06-01",
+            "Content-Type": "application/json",
+        },
+    )
+    with urllib.request.urlopen(req, timeout=60) as resp:
+        result = json.loads(resp.read())
+
+    for block in result.get("content", []):
+        if block.get("type") == "text":
+            return block["text"]
+    return "No analysis generated."
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Analyze Buildkite build failures with Claude")
+    parser.add_argument("--pipeline", default=os.environ.get("BUILDKITE_PIPELINE_SLUG"))
+    parser.add_argument("--build", type=int, default=int(os.environ.get("BUILDKITE_BUILD_NUMBER", "0")))
+    parser.add_argument("--dry-run", action="store_true", help="Print analysis without annotating")
+    args = parser.parse_args()
+
+    if not args.pipeline or not args.build:
+        print("Error: --pipeline and --build required", file=sys.stderr)
+        sys.exit(1)
+
+    bk_token = (get_env_or_file("BUILDKITE_TOKEN", "~/.buildkite/token")
+                or get_env_or_file("BUILDKITE_API_READ_TOKEN", ""))
+    claude_key = get_env_or_file("ANTHROPIC_API_KEY", "~/.elastic/claude_api_key")
+
+    if not bk_token:
+        print("Error: No Buildkite token available", file=sys.stderr)
+        sys.exit(1)
+    if not claude_key:
+        print("Error: No Anthropic API key available", file=sys.stderr)
+        sys.exit(1)
+
+    print(f"Analyzing {args.pipeline} build #{args.build}...")
+
+    build = buildkite_get(f"pipelines/{args.pipeline}/builds/{args.build}", bk_token)
+
+    if build.get("state") == "passed":
+        print("Build passed — nothing to analyze.")
+        sys.exit(0)
+
+    failed_jobs = [
+        j for j in build.get("jobs", [])
+        if j.get("type") == "script" and j.get("state") == "failed"
+    ]
+
+    if not failed_jobs:
+        print("No failed steps found.")
+        sys.exit(0)
+
+    print(f"Found {len(failed_jobs)} failed step(s)")
+
+    all_analyses = []
+
+    for job in failed_jobs:
+        step_key = job.get("step_key", "unknown")
+        step_label = job.get("name", step_key)
+        raw_log_url = job.get("raw_log_url", "")
+
+        print(f"\nAnalyzing: {step_label} ({step_key})")
+
+        log_text = get_job_log(raw_log_url, bk_token) if raw_log_url else None
+        if not log_text:
+            print(f"  Could not fetch log, skipping")
+            continue
+
+        log_excerpt = truncate_log(log_text)
+
+        prompt = f"""Analyze this CI build failure.
+
+**Pipeline**: {args.pipeline}
+**Build**: #{args.build}
+**Branch**: {build.get('branch', 'unknown')}
+**Failed step**: {step_label} (key: {step_key})
+
+{KNOWN_FAILURE_PATTERNS}
+
+**Build log (last {MAX_LOG_CHARS} chars)**:
+```
+{log_excerpt}
+```
+
+Analyze the root cause and suggest a fix."""
+
+        try:
+            analysis = call_claude(claude_key, prompt)
+        except Exception as e:
+            analysis = f"Failed to get analysis: {e}"
+
+        print(f"\n{analysis}")
+        all_analyses.append(f"## {step_label}\n\n{analysis}")
+
+    if not all_analyses:
+        print("No analyses generated.")
+        sys.exit(0)
+
+    full_annotation = f"# 🔍 Build Failure Analysis\n\n"
+    full_annotation += f"*Pipeline*: `{args.pipeline}` | *Build*: #{args.build} | *Branch*: `{build.get('branch', '?')}`\n\n"
+    full_annotation += "\n\n---\n\n".join(all_analyses)
+    full_annotation += "\n\n---\n*Analysis generated by Claude. Verify before acting.*"
+
+    if not args.dry_run:
+        try:
+            subprocess.run(
+                ["buildkite-agent", "annotate",
+                 "--style", "error",
+                 "--context", "build-failure-analysis"],
+                input=full_annotation.encode(),
+                check=True,
+            )
+            print("\nAnnotation posted to Buildkite.")
+        except (FileNotFoundError, subprocess.CalledProcessError) as e:
+            print(f"\nCould not post annotation: {e}", file=sys.stderr)
+            print("Full analysis printed above.")
+
+
+if __name__ == "__main__":
+    main()

From 4dd54423e567ee83bb32a894416a4c98f1d77a9c Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Fri, 20 Feb 2026 13:58:31 +1300
Subject: [PATCH 02/21] [ML] Add Slack notifications to build failure analyzer

When SLACK_WEBHOOK_URL is set, posts a compact summary of each failed
step's AI diagnosis to #machine-learn-build.  The message includes the
classification emoji, root cause, and a link back to the build page.

The webhook URL is retrieved from Vault at runtime; if absent, the
Slack step is silently skipped and only the Buildkite annotation is
posted.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 dev-tools/analyze_build_failure.py | 98 +++++++++++++++++++++++++++++-
 1 file changed, 95 insertions(+), 3 deletions(-)

diff --git a/dev-tools/analyze_build_failure.py b/dev-tools/analyze_build_failure.py
index 7522a6840..cd55769af 100755
--- a/dev-tools/analyze_build_failure.py
+++ b/dev-tools/analyze_build_failure.py
@@ -2,7 +2,8 @@
 """Analyze a Buildkite build failure using Claude and post a diagnosis.
 
 Fetches logs from failed build steps, sends them to the Anthropic Claude API
-with repository context, and posts the analysis as a Buildkite annotation.
+with repository context, and posts the analysis as a Buildkite annotation
+and optionally to Slack.
 
 Usage:
     # Analyze the current build (in CI)
@@ -11,12 +12,13 @@
     # Analyze a specific build
     python3 dev-tools/analyze_build_failure.py --pipeline ml-cpp-snapshot-builds --build 5819
 
-    # Dry run (print to stdout, don't annotate)
+    # Dry run (print to stdout, don't annotate or post to Slack)
     python3 dev-tools/analyze_build_failure.py --pipeline ml-cpp-snapshot-builds --build 5819 --dry-run
 
 Environment:
     BUILDKITE_TOKEN / BUILDKITE_API_READ_TOKEN   Buildkite API token
     ANTHROPIC_API_KEY                             Claude API key
+    SLACK_WEBHOOK_URL                             Slack incoming webhook (optional)
     BUILDKITE_PIPELINE_SLUG                       Current pipeline (set by Buildkite)
     BUILDKITE_BUILD_NUMBER                        Current build number (set by Buildkite)
 """
@@ -144,11 +146,88 @@ def call_claude(api_key, prompt):
     return "No analysis generated."
 
 
+def post_to_slack(webhook_url, pipeline, build_number, branch, build_url, analyses):
+    """Post a summary of the failure analysis to Slack."""
+    # Slack uses mrkdwn, not full markdown — convert minimally
+    blocks = [
+        {
+            "type": "header",
+            "text": {
+                "type": "plain_text",
+                "text": "Build Failure Analysis",
+            },
+        },
+        {
+            "type": "section",
+            "text": {
+                "type": "mrkdwn",
+                "text": (
+                    f"*Pipeline:* `{pipeline}` | *Build:* <{build_url}|#{build_number}> | *Branch:* `{branch}`"
+                ),
+            },
+        },
+    ]
+
+    for step_label, analysis in analyses:
+        # Extract just the classification and root cause for a compact Slack message
+        lines = analysis.split("\n")
+        root_cause = ""
+        classification = ""
+        for i, line in enumerate(lines):
+            if line.startswith("### Root Cause"):
+                root_cause = lines[i + 1].strip() if i + 1 < len(lines) else ""
+            elif line.startswith("### Classification"):
+                classification = lines[i + 1].strip() if i + 1 < len(lines) else ""
+
+        emoji = {
+            "infrastructure/transient": ":cloud:",
+            "code bug": ":bug:",
+            "test failure": ":test_tube:",
+            "configuration": ":gear:",
+            "dependency": ":package:",
+        }.get(classification, ":warning:")
+
+        blocks.append({"type": "divider"})
+        blocks.append({
+            "type": "section",
+            "text": {
+                "type": "mrkdwn",
+                "text": f"{emoji} *{step_label}*\n>{root_cause}\n_Classification: {classification}_",
+            },
+        })
+
+    blocks.append({"type": "divider"})
+    blocks.append({
+        "type": "context",
+        "elements": [
+            {
+                "type": "mrkdwn",
+                "text": f"<{build_url}|View build> | Analysis by Claude — verify before acting",
+            }
+        ],
+    })
+
+    payload = json.dumps({"blocks": blocks}).encode("utf-8")
+    req = urllib.request.Request(
+        webhook_url,
+        data=payload,
+        headers={"Content-Type": "application/json"},
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=10) as resp:
+            if resp.status == 200:
+                print("Slack notification posted.")
+            else:
+                print(f"Slack returned status {resp.status}", file=sys.stderr)
+    except Exception as e:
+        print(f"Could not post to Slack: {e}", file=sys.stderr)
+
+
 def main():
     parser = argparse.ArgumentParser(description="Analyze Buildkite build failures with Claude")
     parser.add_argument("--pipeline", default=os.environ.get("BUILDKITE_PIPELINE_SLUG"))
     parser.add_argument("--build", type=int, default=int(os.environ.get("BUILDKITE_BUILD_NUMBER", "0")))
-    parser.add_argument("--dry-run", action="store_true", help="Print analysis without annotating")
+    parser.add_argument("--dry-run", action="store_true", help="Print analysis without annotating or posting to Slack")
     args = parser.parse_args()
 
     if not args.pipeline or not args.build:
@@ -185,7 +264,11 @@ def main():
 
     print(f"Found {len(failed_jobs)} failed step(s)")
 
+    slack_webhook = get_env_or_file("SLACK_WEBHOOK_URL", "")
+    build_url = build.get("web_url", f"https://buildkite.com/{BUILDKITE_ORG}/{args.pipeline}/builds/{args.build}")
+
     all_analyses = []
+    slack_analyses = []
 
     for job in failed_jobs:
         step_key = job.get("step_key", "unknown")
@@ -224,6 +307,7 @@ def main():
 
         print(f"\n{analysis}")
         all_analyses.append(f"## {step_label}\n\n{analysis}")
+        slack_analyses.append((step_label, analysis))
 
     if not all_analyses:
         print("No analyses generated.")
@@ -248,6 +332,14 @@ def main():
             print(f"\nCould not post annotation: {e}", file=sys.stderr)
             print("Full analysis printed above.")
 
+        if slack_webhook:
+            post_to_slack(
+                slack_webhook, args.pipeline, args.build,
+                build.get("branch", "?"), build_url, slack_analyses,
+            )
+        else:
+            print("No SLACK_WEBHOOK_URL set, skipping Slack notification.")
+
 
 if __name__ == "__main__":
     main()

From 64b138e027d96947c4863ac2f67e7a5c3064fb0b Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Fri, 27 Feb 2026 15:12:07 +1300
Subject: [PATCH 03/21] Post build failure analysis as a GitHub PR comment

When the build is a PR build (BUILDKITE_PULL_REQUEST is set), post the
Claude analysis as a comment on the GitHub PR in addition to the
Buildkite annotation and Slack notification.

Uses an HTML comment marker to find and update existing comments on
rebuild/retry, avoiding duplicate comments on the same PR.

Addresses review feedback from valeriy42 requesting better visibility
of failure analysis for PR authors.

Made-with: Cursor
---
 .buildkite/hooks/post-checkout     |  1 +
 dev-tools/analyze_build_failure.py | 76 ++++++++++++++++++++++++++++--
 2 files changed, 74 insertions(+), 3 deletions(-)

diff --git a/.buildkite/hooks/post-checkout b/.buildkite/hooks/post-checkout
index c3006feab..07d50cf16 100644
--- a/.buildkite/hooks/post-checkout
+++ b/.buildkite/hooks/post-checkout
@@ -37,6 +37,7 @@ if [[ "$BUILDKITE_PIPELINE_SLUG" == ml-cpp* ]]; then
     export BUILDKITE_API_READ_TOKEN=$(vault read -field=token secret/ci/elastic-ml-cpp/buildkite/api_read_token 2>/dev/null || echo "")
     export ANTHROPIC_API_KEY=$(vault read -field=api_key secret/ci/elastic-ml-cpp/anthropic/claude 2>/dev/null || echo "")
     export SLACK_WEBHOOK_URL=$(vault read -field=url secret/ci/elastic-ml-cpp/slack/build_failure_webhook 2>/dev/null || echo "")
+    export GITHUB_TOKEN=$(vault read -field=token secret/ci/elastic-ml-cpp/github/pr_comment_token 2>/dev/null || echo "")
   fi
 
   # GCS service account — inject credentials for build and Java IT steps.
diff --git a/dev-tools/analyze_build_failure.py b/dev-tools/analyze_build_failure.py
index cd55769af..456fc5348 100755
--- a/dev-tools/analyze_build_failure.py
+++ b/dev-tools/analyze_build_failure.py
@@ -2,8 +2,8 @@
 """Analyze a Buildkite build failure using Claude and post a diagnosis.
 
 Fetches logs from failed build steps, sends them to the Anthropic Claude API
-with repository context, and posts the analysis as a Buildkite annotation
-and optionally to Slack.
+with repository context, and posts the analysis as a Buildkite annotation,
+a GitHub PR comment (for PR builds), and optionally to Slack.
 
 Usage:
     # Analyze the current build (in CI)
@@ -12,15 +12,17 @@
     # Analyze a specific build
     python3 dev-tools/analyze_build_failure.py --pipeline ml-cpp-snapshot-builds --build 5819
 
-    # Dry run (print to stdout, don't annotate or post to Slack)
+    # Dry run (print to stdout, don't annotate or post to Slack/GitHub)
     python3 dev-tools/analyze_build_failure.py --pipeline ml-cpp-snapshot-builds --build 5819 --dry-run
 
 Environment:
     BUILDKITE_TOKEN / BUILDKITE_API_READ_TOKEN   Buildkite API token
     ANTHROPIC_API_KEY                             Claude API key
+    GITHUB_TOKEN                                  GitHub API token (optional, for PR comments)
     SLACK_WEBHOOK_URL                             Slack incoming webhook (optional)
     BUILDKITE_PIPELINE_SLUG                       Current pipeline (set by Buildkite)
     BUILDKITE_BUILD_NUMBER                        Current build number (set by Buildkite)
+    BUILDKITE_PULL_REQUEST                        PR number (set by Buildkite for PR builds)
 """
 
 import argparse
@@ -33,6 +35,9 @@
 from pathlib import Path
 
 BUILDKITE_ORG = "elastic"
+GITHUB_REPO = "elastic/ml-cpp"
+GITHUB_API_URL = "https://api.github.com"
+GITHUB_COMMENT_MARKER = "<!-- build-failure-analysis -->"
 ANTHROPIC_API_URL = "https://api.anthropic.com/v1/messages"
 ANTHROPIC_MODEL = "claude-sonnet-4-20250514"
 MAX_LOG_CHARS = 30000
@@ -223,6 +228,60 @@ def post_to_slack(webhook_url, pipeline, build_number, branch, build_url, analys
         print(f"Could not post to Slack: {e}", file=sys.stderr)
 
 
+def github_api(method, path, token, data=None):
+    """Make a GitHub API request and return the parsed JSON response."""
+    url = f"{GITHUB_API_URL}{path}"
+    body = json.dumps(data).encode("utf-8") if data else None
+    req = urllib.request.Request(url, data=body, method=method, headers={
+        "Authorization": f"token {token}",
+        "Accept": "application/vnd.github.v3+json",
+        "Content-Type": "application/json",
+    })
+    with urllib.request.urlopen(req, timeout=15) as resp:
+        return json.loads(resp.read())
+
+
+def find_existing_comment(pr_number, token):
+    """Find an existing analysis comment on the PR by looking for the marker."""
+    page = 1
+    while True:
+        comments = github_api(
+            "GET", f"/repos/{GITHUB_REPO}/issues/{pr_number}/comments?per_page=100&page={page}", token
+        )
+        if not comments:
+            break
+        for comment in comments:
+            if GITHUB_COMMENT_MARKER in comment.get("body", ""):
+                return comment["id"]
+        page += 1
+    return None
+
+
+def post_to_github(token, pr_number, build_url, annotation_body):
+    """Post or update a build failure analysis comment on a GitHub PR."""
+    comment_body = (
+        f"{GITHUB_COMMENT_MARKER}\n"
+        f"## :mag: Build Failure Analysis\n\n"
+        f"{annotation_body}\n\n"
+        f"---\n"
+        f"[View Buildkite build]({build_url}) | "
+        f"*Analysis generated by Claude. Verify before acting.*"
+    )
+
+    try:
+        existing_id = find_existing_comment(pr_number, token)
+        if existing_id:
+            github_api("PATCH", f"/repos/{GITHUB_REPO}/issues/comments/{existing_id}", token,
+                       {"body": comment_body})
+            print(f"Updated existing GitHub comment on PR #{pr_number}.")
+        else:
+            github_api("POST", f"/repos/{GITHUB_REPO}/issues/{pr_number}/comments", token,
+                       {"body": comment_body})
+            print(f"Posted GitHub comment on PR #{pr_number}.")
+    except Exception as e:
+        print(f"Could not post to GitHub: {e}", file=sys.stderr)
+
+
 def main():
     parser = argparse.ArgumentParser(description="Analyze Buildkite build failures with Claude")
     parser.add_argument("--pipeline", default=os.environ.get("BUILDKITE_PIPELINE_SLUG"))
@@ -265,6 +324,9 @@ def main():
     print(f"Found {len(failed_jobs)} failed step(s)")
 
     slack_webhook = get_env_or_file("SLACK_WEBHOOK_URL", "")
+    github_token = get_env_or_file("GITHUB_TOKEN", "")
+    pr_number = os.environ.get("BUILDKITE_PULL_REQUEST", "false")
+    pr_number = int(pr_number) if pr_number not in ("false", "") else None
     build_url = build.get("web_url", f"https://buildkite.com/{BUILDKITE_ORG}/{args.pipeline}/builds/{args.build}")
 
     all_analyses = []
@@ -340,6 +402,14 @@ def main():
         else:
             print("No SLACK_WEBHOOK_URL set, skipping Slack notification.")
 
+        if github_token and pr_number:
+            annotation_body = "\n\n---\n\n".join(all_analyses)
+            post_to_github(github_token, pr_number, build_url, annotation_body)
+        elif pr_number:
+            print("No GITHUB_TOKEN set, skipping GitHub PR comment.")
+        else:
+            print("Not a PR build, skipping GitHub PR comment.")
+
 
 if __name__ == "__main__":
     main()

From b15417fb1cb8952380f760a44f3f14cdd1ffd079 Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Fri, 27 Feb 2026 15:17:35 +1300
Subject: [PATCH 04/21] Add --pr flag for testing GitHub comment posting

Allows overriding the PR number from the command line, useful for
local testing of the GitHub comment feature without being in a
Buildkite PR build environment.

Tested end-to-end against build #2232 (Bayesian test timeout),
posting to a throwaway PR. Both initial post and update-in-place
(deduplication) verified working.

Made-with: Cursor
---
 dev-tools/analyze_build_failure.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/dev-tools/analyze_build_failure.py b/dev-tools/analyze_build_failure.py
index 456fc5348..cda73e85c 100755
--- a/dev-tools/analyze_build_failure.py
+++ b/dev-tools/analyze_build_failure.py
@@ -286,6 +286,8 @@ def main():
     parser = argparse.ArgumentParser(description="Analyze Buildkite build failures with Claude")
     parser.add_argument("--pipeline", default=os.environ.get("BUILDKITE_PIPELINE_SLUG"))
     parser.add_argument("--build", type=int, default=int(os.environ.get("BUILDKITE_BUILD_NUMBER", "0")))
+    parser.add_argument("--pr", type=int, default=None,
+                        help="Override PR number (for testing GitHub comment posting)")
     parser.add_argument("--dry-run", action="store_true", help="Print analysis without annotating or posting to Slack")
     args = parser.parse_args()
 
@@ -325,8 +327,10 @@ def main():
 
     slack_webhook = get_env_or_file("SLACK_WEBHOOK_URL", "")
     github_token = get_env_or_file("GITHUB_TOKEN", "")
-    pr_number = os.environ.get("BUILDKITE_PULL_REQUEST", "false")
-    pr_number = int(pr_number) if pr_number not in ("false", "") else None
+    pr_number = args.pr
+    if pr_number is None:
+        pr_env = os.environ.get("BUILDKITE_PULL_REQUEST", "false")
+        pr_number = int(pr_env) if pr_env not in ("false", "") else None
     build_url = build.get("web_url", f"https://buildkite.com/{BUILDKITE_ORG}/{args.pipeline}/builds/{args.build}")
 
     all_analyses = []

From 9ff81a16d0fb1b9618a859139b78dbca713df102 Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Mon, 16 Mar 2026 15:39:39 +1300
Subject: [PATCH 05/21] [ML] Make AI failure analysis opt-in for PR builds

Failure analysis now only runs on PR builds when triggered by a
`buildkite analyze` comment, avoiding unnecessary API token usage.
Nightly and debug pipelines retain automatic analysis on failure.

Made-with: Cursor
---
 .buildkite/ml_pipeline/config.py | 4 +++-
 .buildkite/pipeline.json.py      | 6 +++---
 .buildkite/pull-requests.json    | 2 +-
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/.buildkite/ml_pipeline/config.py b/.buildkite/ml_pipeline/config.py
index 4669ce8b5..502217c00 100644
--- a/.buildkite/ml_pipeline/config.py
+++ b/.buildkite/ml_pipeline/config.py
@@ -19,6 +19,7 @@ class Config:
     build_x86_64: str = ""
     run_qa_tests: bool = False
     run_pytorch_tests: bool = False
+    run_analyze: bool = False
     action: str = "build"
 
     def parse_comment(self):
@@ -37,7 +38,8 @@ def parse_comment(self):
             self.action = os.environ["GITHUB_PR_COMMENT_VAR_ACTION"]
             self.run_qa_tests = self.action == "run_qa_tests"
             self.run_pytorch_tests = self.action == "run_pytorch_tests"
-            if self.run_pytorch_tests or self.run_qa_tests:
+            self.run_analyze = self.action == "analyze"
+            if self.run_pytorch_tests or self.run_qa_tests or self.run_analyze:
                 self.action = "build"
 
         # If the ACTION is set to "run_qa_tests" then set some optional variables governing the ES branch to build, the
diff --git a/.buildkite/pipeline.json.py b/.buildkite/pipeline.json.py
index c15d6bb84..f7a48077e 100755
--- a/.buildkite/pipeline.json.py
+++ b/.buildkite/pipeline.json.py
@@ -68,9 +68,9 @@ def main():
     # Check for build timing regressions against nightly baseline
     pipeline_steps.append(pipeline_steps.generate_step("Check build timing regressions",
                                                        ".buildkite/pipelines/check_build_regression.yml.sh"))
-    # Analyze failures with AI if the build failed
-    pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure",
-                                                       ".buildkite/pipelines/analyze_build_failure.yml.sh"))
+    if config.run_analyze:
+        pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure",
+                                                           ".buildkite/pipelines/analyze_build_failure.yml.sh"))
 
     pipeline["env"] = env
     pipeline["steps"] = pipeline_steps
diff --git a/.buildkite/pull-requests.json b/.buildkite/pull-requests.json
index dcea82794..81248a767 100644
--- a/.buildkite/pull-requests.json
+++ b/.buildkite/pull-requests.json
@@ -9,7 +9,7 @@
       "commit_status_context": "ml-cpp-ci",
       "build_on_commit": true,
       "build_on_comment": true,
-      "trigger_comment_regex": "^(?:(?:buildkite +)(?<action>build|debug|run_qa_tests|run_pytorch_tests)(=(?<args>(?:[^ ]+)))? *(?: for ES_BRANCH=(?<branch>([.0-9a-zA-Z]+)))? *(?:with STACK_VERSION=(?<version>([.0-9]+)))? *(?: *on *(?<platform>(?:[ ,]*(?:windows|linux|mac(os)?))+))?) *(?<arch>(?:[, ]*aarch64|x86_64)+)?$",
+      "trigger_comment_regex": "^(?:(?:buildkite +)(?<action>build|debug|run_qa_tests|run_pytorch_tests|analyze)(=(?<args>(?:[^ ]+)))? *(?: for ES_BRANCH=(?<branch>([.0-9a-zA-Z]+)))? *(?:with STACK_VERSION=(?<version>([.0-9]+)))? *(?: *on *(?<platform>(?:[ ,]*(?:windows|linux|mac(os)?))+))?) *(?<arch>(?:[, ]*aarch64|x86_64)+)?$",
       "always_trigger_comment_regex": "^(?:(?:buildkite\\W+)?(?:build|test)\\W+(?:this|it))",
       "skip_ci_labels": ["skip-ci", "jenkins-ci", ">test-mute", ">docs"],
       "skip_target_branches": ["6.8", "7.11", "7.12"],

From 44594cd5fdfd3191a9fa8824b5ecbef3588dc903 Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Thu, 19 Mar 2026 14:19:07 +1300
Subject: [PATCH 06/21] [ML] Enable native Buildkite PR comments for build
 failures

Enable the ELASTIC_PR_COMMENTS_ENABLED feature on the PR builds
pipeline so that elasticmachine posts a summary comment listing
failed steps and build history directly on the GitHub PR.

Made-with: Cursor
---
 catalog-info.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/catalog-info.yaml b/catalog-info.yaml
index 17f8aad68..44cd2a4aa 100644
--- a/catalog-info.yaml
+++ b/catalog-info.yaml
@@ -38,6 +38,8 @@ spec:
         publish_commit_status: false
         publish_commit_status_per_step: true
         trigger_mode: code
+      env:
+        ELASTIC_PR_COMMENTS_ENABLED: 'true'
       repository: elastic/ml-cpp
       skip_intermediate_builds: true
       teams:

From 1ea0535f226d640a3e2ee75d34311161c2a03d7a Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Thu, 19 Mar 2026 14:34:41 +1300
Subject: [PATCH 07/21] [ML] Post AI analysis as PR comment via GitHub Actions

Replace direct GitHub API calls from the Buildkite analyze step with
a GitHub Actions workflow that uses the built-in GITHUB_TOKEN. The
Buildkite step now saves the analysis as build metadata, and a
GitHub Actions workflow triggered by the commit status event fetches
it and posts/updates the PR comment. This eliminates the need for a
personal access token or GitHub App for PR comments.

Made-with: Cursor
---
 .buildkite/hooks/post-checkout            |   1 -
 .github/workflows/post-build-analysis.yml | 133 ++++++++++++++++++++++
 dev-tools/analyze_build_failure.py        |  91 +++------------
 3 files changed, 150 insertions(+), 75 deletions(-)
 create mode 100644 .github/workflows/post-build-analysis.yml

diff --git a/.buildkite/hooks/post-checkout b/.buildkite/hooks/post-checkout
index 07d50cf16..c3006feab 100644
--- a/.buildkite/hooks/post-checkout
+++ b/.buildkite/hooks/post-checkout
@@ -37,7 +37,6 @@ if [[ "$BUILDKITE_PIPELINE_SLUG" == ml-cpp* ]]; then
     export BUILDKITE_API_READ_TOKEN=$(vault read -field=token secret/ci/elastic-ml-cpp/buildkite/api_read_token 2>/dev/null || echo "")
     export ANTHROPIC_API_KEY=$(vault read -field=api_key secret/ci/elastic-ml-cpp/anthropic/claude 2>/dev/null || echo "")
     export SLACK_WEBHOOK_URL=$(vault read -field=url secret/ci/elastic-ml-cpp/slack/build_failure_webhook 2>/dev/null || echo "")
-    export GITHUB_TOKEN=$(vault read -field=token secret/ci/elastic-ml-cpp/github/pr_comment_token 2>/dev/null || echo "")
   fi
 
   # GCS service account — inject credentials for build and Java IT steps.
diff --git a/.github/workflows/post-build-analysis.yml b/.github/workflows/post-build-analysis.yml
new file mode 100644
index 000000000..e3dfdeeb4
--- /dev/null
+++ b/.github/workflows/post-build-analysis.yml
@@ -0,0 +1,133 @@
+name: Post Build Failure Analysis
+
+# Triggered by commit status updates from Buildkite. When the
+# analyze_build_failure step completes, Buildkite posts a commit status
+# which fires this workflow. We fetch the AI analysis from Buildkite
+# build metadata and post it as a PR comment using the built-in
+# GITHUB_TOKEN (no PAT or GitHub App needed).
+
+on:
+  status:
+
+permissions:
+  pull-requests: write
+  statuses: read
+
+jobs:
+  post-analysis:
+    # Only run when the analyze step succeeds (soft_fail means Buildkite
+    # reports success even if the analysis itself had issues).
+    if: >-
+      github.event.state == 'success' &&
+      contains(github.event.context, 'Analyze build failure')
+    runs-on: ubuntu-latest
+    steps:
+      - name: Find PR for commit
+        id: find-pr
+        env:
+          GH_TOKEN: ${{ github.token }}
+          SHA: ${{ github.event.sha }}
+        run: |
+          PR_NUMBER=$(gh api "repos/${{ github.repository }}/commits/${SHA}/pulls" \
+            --jq '.[0].number // empty' 2>/dev/null || true)
+          if [ -z "$PR_NUMBER" ]; then
+            echo "No PR found for commit ${SHA} — skipping."
+            echo "skip=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "Found PR #${PR_NUMBER}"
+            echo "skip=false" >> "$GITHUB_OUTPUT"
+            echo "pr_number=${PR_NUMBER}" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Extract Buildkite build info
+        if: steps.find-pr.outputs.skip != 'true'
+        id: bk-info
+        env:
+          TARGET_URL: ${{ github.event.target_url }}
+        run: |
+          # target_url looks like:
+          # https://buildkite.com/elastic/ml-cpp-pr-builds/builds/2361#step-key
+          # Extract pipeline slug and build number.
+          PIPELINE=$(echo "$TARGET_URL" | sed -n 's|.*/elastic/\([^/]*\)/builds/.*|\1|p')
+          BUILD_NUM=$(echo "$TARGET_URL" | sed -n 's|.*/builds/\([0-9]*\).*|\1|p')
+          if [ -z "$PIPELINE" ] || [ -z "$BUILD_NUM" ]; then
+            echo "Could not parse Buildkite URL: $TARGET_URL"
+            echo "skip=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "Pipeline: $PIPELINE, Build: $BUILD_NUM"
+            echo "skip=false" >> "$GITHUB_OUTPUT"
+            echo "pipeline=${PIPELINE}" >> "$GITHUB_OUTPUT"
+            echo "build_num=${BUILD_NUM}" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Fetch analysis from Buildkite
+        if: >-
+          steps.find-pr.outputs.skip != 'true' &&
+          steps.bk-info.outputs.skip != 'true'
+        id: fetch
+        env:
+          BK_TOKEN: ${{ secrets.BUILDKITE_API_READ_TOKEN }}
+          PIPELINE: ${{ steps.bk-info.outputs.pipeline }}
+          BUILD_NUM: ${{ steps.bk-info.outputs.build_num }}
+        run: |
+          if [ -z "$BK_TOKEN" ]; then
+            echo "BUILDKITE_API_READ_TOKEN secret not set — skipping."
+            echo "skip=true" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+
+          # Fetch build metadata containing the analysis.
+          ANALYSIS=$(curl -sS -f \
+            -H "Authorization: Bearer ${BK_TOKEN}" \
+            "https://api.buildkite.com/v2/organizations/elastic/pipelines/${PIPELINE}/builds/${BUILD_NUM}/meta-data/build-failure-analysis" \
+            2>/dev/null) || true
+
+          if [ -z "$ANALYSIS" ]; then
+            echo "No analysis metadata found — skipping."
+            echo "skip=true" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+
+          # Save to file to avoid shell quoting issues.
+          echo "$ANALYSIS" > /tmp/analysis.md
+          echo "skip=false" >> "$GITHUB_OUTPUT"
+
+      - name: Post or update PR comment
+        if: >-
+          steps.find-pr.outputs.skip != 'true' &&
+          steps.bk-info.outputs.skip != 'true' &&
+          steps.fetch.outputs.skip != 'true'
+        env:
+          GH_TOKEN: ${{ github.token }}
+          PR_NUMBER: ${{ steps.find-pr.outputs.pr_number }}
+          PIPELINE: ${{ steps.bk-info.outputs.pipeline }}
+          BUILD_NUM: ${{ steps.bk-info.outputs.build_num }}
+        run: |
+          MARKER="<!-- build-failure-analysis -->"
+          BUILD_URL="https://buildkite.com/elastic/${PIPELINE}/builds/${BUILD_NUM}"
+          ANALYSIS=$(cat /tmp/analysis.md)
+
+          BODY=$(cat <<EOF
+          ${MARKER}
+          ## :mag: Build Failure Analysis
+
+          ${ANALYSIS}
+
+          ---
+          [View Buildkite build](${BUILD_URL}) | *Analysis generated by Claude. Verify before acting.*
+          EOF
+          )
+
+          # Check for an existing comment to update.
+          EXISTING_ID=$(gh api "repos/${{ github.repository }}/issues/${PR_NUMBER}/comments?per_page=100" \
+            --jq ".[] | select(.body | contains(\"${MARKER}\")) | .id" 2>/dev/null | head -1)
+
+          if [ -n "$EXISTING_ID" ]; then
+            gh api "repos/${{ github.repository }}/issues/comments/${EXISTING_ID}" \
+              -X PATCH -f body="$BODY"
+            echo "Updated existing comment on PR #${PR_NUMBER}."
+          else
+            gh api "repos/${{ github.repository }}/issues/${PR_NUMBER}/comments" \
+              -f body="$BODY"
+            echo "Posted new comment on PR #${PR_NUMBER}."
+          fi
diff --git a/dev-tools/analyze_build_failure.py b/dev-tools/analyze_build_failure.py
index cda73e85c..ba93e1d33 100755
--- a/dev-tools/analyze_build_failure.py
+++ b/dev-tools/analyze_build_failure.py
@@ -3,7 +3,8 @@
 
 Fetches logs from failed build steps, sends them to the Anthropic Claude API
 with repository context, and posts the analysis as a Buildkite annotation,
-a GitHub PR comment (for PR builds), and optionally to Slack.
+Buildkite build metadata (for the GitHub Actions PR comment workflow),
+and optionally to Slack.
 
 Usage:
     # Analyze the current build (in CI)
@@ -18,11 +19,9 @@
 Environment:
     BUILDKITE_TOKEN / BUILDKITE_API_READ_TOKEN   Buildkite API token
     ANTHROPIC_API_KEY                             Claude API key
-    GITHUB_TOKEN                                  GitHub API token (optional, for PR comments)
     SLACK_WEBHOOK_URL                             Slack incoming webhook (optional)
     BUILDKITE_PIPELINE_SLUG                       Current pipeline (set by Buildkite)
     BUILDKITE_BUILD_NUMBER                        Current build number (set by Buildkite)
-    BUILDKITE_PULL_REQUEST                        PR number (set by Buildkite for PR builds)
 """
 
 import argparse
@@ -35,9 +34,6 @@
 from pathlib import Path
 
 BUILDKITE_ORG = "elastic"
-GITHUB_REPO = "elastic/ml-cpp"
-GITHUB_API_URL = "https://api.github.com"
-GITHUB_COMMENT_MARKER = "<!-- build-failure-analysis -->"
 ANTHROPIC_API_URL = "https://api.anthropic.com/v1/messages"
 ANTHROPIC_MODEL = "claude-sonnet-4-20250514"
 MAX_LOG_CHARS = 30000
@@ -228,66 +224,11 @@ def post_to_slack(webhook_url, pipeline, build_number, branch, build_url, analys
         print(f"Could not post to Slack: {e}", file=sys.stderr)
 
 
-def github_api(method, path, token, data=None):
-    """Make a GitHub API request and return the parsed JSON response."""
-    url = f"{GITHUB_API_URL}{path}"
-    body = json.dumps(data).encode("utf-8") if data else None
-    req = urllib.request.Request(url, data=body, method=method, headers={
-        "Authorization": f"token {token}",
-        "Accept": "application/vnd.github.v3+json",
-        "Content-Type": "application/json",
-    })
-    with urllib.request.urlopen(req, timeout=15) as resp:
-        return json.loads(resp.read())
-
-
-def find_existing_comment(pr_number, token):
-    """Find an existing analysis comment on the PR by looking for the marker."""
-    page = 1
-    while True:
-        comments = github_api(
-            "GET", f"/repos/{GITHUB_REPO}/issues/{pr_number}/comments?per_page=100&page={page}", token
-        )
-        if not comments:
-            break
-        for comment in comments:
-            if GITHUB_COMMENT_MARKER in comment.get("body", ""):
-                return comment["id"]
-        page += 1
-    return None
-
-
-def post_to_github(token, pr_number, build_url, annotation_body):
-    """Post or update a build failure analysis comment on a GitHub PR."""
-    comment_body = (
-        f"{GITHUB_COMMENT_MARKER}\n"
-        f"## :mag: Build Failure Analysis\n\n"
-        f"{annotation_body}\n\n"
-        f"---\n"
-        f"[View Buildkite build]({build_url}) | "
-        f"*Analysis generated by Claude. Verify before acting.*"
-    )
-
-    try:
-        existing_id = find_existing_comment(pr_number, token)
-        if existing_id:
-            github_api("PATCH", f"/repos/{GITHUB_REPO}/issues/comments/{existing_id}", token,
-                       {"body": comment_body})
-            print(f"Updated existing GitHub comment on PR #{pr_number}.")
-        else:
-            github_api("POST", f"/repos/{GITHUB_REPO}/issues/{pr_number}/comments", token,
-                       {"body": comment_body})
-            print(f"Posted GitHub comment on PR #{pr_number}.")
-    except Exception as e:
-        print(f"Could not post to GitHub: {e}", file=sys.stderr)
-
 
 def main():
     parser = argparse.ArgumentParser(description="Analyze Buildkite build failures with Claude")
     parser.add_argument("--pipeline", default=os.environ.get("BUILDKITE_PIPELINE_SLUG"))
     parser.add_argument("--build", type=int, default=int(os.environ.get("BUILDKITE_BUILD_NUMBER", "0")))
-    parser.add_argument("--pr", type=int, default=None,
-                        help="Override PR number (for testing GitHub comment posting)")
     parser.add_argument("--dry-run", action="store_true", help="Print analysis without annotating or posting to Slack")
     args = parser.parse_args()
 
@@ -326,11 +267,6 @@ def main():
     print(f"Found {len(failed_jobs)} failed step(s)")
 
     slack_webhook = get_env_or_file("SLACK_WEBHOOK_URL", "")
-    github_token = get_env_or_file("GITHUB_TOKEN", "")
-    pr_number = args.pr
-    if pr_number is None:
-        pr_env = os.environ.get("BUILDKITE_PULL_REQUEST", "false")
-        pr_number = int(pr_env) if pr_env not in ("false", "") else None
     build_url = build.get("web_url", f"https://buildkite.com/{BUILDKITE_ORG}/{args.pipeline}/builds/{args.build}")
 
     all_analyses = []
@@ -398,6 +334,21 @@ def main():
             print(f"\nCould not post annotation: {e}", file=sys.stderr)
             print("Full analysis printed above.")
 
+        # Store analysis as build metadata so that the GitHub Actions
+        # workflow (post-build-analysis.yml) can fetch it and post a
+        # PR comment using the built-in GITHUB_TOKEN.
+        annotation_body = "\n\n---\n\n".join(all_analyses)
+        try:
+            subprocess.run(
+                ["buildkite-agent", "meta-data", "set",
+                 "build-failure-analysis"],
+                input=annotation_body.encode(),
+                check=True,
+            )
+            print("Analysis saved as build metadata.")
+        except (FileNotFoundError, subprocess.CalledProcessError) as e:
+            print(f"Could not save build metadata: {e}", file=sys.stderr)
+
         if slack_webhook:
             post_to_slack(
                 slack_webhook, args.pipeline, args.build,
@@ -406,14 +357,6 @@ def main():
         else:
             print("No SLACK_WEBHOOK_URL set, skipping Slack notification.")
 
-        if github_token and pr_number:
-            annotation_body = "\n\n---\n\n".join(all_analyses)
-            post_to_github(github_token, pr_number, build_url, annotation_body)
-        elif pr_number:
-            print("No GITHUB_TOKEN set, skipping GitHub PR comment.")
-        else:
-            print("Not a PR build, skipping GitHub PR comment.")
-
 
 if __name__ == "__main__":
     main()

From 9fec3f6e81cfd53c9c1578a386a93d53dfaf740e Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Thu, 19 Mar 2026 14:44:13 +1300
Subject: [PATCH 08/21] [ML] Add temporary workflow to test Vault OIDC for
 GitHub Actions

Made-with: Cursor
---
 .github/workflows/test-vault-oidc.yml | 55 +++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 .github/workflows/test-vault-oidc.yml

diff --git a/.github/workflows/test-vault-oidc.yml b/.github/workflows/test-vault-oidc.yml
new file mode 100644
index 000000000..9f484a8fd
--- /dev/null
+++ b/.github/workflows/test-vault-oidc.yml
@@ -0,0 +1,55 @@
+name: Test Vault OIDC
+
+on:
+  pull_request:
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  test-vault:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check Vault JWT auth endpoint
+        run: |
+          echo "=== Checking if Vault has a JWT auth method enabled ==="
+          # This is a public, unauthenticated endpoint that returns config
+          # if the JWT auth method exists. A 404 means it's not enabled.
+          for path in jwt github-actions oidc; do
+            STATUS=$(curl -s -o /dev/null -w "%{http_code}" \
+              "https://vault-ci-prod.elastic.dev/v1/auth/${path}/.well-known/openid-configuration" 2>/dev/null)
+            echo "  auth/${path}: HTTP ${STATUS}"
+          done
+
+          echo ""
+          echo "=== Requesting GitHub OIDC token ==="
+          if [ -n "$ACTIONS_ID_TOKEN_REQUEST_URL" ]; then
+            OIDC_TOKEN=$(curl -sS -H "Authorization: bearer ${ACTIONS_ID_TOKEN_REQUEST_TOKEN}" \
+              "${ACTIONS_ID_TOKEN_REQUEST_URL}&audience=https://vault-ci-prod.elastic.dev" | jq -r '.value')
+            if [ -n "$OIDC_TOKEN" ] && [ "$OIDC_TOKEN" != "null" ]; then
+              echo "Got OIDC token (first 20 chars): ${OIDC_TOKEN:0:20}..."
+
+              # Decode the JWT claims (middle segment) to see what info it carries.
+              CLAIMS=$(echo "$OIDC_TOKEN" | cut -d. -f2 | base64 -d 2>/dev/null | jq . 2>/dev/null || echo "could not decode")
+              echo ""
+              echo "=== OIDC token claims ==="
+              echo "$CLAIMS"
+
+              echo ""
+              echo "=== Attempting Vault JWT login ==="
+              for path in jwt github-actions oidc; do
+                echo "  Trying auth/${path}/login..."
+                RESPONSE=$(curl -sS -X POST \
+                  "https://vault-ci-prod.elastic.dev/v1/auth/${path}/login" \
+                  -H "Content-Type: application/json" \
+                  -d "{\"jwt\": \"${OIDC_TOKEN}\", \"role\": \"\"}" 2>&1)
+                echo "  Response: $(echo "$RESPONSE" | jq -c '.errors // .auth.policies // .' 2>/dev/null || echo "$RESPONSE")"
+                echo ""
+              done
+            else
+              echo "Failed to get OIDC token"
+            fi
+          else
+            echo "ACTIONS_ID_TOKEN_REQUEST_URL not set — id-token permission may be missing"
+          fi

From f404519056a6415f1c4efa37a95c35bde78e18e7 Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Thu, 19 Mar 2026 14:51:26 +1300
Subject: [PATCH 09/21] [ML] Remove temporary Vault OIDC test workflow

The test confirmed Vault is reachable from GitHub Actions runners
and JWT auth paths exist. Actual OIDC login needs to be verified
with the infra team.

Made-with: Cursor
---
 .github/workflows/test-vault-oidc.yml | 55 ---------------------------
 1 file changed, 55 deletions(-)
 delete mode 100644 .github/workflows/test-vault-oidc.yml

diff --git a/.github/workflows/test-vault-oidc.yml b/.github/workflows/test-vault-oidc.yml
deleted file mode 100644
index 9f484a8fd..000000000
--- a/.github/workflows/test-vault-oidc.yml
+++ /dev/null
@@ -1,55 +0,0 @@
-name: Test Vault OIDC
-
-on:
-  pull_request:
-
-permissions:
-  id-token: write
-  contents: read
-
-jobs:
-  test-vault:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Check Vault JWT auth endpoint
-        run: |
-          echo "=== Checking if Vault has a JWT auth method enabled ==="
-          # This is a public, unauthenticated endpoint that returns config
-          # if the JWT auth method exists. A 404 means it's not enabled.
-          for path in jwt github-actions oidc; do
-            STATUS=$(curl -s -o /dev/null -w "%{http_code}" \
-              "https://vault-ci-prod.elastic.dev/v1/auth/${path}/.well-known/openid-configuration" 2>/dev/null)
-            echo "  auth/${path}: HTTP ${STATUS}"
-          done
-
-          echo ""
-          echo "=== Requesting GitHub OIDC token ==="
-          if [ -n "$ACTIONS_ID_TOKEN_REQUEST_URL" ]; then
-            OIDC_TOKEN=$(curl -sS -H "Authorization: bearer ${ACTIONS_ID_TOKEN_REQUEST_TOKEN}" \
-              "${ACTIONS_ID_TOKEN_REQUEST_URL}&audience=https://vault-ci-prod.elastic.dev" | jq -r '.value')
-            if [ -n "$OIDC_TOKEN" ] && [ "$OIDC_TOKEN" != "null" ]; then
-              echo "Got OIDC token (first 20 chars): ${OIDC_TOKEN:0:20}..."
-
-              # Decode the JWT claims (middle segment) to see what info it carries.
-              CLAIMS=$(echo "$OIDC_TOKEN" | cut -d. -f2 | base64 -d 2>/dev/null | jq . 2>/dev/null || echo "could not decode")
-              echo ""
-              echo "=== OIDC token claims ==="
-              echo "$CLAIMS"
-
-              echo ""
-              echo "=== Attempting Vault JWT login ==="
-              for path in jwt github-actions oidc; do
-                echo "  Trying auth/${path}/login..."
-                RESPONSE=$(curl -sS -X POST \
-                  "https://vault-ci-prod.elastic.dev/v1/auth/${path}/login" \
-                  -H "Content-Type: application/json" \
-                  -d "{\"jwt\": \"${OIDC_TOKEN}\", \"role\": \"\"}" 2>&1)
-                echo "  Response: $(echo "$RESPONSE" | jq -c '.errors // .auth.policies // .' 2>/dev/null || echo "$RESPONSE")"
-                echo ""
-              done
-            else
-              echo "Failed to get OIDC token"
-            fi
-          else
-            echo "ACTIONS_ID_TOKEN_REQUEST_URL not set — id-token permission may be missing"
-          fi

From 4cc72f30baab1434855ddd85ea5524ee71fa9801 Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Fri, 20 Mar 2026 13:10:35 +1300
Subject: [PATCH 10/21] [ML] Use dynamic depends_on for analyze_build_failure
 step

Apply the same fix as PR #3003 to the analyze_build_failure step:
compute which build step keys will exist based on the platform config
and pass them as ML_BUILD_STEP_KEYS for the shell script to use in
its depends_on section.  This prevents "Step dependencies not found"
errors when not all platforms are built.

Made-with: Cursor
---
 .buildkite/branch.json.py                     | 16 +++++++++++++
 .buildkite/job-build-test-all-debug.json.py   | 22 ++++++++++++++----
 .buildkite/pipeline.json.py                   | 23 +++++++++++++++----
 .../pipelines/analyze_build_failure.yml.sh    | 22 ++++++++++++------
 4 files changed, 66 insertions(+), 17 deletions(-)

diff --git a/.buildkite/branch.json.py b/.buildkite/branch.json.py
index 70f8f9754..5fb9c02c9 100755
--- a/.buildkite/branch.json.py
+++ b/.buildkite/branch.json.py
@@ -30,6 +30,21 @@ def main():
                                                        ".buildkite/pipelines/format_and_validation.yml.sh"))
     config = buildConfig.Config()
     config.parse()
+
+    build_step_keys = []
+    if config.build_linux and config.build_aarch64:
+        build_step_keys.append("build_test_linux-aarch64-RelWithDebInfo")
+    if config.build_linux and config.build_x86_64:
+        build_step_keys.append("build_test_linux-x86_64-RelWithDebInfo")
+    if config.build_macos and config.build_aarch64:
+        build_step_keys.append("build_test_macos-aarch64-RelWithDebInfo")
+    if config.build_windows and config.build_x86_64:
+        build_step_keys.append("build_test_Windows-x86_64-RelWithDebInfo")
+
+    env = {
+        "ML_BUILD_STEP_KEYS": ",".join(build_step_keys),
+    }
+
     if config.build_windows:
         build_windows = pipeline_steps.generate_step_template("Windows", "build", "", config.build_x86_64)
         pipeline_steps.append(build_windows)
@@ -58,6 +73,7 @@ def main():
     pipeline_steps.append(pipeline_steps.generate_step("Upload daily releasable artifacts to GCS",
                                                        ".buildkite/pipelines/upload_dra_to_gcs.yml.sh"))
 
+    pipeline["env"] = env
     pipeline["steps"] = pipeline_steps
     print(json.dumps(pipeline, indent=2))
 
diff --git a/.buildkite/job-build-test-all-debug.json.py b/.buildkite/job-build-test-all-debug.json.py
index 39347d086..13abc2218 100755
--- a/.buildkite/job-build-test-all-debug.json.py
+++ b/.buildkite/job-build-test-all-debug.json.py
@@ -24,11 +24,6 @@
     config as buildConfig,
 )
 
-env = {
-  "BUILD_SNAPSHOT": "true",
-  "VERSION_QUALIFIER": ""
-}
-
 def main():
     pipeline = {}
     pipeline_steps = step.PipelineStep([])
@@ -40,6 +35,23 @@ def main():
                                                        ".buildkite/pipelines/format_and_validation.yml.sh"))
     config = buildConfig.Config()
     config.parse()
+
+    build_step_keys = []
+    if config.build_linux and config.build_aarch64:
+        build_step_keys.append("build_test_linux-aarch64-RelWithDebInfo")
+    if config.build_linux and config.build_x86_64:
+        build_step_keys.append("build_test_linux-x86_64-RelWithDebInfo")
+    if config.build_macos and config.build_aarch64:
+        build_step_keys.append("build_test_macos-aarch64-RelWithDebInfo")
+    if config.build_windows and config.build_x86_64:
+        build_step_keys.append("build_test_Windows-x86_64-RelWithDebInfo")
+
+    env = {
+        "BUILD_SNAPSHOT": "true",
+        "VERSION_QUALIFIER": "",
+        "ML_BUILD_STEP_KEYS": ",".join(build_step_keys),
+    }
+
     if config.build_windows:
         debug_windows = pipeline_steps.generate_step_template("Windows", "debug", "", config.build_x86_64)
         pipeline_steps.append(debug_windows)
diff --git a/.buildkite/pipeline.json.py b/.buildkite/pipeline.json.py
index f7a48077e..1e90d3653 100755
--- a/.buildkite/pipeline.json.py
+++ b/.buildkite/pipeline.json.py
@@ -23,11 +23,6 @@
     config as buildConfig,
 )
 
-# Ensure VERSION_QUALIFIER is always empty for PR builds
-env = {
-    "VERSION_QUALIFIER": ""
-}
-
 def main():
     pipeline = {}
     pipeline_steps = step.PipelineStep([])
@@ -39,6 +34,24 @@ def main():
                                                        ".buildkite/pipelines/format_and_validation.yml.sh"))
     config = buildConfig.Config()
     config.parse()
+
+    # Compute which build step keys will exist so that analytics and
+    # failure-analysis steps can emit a correct depends_on list.
+    build_step_keys = []
+    if config.build_linux and config.build_aarch64:
+        build_step_keys.append("build_test_linux-aarch64-RelWithDebInfo")
+    if config.build_linux and config.build_x86_64:
+        build_step_keys.append("build_test_linux-x86_64-RelWithDebInfo")
+    if config.build_macos and config.build_aarch64:
+        build_step_keys.append("build_test_macos-aarch64-RelWithDebInfo")
+    if config.build_windows and config.build_x86_64:
+        build_step_keys.append("build_test_Windows-x86_64-RelWithDebInfo")
+
+    env = {
+        "VERSION_QUALIFIER": "",
+        "ML_BUILD_STEP_KEYS": ",".join(build_step_keys),
+    }
+
     if config.build_windows:
         build_windows = pipeline_steps.generate_step_template("Windows", config.action, "", config.build_x86_64)
         pipeline_steps.append(build_windows)
diff --git a/.buildkite/pipelines/analyze_build_failure.yml.sh b/.buildkite/pipelines/analyze_build_failure.yml.sh
index 162c3fb05..4e74c249c 100755
--- a/.buildkite/pipelines/analyze_build_failure.yml.sh
+++ b/.buildkite/pipelines/analyze_build_failure.yml.sh
@@ -8,17 +8,25 @@
 # compliance with the Elastic License 2.0 and the foregoing additional
 # limitation.
 
-cat <<EOL
+cat <<'EOL'
 steps:
   - label: "Analyze build failure :mag:"
     key: "analyze_build_failure"
     command:
-        - "python3 dev-tools/analyze_build_failure.py --pipeline \$BUILDKITE_PIPELINE_SLUG --build \$BUILDKITE_BUILD_NUMBER"
-    depends_on:
-        - "build_test_linux-aarch64-RelWithDebInfo"
-        - "build_test_linux-x86_64-RelWithDebInfo"
-        - "build_test_macos-aarch64-RelWithDebInfo"
-        - "build_test_Windows-x86_64-RelWithDebInfo"
+        - "python3 dev-tools/analyze_build_failure.py --pipeline $BUILDKITE_PIPELINE_SLUG --build $BUILDKITE_BUILD_NUMBER"
+EOL
+
+# Emit depends_on dynamically — ML_BUILD_STEP_KEYS is a comma-separated
+# list of step keys set by the pipeline generator.
+if [ -n "${ML_BUILD_STEP_KEYS:-}" ]; then
+    echo '    depends_on:'
+    IFS=',' read -ra STEP_KEYS <<< "$ML_BUILD_STEP_KEYS"
+    for key in "${STEP_KEYS[@]}"; do
+        echo "        - \"${key}\""
+    done
+fi
+
+cat <<'EOL'
     allow_dependency_failure: true
     if: "build.state == 'failed' || build.state == 'failing'"
     soft_fail: true

From 25c7ca0f5cb9faae503d395c4b5d51e771e8db1f Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Fri, 20 Mar 2026 13:32:49 +1300
Subject: [PATCH 11/21] [ML] Always include build failure analysis in PR
 pipelines

The analyze_build_failure step already guards itself with
  if: "build.state == 'failed' || build.state == 'failing'"
so it is automatically skipped for passing builds.  Making it
always-on (rather than requiring a special "buildkite analyze"
comment trigger) ensures it is available whenever a build fails
without needing to be requested in advance.

Remove the run_analyze config flag and the "analyze" action from
the PR comment trigger regex since they are no longer needed.

Made-with: Cursor
---
 .buildkite/ml_pipeline/config.py | 4 +---
 .buildkite/pipeline.json.py      | 8 +++++---
 .buildkite/pull-requests.json    | 2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.buildkite/ml_pipeline/config.py b/.buildkite/ml_pipeline/config.py
index 502217c00..4669ce8b5 100644
--- a/.buildkite/ml_pipeline/config.py
+++ b/.buildkite/ml_pipeline/config.py
@@ -19,7 +19,6 @@ class Config:
     build_x86_64: str = ""
     run_qa_tests: bool = False
     run_pytorch_tests: bool = False
-    run_analyze: bool = False
     action: str = "build"
 
     def parse_comment(self):
@@ -38,8 +37,7 @@ def parse_comment(self):
             self.action = os.environ["GITHUB_PR_COMMENT_VAR_ACTION"]
             self.run_qa_tests = self.action == "run_qa_tests"
             self.run_pytorch_tests = self.action == "run_pytorch_tests"
-            self.run_analyze = self.action == "analyze"
-            if self.run_pytorch_tests or self.run_qa_tests or self.run_analyze:
+            if self.run_pytorch_tests or self.run_qa_tests:
                 self.action = "build"
 
         # If the ACTION is set to "run_qa_tests" then set some optional variables governing the ES branch to build, the
diff --git a/.buildkite/pipeline.json.py b/.buildkite/pipeline.json.py
index 1e90d3653..e26cf8781 100755
--- a/.buildkite/pipeline.json.py
+++ b/.buildkite/pipeline.json.py
@@ -81,9 +81,11 @@ def main():
     # Check for build timing regressions against nightly baseline
     pipeline_steps.append(pipeline_steps.generate_step("Check build timing regressions",
                                                        ".buildkite/pipelines/check_build_regression.yml.sh"))
-    if config.run_analyze:
-        pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure",
-                                                           ".buildkite/pipelines/analyze_build_failure.yml.sh"))
+    # Analyze failures with AI — the step itself uses
+    # if: "build.state == 'failed' || build.state == 'failing'"
+    # so it is automatically skipped for passing builds.
+    pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure",
+                                                       ".buildkite/pipelines/analyze_build_failure.yml.sh"))
 
     pipeline["env"] = env
     pipeline["steps"] = pipeline_steps
diff --git a/.buildkite/pull-requests.json b/.buildkite/pull-requests.json
index 81248a767..dcea82794 100644
--- a/.buildkite/pull-requests.json
+++ b/.buildkite/pull-requests.json
@@ -9,7 +9,7 @@
       "commit_status_context": "ml-cpp-ci",
       "build_on_commit": true,
       "build_on_comment": true,
-      "trigger_comment_regex": "^(?:(?:buildkite +)(?<action>build|debug|run_qa_tests|run_pytorch_tests|analyze)(=(?<args>(?:[^ ]+)))? *(?: for ES_BRANCH=(?<branch>([.0-9a-zA-Z]+)))? *(?:with STACK_VERSION=(?<version>([.0-9]+)))? *(?: *on *(?<platform>(?:[ ,]*(?:windows|linux|mac(os)?))+))?) *(?<arch>(?:[, ]*aarch64|x86_64)+)?$",
+      "trigger_comment_regex": "^(?:(?:buildkite +)(?<action>build|debug|run_qa_tests|run_pytorch_tests)(=(?<args>(?:[^ ]+)))? *(?: for ES_BRANCH=(?<branch>([.0-9a-zA-Z]+)))? *(?:with STACK_VERSION=(?<version>([.0-9]+)))? *(?: *on *(?<platform>(?:[ ,]*(?:windows|linux|mac(os)?))+))?) *(?<arch>(?:[, ]*aarch64|x86_64)+)?$",
       "always_trigger_comment_regex": "^(?:(?:buildkite\\W+)?(?:build|test)\\W+(?:this|it))",
       "skip_ci_labels": ["skip-ci", "jenkins-ci", ">test-mute", ">docs"],
       "skip_target_branches": ["6.8", "7.11", "7.12"],

From 13b1fb23b42198bc15bf0464ce27d71e79c4760a Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Fri, 20 Mar 2026 13:35:07 +1300
Subject: [PATCH 12/21] [ML] TEMPORARY: deliberate compile error for CI testing

Introduce a compile error to test the build failure analysis step.
This commit will be reverted immediately after verifying the step.

Made-with: Cursor
---
 lib/ver/CBuildInfo.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/ver/CBuildInfo.cc b/lib/ver/CBuildInfo.cc
index c1c916bdc..8090698d1 100644
--- a/lib/ver/CBuildInfo.cc
+++ b/lib/ver/CBuildInfo.cc
@@ -10,6 +10,8 @@
  */
 #include <ver/CBuildInfo.h>
 
+#error "DELIBERATE FAILURE: testing CI build failure analysis step — will be reverted"
+
 #include <core/CProgName.h>
 #include <core/CStringUtils.h>
 

From 1985f089964a5263824a489b3e5b0fbbb1b70bc9 Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Fri, 20 Mar 2026 13:51:35 +1300
Subject: [PATCH 13/21] [ML] Fix analyze step and revert deliberate compile
 error

Remove the Buildkite `if` condition from analyze_build_failure.yml.sh.
Buildkite evaluates `if` on dynamically uploaded steps at upload time
(not at step execution time), so the condition always saw
build.state == 'running' and the step was never created.

The Python script already checks the build state via the Buildkite
API and exits early if the build passed, so the YAML-level `if` is
unnecessary.

Also reverts the deliberate compile error in CBuildInfo.cc that was
used to test the failure analysis flow.

Made-with: Cursor
---
 .buildkite/pipelines/analyze_build_failure.yml.sh | 1 -
 lib/ver/CBuildInfo.cc                             | 2 --
 2 files changed, 3 deletions(-)

diff --git a/.buildkite/pipelines/analyze_build_failure.yml.sh b/.buildkite/pipelines/analyze_build_failure.yml.sh
index 4e74c249c..a0c48dd67 100755
--- a/.buildkite/pipelines/analyze_build_failure.yml.sh
+++ b/.buildkite/pipelines/analyze_build_failure.yml.sh
@@ -28,7 +28,6 @@ fi
 
 cat <<'EOL'
     allow_dependency_failure: true
-    if: "build.state == 'failed' || build.state == 'failing'"
     soft_fail: true
     agents:
       image: "python:3-slim"
diff --git a/lib/ver/CBuildInfo.cc b/lib/ver/CBuildInfo.cc
index 8090698d1..c1c916bdc 100644
--- a/lib/ver/CBuildInfo.cc
+++ b/lib/ver/CBuildInfo.cc
@@ -10,8 +10,6 @@
  */
 #include <ver/CBuildInfo.h>
 
-#error "DELIBERATE FAILURE: testing CI build failure analysis step — will be reverted"
-
 #include <core/CProgName.h>
 #include <core/CStringUtils.h>
 

From 74290242ebf359e4b539c6c75ea2ae4fbe257c51 Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Fri, 20 Mar 2026 13:53:17 +1300
Subject: [PATCH 14/21] [ML] TEMPORARY: deliberate compile error for CI testing
 (take 2)

Made-with: Cursor
---
 lib/ver/CBuildInfo.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/ver/CBuildInfo.cc b/lib/ver/CBuildInfo.cc
index c1c916bdc..8090698d1 100644
--- a/lib/ver/CBuildInfo.cc
+++ b/lib/ver/CBuildInfo.cc
@@ -10,6 +10,8 @@
  */
 #include <ver/CBuildInfo.h>
 
+#error "DELIBERATE FAILURE: testing CI build failure analysis step — will be reverted"
+
 #include <core/CProgName.h>
 #include <core/CStringUtils.h>
 

From 8466537ebf4a397d4ac1afb16970c8c1653d0015 Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Fri, 20 Mar 2026 14:11:30 +1300
Subject: [PATCH 15/21] [ML] Fix analyze step Docker image and revert compile
 error

Use python:3 instead of python:3-slim for the analyze_build_failure
step. The slim image lacks curl and git which the Buildkite agent
hooks require.

Also reverts the deliberate compile error.

Made-with: Cursor
---
 .buildkite/pipelines/analyze_build_failure.yml.sh | 2 +-
 lib/ver/CBuildInfo.cc                             | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/.buildkite/pipelines/analyze_build_failure.yml.sh b/.buildkite/pipelines/analyze_build_failure.yml.sh
index a0c48dd67..8bdce871b 100755
--- a/.buildkite/pipelines/analyze_build_failure.yml.sh
+++ b/.buildkite/pipelines/analyze_build_failure.yml.sh
@@ -30,5 +30,5 @@ cat <<'EOL'
     allow_dependency_failure: true
     soft_fail: true
     agents:
-      image: "python:3-slim"
+      image: "python:3"
 EOL
diff --git a/lib/ver/CBuildInfo.cc b/lib/ver/CBuildInfo.cc
index 8090698d1..c1c916bdc 100644
--- a/lib/ver/CBuildInfo.cc
+++ b/lib/ver/CBuildInfo.cc
@@ -10,8 +10,6 @@
  */
 #include <ver/CBuildInfo.h>
 
-#error "DELIBERATE FAILURE: testing CI build failure analysis step — will be reverted"
-
 #include <core/CProgName.h>
 #include <core/CStringUtils.h>
 

From 925c2d65e4b82a372d473160f10a81c8b2f1da25 Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Fri, 20 Mar 2026 14:12:35 +1300
Subject: [PATCH 16/21] [ML] TEMPORARY: deliberate compile error for CI testing
 (take 3)

Made-with: Cursor
---
 lib/ver/CBuildInfo.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/ver/CBuildInfo.cc b/lib/ver/CBuildInfo.cc
index c1c916bdc..8090698d1 100644
--- a/lib/ver/CBuildInfo.cc
+++ b/lib/ver/CBuildInfo.cc
@@ -10,6 +10,8 @@
  */
 #include <ver/CBuildInfo.h>
 
+#error "DELIBERATE FAILURE: testing CI build failure analysis step — will be reverted"
+
 #include <core/CProgName.h>
 #include <core/CStringUtils.h>
 

From 0d804154c3784f68e6b07f85aa423099e2d35133 Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Fri, 20 Mar 2026 14:31:27 +1300
Subject: [PATCH 17/21] [ML] Revert deliberate compile error after successful
 CI test

The "Analyze build failure" step ran successfully on Build #2385,
correctly identifying the deliberate #error as a code bug with high
confidence. Reverting to restore normal builds.

Made-with: Cursor
---
 lib/ver/CBuildInfo.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lib/ver/CBuildInfo.cc b/lib/ver/CBuildInfo.cc
index 8090698d1..c1c916bdc 100644
--- a/lib/ver/CBuildInfo.cc
+++ b/lib/ver/CBuildInfo.cc
@@ -10,8 +10,6 @@
  */
 #include <ver/CBuildInfo.h>
 
-#error "DELIBERATE FAILURE: testing CI build failure analysis step — will be reverted"
-
 #include <core/CProgName.h>
 #include <core/CStringUtils.h>
 

From bdea42c38b29fdc6677e7e2ddb0d29a21c1596a1 Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Fri, 20 Mar 2026 15:00:51 +1300
Subject: [PATCH 18/21] [ML] Make analyze step opt-in via "buildkite analyze"
 PR comment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of always including the analysis step or requiring a full
rebuild, "buildkite analyze" now triggers a lightweight pipeline that
finds the most recent failed build for the branch via the Buildkite
API and analyzes it retroactively — no recompilation needed.

Also improves log extraction: instead of blindly taking the last 30K
chars (which often misses the actual error), the script now scans for
error patterns and extracts matching lines with surrounding context.

Made-with: Cursor
---
 .buildkite/ml_pipeline/config.py              |   4 +-
 .buildkite/pipeline.json.py                   |  21 ++-
 .../pipelines/analyze_build_failure.yml.sh    |  12 +-
 .buildkite/pull-requests.json                 |   2 +-
 dev-tools/analyze_build_failure.py            | 125 +++++++++++++++++-
 5 files changed, 146 insertions(+), 18 deletions(-)

diff --git a/.buildkite/ml_pipeline/config.py b/.buildkite/ml_pipeline/config.py
index 4669ce8b5..502217c00 100644
--- a/.buildkite/ml_pipeline/config.py
+++ b/.buildkite/ml_pipeline/config.py
@@ -19,6 +19,7 @@ class Config:
     build_x86_64: str = ""
     run_qa_tests: bool = False
     run_pytorch_tests: bool = False
+    run_analyze: bool = False
     action: str = "build"
 
     def parse_comment(self):
@@ -37,7 +38,8 @@ def parse_comment(self):
             self.action = os.environ["GITHUB_PR_COMMENT_VAR_ACTION"]
             self.run_qa_tests = self.action == "run_qa_tests"
             self.run_pytorch_tests = self.action == "run_pytorch_tests"
-            if self.run_pytorch_tests or self.run_qa_tests:
+            self.run_analyze = self.action == "analyze"
+            if self.run_pytorch_tests or self.run_qa_tests or self.run_analyze:
                 self.action = "build"
 
         # If the ACTION is set to "run_qa_tests" then set some optional variables governing the ES branch to build, the
diff --git a/.buildkite/pipeline.json.py b/.buildkite/pipeline.json.py
index e26cf8781..87fd84f95 100755
--- a/.buildkite/pipeline.json.py
+++ b/.buildkite/pipeline.json.py
@@ -24,16 +24,28 @@
 )
 
 def main():
+    config = buildConfig.Config()
+    config.parse()
+
     pipeline = {}
     pipeline_steps = step.PipelineStep([])
+
+    # "buildkite analyze" triggers a lightweight pipeline that finds and
+    # analyzes the most recent failed build for this branch — no compilation.
+    if config.run_analyze:
+        pipeline["env"] = {"ML_ANALYZE_PREVIOUS": "true"}
+        pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure",
+                                                           ".buildkite/pipelines/analyze_build_failure.yml.sh"))
+        pipeline["steps"] = pipeline_steps
+        print(json.dumps(pipeline, indent=2))
+        return
+
     pipeline_steps.append(pipeline_steps.generate_step("Queue a :slack: notification for the pipeline",
                                                        ".buildkite/pipelines/send_slack_notification.sh"))
     pipeline_steps.append(pipeline_steps.generate_step("Queue a :email: notification for the pipeline",
                                                        ".buildkite/pipelines/send_email_notification.sh"))
     pipeline_steps.append(pipeline_steps.generate_step("Upload clang-format validation",
                                                        ".buildkite/pipelines/format_and_validation.yml.sh"))
-    config = buildConfig.Config()
-    config.parse()
 
     # Compute which build step keys will exist so that analytics and
     # failure-analysis steps can emit a correct depends_on list.
@@ -81,11 +93,6 @@ def main():
     # Check for build timing regressions against nightly baseline
     pipeline_steps.append(pipeline_steps.generate_step("Check build timing regressions",
                                                        ".buildkite/pipelines/check_build_regression.yml.sh"))
-    # Analyze failures with AI — the step itself uses
-    # if: "build.state == 'failed' || build.state == 'failing'"
-    # so it is automatically skipped for passing builds.
-    pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure",
-                                                       ".buildkite/pipelines/analyze_build_failure.yml.sh"))
 
     pipeline["env"] = env
     pipeline["steps"] = pipeline_steps
diff --git a/.buildkite/pipelines/analyze_build_failure.yml.sh b/.buildkite/pipelines/analyze_build_failure.yml.sh
index 8bdce871b..984b0c5bd 100755
--- a/.buildkite/pipelines/analyze_build_failure.yml.sh
+++ b/.buildkite/pipelines/analyze_build_failure.yml.sh
@@ -8,16 +8,22 @@
 # compliance with the Elastic License 2.0 and the foregoing additional
 # limitation.
 
-cat <<'EOL'
+EXTRA_FLAGS=""
+if [ "${ML_ANALYZE_PREVIOUS:-}" = "true" ]; then
+    EXTRA_FLAGS=" --find-previous-failure"
+fi
+
+cat <<EOL
 steps:
   - label: "Analyze build failure :mag:"
     key: "analyze_build_failure"
     command:
-        - "python3 dev-tools/analyze_build_failure.py --pipeline $BUILDKITE_PIPELINE_SLUG --build $BUILDKITE_BUILD_NUMBER"
+        - "python3 dev-tools/analyze_build_failure.py --pipeline \$BUILDKITE_PIPELINE_SLUG --build \$BUILDKITE_BUILD_NUMBER${EXTRA_FLAGS}"
 EOL
 
 # Emit depends_on dynamically — ML_BUILD_STEP_KEYS is a comma-separated
-# list of step keys set by the pipeline generator.
+# list of step keys set by the pipeline generator.  In analyze-previous
+# mode there are no build steps so this block is skipped.
 if [ -n "${ML_BUILD_STEP_KEYS:-}" ]; then
     echo '    depends_on:'
     IFS=',' read -ra STEP_KEYS <<< "$ML_BUILD_STEP_KEYS"
diff --git a/.buildkite/pull-requests.json b/.buildkite/pull-requests.json
index dcea82794..81248a767 100644
--- a/.buildkite/pull-requests.json
+++ b/.buildkite/pull-requests.json
@@ -9,7 +9,7 @@
       "commit_status_context": "ml-cpp-ci",
       "build_on_commit": true,
       "build_on_comment": true,
-      "trigger_comment_regex": "^(?:(?:buildkite +)(?<action>build|debug|run_qa_tests|run_pytorch_tests)(=(?<args>(?:[^ ]+)))? *(?: for ES_BRANCH=(?<branch>([.0-9a-zA-Z]+)))? *(?:with STACK_VERSION=(?<version>([.0-9]+)))? *(?: *on *(?<platform>(?:[ ,]*(?:windows|linux|mac(os)?))+))?) *(?<arch>(?:[, ]*aarch64|x86_64)+)?$",
+      "trigger_comment_regex": "^(?:(?:buildkite +)(?<action>build|debug|run_qa_tests|run_pytorch_tests|analyze)(=(?<args>(?:[^ ]+)))? *(?: for ES_BRANCH=(?<branch>([.0-9a-zA-Z]+)))? *(?:with STACK_VERSION=(?<version>([.0-9]+)))? *(?: *on *(?<platform>(?:[ ,]*(?:windows|linux|mac(os)?))+))?) *(?<arch>(?:[, ]*aarch64|x86_64)+)?$",
       "always_trigger_comment_regex": "^(?:(?:buildkite\\W+)?(?:build|test)\\W+(?:this|it))",
       "skip_ci_labels": ["skip-ci", "jenkins-ci", ">test-mute", ">docs"],
       "skip_target_branches": ["6.8", "7.11", "7.12"],
diff --git a/dev-tools/analyze_build_failure.py b/dev-tools/analyze_build_failure.py
index ba93e1d33..e097771e1 100755
--- a/dev-tools/analyze_build_failure.py
+++ b/dev-tools/analyze_build_failure.py
@@ -13,6 +13,10 @@
     # Analyze a specific build
     python3 dev-tools/analyze_build_failure.py --pipeline ml-cpp-snapshot-builds --build 5819
 
+    # Find and analyze the most recent failed build for the current branch
+    # (used by "buildkite analyze" PR comment — no rebuild needed)
+    python3 dev-tools/analyze_build_failure.py --find-previous-failure
+
     # Dry run (print to stdout, don't annotate or post to Slack/GitHub)
     python3 dev-tools/analyze_build_failure.py --pipeline ml-cpp-snapshot-builds --build 5819 --dry-run
 
@@ -27,8 +31,10 @@
 import argparse
 import json
 import os
+import re
 import subprocess
 import sys
+import urllib.parse
 import urllib.request
 import urllib.error
 from pathlib import Path
@@ -98,6 +104,20 @@ def buildkite_get(path, token):
         return json.loads(resp.read())
 
 
+def find_previous_failed_build(pipeline, token, branch=None, exclude_build=None):
+    """Find the most recent failed build for a pipeline, optionally filtered by branch."""
+    params = {"state": "failed", "per_page": "5"}
+    if branch:
+        params["branch"] = branch
+    query = urllib.parse.urlencode(params)
+    builds = buildkite_get(f"pipelines/{pipeline}/builds?{query}", token)
+    for build in builds:
+        if exclude_build and build.get("number") == exclude_build:
+            continue
+        return build
+    return None
+
+
 def get_job_log(log_url, token):
     """Fetch the raw log for a Buildkite job."""
     req = urllib.request.Request(
@@ -114,11 +134,92 @@ def get_job_log(log_url, token):
         return None
 
 
-def truncate_log(log_text, max_chars=MAX_LOG_CHARS):
-    """Keep the last max_chars of the log (the end usually has the error)."""
-    if not log_text or len(log_text) <= max_chars:
+ERROR_PATTERNS = re.compile(
+    r"(?i)"
+    r"(?:^|\s)error(?:\s|:|\[|C\d)"    # "error:", "error C2338", "error[E"
+    r"|fatal error"
+    r"|^#error\b"
+    r"|FAILED"
+    r"|BOOST_ERROR"
+    r"|BOOST_FAIL"
+    r"|: fatal:"                         # linker fatal
+    r"|ninja: build stopped"
+    r"|make.*\*\*\*"                     # make: *** [target] Error
+    r"|CMake Error"
+    r"|assertion failed"
+    r"|LINK : fatal"                     # MSVC linker
+    r"|unresolved external"
+    r"|cannot find -l"                   # linker: cannot find library
+    r"|undefined reference"
+    r"|Segmentation fault"
+    r"|signal \d+"
+    r"|exit code \d+"
+    r"|Exit status: \d+(?!.*exit code 0)"
+)
+
+ANSI_ESCAPE = re.compile(r"\x1b\[[0-9;]*[A-Za-z]|\x1b\].*?\x07")
+BK_TIMESTAMP = re.compile(r"_bk;t=\d+")
+
+
+def strip_terminal_noise(log_text):
+    """Remove ANSI escapes and Buildkite timestamp markers."""
+    text = ANSI_ESCAPE.sub("", log_text)
+    return BK_TIMESTAMP.sub("", text)
+
+
+def extract_error_context(log_text, context_lines=10, max_chars=MAX_LOG_CHARS):
+    """Extract error-relevant sections from a build log.
+
+    Scans every line for error patterns and collects matching lines with
+    surrounding context.  Always appends the tail of the log (which
+    typically contains the build summary / exit code).  The combined
+    output is capped at *max_chars*.
+    """
+    if not log_text:
+        return log_text
+
+    log_text = strip_terminal_noise(log_text)
+    lines = log_text.splitlines()
+
+    if len(log_text) <= max_chars:
         return log_text
-    return f"... [truncated {len(log_text) - max_chars} chars] ...\n" + log_text[-max_chars:]
+
+    # Find line indices that match error patterns.
+    error_indices = set()
+    for i, line in enumerate(lines):
+        if ERROR_PATTERNS.search(line):
+            error_indices.add(i)
+
+    # Expand each match with context_lines before/after, merging overlaps.
+    include = set()
+    for idx in sorted(error_indices):
+        for j in range(max(0, idx - context_lines), min(len(lines), idx + context_lines + 1)):
+            include.add(j)
+
+    # Always include the last 80 lines (build summary / exit info).
+    tail_start = max(0, len(lines) - 80)
+    for j in range(tail_start, len(lines)):
+        include.add(j)
+
+    # Build the excerpt, inserting "..." markers for skipped regions.
+    sections = []
+    prev = -2
+    for i in sorted(include):
+        if i != prev + 1:
+            sections.append("... [skipped] ...")
+        sections.append(lines[i])
+        prev = i
+
+    excerpt = "\n".join(sections)
+
+    # Final safety cap — if still too long, keep the head and tail.
+    if len(excerpt) > max_chars:
+        half = max_chars // 2
+        excerpt = (excerpt[:half]
+                   + f"\n... [trimmed {len(excerpt) - max_chars} chars] ...\n"
+                   + excerpt[-half:])
+
+    return excerpt
 
 
 def call_claude(api_key, prompt):
@@ -229,6 +330,8 @@ def main():
     parser = argparse.ArgumentParser(description="Analyze Buildkite build failures with Claude")
     parser.add_argument("--pipeline", default=os.environ.get("BUILDKITE_PIPELINE_SLUG"))
     parser.add_argument("--build", type=int, default=int(os.environ.get("BUILDKITE_BUILD_NUMBER", "0")))
+    parser.add_argument("--find-previous-failure", action="store_true",
+                        help="Find and analyze the most recent failed build for the current branch")
     parser.add_argument("--dry-run", action="store_true", help="Print analysis without annotating or posting to Slack")
     args = parser.parse_args()
 
@@ -247,6 +350,16 @@ def main():
         print("Error: No Anthropic API key available", file=sys.stderr)
         sys.exit(1)
 
+    if args.find_previous_failure:
+        branch = os.environ.get("BUILDKITE_BRANCH")
+        print(f"Searching for previous failed build on branch '{branch}'...")
+        prev = find_previous_failed_build(args.pipeline, bk_token, branch, args.build)
+        if not prev:
+            print(f"No previous failed build found for branch '{branch}' — nothing to analyze.")
+            sys.exit(0)
+        args.build = prev["number"]
+        print(f"Found failed build #{args.build}: {prev.get('web_url', '')}")
+
     print(f"Analyzing {args.pipeline} build #{args.build}...")
 
     build = buildkite_get(f"pipelines/{args.pipeline}/builds/{args.build}", bk_token)
@@ -284,7 +397,7 @@ def main():
             print(f"  Could not fetch log, skipping")
             continue
 
-        log_excerpt = truncate_log(log_text)
+        log_excerpt = extract_error_context(log_text)
 
         prompt = f"""Analyze this CI build failure.
 
@@ -295,7 +408,7 @@ def main():
 
 {KNOWN_FAILURE_PATTERNS}
 
-**Build log (last {MAX_LOG_CHARS} chars)**:
+**Build log (error-relevant sections extracted from full log)**:
 ```
 {log_excerpt}
 ```

From b3c7cb114965c479a34ad0f0209b1bdea4f91475 Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Fri, 20 Mar 2026 15:37:42 +1300
Subject: [PATCH 19/21] [ML] Improve Boost.Test failure detection in log
 extraction

Replace BOOST_ERROR/BOOST_FAIL patterns (source-code macro names that
don't appear in logs) with a pattern matching the actual Boost.Test
summary output: "*** N failure(s) detected in test suite".

Made-with: Cursor
---
 dev-tools/analyze_build_failure.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/dev-tools/analyze_build_failure.py b/dev-tools/analyze_build_failure.py
index e097771e1..43d6b449c 100755
--- a/dev-tools/analyze_build_failure.py
+++ b/dev-tools/analyze_build_failure.py
@@ -140,8 +140,7 @@ def get_job_log(log_url, token):
     r"|fatal error"
     r"|^#error\b"
     r"|FAILED"
-    r"|BOOST_ERROR"
-    r"|BOOST_FAIL"
+    r"|\*\*\* \d+ failure"              # Boost.Test: *** N failure(s) detected
     r"|: fatal:"                         # linker fatal
     r"|ninja: build stopped"
     r"|make.*\*\*\*"                     # make: *** [target] Error

From c9d9ef029efc125cbc8744a50cbf8aa836824399 Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Fri, 20 Mar 2026 15:39:57 +1300
Subject: [PATCH 20/21] =?UTF-8?q?[ML]=20TEMPORARY:=20deliberate=20test=20f?=
 =?UTF-8?q?ailure=20for=20CI=20analysis=20testing=20=E2=80=94=20will=20be?=
 =?UTF-8?q?=20reverted?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Made-with: Cursor
---
 lib/ver/unittest/CBuildInfoTest.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lib/ver/unittest/CBuildInfoTest.cc b/lib/ver/unittest/CBuildInfoTest.cc
index b1382a905..bb2bc337d 100644
--- a/lib/ver/unittest/CBuildInfoTest.cc
+++ b/lib/ver/unittest/CBuildInfoTest.cc
@@ -19,6 +19,10 @@
 
 BOOST_AUTO_TEST_SUITE(CBuildInfoTest)
 
+BOOST_AUTO_TEST_CASE(testDeliberateFailure) {
+    BOOST_FAIL("DELIBERATE TEST FAILURE: testing CI build failure analysis step — will be reverted");
+}
+
 BOOST_AUTO_TEST_CASE(testFullInfo) {
     std::string fullInfo(ml::ver::CBuildInfo::fullInfo());
     LOG_DEBUG(<< fullInfo);

From bfe59eb8b2f54646ce2ea21419ce9c26ef30a5aa Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Fri, 20 Mar 2026 16:13:52 +1300
Subject: [PATCH 21/21] [ML] Revert deliberate test failure after successful
 analysis testing

The analysis step correctly identified the Boost.Test failure on all
platforms. Reverting to restore normal test behaviour.

Made-with: Cursor
---
 lib/ver/unittest/CBuildInfoTest.cc | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/lib/ver/unittest/CBuildInfoTest.cc b/lib/ver/unittest/CBuildInfoTest.cc
index bb2bc337d..b1382a905 100644
--- a/lib/ver/unittest/CBuildInfoTest.cc
+++ b/lib/ver/unittest/CBuildInfoTest.cc
@@ -19,10 +19,6 @@
 
 BOOST_AUTO_TEST_SUITE(CBuildInfoTest)
 
-BOOST_AUTO_TEST_CASE(testDeliberateFailure) {
-    BOOST_FAIL("DELIBERATE TEST FAILURE: testing CI build failure analysis step — will be reverted");
-}
-
 BOOST_AUTO_TEST_CASE(testFullInfo) {
     std::string fullInfo(ml::ver::CBuildInfo::fullInfo());
     LOG_DEBUG(<< fullInfo);