diff --git a/.buildkite/branch.json.py b/.buildkite/branch.json.py
index 4f5a8ac95..8c7585819 100755
--- a/.buildkite/branch.json.py
+++ b/.buildkite/branch.json.py
@@ -67,6 +67,9 @@ def main():
     # Ingest step-level timings into Elasticsearch for anomaly detection
     pipeline_steps.append(pipeline_steps.generate_step("Ingest build timings",
                                                        ".buildkite/pipelines/ingest_build_timings.yml.sh"))
+    # Analyze failures with AI if the build failed
+    pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure",
+                                                       ".buildkite/pipelines/analyze_build_failure.yml.sh"))
 
     # Build the DRA artifacts and upload to S3 and GCS
     pipeline_steps.append(pipeline_steps.generate_step("Create daily releasable artifacts",
diff --git a/.buildkite/hooks/post-checkout b/.buildkite/hooks/post-checkout
index aca27b008..c3006feab 100644
--- a/.buildkite/hooks/post-checkout
+++ b/.buildkite/hooks/post-checkout
@@ -33,6 +33,12 @@ if [[ "$BUILDKITE_PIPELINE_SLUG" == ml-cpp* ]]; then
     export ES_API_KEY=$(vault read -field=api_key secret/ci/elastic-ml-cpp/elasticsearch/ci_analytics 2>/dev/null || echo "")
   fi
 
+  if [[ "$BUILDKITE_STEP_KEY" == "analyze_build_failure" ]]; then
+    export BUILDKITE_API_READ_TOKEN=$(vault read -field=token secret/ci/elastic-ml-cpp/buildkite/api_read_token 2>/dev/null || echo "")
+    export ANTHROPIC_API_KEY=$(vault read -field=api_key secret/ci/elastic-ml-cpp/anthropic/claude 2>/dev/null || echo "")
+    export SLACK_WEBHOOK_URL=$(vault read -field=url secret/ci/elastic-ml-cpp/slack/build_failure_webhook 2>/dev/null || echo "")
+  fi
+
   # GCS service account — inject credentials for build and Java IT steps.
   # Build steps use it for sccache; Java IT steps use it for the Gradle
   # build cache.  The key is stored in Vault.
diff --git a/.buildkite/job-build-test-all-debug.json.py b/.buildkite/job-build-test-all-debug.json.py
index 7aa0e4a61..13abc2218 100755
--- a/.buildkite/job-build-test-all-debug.json.py
+++ b/.buildkite/job-build-test-all-debug.json.py
@@ -24,11 +24,6 @@
     config as buildConfig,
 )
 
-env = {
-  "BUILD_SNAPSHOT": "true",
-  "VERSION_QUALIFIER": ""
-}
-
 def main():
     pipeline = {}
     pipeline_steps = step.PipelineStep([])
@@ -40,6 +35,23 @@ def main():
                                                        ".buildkite/pipelines/format_and_validation.yml.sh"))
     config = buildConfig.Config()
     config.parse()
+
+    build_step_keys = []
+    if config.build_linux and config.build_aarch64:
+        build_step_keys.append("build_test_linux-aarch64-RelWithDebInfo")
+    if config.build_linux and config.build_x86_64:
+        build_step_keys.append("build_test_linux-x86_64-RelWithDebInfo")
+    if config.build_macos and config.build_aarch64:
+        build_step_keys.append("build_test_macos-aarch64-RelWithDebInfo")
+    if config.build_windows and config.build_x86_64:
+        build_step_keys.append("build_test_Windows-x86_64-RelWithDebInfo")
+
+    env = {
+        "BUILD_SNAPSHOT": "true",
+        "VERSION_QUALIFIER": "",
+        "ML_BUILD_STEP_KEYS": ",".join(build_step_keys),
+    }
+
     if config.build_windows:
         debug_windows = pipeline_steps.generate_step_template("Windows", "debug", "", config.build_x86_64)
         pipeline_steps.append(debug_windows)
@@ -57,6 +69,9 @@ def main():
     # Ingest step-level timings into Elasticsearch for anomaly detection
     pipeline_steps.append(pipeline_steps.generate_step("Ingest build timings",
                                                        ".buildkite/pipelines/ingest_build_timings.yml.sh"))
+    # Analyze failures with AI if the build failed
+    pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure",
+                                                       ".buildkite/pipelines/analyze_build_failure.yml.sh"))
 
     pipeline["env"] = env
     pipeline["steps"] = pipeline_steps
diff --git a/.buildkite/ml_pipeline/config.py b/.buildkite/ml_pipeline/config.py
index 7abb4a537..acc399c2b 100644
--- a/.buildkite/ml_pipeline/config.py
+++ b/.buildkite/ml_pipeline/config.py
@@ -19,6 +19,7 @@ class Config:
     build_x86_64: str = ""
     run_qa_tests: bool = False
     run_pytorch_tests: bool = False
+    run_analyze: bool = False
     action: str = "build"
 
     def parse_comment(self):
@@ -37,7 +38,8 @@ def parse_comment(self):
             self.action = os.environ["GITHUB_PR_COMMENT_VAR_ACTION"]
             self.run_qa_tests = self.action == "run_qa_tests"
             self.run_pytorch_tests = self.action == "run_pytorch_tests"
-            if self.run_pytorch_tests or self.run_qa_tests:
+            self.run_analyze = self.action == "analyze"
+            if self.run_pytorch_tests or self.run_qa_tests or self.run_analyze:
                 self.action = "build"
 
         # If the ACTION is set to "run_qa_tests" then set some optional variables governing the ES branch to build, the
diff --git a/.buildkite/pipeline.json.py b/.buildkite/pipeline.json.py
index 0ae577685..b0f002b49 100755
--- a/.buildkite/pipeline.json.py
+++ b/.buildkite/pipeline.json.py
@@ -24,20 +24,31 @@
 )
 
 def main():
+    config = buildConfig.Config()
+    config.parse()
+
     pipeline = {}
     pipeline_steps = step.PipelineStep([])
+
+    # "buildkite analyze" triggers a lightweight pipeline that finds and
+    # analyzes the most recent failed build for this branch — no compilation.
+    if config.run_analyze:
+        pipeline["env"] = {"ML_ANALYZE_PREVIOUS": "true"}
+        pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure",
+                                                           ".buildkite/pipelines/analyze_build_failure.yml.sh"))
+        pipeline["steps"] = pipeline_steps
+        print(json.dumps(pipeline, indent=2))
+        return
+
     pipeline_steps.append(pipeline_steps.generate_step("Queue a :slack: notification for the pipeline",
                                                        ".buildkite/pipelines/send_slack_notification.sh"))
     pipeline_steps.append(pipeline_steps.generate_step("Queue a :email: notification for the pipeline",
                                                        ".buildkite/pipelines/send_email_notification.sh"))
     pipeline_steps.append(pipeline_steps.generate_step("Upload clang-format validation",
                                                        ".buildkite/pipelines/format_and_validation.yml.sh"))
-    config = buildConfig.Config()
-    config.parse()
 
-    # Compute which build step keys will exist so that analytics steps
-    # can emit a correct depends_on list (not all platforms are built
-    # for every PR, depending on labels/comments).
+    # Compute which build step keys will exist so that analytics and
+    # failure-analysis steps can emit a correct depends_on list.
     build_step_keys = []
     if config.build_linux and config.build_aarch64:
         build_step_keys.append("build_test_linux-aarch64-RelWithDebInfo")
diff --git a/.buildkite/pipelines/analyze_build_failure.yml.sh b/.buildkite/pipelines/analyze_build_failure.yml.sh
new file mode 100755
index 000000000..984b0c5bd
--- /dev/null
+++ b/.buildkite/pipelines/analyze_build_failure.yml.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+# or more contributor license agreements. Licensed under the Elastic License
+# 2.0 and the following additional limitation. Functionality enabled by the
+# files subject to the Elastic License 2.0 may only be used in production when
+# invoked by an Elasticsearch process with a license key installed that permits
+# use of machine learning features. You may not use this file except in
+# compliance with the Elastic License 2.0 and the foregoing additional
+# limitation.
+
+EXTRA_FLAGS=""
+if [ "${ML_ANALYZE_PREVIOUS:-}" = "true" ]; then
+    EXTRA_FLAGS=" --find-previous-failure"
+fi
+
+cat <<EOL
+steps:
+  - label: "Analyze build failure :mag:"
+    key: "analyze_build_failure"
+    command:
+        - "python3 dev-tools/analyze_build_failure.py --pipeline \$BUILDKITE_PIPELINE_SLUG --build \$BUILDKITE_BUILD_NUMBER${EXTRA_FLAGS}"
+EOL
+
+# Emit depends_on dynamically — ML_BUILD_STEP_KEYS is a comma-separated
+# list of step keys set by the pipeline generator.  In analyze-previous
+# mode there are no build steps so this block is skipped.
+if [ -n "${ML_BUILD_STEP_KEYS:-}" ]; then
+    echo '    depends_on:'
+    IFS=',' read -ra STEP_KEYS <<< "$ML_BUILD_STEP_KEYS"
+    for key in "${STEP_KEYS[@]}"; do
+        echo "        - \"${key}\""
+    done
+fi
+
+cat <<'EOL'
+    allow_dependency_failure: true
+    soft_fail: true
+    agents:
+      image: "python:3"
+EOL
diff --git a/.buildkite/pull-requests.json b/.buildkite/pull-requests.json
index dcea82794..81248a767 100644
--- a/.buildkite/pull-requests.json
+++ b/.buildkite/pull-requests.json
@@ -9,7 +9,7 @@
       "commit_status_context": "ml-cpp-ci",
       "build_on_commit": true,
       "build_on_comment": true,
-      "trigger_comment_regex": "^(?:(?:buildkite +)(?<action>build|debug|run_qa_tests|run_pytorch_tests)(=(?<args>(?:[^ ]+)))? *(?: for ES_BRANCH=(?<branch>([.0-9a-zA-Z]+)))? *(?:with STACK_VERSION=(?<version>([.0-9]+)))? *(?: *on *(?<platform>(?:[ ,]*(?:windows|linux|mac(os)?))+))?) *(?<arch>(?:[, ]*aarch64|x86_64)+)?$",
+      "trigger_comment_regex": "^(?:(?:buildkite +)(?<action>build|debug|run_qa_tests|run_pytorch_tests|analyze)(=(?<args>(?:[^ ]+)))? *(?: for ES_BRANCH=(?<branch>([.0-9a-zA-Z]+)))? *(?:with STACK_VERSION=(?<version>([.0-9]+)))? *(?: *on *(?<platform>(?:[ ,]*(?:windows|linux|mac(os)?))+))?) *(?<arch>(?:[, ]*aarch64|x86_64)+)?$",
       "always_trigger_comment_regex": "^(?:(?:buildkite\\W+)?(?:build|test)\\W+(?:this|it))",
       "skip_ci_labels": ["skip-ci", "jenkins-ci", ">test-mute", ">docs"],
       "skip_target_branches": ["6.8", "7.11", "7.12"],
diff --git a/.github/workflows/post-build-analysis.yml b/.github/workflows/post-build-analysis.yml
new file mode 100644
index 000000000..e3dfdeeb4
--- /dev/null
+++ b/.github/workflows/post-build-analysis.yml
@@ -0,0 +1,133 @@
+name: Post Build Failure Analysis
+
+# Triggered by commit status updates from Buildkite. When the
+# analyze_build_failure step completes, Buildkite posts a commit status
+# which fires this workflow. We fetch the AI analysis from Buildkite
+# build metadata and post it as a PR comment using the built-in
+# GITHUB_TOKEN (no PAT or GitHub App needed).
+
+on:
+  status:
+
+permissions:
+  pull-requests: write
+  statuses: read
+
+jobs:
+  post-analysis:
+    # Only run when the analyze step succeeds (soft_fail means Buildkite
+    # reports success even if the analysis itself had issues).
+    if: >-
+      github.event.state == 'success' &&
+      contains(github.event.context, 'Analyze build failure')
+    runs-on: ubuntu-latest
+    steps:
+      - name: Find PR for commit
+        id: find-pr
+        env:
+          GH_TOKEN: ${{ github.token }}
+          SHA: ${{ github.event.sha }}
+        run: |
+          PR_NUMBER=$(gh api "repos/${{ github.repository }}/commits/${SHA}/pulls" \
+            --jq '.[0].number // empty' 2>/dev/null || true)
+          if [ -z "$PR_NUMBER" ]; then
+            echo "No PR found for commit ${SHA} — skipping."
+            echo "skip=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "Found PR #${PR_NUMBER}"
+            echo "skip=false" >> "$GITHUB_OUTPUT"
+            echo "pr_number=${PR_NUMBER}" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Extract Buildkite build info
+        if: steps.find-pr.outputs.skip != 'true'
+        id: bk-info
+        env:
+          TARGET_URL: ${{ github.event.target_url }}
+        run: |
+          # target_url looks like:
+          # https://buildkite.com/elastic/ml-cpp-pr-builds/builds/2361#step-key
+          # Extract pipeline slug and build number.
+          PIPELINE=$(echo "$TARGET_URL" | sed -n 's|.*/elastic/\([^/]*\)/builds/.*|\1|p')
+          BUILD_NUM=$(echo "$TARGET_URL" | sed -n 's|.*/builds/\([0-9]*\).*|\1|p')
+          if [ -z "$PIPELINE" ] || [ -z "$BUILD_NUM" ]; then
+            echo "Could not parse Buildkite URL: $TARGET_URL"
+            echo "skip=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "Pipeline: $PIPELINE, Build: $BUILD_NUM"
+            echo "skip=false" >> "$GITHUB_OUTPUT"
+            echo "pipeline=${PIPELINE}" >> "$GITHUB_OUTPUT"
+            echo "build_num=${BUILD_NUM}" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Fetch analysis from Buildkite
+        if: >-
+          steps.find-pr.outputs.skip != 'true' &&
+          steps.bk-info.outputs.skip != 'true'
+        id: fetch
+        env:
+          BK_TOKEN: ${{ secrets.BUILDKITE_API_READ_TOKEN }}
+          PIPELINE: ${{ steps.bk-info.outputs.pipeline }}
+          BUILD_NUM: ${{ steps.bk-info.outputs.build_num }}
+        run: |
+          if [ -z "$BK_TOKEN" ]; then
+            echo "BUILDKITE_API_READ_TOKEN secret not set — skipping."
+            echo "skip=true" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+
+          # Fetch build metadata containing the analysis.
+          ANALYSIS=$(curl -sS -f \
+            -H "Authorization: Bearer ${BK_TOKEN}" \
+            "https://api.buildkite.com/v2/organizations/elastic/pipelines/${PIPELINE}/builds/${BUILD_NUM}/meta-data/build-failure-analysis" \
+            2>/dev/null) || true
+
+          if [ -z "$ANALYSIS" ]; then
+            echo "No analysis metadata found — skipping."
+            echo "skip=true" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+
+          # Save to file to avoid shell quoting issues.
+          echo "$ANALYSIS" > /tmp/analysis.md
+          echo "skip=false" >> "$GITHUB_OUTPUT"
+
+      - name: Post or update PR comment
+        if: >-
+          steps.find-pr.outputs.skip != 'true' &&
+          steps.bk-info.outputs.skip != 'true' &&
+          steps.fetch.outputs.skip != 'true'
+        env:
+          GH_TOKEN: ${{ github.token }}
+          PR_NUMBER: ${{ steps.find-pr.outputs.pr_number }}
+          PIPELINE: ${{ steps.bk-info.outputs.pipeline }}
+          BUILD_NUM: ${{ steps.bk-info.outputs.build_num }}
+        run: |
+          MARKER="<!-- build-failure-analysis -->"
+          BUILD_URL="https://buildkite.com/elastic/${PIPELINE}/builds/${BUILD_NUM}"
+          ANALYSIS=$(cat /tmp/analysis.md)
+
+          BODY=$(cat <<EOF
+          ${MARKER}
+          ## :mag: Build Failure Analysis
+
+          ${ANALYSIS}
+
+          ---
+          [View Buildkite build](${BUILD_URL}) | *Analysis generated by Claude. Verify before acting.*
+          EOF
+          )
+
+          # Check for an existing comment to update.
+          EXISTING_ID=$(gh api "repos/${{ github.repository }}/issues/${PR_NUMBER}/comments?per_page=100" \
+            --jq ".[] | select(.body | contains(\"${MARKER}\")) | .id" 2>/dev/null | head -1)
+
+          if [ -n "$EXISTING_ID" ]; then
+            gh api "repos/${{ github.repository }}/issues/comments/${EXISTING_ID}" \
+              -X PATCH -f body="$BODY"
+            echo "Updated existing comment on PR #${PR_NUMBER}."
+          else
+            gh api "repos/${{ github.repository }}/issues/${PR_NUMBER}/comments" \
+              -f body="$BODY"
+            echo "Posted new comment on PR #${PR_NUMBER}."
+          fi
diff --git a/catalog-info.yaml b/catalog-info.yaml
index 17f8aad68..44cd2a4aa 100644
--- a/catalog-info.yaml
+++ b/catalog-info.yaml
@@ -38,6 +38,8 @@ spec:
         publish_commit_status: false
         publish_commit_status_per_step: true
         trigger_mode: code
+      env:
+        ELASTIC_PR_COMMENTS_ENABLED: 'true'
       repository: elastic/ml-cpp
       skip_intermediate_builds: true
       teams:
diff --git a/dev-tools/analyze_build_failure.py b/dev-tools/analyze_build_failure.py
new file mode 100755
index 000000000..43d6b449c
--- /dev/null
+++ b/dev-tools/analyze_build_failure.py
@@ -0,0 +1,474 @@
+#!/usr/bin/env python3
+"""Analyze a Buildkite build failure using Claude and post a diagnosis.
+
+Fetches logs from failed build steps, sends them to the Anthropic Claude API
+with repository context, and posts the analysis as a Buildkite annotation,
+Buildkite build metadata (for the GitHub Actions PR comment workflow),
+and optionally to Slack.
+
+Usage:
+    # Analyze the current build (in CI)
+    python3 dev-tools/analyze_build_failure.py
+
+    # Analyze a specific build
+    python3 dev-tools/analyze_build_failure.py --pipeline ml-cpp-snapshot-builds --build 5819
+
+    # Find and analyze the most recent failed build for the current branch
+    # (used by "buildkite analyze" PR comment — no rebuild needed)
+    python3 dev-tools/analyze_build_failure.py --find-previous-failure
+
+    # Dry run (print to stdout, don't annotate or post to Slack/GitHub)
+    python3 dev-tools/analyze_build_failure.py --pipeline ml-cpp-snapshot-builds --build 5819 --dry-run
+
+Environment:
+    BUILDKITE_TOKEN / BUILDKITE_API_READ_TOKEN   Buildkite API token
+    ANTHROPIC_API_KEY                             Claude API key
+    SLACK_WEBHOOK_URL                             Slack incoming webhook (optional)
+    BUILDKITE_PIPELINE_SLUG                       Current pipeline (set by Buildkite)
+    BUILDKITE_BUILD_NUMBER                        Current build number (set by Buildkite)
+"""
+
+import argparse
+import json
+import os
+import re
+import subprocess
+import sys
+import urllib.parse
+import urllib.request
+import urllib.error
+from pathlib import Path
+
+BUILDKITE_ORG = "elastic"
+ANTHROPIC_API_URL = "https://api.anthropic.com/v1/messages"
+ANTHROPIC_MODEL = "claude-sonnet-4-20250514"
+MAX_LOG_CHARS = 30000
+MAX_RESPONSE_TOKENS = 2048
+
+KNOWN_FAILURE_PATTERNS = """
+Known transient/infrastructure failures:
+- "Unable to download toolchain" / Adoptium JDK download failure: transient, retry usually fixes it
+- "Exceeded maximum artifact size limit of 10 GiB": artifact_paths glob is too broad
+- "sccache: error: couldn't connect to server": sccache server not running, check setup_sccache.sh
+- CKMostCorrelatedTest/testScale timeout: CPU contention on low-core machines, check test parallelism
+- CMultiFileDataAdderTest collision: test isolation bug with temp file naming
+
+Known compilation patterns:
+- "redefinition of" in unity builds: file needs SKIP_UNITY_BUILD_INCLUSION or unity disabled for library
+- boost/unordered_map.hpp conflicts: remove from PCH list
+- "mspdbsrv.exe" errors on Windows: switch from /Zi to /Z7
+"""
+
+SYSTEM_PROMPT = """You are a CI build failure analyst for the elastic/ml-cpp repository.
+This is a C++ codebase that builds on Linux (x86_64, aarch64), macOS (aarch64), and Windows (x86_64).
+Build system: CMake with Boost, uses Docker for Linux builds, Gradle for macOS/Windows, Buildkite for CI.
+
+Your job is to:
+1. Identify the root cause of the failure from the build log
+2. Classify it as: code bug, test failure, infrastructure/transient, configuration issue, or dependency issue
+3. Suggest a specific fix or workaround
+4. If it's transient, say so clearly — don't over-diagnose
+
+Be concise and actionable. Use markdown formatting.
+Format your response as:
+
+### Root Cause
+<1-2 sentences>
+
+### Classification
+<one of: code bug | test failure | infrastructure/transient | configuration | dependency>
+
+### Suggested Fix
+<specific actionable steps>
+
+### Confidence
+<high | medium | low> — <brief justification>
+"""
+
+
+def get_env_or_file(env_var, file_path):
+    val = os.environ.get(env_var, "").strip()
+    if val:
+        return val
+    if file_path:
+        p = Path(file_path).expanduser()
+        if p.exists():
+            return p.read_text().strip()
+    return None
+
+
+def buildkite_get(path, token):
+    url = f"https://api.buildkite.com/v2/organizations/{BUILDKITE_ORG}/{path}"
+    req = urllib.request.Request(url, headers={"Authorization": f"Bearer {token}"})
+    with urllib.request.urlopen(req) as resp:
+        return json.loads(resp.read())
+
+
+def find_previous_failed_build(pipeline, token, branch=None, exclude_build=None):
+    """Find the most recent failed build for a pipeline, optionally filtered by branch."""
+    params = {"state": "failed", "per_page": "5"}
+    if branch:
+        params["branch"] = branch
+    query = urllib.parse.urlencode(params)
+    builds = buildkite_get(f"pipelines/{pipeline}/builds?{query}", token)
+    for build in builds:
+        if exclude_build and build.get("number") == exclude_build:
+            continue
+        return build
+    return None
+
+
+def get_job_log(log_url, token):
+    """Fetch the raw log for a Buildkite job."""
+    req = urllib.request.Request(
+        log_url,
+        headers={
+            "Authorization": f"Bearer {token}",
+            "Accept": "text/plain",
+        },
+    )
+    try:
+        with urllib.request.urlopen(req) as resp:
+            return resp.read().decode("utf-8", errors="replace")
+    except urllib.error.HTTPError:
+        return None
+
+
+ERROR_PATTERNS = re.compile(
+    r"(?i)"
+    r"(?:^|\s)error(?:\s|:|\[|C\d)"    # "error:", "error C2338", "error[E"
+    r"|fatal error"
+    r"|^#error\b"
+    r"|FAILED"
+    r"|\*\*\* \d+ failure"              # Boost.Test: *** N failure(s) detected
+    r"|: fatal:"                         # linker fatal
+    r"|ninja: build stopped"
+    r"|make.*\*\*\*"                     # make: *** [target] Error
+    r"|CMake Error"
+    r"|assertion failed"
+    r"|LINK : fatal"                     # MSVC linker
+    r"|unresolved external"
+    r"|cannot find -l"                   # linker: cannot find library
+    r"|undefined reference"
+    r"|Segmentation fault"
+    r"|signal \d+"
+    r"|exit code \d+"
+    r"|Exit status: \d+(?!.*exit code 0)"
+)
+
+ANSI_ESCAPE = re.compile(r"\x1b\[[0-9;]*[A-Za-z]|\x1b\].*?\x07")
+BK_TIMESTAMP = re.compile(r"_bk;t=\d+")
+
+
+def strip_terminal_noise(log_text):
+    """Remove ANSI escapes and Buildkite timestamp markers."""
+    text = ANSI_ESCAPE.sub("", log_text)
+    return BK_TIMESTAMP.sub("", text)
+
+
+def extract_error_context(log_text, context_lines=10, max_chars=MAX_LOG_CHARS):
+    """Extract error-relevant sections from a build log.
+
+    Scans every line for error patterns and collects matching lines with
+    surrounding context.  Always appends the tail of the log (which
+    typically contains the build summary / exit code).  The combined
+    output is capped at *max_chars*.
+    """
+    if not log_text:
+        return log_text
+
+    log_text = strip_terminal_noise(log_text)
+    lines = log_text.splitlines()
+
+    if len(log_text) <= max_chars:
+        return log_text
+
+    # Find line indices that match error patterns.
+    error_indices = set()
+    for i, line in enumerate(lines):
+        if ERROR_PATTERNS.search(line):
+            error_indices.add(i)
+
+    # Expand each match with context_lines before/after, merging overlaps.
+    include = set()
+    for idx in sorted(error_indices):
+        for j in range(max(0, idx - context_lines), min(len(lines), idx + context_lines + 1)):
+            include.add(j)
+
+    # Always include the last 80 lines (build summary / exit info).
+    tail_start = max(0, len(lines) - 80)
+    for j in range(tail_start, len(lines)):
+        include.add(j)
+
+    # Build the excerpt, inserting "..." markers for skipped regions.
+    sections = []
+    prev = -2
+    for i in sorted(include):
+        if i != prev + 1:
+            sections.append("... [skipped] ...")
+        sections.append(lines[i])
+        prev = i
+
+    excerpt = "\n".join(sections)
+
+    # Final safety cap — if still too long, keep the head and tail.
+    if len(excerpt) > max_chars:
+        half = max_chars // 2
+        excerpt = (excerpt[:half]
+                   + f"\n... [trimmed {len(excerpt) - max_chars} chars] ...\n"
+                   + excerpt[-half:])
+
+    return excerpt
+
+
+def call_claude(api_key, prompt):
+    body = json.dumps({
+        "model": ANTHROPIC_MODEL,
+        "max_tokens": MAX_RESPONSE_TOKENS,
+        "system": SYSTEM_PROMPT,
+        "messages": [{"role": "user", "content": prompt}],
+    }).encode("utf-8")
+
+    req = urllib.request.Request(
+        ANTHROPIC_API_URL,
+        data=body,
+        headers={
+            "x-api-key": api_key,
+            "anthropic-version": "2023-06-01",
+            "Content-Type": "application/json",
+        },
+    )
+    with urllib.request.urlopen(req, timeout=60) as resp:
+        result = json.loads(resp.read())
+
+    for block in result.get("content", []):
+        if block.get("type") == "text":
+            return block["text"]
+    return "No analysis generated."
+
+
+def post_to_slack(webhook_url, pipeline, build_number, branch, build_url, analyses):
+    """Post a summary of the failure analysis to Slack."""
+    # Slack uses mrkdwn, not full markdown — convert minimally
+    blocks = [
+        {
+            "type": "header",
+            "text": {
+                "type": "plain_text",
+                "text": "Build Failure Analysis",
+            },
+        },
+        {
+            "type": "section",
+            "text": {
+                "type": "mrkdwn",
+                "text": (
+                    f"*Pipeline:* `{pipeline}` | *Build:* <{build_url}|#{build_number}> | *Branch:* `{branch}`"
+                ),
+            },
+        },
+    ]
+
+    for step_label, analysis in analyses:
+        # Extract just the classification and root cause for a compact Slack message
+        lines = analysis.split("\n")
+        root_cause = ""
+        classification = ""
+        for i, line in enumerate(lines):
+            if line.startswith("### Root Cause"):
+                root_cause = lines[i + 1].strip() if i + 1 < len(lines) else ""
+            elif line.startswith("### Classification"):
+                classification = lines[i + 1].strip() if i + 1 < len(lines) else ""
+
+        emoji = {
+            "infrastructure/transient": ":cloud:",
+            "code bug": ":bug:",
+            "test failure": ":test_tube:",
+            "configuration": ":gear:",
+            "dependency": ":package:",
+        }.get(classification, ":warning:")
+
+        blocks.append({"type": "divider"})
+        blocks.append({
+            "type": "section",
+            "text": {
+                "type": "mrkdwn",
+                "text": f"{emoji} *{step_label}*\n>{root_cause}\n_Classification: {classification}_",
+            },
+        })
+
+    blocks.append({"type": "divider"})
+    blocks.append({
+        "type": "context",
+        "elements": [
+            {
+                "type": "mrkdwn",
+                "text": f"<{build_url}|View build> | Analysis by Claude — verify before acting",
+            }
+        ],
+    })
+
+    payload = json.dumps({"blocks": blocks}).encode("utf-8")
+    req = urllib.request.Request(
+        webhook_url,
+        data=payload,
+        headers={"Content-Type": "application/json"},
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=10) as resp:
+            if resp.status == 200:
+                print("Slack notification posted.")
+            else:
+                print(f"Slack returned status {resp.status}", file=sys.stderr)
+    except Exception as e:
+        print(f"Could not post to Slack: {e}", file=sys.stderr)
+
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Analyze Buildkite build failures with Claude")
+    parser.add_argument("--pipeline", default=os.environ.get("BUILDKITE_PIPELINE_SLUG"))
+    parser.add_argument("--build", type=int, default=int(os.environ.get("BUILDKITE_BUILD_NUMBER", "0")))
+    parser.add_argument("--find-previous-failure", action="store_true",
+                        help="Find and analyze the most recent failed build for the current branch")
+    parser.add_argument("--dry-run", action="store_true", help="Print analysis without annotating or posting to Slack")
+    args = parser.parse_args()
+
+    if not args.pipeline or not args.build:
+        print("Error: --pipeline and --build required", file=sys.stderr)
+        sys.exit(1)
+
+    bk_token = (get_env_or_file("BUILDKITE_TOKEN", "~/.buildkite/token")
+                or get_env_or_file("BUILDKITE_API_READ_TOKEN", ""))
+    claude_key = get_env_or_file("ANTHROPIC_API_KEY", "~/.elastic/claude_api_key")
+
+    if not bk_token:
+        print("Error: No Buildkite token available", file=sys.stderr)
+        sys.exit(1)
+    if not claude_key:
+        print("Error: No Anthropic API key available", file=sys.stderr)
+        sys.exit(1)
+
+    if args.find_previous_failure:
+        branch = os.environ.get("BUILDKITE_BRANCH")
+        print(f"Searching for previous failed build on branch '{branch}'...")
+        prev = find_previous_failed_build(args.pipeline, bk_token, branch, args.build)
+        if not prev:
+            print(f"No previous failed build found for branch '{branch}' — nothing to analyze.")
+            sys.exit(0)
+        args.build = prev["number"]
+        print(f"Found failed build #{args.build}: {prev.get('web_url', '')}")
+
+    print(f"Analyzing {args.pipeline} build #{args.build}...")
+
+    build = buildkite_get(f"pipelines/{args.pipeline}/builds/{args.build}", bk_token)
+
+    if build.get("state") == "passed":
+        print("Build passed — nothing to analyze.")
+        sys.exit(0)
+
+    failed_jobs = [
+        j for j in build.get("jobs", [])
+        if j.get("type") == "script" and j.get("state") == "failed"
+    ]
+
+    if not failed_jobs:
+        print("No failed steps found.")
+        sys.exit(0)
+
+    print(f"Found {len(failed_jobs)} failed step(s)")
+
+    slack_webhook = get_env_or_file("SLACK_WEBHOOK_URL", "")
+    build_url = build.get("web_url", f"https://buildkite.com/{BUILDKITE_ORG}/{args.pipeline}/builds/{args.build}")
+
+    all_analyses = []
+    slack_analyses = []
+
+    for job in failed_jobs:
+        step_key = job.get("step_key", "unknown")
+        step_label = job.get("name", step_key)
+        raw_log_url = job.get("raw_log_url", "")
+
+        print(f"\nAnalyzing: {step_label} ({step_key})")
+
+        log_text = get_job_log(raw_log_url, bk_token) if raw_log_url else None
+        if not log_text:
+            print(f"  Could not fetch log, skipping")
+            continue
+
+        log_excerpt = extract_error_context(log_text)
+
+        prompt = f"""Analyze this CI build failure.
+
+**Pipeline**: {args.pipeline}
+**Build**: #{args.build}
+**Branch**: {build.get('branch', 'unknown')}
+**Failed step**: {step_label} (key: {step_key})
+
+{KNOWN_FAILURE_PATTERNS}
+
+**Build log (error-relevant sections extracted from full log)**:
+```
+{log_excerpt}
+```
+
+Analyze the root cause and suggest a fix."""
+
+        try:
+            analysis = call_claude(claude_key, prompt)
+        except Exception as e:
+            analysis = f"Failed to get analysis: {e}"
+
+        print(f"\n{analysis}")
+        all_analyses.append(f"## {step_label}\n\n{analysis}")
+        slack_analyses.append((step_label, analysis))
+
+    if not all_analyses:
+        print("No analyses generated.")
+        sys.exit(0)
+
+    full_annotation = f"# 🔍 Build Failure Analysis\n\n"
+    full_annotation += f"*Pipeline*: `{args.pipeline}` | *Build*: #{args.build} | *Branch*: `{build.get('branch', '?')}`\n\n"
+    full_annotation += "\n\n---\n\n".join(all_analyses)
+    full_annotation += "\n\n---\n*Analysis generated by Claude. Verify before acting.*"
+
+    if not args.dry_run:
+        try:
+            subprocess.run(
+                ["buildkite-agent", "annotate",
+                 "--style", "error",
+                 "--context", "build-failure-analysis"],
+                input=full_annotation.encode(),
+                check=True,
+            )
+            print("\nAnnotation posted to Buildkite.")
+        except (FileNotFoundError, subprocess.CalledProcessError) as e:
+            print(f"\nCould not post annotation: {e}", file=sys.stderr)
+            print("Full analysis printed above.")
+
+        # Store analysis as build metadata so that the GitHub Actions
+        # workflow (post-build-analysis.yml) can fetch it and post a
+        # PR comment using the built-in GITHUB_TOKEN.
+        annotation_body = "\n\n---\n\n".join(all_analyses)
+        try:
+            subprocess.run(
+                ["buildkite-agent", "meta-data", "set",
+                 "build-failure-analysis"],
+                input=annotation_body.encode(),
+                check=True,
+            )
+            print("Analysis saved as build metadata.")
+        except (FileNotFoundError, subprocess.CalledProcessError) as e:
+            print(f"Could not save build metadata: {e}", file=sys.stderr)
+
+        if slack_webhook:
+            post_to_slack(
+                slack_webhook, args.pipeline, args.build,
+                build.get("branch", "?"), build_url, slack_analyses,
+            )
+        else:
+            print("No SLACK_WEBHOOK_URL set, skipping Slack notification.")
+
+
+if __name__ == "__main__":
+    main()