diff --git a/.buildkite/branch.json.py b/.buildkite/branch.json.py index 4f5a8ac95..8c7585819 100755 --- a/.buildkite/branch.json.py +++ b/.buildkite/branch.json.py @@ -67,6 +67,9 @@ def main(): # Ingest step-level timings into Elasticsearch for anomaly detection pipeline_steps.append(pipeline_steps.generate_step("Ingest build timings", ".buildkite/pipelines/ingest_build_timings.yml.sh")) + # Analyze failures with AI if the build failed + pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure", + ".buildkite/pipelines/analyze_build_failure.yml.sh")) # Build the DRA artifacts and upload to S3 and GCS pipeline_steps.append(pipeline_steps.generate_step("Create daily releasable artifacts", diff --git a/.buildkite/hooks/post-checkout b/.buildkite/hooks/post-checkout index aca27b008..c3006feab 100644 --- a/.buildkite/hooks/post-checkout +++ b/.buildkite/hooks/post-checkout @@ -33,6 +33,12 @@ if [[ "$BUILDKITE_PIPELINE_SLUG" == ml-cpp* ]]; then export ES_API_KEY=$(vault read -field=api_key secret/ci/elastic-ml-cpp/elasticsearch/ci_analytics 2>/dev/null || echo "") fi + if [[ "$BUILDKITE_STEP_KEY" == "analyze_build_failure" ]]; then + export BUILDKITE_API_READ_TOKEN=$(vault read -field=token secret/ci/elastic-ml-cpp/buildkite/api_read_token 2>/dev/null || echo "") + export ANTHROPIC_API_KEY=$(vault read -field=api_key secret/ci/elastic-ml-cpp/anthropic/claude 2>/dev/null || echo "") + export SLACK_WEBHOOK_URL=$(vault read -field=url secret/ci/elastic-ml-cpp/slack/build_failure_webhook 2>/dev/null || echo "") + fi + # GCS service account — inject credentials for build and Java IT steps. # Build steps use it for sccache; Java IT steps use it for the Gradle # build cache. The key is stored in Vault. diff --git a/.buildkite/job-build-test-all-debug.json.py b/.buildkite/job-build-test-all-debug.json.py index 7aa0e4a61..13abc2218 100755 --- a/.buildkite/job-build-test-all-debug.json.py +++ b/.buildkite/job-build-test-all-debug.json.py @@ -24,11 +24,6 @@ config as buildConfig, ) -env = { - "BUILD_SNAPSHOT": "true", - "VERSION_QUALIFIER": "" -} - def main(): pipeline = {} pipeline_steps = step.PipelineStep([]) @@ -40,6 +35,23 @@ def main(): ".buildkite/pipelines/format_and_validation.yml.sh")) config = buildConfig.Config() config.parse() + + build_step_keys = [] + if config.build_linux and config.build_aarch64: + build_step_keys.append("build_test_linux-aarch64-RelWithDebInfo") + if config.build_linux and config.build_x86_64: + build_step_keys.append("build_test_linux-x86_64-RelWithDebInfo") + if config.build_macos and config.build_aarch64: + build_step_keys.append("build_test_macos-aarch64-RelWithDebInfo") + if config.build_windows and config.build_x86_64: + build_step_keys.append("build_test_Windows-x86_64-RelWithDebInfo") + + env = { + "BUILD_SNAPSHOT": "true", + "VERSION_QUALIFIER": "", + "ML_BUILD_STEP_KEYS": ",".join(build_step_keys), + } + if config.build_windows: debug_windows = pipeline_steps.generate_step_template("Windows", "debug", "", config.build_x86_64) pipeline_steps.append(debug_windows) @@ -57,6 +69,9 @@ def main(): # Ingest step-level timings into Elasticsearch for anomaly detection pipeline_steps.append(pipeline_steps.generate_step("Ingest build timings", ".buildkite/pipelines/ingest_build_timings.yml.sh")) + # Analyze failures with AI if the build failed + pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure", + ".buildkite/pipelines/analyze_build_failure.yml.sh")) pipeline["env"] = env pipeline["steps"] = pipeline_steps diff --git a/.buildkite/ml_pipeline/config.py b/.buildkite/ml_pipeline/config.py index 7abb4a537..acc399c2b 100644 --- a/.buildkite/ml_pipeline/config.py +++ b/.buildkite/ml_pipeline/config.py @@ -19,6 +19,7 @@ class Config: build_x86_64: str = "" run_qa_tests: bool = False run_pytorch_tests: bool = False + run_analyze: bool = False action: str = "build" def parse_comment(self): @@ -37,7 +38,8 @@ def parse_comment(self): self.action = os.environ["GITHUB_PR_COMMENT_VAR_ACTION"] self.run_qa_tests = self.action == "run_qa_tests" self.run_pytorch_tests = self.action == "run_pytorch_tests" - if self.run_pytorch_tests or self.run_qa_tests: + self.run_analyze = self.action == "analyze" + if self.run_pytorch_tests or self.run_qa_tests or self.run_analyze: self.action = "build" # If the ACTION is set to "run_qa_tests" then set some optional variables governing the ES branch to build, the diff --git a/.buildkite/pipeline.json.py b/.buildkite/pipeline.json.py index 0ae577685..b0f002b49 100755 --- a/.buildkite/pipeline.json.py +++ b/.buildkite/pipeline.json.py @@ -24,20 +24,31 @@ ) def main(): + config = buildConfig.Config() + config.parse() + pipeline = {} pipeline_steps = step.PipelineStep([]) + + # "buildkite analyze" triggers a lightweight pipeline that finds and + # analyzes the most recent failed build for this branch — no compilation. + if config.run_analyze: + pipeline["env"] = {"ML_ANALYZE_PREVIOUS": "true"} + pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure", + ".buildkite/pipelines/analyze_build_failure.yml.sh")) + pipeline["steps"] = pipeline_steps + print(json.dumps(pipeline, indent=2)) + return + pipeline_steps.append(pipeline_steps.generate_step("Queue a :slack: notification for the pipeline", ".buildkite/pipelines/send_slack_notification.sh")) pipeline_steps.append(pipeline_steps.generate_step("Queue a :email: notification for the pipeline", ".buildkite/pipelines/send_email_notification.sh")) pipeline_steps.append(pipeline_steps.generate_step("Upload clang-format validation", ".buildkite/pipelines/format_and_validation.yml.sh")) - config = buildConfig.Config() - config.parse() - # Compute which build step keys will exist so that analytics steps - # can emit a correct depends_on list (not all platforms are built - # for every PR, depending on labels/comments). + # Compute which build step keys will exist so that analytics and + # failure-analysis steps can emit a correct depends_on list. build_step_keys = [] if config.build_linux and config.build_aarch64: build_step_keys.append("build_test_linux-aarch64-RelWithDebInfo") diff --git a/.buildkite/pipelines/analyze_build_failure.yml.sh b/.buildkite/pipelines/analyze_build_failure.yml.sh new file mode 100755 index 000000000..984b0c5bd --- /dev/null +++ b/.buildkite/pipelines/analyze_build_failure.yml.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +# or more contributor license agreements. Licensed under the Elastic License +# 2.0 and the following additional limitation. Functionality enabled by the +# files subject to the Elastic License 2.0 may only be used in production when +# invoked by an Elasticsearch process with a license key installed that permits +# use of machine learning features. You may not use this file except in +# compliance with the Elastic License 2.0 and the foregoing additional +# limitation. + +EXTRA_FLAGS="" +if [ "${ML_ANALYZE_PREVIOUS:-}" = "true" ]; then + EXTRA_FLAGS=" --find-previous-failure" +fi + +cat <build|debug|run_qa_tests|run_pytorch_tests)(=(?(?:[^ ]+)))? *(?: for ES_BRANCH=(?([.0-9a-zA-Z]+)))? *(?:with STACK_VERSION=(?([.0-9]+)))? *(?: *on *(?(?:[ ,]*(?:windows|linux|mac(os)?))+))?) *(?(?:[, ]*aarch64|x86_64)+)?$", + "trigger_comment_regex": "^(?:(?:buildkite +)(?build|debug|run_qa_tests|run_pytorch_tests|analyze)(=(?(?:[^ ]+)))? *(?: for ES_BRANCH=(?([.0-9a-zA-Z]+)))? *(?:with STACK_VERSION=(?([.0-9]+)))? *(?: *on *(?(?:[ ,]*(?:windows|linux|mac(os)?))+))?) *(?(?:[, ]*aarch64|x86_64)+)?$", "always_trigger_comment_regex": "^(?:(?:buildkite\\W+)?(?:build|test)\\W+(?:this|it))", "skip_ci_labels": ["skip-ci", "jenkins-ci", ">test-mute", ">docs"], "skip_target_branches": ["6.8", "7.11", "7.12"], diff --git a/.github/workflows/post-build-analysis.yml b/.github/workflows/post-build-analysis.yml new file mode 100644 index 000000000..e3dfdeeb4 --- /dev/null +++ b/.github/workflows/post-build-analysis.yml @@ -0,0 +1,133 @@ +name: Post Build Failure Analysis + +# Triggered by commit status updates from Buildkite. When the +# analyze_build_failure step completes, Buildkite posts a commit status +# which fires this workflow. We fetch the AI analysis from Buildkite +# build metadata and post it as a PR comment using the built-in +# GITHUB_TOKEN (no PAT or GitHub App needed). + +on: + status: + +permissions: + pull-requests: write + statuses: read + +jobs: + post-analysis: + # Only run when the analyze step succeeds (soft_fail means Buildkite + # reports success even if the analysis itself had issues). + if: >- + github.event.state == 'success' && + contains(github.event.context, 'Analyze build failure') + runs-on: ubuntu-latest + steps: + - name: Find PR for commit + id: find-pr + env: + GH_TOKEN: ${{ github.token }} + SHA: ${{ github.event.sha }} + run: | + PR_NUMBER=$(gh api "repos/${{ github.repository }}/commits/${SHA}/pulls" \ + --jq '.[0].number // empty' 2>/dev/null || true) + if [ -z "$PR_NUMBER" ]; then + echo "No PR found for commit ${SHA} — skipping." + echo "skip=true" >> "$GITHUB_OUTPUT" + else + echo "Found PR #${PR_NUMBER}" + echo "skip=false" >> "$GITHUB_OUTPUT" + echo "pr_number=${PR_NUMBER}" >> "$GITHUB_OUTPUT" + fi + + - name: Extract Buildkite build info + if: steps.find-pr.outputs.skip != 'true' + id: bk-info + env: + TARGET_URL: ${{ github.event.target_url }} + run: | + # target_url looks like: + # https://buildkite.com/elastic/ml-cpp-pr-builds/builds/2361#step-key + # Extract pipeline slug and build number. + PIPELINE=$(echo "$TARGET_URL" | sed -n 's|.*/elastic/\([^/]*\)/builds/.*|\1|p') + BUILD_NUM=$(echo "$TARGET_URL" | sed -n 's|.*/builds/\([0-9]*\).*|\1|p') + if [ -z "$PIPELINE" ] || [ -z "$BUILD_NUM" ]; then + echo "Could not parse Buildkite URL: $TARGET_URL" + echo "skip=true" >> "$GITHUB_OUTPUT" + else + echo "Pipeline: $PIPELINE, Build: $BUILD_NUM" + echo "skip=false" >> "$GITHUB_OUTPUT" + echo "pipeline=${PIPELINE}" >> "$GITHUB_OUTPUT" + echo "build_num=${BUILD_NUM}" >> "$GITHUB_OUTPUT" + fi + + - name: Fetch analysis from Buildkite + if: >- + steps.find-pr.outputs.skip != 'true' && + steps.bk-info.outputs.skip != 'true' + id: fetch + env: + BK_TOKEN: ${{ secrets.BUILDKITE_API_READ_TOKEN }} + PIPELINE: ${{ steps.bk-info.outputs.pipeline }} + BUILD_NUM: ${{ steps.bk-info.outputs.build_num }} + run: | + if [ -z "$BK_TOKEN" ]; then + echo "BUILDKITE_API_READ_TOKEN secret not set — skipping." + echo "skip=true" >> "$GITHUB_OUTPUT" + exit 0 + fi + + # Fetch build metadata containing the analysis. + ANALYSIS=$(curl -sS -f \ + -H "Authorization: Bearer ${BK_TOKEN}" \ + "https://api.buildkite.com/v2/organizations/elastic/pipelines/${PIPELINE}/builds/${BUILD_NUM}/meta-data/build-failure-analysis" \ + 2>/dev/null) || true + + if [ -z "$ANALYSIS" ]; then + echo "No analysis metadata found — skipping." + echo "skip=true" >> "$GITHUB_OUTPUT" + exit 0 + fi + + # Save to file to avoid shell quoting issues. + echo "$ANALYSIS" > /tmp/analysis.md + echo "skip=false" >> "$GITHUB_OUTPUT" + + - name: Post or update PR comment + if: >- + steps.find-pr.outputs.skip != 'true' && + steps.bk-info.outputs.skip != 'true' && + steps.fetch.outputs.skip != 'true' + env: + GH_TOKEN: ${{ github.token }} + PR_NUMBER: ${{ steps.find-pr.outputs.pr_number }} + PIPELINE: ${{ steps.bk-info.outputs.pipeline }} + BUILD_NUM: ${{ steps.bk-info.outputs.build_num }} + run: | + MARKER="" + BUILD_URL="https://buildkite.com/elastic/${PIPELINE}/builds/${BUILD_NUM}" + ANALYSIS=$(cat /tmp/analysis.md) + + BODY=$(cat </dev/null | head -1) + + if [ -n "$EXISTING_ID" ]; then + gh api "repos/${{ github.repository }}/issues/comments/${EXISTING_ID}" \ + -X PATCH -f body="$BODY" + echo "Updated existing comment on PR #${PR_NUMBER}." + else + gh api "repos/${{ github.repository }}/issues/${PR_NUMBER}/comments" \ + -f body="$BODY" + echo "Posted new comment on PR #${PR_NUMBER}." + fi diff --git a/catalog-info.yaml b/catalog-info.yaml index 17f8aad68..44cd2a4aa 100644 --- a/catalog-info.yaml +++ b/catalog-info.yaml @@ -38,6 +38,8 @@ spec: publish_commit_status: false publish_commit_status_per_step: true trigger_mode: code + env: + ELASTIC_PR_COMMENTS_ENABLED: 'true' repository: elastic/ml-cpp skip_intermediate_builds: true teams: diff --git a/dev-tools/analyze_build_failure.py b/dev-tools/analyze_build_failure.py new file mode 100755 index 000000000..43d6b449c --- /dev/null +++ b/dev-tools/analyze_build_failure.py @@ -0,0 +1,474 @@ +#!/usr/bin/env python3 +"""Analyze a Buildkite build failure using Claude and post a diagnosis. + +Fetches logs from failed build steps, sends them to the Anthropic Claude API +with repository context, and posts the analysis as a Buildkite annotation, +Buildkite build metadata (for the GitHub Actions PR comment workflow), +and optionally to Slack. + +Usage: + # Analyze the current build (in CI) + python3 dev-tools/analyze_build_failure.py + + # Analyze a specific build + python3 dev-tools/analyze_build_failure.py --pipeline ml-cpp-snapshot-builds --build 5819 + + # Find and analyze the most recent failed build for the current branch + # (used by "buildkite analyze" PR comment — no rebuild needed) + python3 dev-tools/analyze_build_failure.py --find-previous-failure + + # Dry run (print to stdout, don't annotate or post to Slack/GitHub) + python3 dev-tools/analyze_build_failure.py --pipeline ml-cpp-snapshot-builds --build 5819 --dry-run + +Environment: + BUILDKITE_TOKEN / BUILDKITE_API_READ_TOKEN Buildkite API token + ANTHROPIC_API_KEY Claude API key + SLACK_WEBHOOK_URL Slack incoming webhook (optional) + BUILDKITE_PIPELINE_SLUG Current pipeline (set by Buildkite) + BUILDKITE_BUILD_NUMBER Current build number (set by Buildkite) +""" + +import argparse +import json +import os +import re +import subprocess +import sys +import urllib.parse +import urllib.request +import urllib.error +from pathlib import Path + +BUILDKITE_ORG = "elastic" +ANTHROPIC_API_URL = "https://api.anthropic.com/v1/messages" +ANTHROPIC_MODEL = "claude-sonnet-4-20250514" +MAX_LOG_CHARS = 30000 +MAX_RESPONSE_TOKENS = 2048 + +KNOWN_FAILURE_PATTERNS = """ +Known transient/infrastructure failures: +- "Unable to download toolchain" / Adoptium JDK download failure: transient, retry usually fixes it +- "Exceeded maximum artifact size limit of 10 GiB": artifact_paths glob is too broad +- "sccache: error: couldn't connect to server": sccache server not running, check setup_sccache.sh +- CKMostCorrelatedTest/testScale timeout: CPU contention on low-core machines, check test parallelism +- CMultiFileDataAdderTest collision: test isolation bug with temp file naming + +Known compilation patterns: +- "redefinition of" in unity builds: file needs SKIP_UNITY_BUILD_INCLUSION or unity disabled for library +- boost/unordered_map.hpp conflicts: remove from PCH list +- "mspdbsrv.exe" errors on Windows: switch from /Zi to /Z7 +""" + +SYSTEM_PROMPT = """You are a CI build failure analyst for the elastic/ml-cpp repository. +This is a C++ codebase that builds on Linux (x86_64, aarch64), macOS (aarch64), and Windows (x86_64). +Build system: CMake with Boost, uses Docker for Linux builds, Gradle for macOS/Windows, Buildkite for CI. + +Your job is to: +1. Identify the root cause of the failure from the build log +2. Classify it as: code bug, test failure, infrastructure/transient, configuration issue, or dependency issue +3. Suggest a specific fix or workaround +4. If it's transient, say so clearly — don't over-diagnose + +Be concise and actionable. Use markdown formatting. +Format your response as: + +### Root Cause +<1-2 sentences> + +### Classification + + +### Suggested Fix + + +### Confidence + +""" + + +def get_env_or_file(env_var, file_path): + val = os.environ.get(env_var, "").strip() + if val: + return val + if file_path: + p = Path(file_path).expanduser() + if p.exists(): + return p.read_text().strip() + return None + + +def buildkite_get(path, token): + url = f"https://api.buildkite.com/v2/organizations/{BUILDKITE_ORG}/{path}" + req = urllib.request.Request(url, headers={"Authorization": f"Bearer {token}"}) + with urllib.request.urlopen(req) as resp: + return json.loads(resp.read()) + + +def find_previous_failed_build(pipeline, token, branch=None, exclude_build=None): + """Find the most recent failed build for a pipeline, optionally filtered by branch.""" + params = {"state": "failed", "per_page": "5"} + if branch: + params["branch"] = branch + query = urllib.parse.urlencode(params) + builds = buildkite_get(f"pipelines/{pipeline}/builds?{query}", token) + for build in builds: + if exclude_build and build.get("number") == exclude_build: + continue + return build + return None + + +def get_job_log(log_url, token): + """Fetch the raw log for a Buildkite job.""" + req = urllib.request.Request( + log_url, + headers={ + "Authorization": f"Bearer {token}", + "Accept": "text/plain", + }, + ) + try: + with urllib.request.urlopen(req) as resp: + return resp.read().decode("utf-8", errors="replace") + except urllib.error.HTTPError: + return None + + +ERROR_PATTERNS = re.compile( + r"(?i)" + r"(?:^|\s)error(?:\s|:|\[|C\d)" # "error:", "error C2338", "error[E" + r"|fatal error" + r"|^#error\b" + r"|FAILED" + r"|\*\*\* \d+ failure" # Boost.Test: *** N failure(s) detected + r"|: fatal:" # linker fatal + r"|ninja: build stopped" + r"|make.*\*\*\*" # make: *** [target] Error + r"|CMake Error" + r"|assertion failed" + r"|LINK : fatal" # MSVC linker + r"|unresolved external" + r"|cannot find -l" # linker: cannot find library + r"|undefined reference" + r"|Segmentation fault" + r"|signal \d+" + r"|exit code \d+" + r"|Exit status: \d+(?!.*exit code 0)" +) + +ANSI_ESCAPE = re.compile(r"\x1b\[[0-9;]*[A-Za-z]|\x1b\].*?\x07") +BK_TIMESTAMP = re.compile(r"_bk;t=\d+") + + +def strip_terminal_noise(log_text): + """Remove ANSI escapes and Buildkite timestamp markers.""" + text = ANSI_ESCAPE.sub("", log_text) + return BK_TIMESTAMP.sub("", text) + + +def extract_error_context(log_text, context_lines=10, max_chars=MAX_LOG_CHARS): + """Extract error-relevant sections from a build log. + + Scans every line for error patterns and collects matching lines with + surrounding context. Always appends the tail of the log (which + typically contains the build summary / exit code). The combined + output is capped at *max_chars*. + """ + if not log_text: + return log_text + + log_text = strip_terminal_noise(log_text) + lines = log_text.splitlines() + + if len(log_text) <= max_chars: + return log_text + + # Find line indices that match error patterns. + error_indices = set() + for i, line in enumerate(lines): + if ERROR_PATTERNS.search(line): + error_indices.add(i) + + # Expand each match with context_lines before/after, merging overlaps. + include = set() + for idx in sorted(error_indices): + for j in range(max(0, idx - context_lines), min(len(lines), idx + context_lines + 1)): + include.add(j) + + # Always include the last 80 lines (build summary / exit info). + tail_start = max(0, len(lines) - 80) + for j in range(tail_start, len(lines)): + include.add(j) + + # Build the excerpt, inserting "..." markers for skipped regions. + sections = [] + prev = -2 + for i in sorted(include): + if i != prev + 1: + sections.append("... [skipped] ...") + sections.append(lines[i]) + prev = i + + excerpt = "\n".join(sections) + + # Final safety cap — if still too long, keep the head and tail. + if len(excerpt) > max_chars: + half = max_chars // 2 + excerpt = (excerpt[:half] + + f"\n... [trimmed {len(excerpt) - max_chars} chars] ...\n" + + excerpt[-half:]) + + return excerpt + + +def call_claude(api_key, prompt): + body = json.dumps({ + "model": ANTHROPIC_MODEL, + "max_tokens": MAX_RESPONSE_TOKENS, + "system": SYSTEM_PROMPT, + "messages": [{"role": "user", "content": prompt}], + }).encode("utf-8") + + req = urllib.request.Request( + ANTHROPIC_API_URL, + data=body, + headers={ + "x-api-key": api_key, + "anthropic-version": "2023-06-01", + "Content-Type": "application/json", + }, + ) + with urllib.request.urlopen(req, timeout=60) as resp: + result = json.loads(resp.read()) + + for block in result.get("content", []): + if block.get("type") == "text": + return block["text"] + return "No analysis generated." + + +def post_to_slack(webhook_url, pipeline, build_number, branch, build_url, analyses): + """Post a summary of the failure analysis to Slack.""" + # Slack uses mrkdwn, not full markdown — convert minimally + blocks = [ + { + "type": "header", + "text": { + "type": "plain_text", + "text": "Build Failure Analysis", + }, + }, + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": ( + f"*Pipeline:* `{pipeline}` | *Build:* <{build_url}|#{build_number}> | *Branch:* `{branch}`" + ), + }, + }, + ] + + for step_label, analysis in analyses: + # Extract just the classification and root cause for a compact Slack message + lines = analysis.split("\n") + root_cause = "" + classification = "" + for i, line in enumerate(lines): + if line.startswith("### Root Cause"): + root_cause = lines[i + 1].strip() if i + 1 < len(lines) else "" + elif line.startswith("### Classification"): + classification = lines[i + 1].strip() if i + 1 < len(lines) else "" + + emoji = { + "infrastructure/transient": ":cloud:", + "code bug": ":bug:", + "test failure": ":test_tube:", + "configuration": ":gear:", + "dependency": ":package:", + }.get(classification, ":warning:") + + blocks.append({"type": "divider"}) + blocks.append({ + "type": "section", + "text": { + "type": "mrkdwn", + "text": f"{emoji} *{step_label}*\n>{root_cause}\n_Classification: {classification}_", + }, + }) + + blocks.append({"type": "divider"}) + blocks.append({ + "type": "context", + "elements": [ + { + "type": "mrkdwn", + "text": f"<{build_url}|View build> | Analysis by Claude — verify before acting", + } + ], + }) + + payload = json.dumps({"blocks": blocks}).encode("utf-8") + req = urllib.request.Request( + webhook_url, + data=payload, + headers={"Content-Type": "application/json"}, + ) + try: + with urllib.request.urlopen(req, timeout=10) as resp: + if resp.status == 200: + print("Slack notification posted.") + else: + print(f"Slack returned status {resp.status}", file=sys.stderr) + except Exception as e: + print(f"Could not post to Slack: {e}", file=sys.stderr) + + + +def main(): + parser = argparse.ArgumentParser(description="Analyze Buildkite build failures with Claude") + parser.add_argument("--pipeline", default=os.environ.get("BUILDKITE_PIPELINE_SLUG")) + parser.add_argument("--build", type=int, default=int(os.environ.get("BUILDKITE_BUILD_NUMBER", "0"))) + parser.add_argument("--find-previous-failure", action="store_true", + help="Find and analyze the most recent failed build for the current branch") + parser.add_argument("--dry-run", action="store_true", help="Print analysis without annotating or posting to Slack") + args = parser.parse_args() + + if not args.pipeline or not args.build: + print("Error: --pipeline and --build required", file=sys.stderr) + sys.exit(1) + + bk_token = (get_env_or_file("BUILDKITE_TOKEN", "~/.buildkite/token") + or get_env_or_file("BUILDKITE_API_READ_TOKEN", "")) + claude_key = get_env_or_file("ANTHROPIC_API_KEY", "~/.elastic/claude_api_key") + + if not bk_token: + print("Error: No Buildkite token available", file=sys.stderr) + sys.exit(1) + if not claude_key: + print("Error: No Anthropic API key available", file=sys.stderr) + sys.exit(1) + + if args.find_previous_failure: + branch = os.environ.get("BUILDKITE_BRANCH") + print(f"Searching for previous failed build on branch '{branch}'...") + prev = find_previous_failed_build(args.pipeline, bk_token, branch, args.build) + if not prev: + print(f"No previous failed build found for branch '{branch}' — nothing to analyze.") + sys.exit(0) + args.build = prev["number"] + print(f"Found failed build #{args.build}: {prev.get('web_url', '')}") + + print(f"Analyzing {args.pipeline} build #{args.build}...") + + build = buildkite_get(f"pipelines/{args.pipeline}/builds/{args.build}", bk_token) + + if build.get("state") == "passed": + print("Build passed — nothing to analyze.") + sys.exit(0) + + failed_jobs = [ + j for j in build.get("jobs", []) + if j.get("type") == "script" and j.get("state") == "failed" + ] + + if not failed_jobs: + print("No failed steps found.") + sys.exit(0) + + print(f"Found {len(failed_jobs)} failed step(s)") + + slack_webhook = get_env_or_file("SLACK_WEBHOOK_URL", "") + build_url = build.get("web_url", f"https://buildkite.com/{BUILDKITE_ORG}/{args.pipeline}/builds/{args.build}") + + all_analyses = [] + slack_analyses = [] + + for job in failed_jobs: + step_key = job.get("step_key", "unknown") + step_label = job.get("name", step_key) + raw_log_url = job.get("raw_log_url", "") + + print(f"\nAnalyzing: {step_label} ({step_key})") + + log_text = get_job_log(raw_log_url, bk_token) if raw_log_url else None + if not log_text: + print(f" Could not fetch log, skipping") + continue + + log_excerpt = extract_error_context(log_text) + + prompt = f"""Analyze this CI build failure. + +**Pipeline**: {args.pipeline} +**Build**: #{args.build} +**Branch**: {build.get('branch', 'unknown')} +**Failed step**: {step_label} (key: {step_key}) + +{KNOWN_FAILURE_PATTERNS} + +**Build log (error-relevant sections extracted from full log)**: +``` +{log_excerpt} +``` + +Analyze the root cause and suggest a fix.""" + + try: + analysis = call_claude(claude_key, prompt) + except Exception as e: + analysis = f"Failed to get analysis: {e}" + + print(f"\n{analysis}") + all_analyses.append(f"## {step_label}\n\n{analysis}") + slack_analyses.append((step_label, analysis)) + + if not all_analyses: + print("No analyses generated.") + sys.exit(0) + + full_annotation = f"# 🔍 Build Failure Analysis\n\n" + full_annotation += f"*Pipeline*: `{args.pipeline}` | *Build*: #{args.build} | *Branch*: `{build.get('branch', '?')}`\n\n" + full_annotation += "\n\n---\n\n".join(all_analyses) + full_annotation += "\n\n---\n*Analysis generated by Claude. Verify before acting.*" + + if not args.dry_run: + try: + subprocess.run( + ["buildkite-agent", "annotate", + "--style", "error", + "--context", "build-failure-analysis"], + input=full_annotation.encode(), + check=True, + ) + print("\nAnnotation posted to Buildkite.") + except (FileNotFoundError, subprocess.CalledProcessError) as e: + print(f"\nCould not post annotation: {e}", file=sys.stderr) + print("Full analysis printed above.") + + # Store analysis as build metadata so that the GitHub Actions + # workflow (post-build-analysis.yml) can fetch it and post a + # PR comment using the built-in GITHUB_TOKEN. + annotation_body = "\n\n---\n\n".join(all_analyses) + try: + subprocess.run( + ["buildkite-agent", "meta-data", "set", + "build-failure-analysis"], + input=annotation_body.encode(), + check=True, + ) + print("Analysis saved as build metadata.") + except (FileNotFoundError, subprocess.CalledProcessError) as e: + print(f"Could not save build metadata: {e}", file=sys.stderr) + + if slack_webhook: + post_to_slack( + slack_webhook, args.pipeline, args.build, + build.get("branch", "?"), build_url, slack_analyses, + ) + else: + print("No SLACK_WEBHOOK_URL set, skipping Slack notification.") + + +if __name__ == "__main__": + main()