Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
10bd343
[ML] Add AI-powered build failure analysis to CI pipelines
edsavage Feb 20, 2026
4dd5442
[ML] Add Slack notifications to build failure analyzer
edsavage Feb 20, 2026
64b138e
Post build failure analysis as a GitHub PR comment
edsavage Feb 27, 2026
b15417f
Add --pr flag for testing GitHub comment posting
edsavage Feb 27, 2026
9ff81a1
[ML] Make AI failure analysis opt-in for PR builds
edsavage Mar 16, 2026
44594cd
[ML] Enable native Buildkite PR comments for build failures
edsavage Mar 19, 2026
1ea0535
[ML] Post AI analysis as PR comment via GitHub Actions
edsavage Mar 19, 2026
9fec3f6
[ML] Add temporary workflow to test Vault OIDC for GitHub Actions
edsavage Mar 19, 2026
f404519
[ML] Remove temporary Vault OIDC test workflow
edsavage Mar 19, 2026
4cc72f3
[ML] Use dynamic depends_on for analyze_build_failure step
edsavage Mar 20, 2026
25c7ca0
[ML] Always include build failure analysis in PR pipelines
edsavage Mar 20, 2026
13b1fb2
[ML] TEMPORARY: deliberate compile error for CI testing
edsavage Mar 20, 2026
1985f08
[ML] Fix analyze step and revert deliberate compile error
edsavage Mar 20, 2026
7429024
[ML] TEMPORARY: deliberate compile error for CI testing (take 2)
edsavage Mar 20, 2026
8466537
[ML] Fix analyze step Docker image and revert compile error
edsavage Mar 20, 2026
925c2d6
[ML] TEMPORARY: deliberate compile error for CI testing (take 3)
edsavage Mar 20, 2026
0d80415
[ML] Revert deliberate compile error after successful CI test
edsavage Mar 20, 2026
bdea42c
[ML] Make analyze step opt-in via "buildkite analyze" PR comment
edsavage Mar 20, 2026
b3c7cb1
[ML] Improve Boost.Test failure detection in log extraction
edsavage Mar 20, 2026
c9d9ef0
[ML] TEMPORARY: deliberate test failure for CI analysis testing — wil…
edsavage Mar 20, 2026
bfe59eb
[ML] Revert deliberate test failure after successful analysis testing
edsavage Mar 20, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions .buildkite/branch.json.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,21 @@ def main():
".buildkite/pipelines/format_and_validation.yml.sh"))
config = buildConfig.Config()
config.parse()

build_step_keys = []
if config.build_linux and config.build_aarch64:
build_step_keys.append("build_test_linux-aarch64-RelWithDebInfo")
if config.build_linux and config.build_x86_64:
build_step_keys.append("build_test_linux-x86_64-RelWithDebInfo")
if config.build_macos and config.build_aarch64:
build_step_keys.append("build_test_macos-aarch64-RelWithDebInfo")
if config.build_windows and config.build_x86_64:
build_step_keys.append("build_test_Windows-x86_64-RelWithDebInfo")

env = {
"ML_BUILD_STEP_KEYS": ",".join(build_step_keys),
}

if config.build_windows:
build_windows = pipeline_steps.generate_step_template("Windows", "build", "", config.build_x86_64)
pipeline_steps.append(build_windows)
Expand All @@ -46,6 +61,9 @@ def main():
# Ingest step-level timings into Elasticsearch for anomaly detection
pipeline_steps.append(pipeline_steps.generate_step("Ingest build timings",
".buildkite/pipelines/ingest_build_timings.yml.sh"))
# Analyze failures with AI if the build failed
pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure",
".buildkite/pipelines/analyze_build_failure.yml.sh"))

# Build the DRA artifacts and upload to S3 and GCS
pipeline_steps.append(pipeline_steps.generate_step("Create daily releasable artifacts",
Expand All @@ -55,6 +73,7 @@ def main():
pipeline_steps.append(pipeline_steps.generate_step("Upload daily releasable artifacts to GCS",
".buildkite/pipelines/upload_dra_to_gcs.yml.sh"))

pipeline["env"] = env
pipeline["steps"] = pipeline_steps
print(json.dumps(pipeline, indent=2))

Expand Down
6 changes: 6 additions & 0 deletions .buildkite/hooks/post-checkout
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@ if [[ "$BUILDKITE_PIPELINE_SLUG" == ml-cpp* ]]; then
export ES_API_KEY=$(vault read -field=api_key secret/ci/elastic-ml-cpp/elasticsearch/ci_analytics 2>/dev/null || echo "")
fi

if [[ "$BUILDKITE_STEP_KEY" == "analyze_build_failure" ]]; then
export BUILDKITE_API_READ_TOKEN=$(vault read -field=token secret/ci/elastic-ml-cpp/buildkite/api_read_token 2>/dev/null || echo "")
export ANTHROPIC_API_KEY=$(vault read -field=api_key secret/ci/elastic-ml-cpp/anthropic/claude 2>/dev/null || echo "")
export SLACK_WEBHOOK_URL=$(vault read -field=url secret/ci/elastic-ml-cpp/slack/build_failure_webhook 2>/dev/null || echo "")
fi

# GCS service account — inject credentials for build and Java IT steps.
# Build steps use it for sccache; Java IT steps use it for the Gradle
# build cache. The key is stored in Vault.
Expand Down
25 changes: 20 additions & 5 deletions .buildkite/job-build-test-all-debug.json.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,6 @@
config as buildConfig,
)

env = {
"BUILD_SNAPSHOT": "true",
"VERSION_QUALIFIER": ""
}

def main():
pipeline = {}
pipeline_steps = step.PipelineStep([])
Expand All @@ -40,6 +35,23 @@ def main():
".buildkite/pipelines/format_and_validation.yml.sh"))
config = buildConfig.Config()
config.parse()

build_step_keys = []
if config.build_linux and config.build_aarch64:
build_step_keys.append("build_test_linux-aarch64-RelWithDebInfo")
if config.build_linux and config.build_x86_64:
build_step_keys.append("build_test_linux-x86_64-RelWithDebInfo")
if config.build_macos and config.build_aarch64:
build_step_keys.append("build_test_macos-aarch64-RelWithDebInfo")
if config.build_windows and config.build_x86_64:
build_step_keys.append("build_test_Windows-x86_64-RelWithDebInfo")

env = {
"BUILD_SNAPSHOT": "true",
"VERSION_QUALIFIER": "",
"ML_BUILD_STEP_KEYS": ",".join(build_step_keys),
}

if config.build_windows:
debug_windows = pipeline_steps.generate_step_template("Windows", "debug", "", config.build_x86_64)
pipeline_steps.append(debug_windows)
Expand All @@ -57,6 +69,9 @@ def main():
# Ingest step-level timings into Elasticsearch for anomaly detection
pipeline_steps.append(pipeline_steps.generate_step("Ingest build timings",
".buildkite/pipelines/ingest_build_timings.yml.sh"))
# Analyze failures with AI if the build failed
pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure",
".buildkite/pipelines/analyze_build_failure.yml.sh"))

pipeline["env"] = env
pipeline["steps"] = pipeline_steps
Expand Down
4 changes: 3 additions & 1 deletion .buildkite/ml_pipeline/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class Config:
build_x86_64: str = ""
run_qa_tests: bool = False
run_pytorch_tests: bool = False
run_analyze: bool = False
action: str = "build"

def parse_comment(self):
Expand All @@ -37,7 +38,8 @@ def parse_comment(self):
self.action = os.environ["GITHUB_PR_COMMENT_VAR_ACTION"]
self.run_qa_tests = self.action == "run_qa_tests"
self.run_pytorch_tests = self.action == "run_pytorch_tests"
if self.run_pytorch_tests or self.run_qa_tests:
self.run_analyze = self.action == "analyze"
if self.run_pytorch_tests or self.run_qa_tests or self.run_analyze:
self.action = "build"

# If the ACTION is set to "run_qa_tests" then set some optional variables governing the ES branch to build, the
Expand Down
39 changes: 32 additions & 7 deletions .buildkite/pipeline.json.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,22 +23,47 @@
config as buildConfig,
)

# Ensure VERSION_QUALIFIER is always empty for PR builds
env = {
"VERSION_QUALIFIER": ""
}

def main():
config = buildConfig.Config()
config.parse()

pipeline = {}
pipeline_steps = step.PipelineStep([])

# "buildkite analyze" triggers a lightweight pipeline that finds and
# analyzes the most recent failed build for this branch — no compilation.
if config.run_analyze:
pipeline["env"] = {"ML_ANALYZE_PREVIOUS": "true"}
pipeline_steps.append(pipeline_steps.generate_step("Analyze build failure",
".buildkite/pipelines/analyze_build_failure.yml.sh"))
pipeline["steps"] = pipeline_steps
print(json.dumps(pipeline, indent=2))
return

pipeline_steps.append(pipeline_steps.generate_step("Queue a :slack: notification for the pipeline",
".buildkite/pipelines/send_slack_notification.sh"))
pipeline_steps.append(pipeline_steps.generate_step("Queue a :email: notification for the pipeline",
".buildkite/pipelines/send_email_notification.sh"))
pipeline_steps.append(pipeline_steps.generate_step("Upload clang-format validation",
".buildkite/pipelines/format_and_validation.yml.sh"))
config = buildConfig.Config()
config.parse()

# Compute which build step keys will exist so that analytics and
# failure-analysis steps can emit a correct depends_on list.
build_step_keys = []
if config.build_linux and config.build_aarch64:
build_step_keys.append("build_test_linux-aarch64-RelWithDebInfo")
if config.build_linux and config.build_x86_64:
build_step_keys.append("build_test_linux-x86_64-RelWithDebInfo")
if config.build_macos and config.build_aarch64:
build_step_keys.append("build_test_macos-aarch64-RelWithDebInfo")
if config.build_windows and config.build_x86_64:
build_step_keys.append("build_test_Windows-x86_64-RelWithDebInfo")

env = {
"VERSION_QUALIFIER": "",
"ML_BUILD_STEP_KEYS": ",".join(build_step_keys),
}

if config.build_windows:
build_windows = pipeline_steps.generate_step_template("Windows", config.action, "", config.build_x86_64)
pipeline_steps.append(build_windows)
Expand Down
40 changes: 40 additions & 0 deletions .buildkite/pipelines/analyze_build_failure.yml.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/bin/bash
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
# or more contributor license agreements. Licensed under the Elastic License
# 2.0 and the following additional limitation. Functionality enabled by the
# files subject to the Elastic License 2.0 may only be used in production when
# invoked by an Elasticsearch process with a license key installed that permits
# use of machine learning features. You may not use this file except in
# compliance with the Elastic License 2.0 and the foregoing additional
# limitation.

EXTRA_FLAGS=""
if [ "${ML_ANALYZE_PREVIOUS:-}" = "true" ]; then
EXTRA_FLAGS=" --find-previous-failure"
fi

cat <<EOL
steps:
- label: "Analyze build failure :mag:"
key: "analyze_build_failure"
command:
- "python3 dev-tools/analyze_build_failure.py --pipeline \$BUILDKITE_PIPELINE_SLUG --build \$BUILDKITE_BUILD_NUMBER${EXTRA_FLAGS}"
EOL

# Emit depends_on dynamically — ML_BUILD_STEP_KEYS is a comma-separated
# list of step keys set by the pipeline generator. In analyze-previous
# mode there are no build steps so this block is skipped.
if [ -n "${ML_BUILD_STEP_KEYS:-}" ]; then
echo ' depends_on:'
IFS=',' read -ra STEP_KEYS <<< "$ML_BUILD_STEP_KEYS"
for key in "${STEP_KEYS[@]}"; do
echo " - \"${key}\""
done
fi

cat <<'EOL'
allow_dependency_failure: true
soft_fail: true
agents:
image: "python:3"
EOL
2 changes: 1 addition & 1 deletion .buildkite/pull-requests.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"commit_status_context": "ml-cpp-ci",
"build_on_commit": true,
"build_on_comment": true,
"trigger_comment_regex": "^(?:(?:buildkite +)(?<action>build|debug|run_qa_tests|run_pytorch_tests)(=(?<args>(?:[^ ]+)))? *(?: for ES_BRANCH=(?<branch>([.0-9a-zA-Z]+)))? *(?:with STACK_VERSION=(?<version>([.0-9]+)))? *(?: *on *(?<platform>(?:[ ,]*(?:windows|linux|mac(os)?))+))?) *(?<arch>(?:[, ]*aarch64|x86_64)+)?$",
"trigger_comment_regex": "^(?:(?:buildkite +)(?<action>build|debug|run_qa_tests|run_pytorch_tests|analyze)(=(?<args>(?:[^ ]+)))? *(?: for ES_BRANCH=(?<branch>([.0-9a-zA-Z]+)))? *(?:with STACK_VERSION=(?<version>([.0-9]+)))? *(?: *on *(?<platform>(?:[ ,]*(?:windows|linux|mac(os)?))+))?) *(?<arch>(?:[, ]*aarch64|x86_64)+)?$",
"always_trigger_comment_regex": "^(?:(?:buildkite\\W+)?(?:build|test)\\W+(?:this|it))",
"skip_ci_labels": ["skip-ci", "jenkins-ci", ">test-mute", ">docs"],
"skip_target_branches": ["6.8", "7.11", "7.12"],
Expand Down
133 changes: 133 additions & 0 deletions .github/workflows/post-build-analysis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
name: Post Build Failure Analysis

# Triggered by commit status updates from Buildkite. When the
# analyze_build_failure step completes, Buildkite posts a commit status
# which fires this workflow. We fetch the AI analysis from Buildkite
# build metadata and post it as a PR comment using the built-in
# GITHUB_TOKEN (no PAT or GitHub App needed).

on:
status:

permissions:
pull-requests: write
statuses: read

jobs:
post-analysis:
# Only run when the analyze step succeeds (soft_fail means Buildkite
# reports success even if the analysis itself had issues).
if: >-
github.event.state == 'success' &&
contains(github.event.context, 'Analyze build failure')
runs-on: ubuntu-latest
steps:
- name: Find PR for commit
id: find-pr
env:
GH_TOKEN: ${{ github.token }}
SHA: ${{ github.event.sha }}
run: |
PR_NUMBER=$(gh api "repos/${{ github.repository }}/commits/${SHA}/pulls" \
--jq '.[0].number // empty' 2>/dev/null || true)
if [ -z "$PR_NUMBER" ]; then
echo "No PR found for commit ${SHA} — skipping."
echo "skip=true" >> "$GITHUB_OUTPUT"
else
echo "Found PR #${PR_NUMBER}"
echo "skip=false" >> "$GITHUB_OUTPUT"
echo "pr_number=${PR_NUMBER}" >> "$GITHUB_OUTPUT"
fi

- name: Extract Buildkite build info
if: steps.find-pr.outputs.skip != 'true'
id: bk-info
env:
TARGET_URL: ${{ github.event.target_url }}
run: |
# target_url looks like:
# https://buildkite.com/elastic/ml-cpp-pr-builds/builds/2361#step-key
# Extract pipeline slug and build number.
PIPELINE=$(echo "$TARGET_URL" | sed -n 's|.*/elastic/\([^/]*\)/builds/.*|\1|p')
BUILD_NUM=$(echo "$TARGET_URL" | sed -n 's|.*/builds/\([0-9]*\).*|\1|p')
if [ -z "$PIPELINE" ] || [ -z "$BUILD_NUM" ]; then
echo "Could not parse Buildkite URL: $TARGET_URL"
echo "skip=true" >> "$GITHUB_OUTPUT"
else
echo "Pipeline: $PIPELINE, Build: $BUILD_NUM"
echo "skip=false" >> "$GITHUB_OUTPUT"
echo "pipeline=${PIPELINE}" >> "$GITHUB_OUTPUT"
echo "build_num=${BUILD_NUM}" >> "$GITHUB_OUTPUT"
fi

- name: Fetch analysis from Buildkite
if: >-
steps.find-pr.outputs.skip != 'true' &&
steps.bk-info.outputs.skip != 'true'
id: fetch
env:
BK_TOKEN: ${{ secrets.BUILDKITE_API_READ_TOKEN }}
PIPELINE: ${{ steps.bk-info.outputs.pipeline }}
BUILD_NUM: ${{ steps.bk-info.outputs.build_num }}
run: |
if [ -z "$BK_TOKEN" ]; then
echo "BUILDKITE_API_READ_TOKEN secret not set — skipping."
echo "skip=true" >> "$GITHUB_OUTPUT"
exit 0
fi

# Fetch build metadata containing the analysis.
ANALYSIS=$(curl -sS -f \
-H "Authorization: Bearer ${BK_TOKEN}" \
"https://api.buildkite.com/v2/organizations/elastic/pipelines/${PIPELINE}/builds/${BUILD_NUM}/meta-data/build-failure-analysis" \
2>/dev/null) || true

if [ -z "$ANALYSIS" ]; then
echo "No analysis metadata found — skipping."
echo "skip=true" >> "$GITHUB_OUTPUT"
exit 0
fi

# Save to file to avoid shell quoting issues.
echo "$ANALYSIS" > /tmp/analysis.md
echo "skip=false" >> "$GITHUB_OUTPUT"

- name: Post or update PR comment
if: >-
steps.find-pr.outputs.skip != 'true' &&
steps.bk-info.outputs.skip != 'true' &&
steps.fetch.outputs.skip != 'true'
env:
GH_TOKEN: ${{ github.token }}
PR_NUMBER: ${{ steps.find-pr.outputs.pr_number }}
PIPELINE: ${{ steps.bk-info.outputs.pipeline }}
BUILD_NUM: ${{ steps.bk-info.outputs.build_num }}
run: |
MARKER="<!-- build-failure-analysis -->"
BUILD_URL="https://buildkite.com/elastic/${PIPELINE}/builds/${BUILD_NUM}"
ANALYSIS=$(cat /tmp/analysis.md)

BODY=$(cat <<EOF
${MARKER}
## :mag: Build Failure Analysis

${ANALYSIS}

---
[View Buildkite build](${BUILD_URL}) | *Analysis generated by Claude. Verify before acting.*
EOF
)

# Check for an existing comment to update.
EXISTING_ID=$(gh api "repos/${{ github.repository }}/issues/${PR_NUMBER}/comments?per_page=100" \
--jq ".[] | select(.body | contains(\"${MARKER}\")) | .id" 2>/dev/null | head -1)

if [ -n "$EXISTING_ID" ]; then
gh api "repos/${{ github.repository }}/issues/comments/${EXISTING_ID}" \
-X PATCH -f body="$BODY"
echo "Updated existing comment on PR #${PR_NUMBER}."
else
gh api "repos/${{ github.repository }}/issues/${PR_NUMBER}/comments" \
-f body="$BODY"
echo "Posted new comment on PR #${PR_NUMBER}."
fi
2 changes: 2 additions & 0 deletions catalog-info.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ spec:
publish_commit_status: false
publish_commit_status_per_step: true
trigger_mode: code
env:
ELASTIC_PR_COMMENTS_ENABLED: 'true'
repository: elastic/ml-cpp
skip_intermediate_builds: true
teams:
Expand Down
Loading