From d63b1661a216f2a6c67517b5d6c364deb0e2fabf Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Fri, 20 Mar 2026 10:48:41 +1300
Subject: [PATCH 1/3] [ML] Make build timing regression check a soft failure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The "Check build timing regressions" pipeline step should never cause
the overall build to be marked as failed.  The generated analytics step
already had soft_fail: true, but the upload step that creates it did
not — so errors during pipeline upload (e.g. missing step dependencies)
would still produce a hard failure.

Add a soft_fail parameter to generate_step() and use it for the
regression check, making both the upload and the analytics step
soft-fail on any error.

Made-with: Cursor
---
 .buildkite/ml_pipeline/step.py | 4 +++-
 .buildkite/pipeline.json.py    | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/.buildkite/ml_pipeline/step.py b/.buildkite/ml_pipeline/step.py
index ed2fbb0997..cb7db14d08 100644
--- a/.buildkite/ml_pipeline/step.py
+++ b/.buildkite/ml_pipeline/step.py
@@ -9,7 +9,7 @@
 # limitation.
 
 class PipelineStep(list):
-  def generate_step(self, label, command):
+  def generate_step(self, label, command, soft_fail=False):
     command = command + " | buildkite-agent pipeline upload"
     step = {
       "label": label,
@@ -19,6 +19,8 @@ def generate_step(self, label, command):
         "image": "python",
       }
     }
+    if soft_fail:
+      step["soft_fail"] = True
     return step
 
   def generate_step_template(self, platform, action, build_aarch64, build_x86_64):
diff --git a/.buildkite/pipeline.json.py b/.buildkite/pipeline.json.py
index 56b02f4579..37cc4537de 100755
--- a/.buildkite/pipeline.json.py
+++ b/.buildkite/pipeline.json.py
@@ -65,7 +65,8 @@ def main():
 
     # Check for build timing regressions against nightly baseline
     pipeline_steps.append(pipeline_steps.generate_step("Check build timing regressions",
-                                                       ".buildkite/pipelines/check_build_regression.yml.sh"))
+                                                       ".buildkite/pipelines/check_build_regression.yml.sh",
+                                                       soft_fail=True))
 
     pipeline["env"] = env
     pipeline["steps"] = pipeline_steps

From f3fda7646b6b322b9d9dc7ffe5e7972fd4c50a87 Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Mon, 23 Mar 2026 14:48:27 +1300
Subject: [PATCH 2/3] [ML] Fix PyTorch allowlist validation failing when Python
 3 is unavailable

The run-validation.cmake script supports an OPTIONAL flag to skip
gracefully when Python 3 is not on the PATH. Pass it from the CI
test runner so that environments without Python (e.g. the Linux
Docker build image) don't fail the entire test step.

Made-with: Cursor
---
 .buildkite/scripts/steps/run_tests.sh | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/.buildkite/scripts/steps/run_tests.sh b/.buildkite/scripts/steps/run_tests.sh
index 12b88c1bb5..36a49133af 100755
--- a/.buildkite/scripts/steps/run_tests.sh
+++ b/.buildkite/scripts/steps/run_tests.sh
@@ -105,6 +105,28 @@ else
         -P cmake/run-all-tests-parallel.cmake || TEST_OUTCOME=$?
 fi
 
+# --- PyTorch allowlist validation ---
+# When triggered from the PyTorch edge pipeline, run the Python-based
+# allowlist validation which traces live HuggingFace models with the
+# new PyTorch version and verifies every op is in ALLOWED_OPERATIONS.
+VALIDATION_OUTCOME=0
+if [[ "${GITHUB_PR_COMMENT_VAR_ACTION:-}" == "run_pytorch_tests" ]]; then
+    echo "--- Validating PyTorch allowlist against HuggingFace models"
+    cmake \
+        -DSOURCE_DIR="$(pwd)" \
+        -DVALIDATE_CONFIG="$(pwd)/dev-tools/extract_model_ops/validation_models.json" \
+        -DVALIDATE_PT_DIR="$(pwd)/dev-tools/extract_model_ops/es_it_models" \
+        -DVALIDATE_VERBOSE=TRUE \
+        -DOPTIONAL=TRUE \
+        -P cmake/run-validation.cmake || VALIDATION_OUTCOME=$?
+
+    if [[ $VALIDATION_OUTCOME -ne 0 ]]; then
+        echo "^^^ +++"
+        echo "Allowlist validation failed — the new PyTorch version may introduce ops not in ALLOWED_OPERATIONS."
+        echo "See dev-tools/extract_model_ops/README.md for how to update the allowlist."
+    fi
+fi
+
 # Upload test results
 echo "--- Uploading test results"
 TEST_RESULTS_ARCHIVE=${OS}-${HARDWARE_ARCH}-unit_test_results.tgz
@@ -117,4 +139,6 @@ else
     echo "No test results archive created"
 fi
 
-exit $TEST_OUTCOME
+if [[ $TEST_OUTCOME -ne 0 || $VALIDATION_OUTCOME -ne 0 ]]; then
+    exit 1
+fi

From 9c296618261efef51fd8449206d9e8261a658944 Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Tue, 24 Mar 2026 14:32:20 +1300
Subject: [PATCH 3/3] [ML] Guard PyTorch validation on existence of cmake
 script

Skip the allowlist validation block if cmake/run-validation.cmake
does not exist, so this change remains safe even if the graph
validation feature is reverted independently.

Made-with: Cursor
---
 .buildkite/scripts/steps/run_tests.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/scripts/steps/run_tests.sh b/.buildkite/scripts/steps/run_tests.sh
index 36a49133af..0c5c08125a 100755
--- a/.buildkite/scripts/steps/run_tests.sh
+++ b/.buildkite/scripts/steps/run_tests.sh
@@ -110,7 +110,7 @@ fi
 # allowlist validation which traces live HuggingFace models with the
 # new PyTorch version and verifies every op is in ALLOWED_OPERATIONS.
 VALIDATION_OUTCOME=0
-if [[ "${GITHUB_PR_COMMENT_VAR_ACTION:-}" == "run_pytorch_tests" ]]; then
+if [[ "${GITHUB_PR_COMMENT_VAR_ACTION:-}" == "run_pytorch_tests" ]] && [ -f cmake/run-validation.cmake ]; then
     echo "--- Validating PyTorch allowlist against HuggingFace models"
     cmake \
         -DSOURCE_DIR="$(pwd)" \