From bcd592025efa9e1771ff1130af1fc874553c8709 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Wed, 18 Feb 2026 09:37:15 +1300 Subject: [PATCH 01/19] [ML] Replace shell test runner with portable CMake/CTest infrastructure Replace run_tests_as_seperate_processes.sh with cmake/run-tests-individually.cmake for portable cross-platform parallel test execution. Enable CTest integration and rename the 'test' target to 'ml_test' to avoid conflicts. Co-authored-by: Cursor --- .cursor/rules/ml-cpp-build-system.mdc | 167 +++++--- .cursor/rules/ml-cpp-coding-conventions.mdc | 105 +++-- CMakeLists.txt | 2 + cmake/functions.cmake | 22 +- cmake/run-tests-individually.cmake | 363 ++++++++++++++++++ ...on-notes-2026-02-16-test-infrastructure.md | 132 +++++++ test/CMakeLists.txt | 4 +- 7 files changed, 699 insertions(+), 96 deletions(-) create mode 100644 cmake/run-tests-individually.cmake create mode 100644 session-notes-2026-02-16-test-infrastructure.md diff --git a/.cursor/rules/ml-cpp-build-system.mdc b/.cursor/rules/ml-cpp-build-system.mdc index f97b27823..1fdc310ce 100644 --- a/.cursor/rules/ml-cpp-build-system.mdc +++ b/.cursor/rules/ml-cpp-build-system.mdc @@ -1,69 +1,136 @@ --- -description: ml-cpp build system knowledge — CMake, Gradle, Docker, CI -globs: CMakeLists.txt, cmake/**, build.gradle, dev-tools/**, .buildkite/** +description: Build system, test infrastructure, and Docker environment for Elasticsearch ml-cpp +globs: ["CMakeLists.txt", "cmake/**", "**/*Test.cc", "**/*Test.h"] +alwaysApply: false --- -# ml-cpp Build System +# ml-cpp Build System & Test Infrastructure -## CMake Structure +## Project Structure -- Top-level `CMakeLists.txt` configures the project, includes `CTest`, detects compiler cache -- `cmake/functions.cmake` defines `ml_add_library`, `ml_add_test_executable`, `ml_add_test`, `ml_install` -- `cmake/variables.cmake` defines compiler flags per platform -- Toolchain files: `cmake/{darwin-aarch64,linux-x86_64,linux-aarch64,windows-x86_64}.cmake` -- `test/CMakeLists.txt` defines test targets: `ml_test`, `test_individually`, `build_tests`, `run_tests` +- **Source headers:** `include/` (e.g. `include/core/`, `include/model/`) +- **Libraries:** `lib/` with sub-libraries: `core`, `model`, `api`, `maths/common`, `maths/time_series`, `maths/analytics`, `ver`, `seccomp`, `test` +- **Applications:** `bin/autodetect`, `bin/normalize`, `bin/controller`, `bin/categorize`, `bin/data_frame_analyzer`, `bin/pytorch_inference` +- **Unit tests:** co-located at `lib//unittest/` and `bin//unittest/` +- **CMake modules:** `cmake/` -- toolchain files, functions, test runners +- **3rd party:** `3rd_party/` -- Eigen, RapidJSON headers -## Important: CTest and Target Names +## CMake Conventions -- `include(CTest)` reserves the `test` target name — custom targets must not use it -- Our monolithic test target is named `ml_test` (not `test`) -- `test_individually` runs tests via CTest with parallel execution +### Custom Functions (cmake/functions.cmake) -## Build Acceleration +- `ml_add_test_executable(_target ...)` -- creates test executable `ml_test_`, links with Boost.Test, creates `test_` and `test__individually` custom targets +- `ml_add_test(_directory _target)` -- registers a test suite; populates `ML_BUILD_TEST_DEPENDS`, `ML_TEST_DEPENDS`, `ML_TEST_INDIVIDUALLY_DEPENDS` +- `ml_codesign(_target)` -- macOS code signing -### Unity Builds (`-DCMAKE_UNITY_BUILD=ON`) -- Combines multiple source files into single translation units -- Effective on x86_64 (~41% faster), minimal on aarch64 -- Conflicts from anonymous-namespace symbols need `SKIP_UNITY_BUILD_INCLUSION` -- Disabled entirely for: `MlMathsTimeSeries`, `MlMathsAnalytics`, `MlApi`, `ml_test_maths_common`, `ml_test_api` +### Key CMake Targets -### Precompiled Headers (`-DML_PCH=ON`) -- Custom option, applied in `cmake/functions.cmake` via `target_precompile_headers()` -- STL headers + `` for test targets -- Do NOT include `` — conflicts with `boost/json.hpp` on GCC +- `build_tests` -- builds all test executables without running them +- `test_` -- build + run a single test suite (e.g. `test_core`) +- `test__individually` -- build + run a single suite with tests in separate processes +- `run_tests` -- runs all test suites sequentially +- `run_tests_individually` -- runs all test suites individually (parallel within each suite) +- `test` -- run all tests + check for failures +- `test_individually` -- run all tests individually + check for failures -### sccache (GCS Backend) -- `dev-tools/setup_sccache.sh` / `setup_sccache.ps1` — downloads, configures, starts -- GCS bucket: `gs://elastic-ml-cpp-sccache`, per-platform prefix (`linux-x86_64/`, etc.) -- Vault: `secret/ci/elastic-ml-cpp/sccache/gcs_service_account` -- Requires `-DCMAKE_CXX_COMPILER_LAUNCHER=sccache` — top-level CMakeLists.txt respects existing launcher +### Toolchain Files -### Compiler Launcher Precedence -If `CMAKE_CXX_COMPILER_LAUNCHER` is already set (e.g. sccache), the ccache auto-detection in `CMakeLists.txt` is skipped. +- `cmake/linux-x86_64.cmake` -- GCC 13.3 at `/usr/local/gcc133/` +- `cmake/linux-aarch64.cmake` -- GCC 13.3 at `/usr/local/gcc133/` +- `cmake/darwin-aarch64.cmake` -- Clang (Xcode) +- Auto-detected from `CMAKE_HOST_SYSTEM_NAME` and `uname -m` if not specified -### MSVC `/Z7` vs `/Zi` -- `/Zi`: Debug info via shared PDB (`mspdbsrv.exe`) — serializes parallel compilation -- `/Z7`: Debug info embedded in `.obj` — fully parallel, sccache-compatible -- We use `/Z7` for all Windows configurations; `/FS` flag removed as unnecessary +### Build Types -## Gradle Integration +- Default: `RelWithDebInfo` (optimized with debug symbols) +- `ML_DEBUG=1` environment variable switches to `Debug` +- Flags defined in `cmake/variables.cmake` -- `build.gradle` invokes CMake for macOS and Windows builds -- `task test` calls `cmake --build ... -t ml_test` -- `task check` depends on `test` -- `testParallel` formula: `numCpus <= 4 ? 2 : Math.ceil(numCpus / 2.0)` (Unix), `2` (Windows) -- Environment `CMAKE_FLAGS` are appended to Gradle's internal cmake flags (stripping duplicate toolchain) +## Test Framework -## Docker Builds (Linux) +### Boost.Test Conventions -- `dev-tools/docker/docker_entrypoint.sh` — main build/test script inside containers -- `dev-tools/docker_build.sh` / `docker_test.sh` — host orchestration -- Linux aarch64 builds run in Docker; x86_64 runs `docker_entrypoint.sh` directly -- cgroup-aware CPU detection: check `/sys/fs/cgroup/cpu.max` (cgroups v2) or `/sys/fs/cgroup/cpu/cpu.cfs_{quota,period}_us` (v1) -- `ZIP_COMPRESSION_LEVEL`: 1 for PR/debug builds, 9 for release branches +- Test suites named `CTest` (e.g. `CStateFileRemoverTest`) +- Test cases named `test` (e.g. `testDeleteOnDestruction`) +- Files named `CTest.cc` +- Must be added to the `SRCS` list in the unittest's `CMakeLists.txt` +- Copyright header required on all files -## Test Parallelism +### Running Tests -- Test parallelism formula: `numCpus <= 4 ? 2 : ceil(numCpus / 2)` -- CKMostCorrelatedTest/testScale is CPU-time-sensitive — keep parallelism conservative on low-core machines -- Each test suite internally uses `ctest --parallel ` for individual test case parallelism +```bash +# Single suite +cmake --build --target test_core + +# Single suite, tests in separate processes (parallel via CTest) +cmake --build --target test_core_individually + +# All suites +cmake --build --target test + +# All suites individually (parallel within each) +cmake --build --target test_individually + +# All suites individually with outer parallelism (optimal on 14-core: -j5) +cmake --build --target test_individually -j5 + +# Specific test case +TESTS=CStateFileRemoverTest cmake --build --target test_core + +# Direct executable invocation +/test/lib/core/unittest/ml_test_core --run_test=CStateFileRemoverTest +``` + +### Environment Variables for Test Runner + +- `BOOST_TEST_MAX_ARGS` -- tests per batch (default: 2) +- `BOOST_TEST_MAX_PROCS` -- max parallel processes (default: nproc) +- `BOOST_TEST_MIXED_MODE` -- if "true", batch tests run in one Boost process +- `BOOST_TEST_OUTPUT_FORMAT_FLAGS` -- JUnit output flags +- `TEST_FLAGS` -- additional flags passed to test executable +- `RUN_BOOST_TESTS_IN_FOREGROUND` -- run tests with output to terminal + +## Docker Build/Test Environments + +### Images + +| Image | Arch | Tag Pattern | +|---|---|---| +| `docker.elastic.co/ml-dev/ml-linux-build` | x86_64 | `:34` | +| `docker.elastic.co/ml-dev/ml-linux-aarch64-native-build` | aarch64 | `:17` | + +### Required Environment + +```bash +export CPP_SRC_HOME=/path/to/ml-cpp +export PATH="/usr/local/gcc133/bin:$PATH" +export LD_LIBRARY_PATH="/usr/local/gcc133/lib64:/usr/local/gcc133/lib" +``` + +### Docker Build Example + +```bash +docker run --rm \ + -v /path/to/ml-cpp:/ml-cpp \ + -e CPP_SRC_HOME=/ml-cpp \ + -e PATH="/usr/local/gcc133/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" \ + -e LD_LIBRARY_PATH="/usr/local/gcc133/lib64:/usr/local/gcc133/lib" \ + -w /ml-cpp \ + docker.elastic.co/ml-dev/ml-linux-aarch64-native-build:17 \ + bash -c "mkdir -p /tmp/build && cd /tmp/build && \ + cmake -DCMAKE_TOOLCHAIN_FILE=/ml-cpp/cmake/linux-aarch64.cmake /ml-cpp && \ + cmake --build . --target test_individually -j5" +``` + +### Build Performance (ARM Mac host) + +- **aarch64 image:** runs natively, full build ~7min at `-j4`, limited by Docker 8GB RAM +- **x86_64 image:** runs under QEMU emulation, full build 30+ min at `-j1`, OOM risk at `-j2+` + +## Dependencies + +- **Boost 1.86.0** -- found via CMake `find_package`, libraries at `/usr/local/gcc133/lib/` +- **PyTorch (libtorch)** -- headers at `/usr/local/gcc133/include/pytorch`, libs at `/usr/local/gcc133/lib/` +- **libxml2** -- at `/usr/local/gcc133/lib/` +- **Eigen** -- header-only at `3rd_party/eigen/` +- **RapidJSON** -- header-only at `3rd_party/include/` diff --git a/.cursor/rules/ml-cpp-coding-conventions.mdc b/.cursor/rules/ml-cpp-coding-conventions.mdc index 2a56a83ac..4f047bedb 100644 --- a/.cursor/rules/ml-cpp-coding-conventions.mdc +++ b/.cursor/rules/ml-cpp-coding-conventions.mdc @@ -1,54 +1,87 @@ --- -description: ml-cpp coding conventions and cross-platform considerations -globs: "**/*.cc", "**/*.h" +description: Coding conventions and patterns for Elasticsearch ml-cpp C++ code +globs: ["**/*.cc", "**/*.h", "include/**", "lib/**", "bin/**"] +alwaysApply: false --- # ml-cpp Coding Conventions ## Naming -- Classes: `CUpperCamelCase` (C prefix) -- Methods: `lowerCamelCase` -- Member variables: `m_UpperCamelCase` -- Static member variables: `ms_UpperCamelCase` -- Types: `TUpperCamelCase` (T prefix for typedefs) -- Test files: `CClassNameTest.cc` -- Namespaces: `ml::module::submodule` +- **Namespaces:** `ml::core`, `ml::model`, `ml::api`, `ml::maths::common`, `ml::maths::time_series`, `ml::maths::analytics` +- **Classes:** `C` prefix (e.g. `CStateFileRemover`, `CAnomalyDetector`) +- **Type aliases:** `T` prefix (e.g. `TDouble3Vec`, `TStrVec`) +- **Enums:** `E` prefix (e.g. `EFloatingPointErrorStatus`) +- **Member variables:** `m_` prefix (e.g. `m_DeleteStateFiles`, `m_QuantilesStateFile`) +- **Test files:** `CTest.cc` with `BOOST_AUTO_TEST_SUITE(CTest)` +- **Test cases:** `testSomethingDescriptive` -## Commit Messages +## C++ Standard -Format: `[ML] Short description` — 1-2 sentences explaining the "why". +- **C++20** (`CMAKE_CXX_STANDARD 20`) +- Smart pointers preferred (`std::unique_ptr`, `std::make_unique`) +- RAII pattern for resource management -## Boost Test Framework +## Header Guards -- Tests use `BOOST_AUTO_TEST_SUITE` / `BOOST_AUTO_TEST_CASE` -- `BOOST_TEST_DONT_PRINT_LOG_VALUE` for types without operator<< -- JUnit output: `boost_test_results.junit` per test suite -- Seeded RNG: `maths::common::CSampling::seed()` at test start +The codebase uses `#ifndef` / `#define` / `#endif` style header guards (not `#pragma once`). -## Cross-Platform Considerations +## RAII Pattern -### Stream/IO -- `std::istream::eof()` behaves differently across platforms -- Use `peek() == std::char_traits::eof()` for portable end-of-stream detection -- `CJsonStateRestoreTraverser::isEof()` uses both checks for portability +Resources that need cleanup on scope exit should use RAII wrappers. Example: `CStateFileRemover` deletes quantiles state files in its destructor. Use `std::unique_ptr` to manage lifetime; prefer `reset()` over `release()` to avoid memory leaks. -### Timing in Tests -- **Never use wall-clock time** (`CStopWatch`, `CLOCK_MONOTONIC`) for performance assertions in unit tests — flaky under parallel execution -- Use `std::clock()` (CPU time) for scaling/benchmark assertions -- `std::clock()` measures process CPU time on all platforms (POSIX + Windows) +## Logging -### Temporary Files in Tests -- Use process ID (`ml::core::CProcess::instance().id()`) for unique temp names -- Do NOT use small random ranges (e.g. `1-100`) — causes collisions under parallel CTest +Use the ML logging macros: +- `LOG_TRACE` / `LOG_DEBUG` / `LOG_INFO` / `LOG_WARN` / `LOG_ERROR` / `LOG_FATAL` +- Example: `LOG_WARN(<< "Failed to delete file '" << filename << "': " << strerror(errno));` +- Logger is configured via Boost.Log -### Unity Build Conflicts -- Anonymous-namespace constants (`EMPTY_STRING`, `*_TAG`) cause redefinition errors -- Fix: rename to be unique, or add file to `SKIP_UNITY_BUILD_INCLUSION` -- `BOOST_TEST_DONT_PRINT_LOG_VALUE` macros also conflict in unity builds +## Unit Tests -## RAII Patterns +### Structure -- `std::unique_ptr` with custom deleters for resource cleanup -- Use `reset()` not `release()` + manual cleanup — avoids leaks on exception paths -- `CStateFileRemover` is the RAII helper for state file deletion +```cpp +#include + +#include + +BOOST_AUTO_TEST_SUITE(CStateFileRemoverTest) + +BOOST_AUTO_TEST_CASE(testDeleteOnDestruction) { + // Test body +} + +BOOST_AUTO_TEST_SUITE_END() +``` + +### Conventions + +- Each test file tests one class +- Comment each test case explaining which production code path it mirrors +- Clean up test artifacts (temp files) in each test case +- Use `BOOST_TEST_REQUIRE` for preconditions, `BOOST_CHECK` / `BOOST_TEST` for assertions +- Tests discover via `--list_content` and run via `--run_test=Suite/Case` + +## License Header + +Every file must start with the Elastic License 2.0 header: + +``` +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the following additional limitation. ... + */ +``` + +## Include Order + +1. Primary header for the implementation file +2. Project headers (`#include `, `#include `) +3. Boost headers +4. Standard library headers + +## Warnings + +The build uses `-Wall -Wextra -Wconversion -Wold-style-cast` and many more. Third-party headers are included via `-isystem` to suppress their warnings. Eigen generates `-Wmaybe-uninitialized` and `-Warray-bounds` warnings at `-O3` with GCC 13.3 but these are not errors. diff --git a/CMakeLists.txt b/CMakeLists.txt index 9154df820..09ca5d546 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,6 +68,8 @@ endif() project("ML") +include(CTest) + include(CheckPIESupported) check_pie_supported() diff --git a/cmake/functions.cmake b/cmake/functions.cmake index 3e7f5481e..e5f225cd6 100644 --- a/cmake/functions.cmake +++ b/cmake/functions.cmake @@ -365,7 +365,9 @@ function(ml_add_test_executable _target) target_link_libraries(ml_test_${_target} ${ML_LINK_LIBRARIES}) - add_test(ml_test_${_target} ml_test_${_target}) + add_test(NAME ml_test_${_target} COMMAND ml_test_${_target} + --logger=JUNIT,warning,${CMAKE_CURRENT_BINARY_DIR}/ml_test_${_target}_junit.xml + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) get_property(isMultiConfig GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG) @@ -392,14 +394,18 @@ function(ml_add_test_executable _target) COMMENT "Running test: ml_test_${_target}" WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} ) - - add_custom_target(test_${_target}_individually - DEPENDS ml_test_${_target} - COMMAND ${CMAKE_SOURCE_DIR}/run_tests_as_seperate_processes.sh ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR} test_${_target} - COMMENT "Running test: ml_test_${_target}_individually" - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - ) endif() + + add_custom_target(test_${_target}_individually + DEPENDS ml_test_${_target} + COMMAND ${CMAKE_COMMAND} + -DBINARY_DIR=${CMAKE_CURRENT_BINARY_DIR} + -DTEST_SUITE=test_${_target} + -DTEST_DIR=${CMAKE_CURRENT_SOURCE_DIR} + -P ${CMAKE_SOURCE_DIR}/cmake/run-tests-individually.cmake + COMMENT "Running test: ml_test_${_target}_individually" + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + ) endfunction() function(ml_codesign _target) diff --git a/cmake/run-tests-individually.cmake b/cmake/run-tests-individually.cmake new file mode 100644 index 000000000..f6fd416a8 --- /dev/null +++ b/cmake/run-tests-individually.cmake @@ -0,0 +1,363 @@ +# +# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +# or more contributor license agreements. Licensed under the Elastic License +# 2.0 and the following additional limitation. Functionality enabled by the +# files subject to the Elastic License 2.0 may only be used in production when +# invoked by an Elasticsearch process with a license key installed that permits +# use of machine learning features. You may not use this file except in +# compliance with the Elastic License 2.0 and the foregoing additional +# limitation. +# + +# run-tests-individually.cmake +# +# Portable replacement for run_tests_as_seperate_processes.sh. +# Discovers Boost.Test cases, generates a temporary CTest project, and +# runs them in parallel via ctest --parallel. Works on Linux, macOS and +# Windows using only CMake built-in functionality (no bash/sed/awk/xargs). +# +# Required -D parameters: +# BINARY_DIR - directory containing the test executable +# TEST_SUITE - test suite target name without "ml_" prefix, e.g. "test_api" +# TEST_DIR - source-side unittest directory (for working directory) +# +# Optional environment variables (same semantics as the shell script): +# BOOST_TEST_MAX_ARGS - max test cases per batch (default: 1) +# BOOST_TEST_MAX_PROCS - max parallel processes (default: logical CPU count) +# BOOST_TEST_MIXED_MODE - if "true", batch tests run in one Boost process +# using colon-separated --run_test= syntax +# BOOST_TEST_OUTPUT_FORMAT_FLAGS - passed through to the test executable +# TEST_FLAGS - additional flags passed to the test executable + +cmake_minimum_required(VERSION 3.19) + +# --------------------------------------------------------------------------- +# Validate required parameters +# --------------------------------------------------------------------------- +foreach(_var BINARY_DIR TEST_SUITE TEST_DIR) + if(NOT DEFINED ${_var} OR "${${_var}}" STREQUAL "") + message(FATAL_ERROR "${_var} must be defined. " + "Usage: cmake -DBINARY_DIR=... -DTEST_SUITE=... -DTEST_DIR=... " + "-P run-tests-individually.cmake") + endif() +endforeach() + +# --------------------------------------------------------------------------- +# Derive paths +# --------------------------------------------------------------------------- +set(TEST_EXECUTABLE "${BINARY_DIR}/ml_${TEST_SUITE}") +set(LOG_DIR "${BINARY_DIR}/test_logs") +set(CTEST_PROJECT_DIR "${BINARY_DIR}/_ctest_individual") + +# --------------------------------------------------------------------------- +# Parallelism and batching settings +# --------------------------------------------------------------------------- +set(MAX_ARGS 2) +if(DEFINED ENV{BOOST_TEST_MAX_ARGS} AND NOT "$ENV{BOOST_TEST_MAX_ARGS}" STREQUAL "") + set(MAX_ARGS "$ENV{BOOST_TEST_MAX_ARGS}") +endif() + +cmake_host_system_information(RESULT _num_cpus QUERY NUMBER_OF_LOGICAL_CORES) +set(MAX_PROCS ${_num_cpus}) +if(DEFINED ENV{BOOST_TEST_MAX_PROCS} AND NOT "$ENV{BOOST_TEST_MAX_PROCS}" STREQUAL "") + set(MAX_PROCS "$ENV{BOOST_TEST_MAX_PROCS}") +endif() + +set(MIXED_MODE FALSE) +if(DEFINED ENV{BOOST_TEST_MIXED_MODE} AND "$ENV{BOOST_TEST_MIXED_MODE}" STREQUAL "true") + set(MIXED_MODE TRUE) +endif() + +# Collect extra flags from the environment +set(EXTRA_TEST_FLAGS "") +if(DEFINED ENV{TEST_FLAGS} AND NOT "$ENV{TEST_FLAGS}" STREQUAL "") + string(REPLACE " " ";" EXTRA_TEST_FLAGS "$ENV{TEST_FLAGS}") +endif() + +set(BOOST_OUTPUT_FLAGS "") +if(DEFINED ENV{BOOST_TEST_OUTPUT_FORMAT_FLAGS} AND NOT "$ENV{BOOST_TEST_OUTPUT_FORMAT_FLAGS}" STREQUAL "") + set(BOOST_OUTPUT_FLAGS "$ENV{BOOST_TEST_OUTPUT_FORMAT_FLAGS}") +endif() + +# The seccomp test activates a sandbox that restricts system calls, so we must +# force Human Readable Format (HRF) logging instead of XML/JUNIT which may +# attempt I/O operations the sandbox does not permit. +set(IS_SECCOMP_TEST FALSE) +if(TEST_SUITE STREQUAL "test_seccomp") + set(IS_SECCOMP_TEST TRUE) +endif() + +# --------------------------------------------------------------------------- +# Prepare directories +# --------------------------------------------------------------------------- +file(REMOVE_RECURSE "${LOG_DIR}") +file(MAKE_DIRECTORY "${LOG_DIR}") +file(REMOVE_RECURSE "${CTEST_PROJECT_DIR}") +file(MAKE_DIRECTORY "${CTEST_PROJECT_DIR}") + +# --------------------------------------------------------------------------- +# Discover tests via --list_content +# --------------------------------------------------------------------------- +message(STATUS "Discovering tests from ${TEST_EXECUTABLE}...") +execute_process( + COMMAND "${TEST_EXECUTABLE}" --list_content + OUTPUT_VARIABLE _list_output + ERROR_VARIABLE _list_output + RESULT_VARIABLE _list_result +) + +if(NOT _list_result EQUAL 0 AND "${_list_output}" STREQUAL "") + message(FATAL_ERROR "Failed to discover tests from ${TEST_EXECUTABLE}") +endif() + +# Parse Suite/Case names from --list_content output. +# Boost.Test --list_content produces output like: +# CSomeTest* +# testSomething* +# testAnotherThing* +# CAnotherTest* +# testFoo* +set(ALL_TEST_NAMES "") +set(_current_suite "") +string(REPLACE "\n" ";" _lines "${_list_output}") +foreach(_line IN LISTS _lines) + string(STRIP "${_line}" _stripped) + if(_stripped MATCHES "^(C.*Test)\\*$") + set(_current_suite "${CMAKE_MATCH_1}") + elseif(_stripped MATCHES "^(test.*)\\*$" AND NOT "${_current_suite}" STREQUAL "") + list(APPEND ALL_TEST_NAMES "${_current_suite}/${CMAKE_MATCH_1}") + endif() +endforeach() + +list(LENGTH ALL_TEST_NAMES _num_tests) +if(_num_tests EQUAL 0) + message(FATAL_ERROR "No tests found to run or error in test discovery.") +endif() +message(STATUS "Discovered ${_num_tests} test(s)") + +# --------------------------------------------------------------------------- +# Group tests into batches of MAX_ARGS +# +# We use a pipe "|" delimiter within each batch because semicolons are +# CMake list separators and colons are Boost.Test --run_test separators. +# --------------------------------------------------------------------------- +set(_batches "") +set(_batch_idx 0) +set(_count 0) +set(_current_batch "") + +foreach(_test IN LISTS ALL_TEST_NAMES) + if(_count GREATER_EQUAL ${MAX_ARGS}) + list(APPEND _batches "${_current_batch}") + set(_current_batch "") + set(_count 0) + math(EXPR _batch_idx "${_batch_idx} + 1") + endif() + if("${_current_batch}" STREQUAL "") + set(_current_batch "${_test}") + else() + set(_current_batch "${_current_batch}|${_test}") + endif() + math(EXPR _count "${_count} + 1") +endforeach() +if(NOT "${_current_batch}" STREQUAL "") + list(APPEND _batches "${_current_batch}") +endif() + +list(LENGTH _batches _num_batches) +message(STATUS "Running ${_num_tests} test(s) in ${_num_batches} batch(es), " + "max ${MAX_PROCS} parallel process(es)") + +# --------------------------------------------------------------------------- +# Generate a per-batch runner script invoked by each CTest test entry +# --------------------------------------------------------------------------- +set(_batch_runner "${CTEST_PROJECT_DIR}/_run_one_batch.cmake") +file(WRITE "${_batch_runner}" [=[ +# Per-batch runner script. +# Invoked by CTest with -D parameters for each batch. +cmake_minimum_required(VERSION 3.19) + +# Build the command line +set(_cmd "${TEST_EXECUTABLE}" "--run_test=${RUN_TEST_ARG}" --no_color_output) + +# The seccomp test activates a sandbox that restricts system calls. +# Force HRF logging to avoid I/O operations the sandbox does not permit. +if(IS_SECCOMP_TEST) + list(APPEND _cmd --logger=HRF,all --report_format=HRF --show_progress=no) +endif() + +# Append extra test flags if provided +if(NOT "${EXTRA_TEST_FLAGS}" STREQUAL "") + string(REPLACE ";" " " _flags_str "${EXTRA_TEST_FLAGS}") + string(REPLACE " " ";" _flags_list "${_flags_str}") + list(APPEND _cmd ${_flags_list}) +endif() + +# Append Boost output format flags if provided (skip for seccomp as it uses HRF) +if(NOT "${BOOST_OUTPUT_FLAGS}" STREQUAL "" AND NOT IS_SECCOMP_TEST) + # Substitute the test name into the output format flags so each + # batch writes to its own results file + string(REGEX REPLACE "[^a-zA-Z0-9_]" "_" _safe_name "${RUN_TEST_ARG}") + string(REPLACE "boost_test_results" "boost_test_results_${_safe_name}" _output_flags "${BOOST_OUTPUT_FLAGS}") + string(REPLACE " " ";" _output_flags_list "${_output_flags}") + list(APPEND _cmd ${_output_flags_list}) +endif() + +execute_process( + COMMAND ${_cmd} + OUTPUT_FILE "${LOG_FILE}" + ERROR_FILE "${LOG_FILE}" + RESULT_VARIABLE _result + WORKING_DIRECTORY "${WORKING_DIR}" +) + +if(NOT _result EQUAL 0) + file(READ "${LOG_FILE}" _log_content) + message("${_log_content}") + message(FATAL_ERROR "Test(s) '${RUN_TEST_ARG}' FAILED with exit code ${_result}") +endif() +]=]) + +# --------------------------------------------------------------------------- +# Generate CTestTestfile.cmake with one add_test() per batch +# --------------------------------------------------------------------------- +set(_ctest_file "${CTEST_PROJECT_DIR}/CTestTestfile.cmake") +file(WRITE "${_ctest_file}" "# Auto-generated by run-tests-individually.cmake\n\n") + +set(_idx 0) +foreach(_batch IN LISTS _batches) + # In mixed mode or multi-test batches, join with ":" for Boost.Test + if(MIXED_MODE) + string(REPLACE "|" ":" _run_test_arg "${_batch}") + else() + # With MAX_ARGS=1, _batch is just a single test name. + # With MAX_ARGS>1 and not mixed mode, each test in the batch + # still needs to run individually. However, CTest gives us + # per-entry parallelism, so for simplicity (matching the shell + # script behaviour) we join with ":" and run in one Boost process. + string(REPLACE "|" ":" _run_test_arg "${_batch}") + endif() + + # Safe log filename + string(REGEX REPLACE "[^a-zA-Z0-9_]" "_" _safe_name "${_run_test_arg}") + string(SUBSTRING "${_safe_name}" 0 100 _safe_name) + set(_log_file "${LOG_DIR}/${_safe_name}.log") + + # Use the test name as the CTest test name for readable output + set(_test_label "${_run_test_arg}") + + # Escape semicolons in EXTRA_TEST_FLAGS for -D passing + string(REPLACE ";" "\\;" _escaped_flags "${EXTRA_TEST_FLAGS}") + + file(APPEND "${_ctest_file}" + "add_test(\"${_test_label}\" \"${CMAKE_COMMAND}\"" + " \"-DRUN_TEST_ARG=${_run_test_arg}\"" + " \"-DTEST_EXECUTABLE=${TEST_EXECUTABLE}\"" + " \"-DLOG_FILE=${_log_file}\"" + " \"-DWORKING_DIR=${TEST_DIR}\"" + " \"-DEXTRA_TEST_FLAGS=${_escaped_flags}\"" + " \"-DBOOST_OUTPUT_FLAGS=${BOOST_OUTPUT_FLAGS}\"" + " \"-DIS_SECCOMP_TEST=${IS_SECCOMP_TEST}\"" + " -P \"${_batch_runner}\")\n" + "set_tests_properties(\"${_test_label}\" PROPERTIES WORKING_DIRECTORY \"${TEST_DIR}\")\n\n" + ) + + math(EXPR _idx "${_idx} + 1") +endforeach() + +message(STATUS "Generated CTest project with ${_num_batches} test(s)") +message(STATUS "Running with ctest --parallel ${MAX_PROCS}...") +message(STATUS "--------------------------------------------------") + +# --------------------------------------------------------------------------- +# Run ctest --parallel for true concurrent execution +# --------------------------------------------------------------------------- +execute_process( + COMMAND "${CMAKE_CTEST_COMMAND}" + --test-dir "${CTEST_PROJECT_DIR}" + --parallel ${MAX_PROCS} + --output-on-failure + --no-label-summary + --progress + RESULT_VARIABLE _ctest_result + WORKING_DIRECTORY "${TEST_DIR}" +) + +message(STATUS "--------------------------------------------------") + +if(NOT _ctest_result EQUAL 0) + message(STATUS "${TEST_SUITE}: Some individual tests FAILED. Check logs in '${LOG_DIR}'.") +else() + message(STATUS "${TEST_SUITE}: All individual tests PASSED.") +endif() + +# --------------------------------------------------------------------------- +# Clean up temporary CTest project +# --------------------------------------------------------------------------- +file(REMOVE_RECURSE "${CTEST_PROJECT_DIR}") + +# --------------------------------------------------------------------------- +# Merge JUnit results if requested +# --------------------------------------------------------------------------- +if(NOT "${BOOST_OUTPUT_FLAGS}" STREQUAL "") + string(FIND "${BOOST_OUTPUT_FLAGS}" "junit" _junit_pos) + if(NOT _junit_pos EQUAL -1) + file(GLOB _junit_files "${TEST_DIR}/boost_test_results_C*.junit") + list(LENGTH _junit_files _num_junit) + if(_num_junit GREATER 0) + message(STATUS "Merging ${_num_junit} JUnit result file(s)...") + + set(_total_tests 0) + set(_total_errors 0) + set(_total_failures 0) + set(_suite_name "") + set(_suite_id "") + set(_all_testcases "") + + foreach(_jf IN LISTS _junit_files) + file(READ "${_jf}" _jc) + + if(_jc MATCHES "tests=\"([0-9]+)\"") + math(EXPR _total_tests "${_total_tests} + ${CMAKE_MATCH_1}") + endif() + if(_jc MATCHES "errors=\"([0-9]+)\"") + math(EXPR _total_errors "${_total_errors} + ${CMAKE_MATCH_1}") + endif() + if(_jc MATCHES "failures=\"([0-9]+)\"") + math(EXPR _total_failures "${_total_failures} + ${CMAKE_MATCH_1}") + endif() + if("${_suite_name}" STREQUAL "" AND _jc MATCHES "name=\"([a-zA-Z.]+)\"") + set(_suite_name "${CMAKE_MATCH_1}") + endif() + if("${_suite_id}" STREQUAL "" AND _jc MATCHES "id=\"([0-9]+)\"") + set(_suite_id "${CMAKE_MATCH_1}") + endif() + + # Extract non-skipped testcase elements + string(REGEX MATCHALL "]*" _cases "${_jc}") + foreach(_case IN LISTS _cases) + string(FIND "${_case}" "skipped" _skip_pos) + if(_skip_pos EQUAL -1) + string(APPEND _all_testcases "${_case}\n") + endif() + endforeach() + endforeach() + + set(_merged_file "${TEST_DIR}/boost_test_results.junit") + file(WRITE "${_merged_file}" + "\n" + "\n" + "${_all_testcases}" + "\n" + ) + message(STATUS "Merged JUnit results written to ${_merged_file}") + endif() + endif() +endif() + +# Propagate failure to the calling build system +if(NOT _ctest_result EQUAL 0) + message(FATAL_ERROR "Test failures detected") +endif() diff --git a/session-notes-2026-02-16-test-infrastructure.md b/session-notes-2026-02-16-test-infrastructure.md new file mode 100644 index 000000000..b191b1c4f --- /dev/null +++ b/session-notes-2026-02-16-test-infrastructure.md @@ -0,0 +1,132 @@ +# Session Notes: Test Infrastructure & CStateFileRemover Bug Fix + +**Date:** 16-17 February 2026 +**Branch:** `quantile_state_deleter_bug_fix` + +## Executive Summary + +This session delivered two improvements to the ml-cpp codebase: + +1. **Bug fix:** Eliminated a memory leak and redundant file deletion in `autodetect` and `normalize` applications. The `CStateFileRemover` RAII helper's `unique_ptr` was being `release()`'d on the happy path, leaking memory and duplicating the manual `std::remove()` logic that the destructor already handles. The fix removes the `release()` + manual deletion block, letting the `unique_ptr` destructor handle cleanup in all paths. + +2. **Portable test runner:** Replaced the bash/sed/awk-based `run_tests_as_seperate_processes.sh` with a pure-CMake script (`cmake/run-tests-individually.cmake`) that uses CTest for parallel execution. This is **2-5x faster** than the shell script and works on all platforms without Unix tool dependencies. + +Both changes were validated on **linux-x86_64** (`ml-linux-build:34`) and **linux-aarch64** (`ml-linux-aarch64-native-build:17`) Docker containers, in addition to the local macOS development environment. + +--- + +## 1. CStateFileRemover Bug Fix + +### Problem + +In both `bin/autodetect/Main.cc` and `bin/normalize/Main.cc`, the happy path called: + +```cpp +removeQuantilesStateOnFailure.release(); // leaks the CStateFileRemover +if (deleteStateFiles) { + std::remove(quantilesStateFile.c_str()); // duplicates destructor logic +} +``` + +`unique_ptr::release()` relinquishes ownership without calling the destructor, causing a memory leak. The manual `std::remove()` duplicated the logic already in `CStateFileRemover::~CStateFileRemover()`. + +### Fix + +Removed the `release()` + manual deletion block entirely. The `unique_ptr` destructor now handles file cleanup on both success and failure paths. Also removed the unused `#include `. + +### Files Changed (committed) + +- `bin/autodetect/Main.cc` -- removed 10 lines of redundant cleanup +- `bin/normalize/Main.cc` -- same change +- `include/core/CStateFileRemover.h` -- updated class comment +- `lib/core/unittest/CStateFileRemoverTest.cc` -- **new**, 6 Boost unit tests +- `lib/core/unittest/CMakeLists.txt` -- added `CStateFileRemoverTest.cc` + +### Commits + +- `3f00438dd6` -- Add CStateFileRemover tests and fix happy-path memory leak +- `dfd4718667` -- Remove unused disarm() from CStateFileRemover + +--- + +## 2. Portable CMake Test Runner + +### Problem + +`run_tests_as_seperate_processes.sh` relies on bash, sed, awk, grep, and xargs. It invokes `cmake --build` for every test batch, which re-checks the build system each time -- adding significant per-batch overhead. + +### Solution + +`cmake/run-tests-individually.cmake` -- a pure-CMake script that: + +1. Discovers tests via `--list_content` +2. Batches them (default `MAX_ARGS=2`) +3. Generates a temporary CTest project +4. Runs batches in parallel via `ctest --parallel` +5. Optionally merges JUnit XML results + +The `test_*_individually` target in `cmake/functions.cmake` was updated to invoke this script and moved outside the `if(isMultiConfig)` block so it's available for all generators. + +### Files Changed (uncommitted) + +- `cmake/functions.cmake` -- rewired `test_*_individually` targets +- `cmake/run-tests-individually.cmake` -- **new**, the portable test runner + +### Performance Results + +#### `test_individually` target (all 10 test suites) + +| Approach | aarch64 (native) | x86_64 (emulated) | +|---|---|---| +| Shell script, j=1 | 277s | 1029s | +| CMake script, j=1 | 132s (2.1x faster) | 204s (5.0x faster) | +| CMake script, j=5 | 71s (3.9x faster) | not tested | + +#### `-j N` parallelism (aarch64, 14 CPUs, 8GB RAM) + +| j | Median (s) | Reliable | +|---|---|---| +| 1 | 132 | 3/3 | +| 2 | 80 | 1/3 | +| 3 | 137 | 1/3 | +| 4 | -- | 0/2 | +| 5 | **71** | **3/3** | +| 10 | 71 | 1/3 | + +**Optimal: `-j 5`** (100% reliable, 1.9x over j=1). + +Sporadic failures at j=2,3,4,10 are caused by `CStateFileRemoverTest` batches sharing the same temp file when run concurrently -- a pre-existing test isolation issue, not introduced by this change. + +### Known Limitation + +The seccomp test (`CSystemCallFilterTest`) fails under the CMake script because `test-runner.cmake` passes special flags (`--logger=HRF,all --report_format=HRF --show_progress=no`) for seccomp that `run-tests-individually.cmake` does not yet replicate. + +--- + +## 3. Docker Testing + +### Images Used + +| Image | Architecture | OS | Compiler | +|---|---|---|---| +| `docker.elastic.co/ml-dev/ml-linux-build:34` | x86_64 | Rocky Linux 8.10 | GCC 13.3 | +| `docker.elastic.co/ml-dev/ml-linux-aarch64-native-build:17` | aarch64 | Rocky Linux 8.10 | GCC 13.3 | + +### Environment Requirements + +Both images require `LD_LIBRARY_PATH=/usr/local/gcc133/lib64:/usr/local/gcc133/lib` for test execution. This applies to all test targets (not specific to our changes). CMake and other build tools live at `/usr/local/gcc133/bin/`. + +### Build Notes + +- aarch64 image runs natively on ARM Mac -- builds complete in ~6-7 minutes with `-j4` +- x86_64 image runs under Rosetta/QEMU emulation on ARM Mac -- builds take 30+ minutes with `-j1`; `-j2` or higher risks OOM kills (Docker default 8GB RAM) + +--- + +## 4. Remaining Work + +- [ ] Commit `cmake/functions.cmake` and `cmake/run-tests-individually.cmake` +- [ ] Consider adding seccomp-specific flag handling to `run-tests-individually.cmake` +- [ ] Consider fixing `CStateFileRemoverTest` to use unique temp filenames per test case (would make `-j N` more reliable for all N) +- [ ] Consider deleting `run_tests_as_seperate_processes.sh` once the CMake replacement is proven in CI +- [ ] Revert the stray `include(CTest)` in top-level `CMakeLists.txt` if not already done diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 3dba76157..5e571c729 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -35,7 +35,7 @@ add_custom_target(run_tests_individually DEPENDS clean_test_results ${ML_TEST_INDIVIDUALLY_DEPENDS} WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} ) -add_custom_target(test +add_custom_target(ml_test DEPENDS run_tests COMMAND ${CMAKE_COMMAND} -DTEST_DIR=${CMAKE_BINARY_DIR} -P ${CMAKE_SOURCE_DIR}/cmake/test-check-success.cmake WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} @@ -65,4 +65,4 @@ add_custom_target(test_all_parallel ${_build_type_arg} -P ${CMAKE_SOURCE_DIR}/cmake/run-all-tests-parallel.cmake WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} -) \ No newline at end of file +) From 067bdae1a69c113303b8eebb4754002d55cea217 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Wed, 18 Feb 2026 11:31:19 +1300 Subject: [PATCH 02/19] [ML] Fix test_individually target for multi-config generators (Windows) Copy test executable from config subdirectory before running the individually test runner, matching the existing test_ target behaviour. Also handle .exe extension on Windows in the CMake runner script. Co-authored-by: Cursor --- cmake/functions.cmake | 38 ++++++++++++++++++++++-------- cmake/run-tests-individually.cmake | 23 ++++++++++++++++++ 2 files changed, 51 insertions(+), 10 deletions(-) diff --git a/cmake/functions.cmake b/cmake/functions.cmake index e5f225cd6..c4f1d985c 100644 --- a/cmake/functions.cmake +++ b/cmake/functions.cmake @@ -396,16 +396,34 @@ function(ml_add_test_executable _target) ) endif() - add_custom_target(test_${_target}_individually - DEPENDS ml_test_${_target} - COMMAND ${CMAKE_COMMAND} - -DBINARY_DIR=${CMAKE_CURRENT_BINARY_DIR} - -DTEST_SUITE=test_${_target} - -DTEST_DIR=${CMAKE_CURRENT_SOURCE_DIR} - -P ${CMAKE_SOURCE_DIR}/cmake/run-tests-individually.cmake - COMMENT "Running test: ml_test_${_target}_individually" - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - ) + if(isMultiConfig) + add_custom_target(test_${_target}_individually + DEPENDS ml_test_${_target} + COMMAND ${CMAKE_COMMAND} -E copy + ${CMAKE_CURRENT_BINARY_DIR}/$/ml_test_${_target}${CMAKE_EXECUTABLE_SUFFIX} + ${CMAKE_CURRENT_BINARY_DIR}/ml_test_${_target}${CMAKE_EXECUTABLE_SUFFIX} + COMMAND ${CMAKE_COMMAND} + -DBINARY_DIR=${CMAKE_CURRENT_BINARY_DIR} + -DTEST_SUITE=test_${_target} + -DTEST_DIR=${CMAKE_CURRENT_SOURCE_DIR} + -DSOURCE_DIR=${CMAKE_SOURCE_DIR} + -P ${CMAKE_SOURCE_DIR}/cmake/run-tests-individually.cmake + COMMENT "Running test: ml_test_${_target}_individually" + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + ) + else() + add_custom_target(test_${_target}_individually + DEPENDS ml_test_${_target} + COMMAND ${CMAKE_COMMAND} + -DBINARY_DIR=${CMAKE_CURRENT_BINARY_DIR} + -DTEST_SUITE=test_${_target} + -DTEST_DIR=${CMAKE_CURRENT_SOURCE_DIR} + -DSOURCE_DIR=${CMAKE_SOURCE_DIR} + -P ${CMAKE_SOURCE_DIR}/cmake/run-tests-individually.cmake + COMMENT "Running test: ml_test_${_target}_individually" + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + ) + endif() endfunction() function(ml_codesign _target) diff --git a/cmake/run-tests-individually.cmake b/cmake/run-tests-individually.cmake index f6fd416a8..68936a5ef 100644 --- a/cmake/run-tests-individually.cmake +++ b/cmake/run-tests-individually.cmake @@ -46,9 +46,32 @@ endforeach() # Derive paths # --------------------------------------------------------------------------- set(TEST_EXECUTABLE "${BINARY_DIR}/ml_${TEST_SUITE}") +if(CMAKE_HOST_WIN32 AND NOT TEST_EXECUTABLE MATCHES "\\.exe$") + set(TEST_EXECUTABLE "${TEST_EXECUTABLE}.exe") +endif() set(LOG_DIR "${BINARY_DIR}/test_logs") set(CTEST_PROJECT_DIR "${BINARY_DIR}/_ctest_individual") +# --------------------------------------------------------------------------- +# On Windows, ensure DLLs are discoverable by prepending the distribution +# bin directory to PATH. This mirrors what set_env.ps1 does. +# --------------------------------------------------------------------------- +if(CMAKE_HOST_WIN32 AND DEFINED SOURCE_DIR) + set(_dist_bin "${SOURCE_DIR}/build/distribution/platform/windows-x86_64/bin") + if(IS_DIRECTORY "${_dist_bin}") + set(ENV{PATH} "${_dist_bin};$ENV{PATH}") + message(STATUS "Prepended ${_dist_bin} to PATH") + endif() +endif() + +# --------------------------------------------------------------------------- +# Set CPP_SRC_HOME so that CResourceLocator can find resource files +# (dictionaries, timezone DB, etc.) when running from the build tree. +# --------------------------------------------------------------------------- +if(DEFINED SOURCE_DIR AND NOT DEFINED ENV{CPP_SRC_HOME}) + set(ENV{CPP_SRC_HOME} "${SOURCE_DIR}") +endif() + # --------------------------------------------------------------------------- # Parallelism and batching settings # --------------------------------------------------------------------------- From 42ecc002fd22b500b3fdbd4dcde5b5959f4cdd38 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Wed, 18 Feb 2026 15:35:24 +1300 Subject: [PATCH 03/19] Remove session notes and IDE config files from PR Co-authored-by: Cursor --- .cursor/rules/ml-cpp-build-system.mdc | 136 ------------------ .cursor/rules/ml-cpp-coding-conventions.mdc | 87 ----------- ...on-notes-2026-02-16-test-infrastructure.md | 132 ----------------- 3 files changed, 355 deletions(-) delete mode 100644 .cursor/rules/ml-cpp-build-system.mdc delete mode 100644 .cursor/rules/ml-cpp-coding-conventions.mdc delete mode 100644 session-notes-2026-02-16-test-infrastructure.md diff --git a/.cursor/rules/ml-cpp-build-system.mdc b/.cursor/rules/ml-cpp-build-system.mdc deleted file mode 100644 index 1fdc310ce..000000000 --- a/.cursor/rules/ml-cpp-build-system.mdc +++ /dev/null @@ -1,136 +0,0 @@ ---- -description: Build system, test infrastructure, and Docker environment for Elasticsearch ml-cpp -globs: ["CMakeLists.txt", "cmake/**", "**/*Test.cc", "**/*Test.h"] -alwaysApply: false ---- - -# ml-cpp Build System & Test Infrastructure - -## Project Structure - -- **Source headers:** `include/` (e.g. `include/core/`, `include/model/`) -- **Libraries:** `lib/` with sub-libraries: `core`, `model`, `api`, `maths/common`, `maths/time_series`, `maths/analytics`, `ver`, `seccomp`, `test` -- **Applications:** `bin/autodetect`, `bin/normalize`, `bin/controller`, `bin/categorize`, `bin/data_frame_analyzer`, `bin/pytorch_inference` -- **Unit tests:** co-located at `lib//unittest/` and `bin//unittest/` -- **CMake modules:** `cmake/` -- toolchain files, functions, test runners -- **3rd party:** `3rd_party/` -- Eigen, RapidJSON headers - -## CMake Conventions - -### Custom Functions (cmake/functions.cmake) - -- `ml_add_test_executable(_target ...)` -- creates test executable `ml_test_`, links with Boost.Test, creates `test_` and `test__individually` custom targets -- `ml_add_test(_directory _target)` -- registers a test suite; populates `ML_BUILD_TEST_DEPENDS`, `ML_TEST_DEPENDS`, `ML_TEST_INDIVIDUALLY_DEPENDS` -- `ml_codesign(_target)` -- macOS code signing - -### Key CMake Targets - -- `build_tests` -- builds all test executables without running them -- `test_` -- build + run a single test suite (e.g. `test_core`) -- `test__individually` -- build + run a single suite with tests in separate processes -- `run_tests` -- runs all test suites sequentially -- `run_tests_individually` -- runs all test suites individually (parallel within each suite) -- `test` -- run all tests + check for failures -- `test_individually` -- run all tests individually + check for failures - -### Toolchain Files - -- `cmake/linux-x86_64.cmake` -- GCC 13.3 at `/usr/local/gcc133/` -- `cmake/linux-aarch64.cmake` -- GCC 13.3 at `/usr/local/gcc133/` -- `cmake/darwin-aarch64.cmake` -- Clang (Xcode) -- Auto-detected from `CMAKE_HOST_SYSTEM_NAME` and `uname -m` if not specified - -### Build Types - -- Default: `RelWithDebInfo` (optimized with debug symbols) -- `ML_DEBUG=1` environment variable switches to `Debug` -- Flags defined in `cmake/variables.cmake` - -## Test Framework - -### Boost.Test Conventions - -- Test suites named `CTest` (e.g. `CStateFileRemoverTest`) -- Test cases named `test` (e.g. `testDeleteOnDestruction`) -- Files named `CTest.cc` -- Must be added to the `SRCS` list in the unittest's `CMakeLists.txt` -- Copyright header required on all files - -### Running Tests - -```bash -# Single suite -cmake --build --target test_core - -# Single suite, tests in separate processes (parallel via CTest) -cmake --build --target test_core_individually - -# All suites -cmake --build --target test - -# All suites individually (parallel within each) -cmake --build --target test_individually - -# All suites individually with outer parallelism (optimal on 14-core: -j5) -cmake --build --target test_individually -j5 - -# Specific test case -TESTS=CStateFileRemoverTest cmake --build --target test_core - -# Direct executable invocation -/test/lib/core/unittest/ml_test_core --run_test=CStateFileRemoverTest -``` - -### Environment Variables for Test Runner - -- `BOOST_TEST_MAX_ARGS` -- tests per batch (default: 2) -- `BOOST_TEST_MAX_PROCS` -- max parallel processes (default: nproc) -- `BOOST_TEST_MIXED_MODE` -- if "true", batch tests run in one Boost process -- `BOOST_TEST_OUTPUT_FORMAT_FLAGS` -- JUnit output flags -- `TEST_FLAGS` -- additional flags passed to test executable -- `RUN_BOOST_TESTS_IN_FOREGROUND` -- run tests with output to terminal - -## Docker Build/Test Environments - -### Images - -| Image | Arch | Tag Pattern | -|---|---|---| -| `docker.elastic.co/ml-dev/ml-linux-build` | x86_64 | `:34` | -| `docker.elastic.co/ml-dev/ml-linux-aarch64-native-build` | aarch64 | `:17` | - -### Required Environment - -```bash -export CPP_SRC_HOME=/path/to/ml-cpp -export PATH="/usr/local/gcc133/bin:$PATH" -export LD_LIBRARY_PATH="/usr/local/gcc133/lib64:/usr/local/gcc133/lib" -``` - -### Docker Build Example - -```bash -docker run --rm \ - -v /path/to/ml-cpp:/ml-cpp \ - -e CPP_SRC_HOME=/ml-cpp \ - -e PATH="/usr/local/gcc133/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" \ - -e LD_LIBRARY_PATH="/usr/local/gcc133/lib64:/usr/local/gcc133/lib" \ - -w /ml-cpp \ - docker.elastic.co/ml-dev/ml-linux-aarch64-native-build:17 \ - bash -c "mkdir -p /tmp/build && cd /tmp/build && \ - cmake -DCMAKE_TOOLCHAIN_FILE=/ml-cpp/cmake/linux-aarch64.cmake /ml-cpp && \ - cmake --build . --target test_individually -j5" -``` - -### Build Performance (ARM Mac host) - -- **aarch64 image:** runs natively, full build ~7min at `-j4`, limited by Docker 8GB RAM -- **x86_64 image:** runs under QEMU emulation, full build 30+ min at `-j1`, OOM risk at `-j2+` - -## Dependencies - -- **Boost 1.86.0** -- found via CMake `find_package`, libraries at `/usr/local/gcc133/lib/` -- **PyTorch (libtorch)** -- headers at `/usr/local/gcc133/include/pytorch`, libs at `/usr/local/gcc133/lib/` -- **libxml2** -- at `/usr/local/gcc133/lib/` -- **Eigen** -- header-only at `3rd_party/eigen/` -- **RapidJSON** -- header-only at `3rd_party/include/` diff --git a/.cursor/rules/ml-cpp-coding-conventions.mdc b/.cursor/rules/ml-cpp-coding-conventions.mdc deleted file mode 100644 index 4f047bedb..000000000 --- a/.cursor/rules/ml-cpp-coding-conventions.mdc +++ /dev/null @@ -1,87 +0,0 @@ ---- -description: Coding conventions and patterns for Elasticsearch ml-cpp C++ code -globs: ["**/*.cc", "**/*.h", "include/**", "lib/**", "bin/**"] -alwaysApply: false ---- - -# ml-cpp Coding Conventions - -## Naming - -- **Namespaces:** `ml::core`, `ml::model`, `ml::api`, `ml::maths::common`, `ml::maths::time_series`, `ml::maths::analytics` -- **Classes:** `C` prefix (e.g. `CStateFileRemover`, `CAnomalyDetector`) -- **Type aliases:** `T` prefix (e.g. `TDouble3Vec`, `TStrVec`) -- **Enums:** `E` prefix (e.g. `EFloatingPointErrorStatus`) -- **Member variables:** `m_` prefix (e.g. `m_DeleteStateFiles`, `m_QuantilesStateFile`) -- **Test files:** `CTest.cc` with `BOOST_AUTO_TEST_SUITE(CTest)` -- **Test cases:** `testSomethingDescriptive` - -## C++ Standard - -- **C++20** (`CMAKE_CXX_STANDARD 20`) -- Smart pointers preferred (`std::unique_ptr`, `std::make_unique`) -- RAII pattern for resource management - -## Header Guards - -The codebase uses `#ifndef` / `#define` / `#endif` style header guards (not `#pragma once`). - -## RAII Pattern - -Resources that need cleanup on scope exit should use RAII wrappers. Example: `CStateFileRemover` deletes quantiles state files in its destructor. Use `std::unique_ptr` to manage lifetime; prefer `reset()` over `release()` to avoid memory leaks. - -## Logging - -Use the ML logging macros: -- `LOG_TRACE` / `LOG_DEBUG` / `LOG_INFO` / `LOG_WARN` / `LOG_ERROR` / `LOG_FATAL` -- Example: `LOG_WARN(<< "Failed to delete file '" << filename << "': " << strerror(errno));` -- Logger is configured via Boost.Log - -## Unit Tests - -### Structure - -```cpp -#include - -#include - -BOOST_AUTO_TEST_SUITE(CStateFileRemoverTest) - -BOOST_AUTO_TEST_CASE(testDeleteOnDestruction) { - // Test body -} - -BOOST_AUTO_TEST_SUITE_END() -``` - -### Conventions - -- Each test file tests one class -- Comment each test case explaining which production code path it mirrors -- Clean up test artifacts (temp files) in each test case -- Use `BOOST_TEST_REQUIRE` for preconditions, `BOOST_CHECK` / `BOOST_TEST` for assertions -- Tests discover via `--list_content` and run via `--run_test=Suite/Case` - -## License Header - -Every file must start with the Elastic License 2.0 header: - -``` -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the Elastic License - * 2.0 and the following additional limitation. ... - */ -``` - -## Include Order - -1. Primary header for the implementation file -2. Project headers (`#include `, `#include `) -3. Boost headers -4. Standard library headers - -## Warnings - -The build uses `-Wall -Wextra -Wconversion -Wold-style-cast` and many more. Third-party headers are included via `-isystem` to suppress their warnings. Eigen generates `-Wmaybe-uninitialized` and `-Warray-bounds` warnings at `-O3` with GCC 13.3 but these are not errors. diff --git a/session-notes-2026-02-16-test-infrastructure.md b/session-notes-2026-02-16-test-infrastructure.md deleted file mode 100644 index b191b1c4f..000000000 --- a/session-notes-2026-02-16-test-infrastructure.md +++ /dev/null @@ -1,132 +0,0 @@ -# Session Notes: Test Infrastructure & CStateFileRemover Bug Fix - -**Date:** 16-17 February 2026 -**Branch:** `quantile_state_deleter_bug_fix` - -## Executive Summary - -This session delivered two improvements to the ml-cpp codebase: - -1. **Bug fix:** Eliminated a memory leak and redundant file deletion in `autodetect` and `normalize` applications. The `CStateFileRemover` RAII helper's `unique_ptr` was being `release()`'d on the happy path, leaking memory and duplicating the manual `std::remove()` logic that the destructor already handles. The fix removes the `release()` + manual deletion block, letting the `unique_ptr` destructor handle cleanup in all paths. - -2. **Portable test runner:** Replaced the bash/sed/awk-based `run_tests_as_seperate_processes.sh` with a pure-CMake script (`cmake/run-tests-individually.cmake`) that uses CTest for parallel execution. This is **2-5x faster** than the shell script and works on all platforms without Unix tool dependencies. - -Both changes were validated on **linux-x86_64** (`ml-linux-build:34`) and **linux-aarch64** (`ml-linux-aarch64-native-build:17`) Docker containers, in addition to the local macOS development environment. - ---- - -## 1. CStateFileRemover Bug Fix - -### Problem - -In both `bin/autodetect/Main.cc` and `bin/normalize/Main.cc`, the happy path called: - -```cpp -removeQuantilesStateOnFailure.release(); // leaks the CStateFileRemover -if (deleteStateFiles) { - std::remove(quantilesStateFile.c_str()); // duplicates destructor logic -} -``` - -`unique_ptr::release()` relinquishes ownership without calling the destructor, causing a memory leak. The manual `std::remove()` duplicated the logic already in `CStateFileRemover::~CStateFileRemover()`. - -### Fix - -Removed the `release()` + manual deletion block entirely. The `unique_ptr` destructor now handles file cleanup on both success and failure paths. Also removed the unused `#include `. - -### Files Changed (committed) - -- `bin/autodetect/Main.cc` -- removed 10 lines of redundant cleanup -- `bin/normalize/Main.cc` -- same change -- `include/core/CStateFileRemover.h` -- updated class comment -- `lib/core/unittest/CStateFileRemoverTest.cc` -- **new**, 6 Boost unit tests -- `lib/core/unittest/CMakeLists.txt` -- added `CStateFileRemoverTest.cc` - -### Commits - -- `3f00438dd6` -- Add CStateFileRemover tests and fix happy-path memory leak -- `dfd4718667` -- Remove unused disarm() from CStateFileRemover - ---- - -## 2. Portable CMake Test Runner - -### Problem - -`run_tests_as_seperate_processes.sh` relies on bash, sed, awk, grep, and xargs. It invokes `cmake --build` for every test batch, which re-checks the build system each time -- adding significant per-batch overhead. - -### Solution - -`cmake/run-tests-individually.cmake` -- a pure-CMake script that: - -1. Discovers tests via `--list_content` -2. Batches them (default `MAX_ARGS=2`) -3. Generates a temporary CTest project -4. Runs batches in parallel via `ctest --parallel` -5. Optionally merges JUnit XML results - -The `test_*_individually` target in `cmake/functions.cmake` was updated to invoke this script and moved outside the `if(isMultiConfig)` block so it's available for all generators. - -### Files Changed (uncommitted) - -- `cmake/functions.cmake` -- rewired `test_*_individually` targets -- `cmake/run-tests-individually.cmake` -- **new**, the portable test runner - -### Performance Results - -#### `test_individually` target (all 10 test suites) - -| Approach | aarch64 (native) | x86_64 (emulated) | -|---|---|---| -| Shell script, j=1 | 277s | 1029s | -| CMake script, j=1 | 132s (2.1x faster) | 204s (5.0x faster) | -| CMake script, j=5 | 71s (3.9x faster) | not tested | - -#### `-j N` parallelism (aarch64, 14 CPUs, 8GB RAM) - -| j | Median (s) | Reliable | -|---|---|---| -| 1 | 132 | 3/3 | -| 2 | 80 | 1/3 | -| 3 | 137 | 1/3 | -| 4 | -- | 0/2 | -| 5 | **71** | **3/3** | -| 10 | 71 | 1/3 | - -**Optimal: `-j 5`** (100% reliable, 1.9x over j=1). - -Sporadic failures at j=2,3,4,10 are caused by `CStateFileRemoverTest` batches sharing the same temp file when run concurrently -- a pre-existing test isolation issue, not introduced by this change. - -### Known Limitation - -The seccomp test (`CSystemCallFilterTest`) fails under the CMake script because `test-runner.cmake` passes special flags (`--logger=HRF,all --report_format=HRF --show_progress=no`) for seccomp that `run-tests-individually.cmake` does not yet replicate. - ---- - -## 3. Docker Testing - -### Images Used - -| Image | Architecture | OS | Compiler | -|---|---|---|---| -| `docker.elastic.co/ml-dev/ml-linux-build:34` | x86_64 | Rocky Linux 8.10 | GCC 13.3 | -| `docker.elastic.co/ml-dev/ml-linux-aarch64-native-build:17` | aarch64 | Rocky Linux 8.10 | GCC 13.3 | - -### Environment Requirements - -Both images require `LD_LIBRARY_PATH=/usr/local/gcc133/lib64:/usr/local/gcc133/lib` for test execution. This applies to all test targets (not specific to our changes). CMake and other build tools live at `/usr/local/gcc133/bin/`. - -### Build Notes - -- aarch64 image runs natively on ARM Mac -- builds complete in ~6-7 minutes with `-j4` -- x86_64 image runs under Rosetta/QEMU emulation on ARM Mac -- builds take 30+ minutes with `-j1`; `-j2` or higher risks OOM kills (Docker default 8GB RAM) - ---- - -## 4. Remaining Work - -- [ ] Commit `cmake/functions.cmake` and `cmake/run-tests-individually.cmake` -- [ ] Consider adding seccomp-specific flag handling to `run-tests-individually.cmake` -- [ ] Consider fixing `CStateFileRemoverTest` to use unique temp filenames per test case (would make `-j N` more reliable for all N) -- [ ] Consider deleting `run_tests_as_seperate_processes.sh` once the CMake replacement is proven in CI -- [ ] Revert the stray `include(CTest)` in top-level `CMakeLists.txt` if not already done From 9777471e8954baa7ed8c18c7ed410813cfd92b93 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Wed, 18 Feb 2026 15:38:03 +1300 Subject: [PATCH 04/19] [ML] Remove legacy bash test runner replaced by CMake/CTest run_tests_as_seperate_processes.sh is superseded by the portable cmake/run-tests-individually.cmake runner. Co-authored-by: Cursor --- run_tests_as_seperate_processes.sh | 179 ----------------------------- 1 file changed, 179 deletions(-) delete mode 100755 run_tests_as_seperate_processes.sh diff --git a/run_tests_as_seperate_processes.sh b/run_tests_as_seperate_processes.sh deleted file mode 100755 index 4208be8fd..000000000 --- a/run_tests_as_seperate_processes.sh +++ /dev/null @@ -1,179 +0,0 @@ -#!/bin/bash -# -# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one -# or more contributor license agreements. Licensed under the Elastic License -# 2.0 and the following additional limitation. Functionality enabled by the -# files subject to the Elastic License 2.0 may only be used in production when -# invoked by an Elasticsearch process with a license key installed that permits -# use of machine learning features. You may not use this file except in -# compliance with the Elastic License 2.0 and the foregoing additional -# limitation. -# - -# This script ultimately gets called from within the docker entry point script. -# It provides a wrapper around the call to "cmake" that runs the test cases -# and provides some flexibility as to how the tests should be run in terms of how they -# are spread across processes. This is necessary when trying to isolate the impact memory -# usage of tests have upon one another. -# -# It is intended to be called as part of the CI build/test process but should be able to be run manually. -# -# It should be called with 3 parameters -# cmake_build_dir: The directory that cmake is using for build outputs, i.e. that passed to cmake's --build argument -# cmake_current_binary_dir: The directory containing the current test suite executable e.g. /test/lib/api/unittest -# test_suite: The name of the test suite to run, minus any leading "ml_", e.g. "test_api" -# -# In addition to the required parameters there are several environment variables that control the script's behaviour -# BOOST_TEST_MAX_ARGS: The maximum number of test cases to be passed off to a sub shell -# BOOST_TEST_MAX_PROCS: The maximum number of sub shells to use -# BOOST_TEST_MIXED_MODE: If set to "true" then rather than iterating over each individual test passed to a sub-shell -# run them all in the same BOOST test executable process. -# -# Design decisions: The script relies upon the simplest tools available on most unix like platforms - bash, sed and -# awk (the awk script does not use any GNU extensions for maximum portability). This is to keep the number of dependencies -# required by CI build images to a minimum (so e.g. no python etc.) - -if [ $# -lt 3 ]; then - echo "Usage: $0 " - echo "e.g.: $0 ${CPP_SRC_HOME}/cmake-build-relwithdebinfo-local ${CPP_SRC_HOME}/cmake-build-relwithdebinfo-local/test/lib/api/unittest test_api" - exit -fi - -export BUILD_DIR=$( echo $1 | sed 's|/$||' ) -export BINARY_DIR=$( echo $2 | sed 's|/$||' ) -export TEST_SUITE=$3 - -TEST_DIR=${CPP_SRC_HOME}/$(echo $BINARY_DIR | sed -e "s|$BUILD_DIR/test/||" -e 's|unittest.*|unittest|') - -export TEST_EXECUTABLE="$2/ml_$3" -export LOG_DIR="$2/test_logs" - -function num_procs() { - if [ `uname` = "Darwin" ]; then - sysctl -n hw.logicalcpu - else - nproc - fi -} - -MAX_ARGS=1 -MAX_PROCS=$(num_procs) - -if [[ -n "$BOOST_TEST_MAX_ARGS" ]]; then - MAX_ARGS=$BOOST_TEST_MAX_ARGS -fi - -if [[ -n "$BOOST_TEST_MAX_PROCS" ]]; then - MAX_PROCS=$BOOST_TEST_MAX_PROCS -fi - -rm -rf "$LOG_DIR" -mkdir -p "$LOG_DIR" - -function get_qualified_test_names() { - executable_path=$1 - - output_lines=$($executable_path --list_content 2>&1) - - while IFS= read -r line; do - match=$(grep -w '^[ ]*C.*Test' <<< "$line"); - if [ $? -eq 0 ]; then - suite=$match - continue - fi - match=$(grep -w 'test.*\*$' <<< "$line"); - if [ $? -eq 0 ]; then - case=$(sed 's/[ \*]//g' <<< "$suite/$match") - echo "$case" - fi - done <<< "$output_lines" -} - -# get the fully qualified test names -echo "Discovering tests..." -ALL_TEST_NAMES=$(get_qualified_test_names "$TEST_EXECUTABLE") - -if [ -z "$ALL_TEST_NAMES" ]; then - echo "No tests found to run or error in test discovery." - exit 1 -fi - -function execute_tests() { - - if [[ "$BOOST_TEST_MIXED_MODE" == "true" ]]; then - TEST_CASES=$(sed 's/ /:/g' <<< $@) - else - TEST_CASES=$@ - fi - - # Loop through each test - for TEST_NAME in $TEST_CASES; do - echo "--------------------------------------------------" - echo "Running test: $TEST_NAME" - - # Replace slashes and potentially other special chars for a safe filename - SAFE_TEST_LOG_FILENAME=$(echo "$TEST_NAME" | sed 's/[^a-zA-Z0-9_]/_/g' | cut -c-100) - LOG_FILE="$LOG_DIR/${SAFE_TEST_LOG_FILENAME}.log" - - # Execute the test in a separate process - TESTS=$TEST_NAME cmake --build $BUILD_DIR -t $TEST_SUITE > "$LOG_FILE" 2>&1 - TEST_STATUS=$? - - if [ $TEST_STATUS -eq 0 ]; then - echo "Test '$TEST_NAME' PASSED." - else - echo "Test '$TEST_NAME' FAILED with exit code $TEST_STATUS. Check '$LOG_FILE' for details." - fi - done -} - -export -f execute_tests - -RESULT=$(echo $ALL_TEST_NAMES | xargs -n $MAX_ARGS -P $MAX_PROCS bash -c 'execute_tests "$@"' _) - -echo "--------------------------------------------------" - -grep 'FAILED with exit code' <<< $RESULT -if [ $? -eq 0 ] -then - echo "$TEST_SUITE: Some individual tests FAILED. Check logs in '$LOG_DIR'." - echo found -else - echo "$TEST_SUITE: All individual tests PASSED." -fi - -function merge_junit_results() { - JUNIT_FILES="$@" - echo "" - cat $JUNIT_FILES | \ - awk ' - BEGIN{tests=0; skipped=0; errors=0; failures=0; id=""; time=0.0; name=""} - $0 ~ /"}' - - cat $JUNIT_FILES | sed -e '/xml/d' -e '/testsuite/d' -e '//{H;d;};x;/skipped/d' | grep '.' -echo "" -echo -} - -if [[ $BOOST_TEST_OUTPUT_FORMAT_FLAGS =~ junit ]]; then - merge_junit_results $TEST_DIR/boost_test_results_C*.junit > $TEST_DIR/boost_test_results.junit -fi - From 62cc3bf41db65d4260b8bae034afce5148e72aaa Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Thu, 19 Feb 2026 10:01:45 +1300 Subject: [PATCH 05/19] [ML] Fix test parallelism for low-core CI machines and add diagnostics The ceil(nproc/3) formula was too conservative on 4-core machines (macOS CI Orka VMs), yielding -j 2 which serialised test suites into 5 waves. On <=4 cores, CTest internal parallelism is modest enough that using all cores avoids unnecessary serialisation. Also adds diagnostic logging of CPU count and parallelism settings to build.gradle, docker_entrypoint.sh, and run-tests-individually.cmake to make CI performance easier to analyse. Co-authored-by: Cursor --- cmake/run-tests-individually.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/run-tests-individually.cmake b/cmake/run-tests-individually.cmake index 68936a5ef..918b9db92 100644 --- a/cmake/run-tests-individually.cmake +++ b/cmake/run-tests-individually.cmake @@ -188,8 +188,8 @@ if(NOT "${_current_batch}" STREQUAL "") endif() list(LENGTH _batches _num_batches) -message(STATUS "Running ${_num_tests} test(s) in ${_num_batches} batch(es), " - "max ${MAX_PROCS} parallel process(es)") +message(STATUS "${TEST_SUITE}: ${_num_tests} test(s) in ${_num_batches} batch(es), " + "MAX_ARGS=${MAX_ARGS}, MAX_PROCS=${MAX_PROCS} (${_num_cpus} logical CPUs)") # --------------------------------------------------------------------------- # Generate a per-batch runner script invoked by each CTest test entry From 8408457978373e4c826e2a536eaf3715a923611f Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Thu, 19 Feb 2026 11:37:34 +1300 Subject: [PATCH 06/19] [ML] Add daily build timing analysis step to snapshot pipeline Adds a post-build step to the branch/snapshot pipeline that analyses build+test timings for each platform, compares against the last 14 builds, and creates a Buildkite annotation flagging regressions (>2 std deviations above mean) or improvements (>1 std dev below). The step depends on all build+test jobs, uses allow_dependency_failure and soft_fail so it never blocks the pipeline, and runs on a lightweight python:3-slim Docker image. Requires a Buildkite API token with read_builds scope stored in Vault at secret/ci/elastic-ml-cpp/buildkite/api_read_token. Co-authored-by: Cursor --- .buildkite/branch.json.py | 4 + .buildkite/hooks/post-checkout | 4 + .../pipelines/analyze_build_timings.yml.sh | 26 +++ .../scripts/steps/analyze_build_timings.py | 185 ++++++++++++++++++ 4 files changed, 219 insertions(+) create mode 100755 .buildkite/pipelines/analyze_build_timings.yml.sh create mode 100755 .buildkite/scripts/steps/analyze_build_timings.py diff --git a/.buildkite/branch.json.py b/.buildkite/branch.json.py index 374326ddd..e97daf26a 100755 --- a/.buildkite/branch.json.py +++ b/.buildkite/branch.json.py @@ -40,6 +40,10 @@ def main(): build_linux = pipeline_steps.generate_step_template("Linux", "build", config.build_aarch64, config.build_x86_64) pipeline_steps.append(build_linux) + # Analyse build timings after all build+test steps complete + pipeline_steps.append(pipeline_steps.generate_step("Analyse build timings", + ".buildkite/pipelines/analyze_build_timings.yml.sh")) + # Build the DRA artifacts and upload to S3 and GCS pipeline_steps.append(pipeline_steps.generate_step("Create daily releasable artifacts", ".buildkite/pipelines/create_dra.yml.sh")) diff --git a/.buildkite/hooks/post-checkout b/.buildkite/hooks/post-checkout index 0d0253484..5998e8198 100644 --- a/.buildkite/hooks/post-checkout +++ b/.buildkite/hooks/post-checkout @@ -27,6 +27,10 @@ if [[ "$BUILDKITE_PIPELINE_SLUG" == ml-cpp* ]]; then export BUILDKITE_ANALYTICS_TOKEN=$(vault read secret/ci/elastic-ml-cpp/buildkite/test_analytics/windows_x86_64 | awk '/^token/ {print $2;}') fi + if [[ "$BUILDKITE_STEP_KEY" == "analyze_build_timings" ]]; then + export BUILDKITE_API_READ_TOKEN=$(vault read -field=token secret/ci/elastic-ml-cpp/buildkite/api_read_token 2>/dev/null || echo "") + fi + if [[ "$BUILDKITE_STEP_KEY" == "build_pytorch_docker_image" ]]; then export DOCKER_REGISTRY_USERNAME=$(vault read --field=username secret/ci/elastic-ml-cpp/prod_docker_registry_credentials) export DOCKER_REGISTRY_PASSWORD=$(vault read --field=password secret/ci/elastic-ml-cpp/prod_docker_registry_credentials) diff --git a/.buildkite/pipelines/analyze_build_timings.yml.sh b/.buildkite/pipelines/analyze_build_timings.yml.sh new file mode 100755 index 000000000..f59f31fbb --- /dev/null +++ b/.buildkite/pipelines/analyze_build_timings.yml.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +# or more contributor license agreements. Licensed under the Elastic License +# 2.0 and the following additional limitation. Functionality enabled by the +# files subject to the Elastic License 2.0 may only be used in production when +# invoked by an Elasticsearch process with a license key installed that permits +# use of machine learning features. You may not use this file except in +# compliance with the Elastic License 2.0 and the foregoing additional +# limitation. + +cat < 0 else 0 + sign = "+" if delta >= 0 else "" + + if avg > 0 and sd > 0 and cur > avg + 2 * sd: + status = ":rotating_light: Regression" + has_regression = True + elif avg > 0 and cur < avg - sd: + status = ":rocket: Faster" + else: + status = ":white_check_mark: Normal" + + lines.append( + f"| {platform_labels[plat]} | **{cur:.1f}** | {avg:.1f} | {sd:.1f} " + f"| {sign}{delta:.1f} ({sign}{delta_pct:.0f}%) | {status} |" + ) + + n_hist = len(history_builds) + lines.append("") + lines.append(f"_Compared against {n_hist} recent `{branch}` builds._") + + markdown = "\n".join(lines) + print(markdown) + + style = "warning" if has_regression else "info" + annotate(markdown, style) + + +if __name__ == "__main__": + main() From 2bdb82e7d50239e24d51046b21c26915a45ba31d Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Thu, 19 Feb 2026 13:20:11 +1300 Subject: [PATCH 07/19] [ML] Fix CMultiFileDataAdderTest parallel isolation using PID The test used a random number from 1-100 for temp directory names, which caused ~14% collision probability when CTest runs 3 batches concurrently (birthday problem). Use the process ID instead, which is guaranteed unique per CTest batch. Co-authored-by: Cursor --- lib/api/unittest/CMultiFileDataAdderTest.cc | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/lib/api/unittest/CMultiFileDataAdderTest.cc b/lib/api/unittest/CMultiFileDataAdderTest.cc index f37d11c1f..a88636e98 100644 --- a/lib/api/unittest/CMultiFileDataAdderTest.cc +++ b/lib/api/unittest/CMultiFileDataAdderTest.cc @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -37,7 +38,6 @@ #include #include #include -#include // For random number generation facilities #include #include #include @@ -102,13 +102,8 @@ void detectorPersistHelper(const std::string& configFileName, // Persist the detector state to file(s) - // Create a random number to use to generate a unique file name for each test - // this allows tests to be run successfully in parallel - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_int_distribution<> distrib(1, 100); std::ostringstream oss; - oss << distrib(gen); + oss << ml::core::CProcess::instance().id(); std::string baseOrigOutputFilename(ml::test::CTestTmpDir::tmpDir() + "/orig_" + oss.str()); From 5f0022519b255cb4416af94909addd32705e05c9 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Thu, 19 Feb 2026 14:05:33 +1300 Subject: [PATCH 08/19] [ML] Enable CMake unity builds to speed up compilation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable CMAKE_UNITY_BUILD across all CI platforms (Linux, macOS, Windows) with a batch size of 16. Unity builds combine multiple source files into fewer translation units, reducing redundant header parsing — especially beneficial for MSVC which had 32+ minute compile times. Targets with identically-named symbols in anonymous namespaces (e.g. EMPTY_STRING, persistence tags) are excluded: MlMathsCommon, MlMathsTimeSeries, MlMathsAnalytics, MlModel, MlApi, and their test counterparts where applicable. The remaining ~316 files (including MlCore at 121 files and several large test targets) benefit from unity. Also fixes symbol collisions in MlCore: rename DO_NOT_USE_THIS_VARIABLE to unique names per file, and EMPTY_STRING to EMPTY_PREFIX in CFlatPrefixTree.cc. Gradle's build.gradle now reads CMAKE_FLAGS from the environment so pipeline-defined flags flow through. Co-authored-by: Cursor --- .buildkite/pipelines/build_linux.json.py | 12 ++++++------ .buildkite/pipelines/build_macos.json.py | 2 +- .buildkite/pipelines/build_windows.json.py | 2 +- CMakeLists.txt | 7 +++++++ build.gradle | 9 +++++++++ lib/api/CMakeLists.txt | 2 ++ lib/api/unittest/CMakeLists.txt | 2 ++ lib/core/CFlatPrefixTree.cc | 4 ++-- lib/core/CLogger.cc | 2 +- lib/core/CStringUtils.cc | 2 +- lib/core/CTimezone.cc | 2 +- lib/core/CTimezone_Windows.cc | 2 +- lib/maths/analytics/CMakeLists.txt | 2 ++ lib/maths/common/CMakeLists.txt | 5 +++++ lib/maths/common/unittest/CMakeLists.txt | 2 ++ lib/maths/time_series/CMakeLists.txt | 2 ++ lib/model/CMakeLists.txt | 2 ++ 17 files changed, 47 insertions(+), 14 deletions(-) diff --git a/.buildkite/pipelines/build_linux.json.py b/.buildkite/pipelines/build_linux.json.py index 5eac40192..6b8787572 100755 --- a/.buildkite/pipelines/build_linux.json.py +++ b/.buildkite/pipelines/build_linux.json.py @@ -93,7 +93,7 @@ def main(args): "key": build_key, "env": { **common_env, - "CMAKE_FLAGS": f"-DCMAKE_TOOLCHAIN_FILE=cmake/linux-{arch}.cmake", + "CMAKE_FLAGS": f"-DCMAKE_TOOLCHAIN_FILE=cmake/linux-{arch}.cmake -DCMAKE_UNITY_BUILD=ON", "RUN_TESTS": "false", }, "notify": [ @@ -118,7 +118,7 @@ def main(args): "env": { **common_env, "BUILD_STEP_KEY": build_key, - "CMAKE_FLAGS": f"-DCMAKE_TOOLCHAIN_FILE=cmake/linux-{arch}.cmake", + "CMAKE_FLAGS": f"-DCMAKE_TOOLCHAIN_FILE=cmake/linux-{arch}.cmake -DCMAKE_UNITY_BUILD=ON", "BOOST_TEST_OUTPUT_FORMAT_FLAGS": "--logger=JUNIT,error,boost_test_results.junit", }, "plugins": { @@ -151,7 +151,7 @@ def main(args): "key": build_key, "env": { **common_env, - "CMAKE_FLAGS": f"-DCMAKE_TOOLCHAIN_FILE=cmake/linux-{arch}.cmake", + "CMAKE_FLAGS": f"-DCMAKE_TOOLCHAIN_FILE=cmake/linux-{arch}.cmake -DCMAKE_UNITY_BUILD=ON", "RUN_TESTS": "false", }, "notify": [ @@ -176,7 +176,7 @@ def main(args): "env": { **common_env, "BUILD_STEP_KEY": build_key, - "CMAKE_FLAGS": f"-DCMAKE_TOOLCHAIN_FILE=cmake/linux-{arch}.cmake", + "CMAKE_FLAGS": f"-DCMAKE_TOOLCHAIN_FILE=cmake/linux-{arch}.cmake -DCMAKE_UNITY_BUILD=ON", "BOOST_TEST_OUTPUT_FORMAT_FLAGS": "--logger=JUNIT,error,boost_test_results.junit", }, "plugins": { @@ -212,7 +212,7 @@ def main(args): "env": { **common_env, "ML_DEBUG": "1", - "CMAKE_FLAGS": "-DCMAKE_TOOLCHAIN_FILE=cmake/linux-x86_64.cmake", + "CMAKE_FLAGS": "-DCMAKE_TOOLCHAIN_FILE=cmake/linux-x86_64.cmake -DCMAKE_UNITY_BUILD=ON", "RUN_TESTS": "false", "SKIP_ARTIFACT_UPLOAD": "true", }, @@ -239,7 +239,7 @@ def main(args): **common_env, "BUILD_STEP_KEY": debug_build_key, "ML_DEBUG": "1", - "CMAKE_FLAGS": "-DCMAKE_TOOLCHAIN_FILE=cmake/linux-x86_64.cmake", + "CMAKE_FLAGS": "-DCMAKE_TOOLCHAIN_FILE=cmake/linux-x86_64.cmake -DCMAKE_UNITY_BUILD=ON", "BOOST_TEST_OUTPUT_FORMAT_FLAGS": "--logger=JUNIT,error,boost_test_results.junit", }, "plugins": { diff --git a/.buildkite/pipelines/build_macos.json.py b/.buildkite/pipelines/build_macos.json.py index 638f38357..c52758e12 100755 --- a/.buildkite/pipelines/build_macos.json.py +++ b/.buildkite/pipelines/build_macos.json.py @@ -43,7 +43,7 @@ "PATH": "/opt/homebrew/bin:$PATH", "ML_DEBUG": "0", "CPP_CROSS_COMPILE": "", - "CMAKE_FLAGS": "-DCMAKE_TOOLCHAIN_FILE=cmake/darwin-aarch64.cmake", + "CMAKE_FLAGS": "-DCMAKE_TOOLCHAIN_FILE=cmake/darwin-aarch64.cmake -DCMAKE_UNITY_BUILD=ON", "RUN_TESTS": "true", "BOOST_TEST_OUTPUT_FORMAT_FLAGS": "--logger=JUNIT,error,boost_test_results.junit", } diff --git a/.buildkite/pipelines/build_windows.json.py b/.buildkite/pipelines/build_windows.json.py index 4f498028e..ac9039110 100755 --- a/.buildkite/pipelines/build_windows.json.py +++ b/.buildkite/pipelines/build_windows.json.py @@ -41,7 +41,7 @@ common_env = { "ML_DEBUG": "0", "CPP_CROSS_COMPILE": "", - "CMAKE_FLAGS": "-DCMAKE_TOOLCHAIN_FILE=cmake/windows-x86_64.cmake", + "CMAKE_FLAGS": "-DCMAKE_TOOLCHAIN_FILE=cmake/windows-x86_64.cmake -DCMAKE_UNITY_BUILD=ON", } def main(args): diff --git a/CMakeLists.txt b/CMakeLists.txt index 09ca5d546..b0a649db4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,6 +68,13 @@ endif() project("ML") +if(CMAKE_UNITY_BUILD) + if(NOT DEFINED CMAKE_UNITY_BUILD_BATCH_SIZE) + set(CMAKE_UNITY_BUILD_BATCH_SIZE 16) + endif() + message(STATUS "Unity build enabled (batch size: ${CMAKE_UNITY_BUILD_BATCH_SIZE})") +endif() + include(CTest) include(CheckPIESupported) diff --git a/build.gradle b/build.gradle index 793124a79..843e8718d 100644 --- a/build.gradle +++ b/build.gradle @@ -86,6 +86,15 @@ if (isWindows) { // where the desired build type is specified at build time with the '--config' option. (It's safe to always specify the // '--config' option as it will simply be ignored by a single-config generated build system) String cmakeFlags = '--no-warn-unused-cli -D CMAKE_TOOLCHAIN_FILE=cmake/' + artifactClassifier + '.cmake' +// Append any extra CMake flags from the environment (e.g. -DCMAKE_UNITY_BUILD=ON) +String envCmakeFlags = System.env.CMAKE_FLAGS +if (envCmakeFlags != null && !envCmakeFlags.isEmpty()) { + // Strip any toolchain file flag to avoid duplication + envCmakeFlags = envCmakeFlags.replaceAll(/-DCMAKE_TOOLCHAIN_FILE=\\S+/, '').trim() + if (!envCmakeFlags.isEmpty()) { + cmakeFlags += ' ' + envCmakeFlags + } +} project.ext.cmakeBuildDir = "cmake-build-relwithdebinfo" project.ext.cmakeBuildType = "RelWithDebInfo" if (mlDebug.toBoolean()) { diff --git a/lib/api/CMakeLists.txt b/lib/api/CMakeLists.txt index 0eaf903fa..f3baa5fe1 100644 --- a/lib/api/CMakeLists.txt +++ b/lib/api/CMakeLists.txt @@ -21,6 +21,8 @@ set(ML_LINK_LIBRARIES $<$:ws2_32> ) +set(CMAKE_UNITY_BUILD OFF) + ml_add_library(MlApi SHARED CAnnotationJsonWriter.cc CAnomalyJob.cc diff --git a/lib/api/unittest/CMakeLists.txt b/lib/api/unittest/CMakeLists.txt index 3b54bad35..69d62a7ac 100644 --- a/lib/api/unittest/CMakeLists.txt +++ b/lib/api/unittest/CMakeLists.txt @@ -73,4 +73,6 @@ set(ML_LINK_LIBRARIES # Valijson must only be used in test cases, never in production code include_directories(SYSTEM ${CMAKE_SOURCE_DIR}/3rd_party/valijson/include) +set(CMAKE_UNITY_BUILD OFF) + ml_add_test_executable(api ${SRCS}) diff --git a/lib/core/CFlatPrefixTree.cc b/lib/core/CFlatPrefixTree.cc index f36489346..e454aa014 100644 --- a/lib/core/CFlatPrefixTree.cc +++ b/lib/core/CFlatPrefixTree.cc @@ -32,7 +32,7 @@ const char PADDING_NODE = '$'; const char LEAF_NODE = 'l'; const char BRANCH_NODE = 'b'; const char LEAF_AND_BRANCH_NODE = '*'; -const std::string EMPTY_STRING = ""; +const std::string EMPTY_PREFIX; struct SCharNotEqualTo { SCharNotEqualTo(char c, std::size_t pos) : s_Char(c), s_Pos(pos) {} @@ -79,7 +79,7 @@ bool CFlatPrefixTree::build(const TStrVec& prefixes) { if (prefixes.empty() == false) { // Ignore empty string if present - std::size_t startIndex = prefixes[0] == EMPTY_STRING ? 1 : 0; + std::size_t startIndex = prefixes[0] == EMPTY_PREFIX ? 1 : 0; this->buildRecursively(prefixes, startIndex, prefixes.size(), 0); } diff --git a/lib/core/CLogger.cc b/lib/core/CLogger.cc index f7f0269d6..13f762205 100644 --- a/lib/core/CLogger.cc +++ b/lib/core/CLogger.cc @@ -54,7 +54,7 @@ const std::string FATAL{"FATAL"}; // call instance() during the static initialisation phase of the program. Of // course, the instance may already be constructed before this if another static // object has used it. -const ml::core::CLogger& DO_NOT_USE_THIS_VARIABLE = ml::core::CLogger::instance(); +const ml::core::CLogger& DO_NOT_USE_THIS_LOGGER = ml::core::CLogger::instance(); // These must use boost::shared_ptr, not std, as that's what the Boost.Log interface // uses diff --git a/lib/core/CStringUtils.cc b/lib/core/CStringUtils.cc index 0ce3dc245..2c8ffde4a 100644 --- a/lib/core/CStringUtils.cc +++ b/lib/core/CStringUtils.cc @@ -52,7 +52,7 @@ const std::locale& locale() { // require it, call locale() during the static initialisation phase of the // program. Of course, the locale may already be constructed before this if // another static object has used it. -const std::locale& DO_NOT_USE_THIS_VARIABLE = locale(); +const std::locale& DO_NOT_USE_THIS_LOCALE = locale(); // Constants for parsing & converting memory size strings in standard ES format const std::string MEMORY_SIZE_FORMAT{"([\\d]+)(b|k|kb|m|mb|g|gb|t|tb|p|pb)"}; diff --git a/lib/core/CTimezone.cc b/lib/core/CTimezone.cc index 16afbc51f..f5dcce518 100644 --- a/lib/core/CTimezone.cc +++ b/lib/core/CTimezone.cc @@ -22,7 +22,7 @@ namespace { // call instance() during the static initialisation phase of the program. Of // course, the instance may already be constructed before this if another static // object has used it. -const ml::core::CTimezone& DO_NOT_USE_THIS_VARIABLE = ml::core::CTimezone::instance(); +const ml::core::CTimezone& DO_NOT_USE_THIS_TIMEZONE = ml::core::CTimezone::instance(); } namespace ml { diff --git a/lib/core/CTimezone_Windows.cc b/lib/core/CTimezone_Windows.cc index 41c5c9f4c..42e31ed4f 100644 --- a/lib/core/CTimezone_Windows.cc +++ b/lib/core/CTimezone_Windows.cc @@ -30,7 +30,7 @@ namespace { // call instance() during the static initialisation phase of the program. Of // course, the instance may already be constructed before this if another static // object has used it. -const ml::core::CTimezone& DO_NOT_USE_THIS_VARIABLE = ml::core::CTimezone::instance(); +const ml::core::CTimezone& DO_NOT_USE_THIS_TIMEZONE = ml::core::CTimezone::instance(); } namespace ml { diff --git a/lib/maths/analytics/CMakeLists.txt b/lib/maths/analytics/CMakeLists.txt index 03f3b0bff..422d68ca8 100644 --- a/lib/maths/analytics/CMakeLists.txt +++ b/lib/maths/analytics/CMakeLists.txt @@ -16,6 +16,8 @@ set(ML_LINK_LIBRARIES MlMathsCommon MlCore) +set(CMAKE_UNITY_BUILD OFF) + ml_add_library(MlMathsAnalytics SHARED CBoostedTree.cc CBoostedTreeFactory.cc diff --git a/lib/maths/common/CMakeLists.txt b/lib/maths/common/CMakeLists.txt index 6cfc8a0c6..163a98c0a 100644 --- a/lib/maths/common/CMakeLists.txt +++ b/lib/maths/common/CMakeLists.txt @@ -15,6 +15,11 @@ set(ML_LINK_LIBRARIES ${Boost_LIBRARIES} MlCore) +# Many source files in this library define identically-named constants +# (EMPTY_STRING, DECAY_RATE_TAG, etc.) in anonymous namespaces, which +# causes redefinition errors under unity builds. +set(CMAKE_UNITY_BUILD OFF) + ml_add_library(MlMathsCommon SHARED CAgglomerativeClusterer.cc CBasicStatistics.cc diff --git a/lib/maths/common/unittest/CMakeLists.txt b/lib/maths/common/unittest/CMakeLists.txt index a94806c9b..a1b7f208b 100644 --- a/lib/maths/common/unittest/CMakeLists.txt +++ b/lib/maths/common/unittest/CMakeLists.txt @@ -79,4 +79,6 @@ set(ML_LINK_LIBRARIES MlTest ) +set(CMAKE_UNITY_BUILD OFF) + ml_add_test_executable(maths_common ${SRCS}) diff --git a/lib/maths/time_series/CMakeLists.txt b/lib/maths/time_series/CMakeLists.txt index 83a4920c3..f7cd8b74a 100644 --- a/lib/maths/time_series/CMakeLists.txt +++ b/lib/maths/time_series/CMakeLists.txt @@ -16,6 +16,8 @@ set(ML_LINK_LIBRARIES MlMathsCommon MlCore) +set(CMAKE_UNITY_BUILD OFF) + ml_add_library(MlMathsTimeSeries SHARED CAdaptiveBucketing.cc CCalendarComponent.cc diff --git a/lib/model/CMakeLists.txt b/lib/model/CMakeLists.txt index 73dd74e4d..f7c5b0c8d 100644 --- a/lib/model/CMakeLists.txt +++ b/lib/model/CMakeLists.txt @@ -17,6 +17,8 @@ set(ML_LINK_LIBRARIES MlMathsCommon MlMathsTimeSeries) +set(CMAKE_UNITY_BUILD OFF) + ml_add_library(MlModel SHARED CAnnotatedProbability.cc CAnnotatedProbabilityBuilder.cc From 798caf4fc4aad0dac26b989dbce9b02f33d028f4 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Thu, 19 Feb 2026 17:04:07 +1300 Subject: [PATCH 09/19] [ML] Speed up Windows builds with Ninja, /Z7 and PCH support Switch Windows CI to Ninja Multi-Config generator and /Z7 debug info to eliminate the mspdbsrv.exe PDB serialization bottleneck that limited parallel compilation. Add precompiled headers (ML_PCH option) for common STL and Boost headers across all platforms. Fix CMakeLists.txt to respect an explicit CMAKE_CXX_COMPILER_LAUNCHER (e.g. sccache) instead of unconditionally forcing ccache. Re-enable unity builds for MlApi, MlModel, MlMathsCommon, and MlMathsAnalytics with per-file exclusions for symbol conflicts. Benchmarked: Windows build drops from 35m44s to 6m32s (-82%). Linux x86_64 PCH drops from 13m08s to 7m33s (-42%). Co-authored-by: Cursor --- .buildkite/pipelines/build_linux.json.py | 12 +++---- .buildkite/pipelines/build_macos.json.py | 2 +- .buildkite/pipelines/build_windows.json.py | 3 +- cmake/compiler/msvc.cmake | 1 - cmake/functions.cmake | 41 ++++++++++++++++++++++ cmake/variables.cmake | 6 ++-- lib/api/CMakeLists.txt | 13 ++++++- lib/maths/analytics/CMakeLists.txt | 2 -- lib/maths/common/CMakeLists.txt | 31 +++++++++++++--- lib/model/CMakeLists.txt | 17 ++++++++- 10 files changed, 108 insertions(+), 20 deletions(-) diff --git a/.buildkite/pipelines/build_linux.json.py b/.buildkite/pipelines/build_linux.json.py index 6b8787572..08ea19dba 100755 --- a/.buildkite/pipelines/build_linux.json.py +++ b/.buildkite/pipelines/build_linux.json.py @@ -93,7 +93,7 @@ def main(args): "key": build_key, "env": { **common_env, - "CMAKE_FLAGS": f"-DCMAKE_TOOLCHAIN_FILE=cmake/linux-{arch}.cmake -DCMAKE_UNITY_BUILD=ON", + "CMAKE_FLAGS": f"-DCMAKE_TOOLCHAIN_FILE=cmake/linux-{arch}.cmake -DCMAKE_UNITY_BUILD=ON -DML_PCH=ON", "RUN_TESTS": "false", }, "notify": [ @@ -118,7 +118,7 @@ def main(args): "env": { **common_env, "BUILD_STEP_KEY": build_key, - "CMAKE_FLAGS": f"-DCMAKE_TOOLCHAIN_FILE=cmake/linux-{arch}.cmake -DCMAKE_UNITY_BUILD=ON", + "CMAKE_FLAGS": f"-DCMAKE_TOOLCHAIN_FILE=cmake/linux-{arch}.cmake -DCMAKE_UNITY_BUILD=ON -DML_PCH=ON", "BOOST_TEST_OUTPUT_FORMAT_FLAGS": "--logger=JUNIT,error,boost_test_results.junit", }, "plugins": { @@ -151,7 +151,7 @@ def main(args): "key": build_key, "env": { **common_env, - "CMAKE_FLAGS": f"-DCMAKE_TOOLCHAIN_FILE=cmake/linux-{arch}.cmake -DCMAKE_UNITY_BUILD=ON", + "CMAKE_FLAGS": f"-DCMAKE_TOOLCHAIN_FILE=cmake/linux-{arch}.cmake -DCMAKE_UNITY_BUILD=ON -DML_PCH=ON", "RUN_TESTS": "false", }, "notify": [ @@ -176,7 +176,7 @@ def main(args): "env": { **common_env, "BUILD_STEP_KEY": build_key, - "CMAKE_FLAGS": f"-DCMAKE_TOOLCHAIN_FILE=cmake/linux-{arch}.cmake -DCMAKE_UNITY_BUILD=ON", + "CMAKE_FLAGS": f"-DCMAKE_TOOLCHAIN_FILE=cmake/linux-{arch}.cmake -DCMAKE_UNITY_BUILD=ON -DML_PCH=ON", "BOOST_TEST_OUTPUT_FORMAT_FLAGS": "--logger=JUNIT,error,boost_test_results.junit", }, "plugins": { @@ -212,7 +212,7 @@ def main(args): "env": { **common_env, "ML_DEBUG": "1", - "CMAKE_FLAGS": "-DCMAKE_TOOLCHAIN_FILE=cmake/linux-x86_64.cmake -DCMAKE_UNITY_BUILD=ON", + "CMAKE_FLAGS": "-DCMAKE_TOOLCHAIN_FILE=cmake/linux-x86_64.cmake -DCMAKE_UNITY_BUILD=ON -DML_PCH=ON", "RUN_TESTS": "false", "SKIP_ARTIFACT_UPLOAD": "true", }, @@ -239,7 +239,7 @@ def main(args): **common_env, "BUILD_STEP_KEY": debug_build_key, "ML_DEBUG": "1", - "CMAKE_FLAGS": "-DCMAKE_TOOLCHAIN_FILE=cmake/linux-x86_64.cmake -DCMAKE_UNITY_BUILD=ON", + "CMAKE_FLAGS": "-DCMAKE_TOOLCHAIN_FILE=cmake/linux-x86_64.cmake -DCMAKE_UNITY_BUILD=ON -DML_PCH=ON", "BOOST_TEST_OUTPUT_FORMAT_FLAGS": "--logger=JUNIT,error,boost_test_results.junit", }, "plugins": { diff --git a/.buildkite/pipelines/build_macos.json.py b/.buildkite/pipelines/build_macos.json.py index c52758e12..a3ee6f0c3 100755 --- a/.buildkite/pipelines/build_macos.json.py +++ b/.buildkite/pipelines/build_macos.json.py @@ -43,7 +43,7 @@ "PATH": "/opt/homebrew/bin:$PATH", "ML_DEBUG": "0", "CPP_CROSS_COMPILE": "", - "CMAKE_FLAGS": "-DCMAKE_TOOLCHAIN_FILE=cmake/darwin-aarch64.cmake -DCMAKE_UNITY_BUILD=ON", + "CMAKE_FLAGS": "-DCMAKE_TOOLCHAIN_FILE=cmake/darwin-aarch64.cmake -DCMAKE_UNITY_BUILD=ON -DML_PCH=ON", "RUN_TESTS": "true", "BOOST_TEST_OUTPUT_FORMAT_FLAGS": "--logger=JUNIT,error,boost_test_results.junit", } diff --git a/.buildkite/pipelines/build_windows.json.py b/.buildkite/pipelines/build_windows.json.py index ac9039110..c0c0d1574 100755 --- a/.buildkite/pipelines/build_windows.json.py +++ b/.buildkite/pipelines/build_windows.json.py @@ -41,7 +41,8 @@ common_env = { "ML_DEBUG": "0", "CPP_CROSS_COMPILE": "", - "CMAKE_FLAGS": "-DCMAKE_TOOLCHAIN_FILE=cmake/windows-x86_64.cmake -DCMAKE_UNITY_BUILD=ON", + "CMAKE_GENERATOR": "Ninja Multi-Config", + "CMAKE_FLAGS": "-DCMAKE_TOOLCHAIN_FILE=cmake/windows-x86_64.cmake -DCMAKE_UNITY_BUILD=ON -DML_PCH=ON", } def main(args): diff --git a/cmake/compiler/msvc.cmake b/cmake/compiler/msvc.cmake index 54988651d..d6385aef2 100644 --- a/cmake/compiler/msvc.cmake +++ b/cmake/compiler/msvc.cmake @@ -33,7 +33,6 @@ list(APPEND ML_C_FLAGS "/W4" "/EHsc" "/Gw" - "/FS" "/Zc:inline" "/diagnostics:caret" "/utf-8") diff --git a/cmake/functions.cmake b/cmake/functions.cmake index c4f1d985c..e8fe99fcc 100644 --- a/cmake/functions.cmake +++ b/cmake/functions.cmake @@ -190,6 +190,26 @@ function(ml_add_library _target _type) set_property(TARGET ${_target} PROPERTY POSITION_INDEPENDENT_CODE TRUE) + if(ML_PCH) + target_precompile_headers(${_target} PRIVATE + + + + + + + + + + + + + + + + ) + endif() + if(ML_LINK_LIBRARIES) target_link_libraries(${_target} PUBLIC ${ML_LINK_LIBRARIES}) endif() @@ -363,6 +383,27 @@ function(ml_add_test_executable _target) set_property(TARGET ml_test_${_target} PROPERTY POSITION_INDEPENDENT_CODE TRUE) + if(ML_PCH) + target_precompile_headers(ml_test_${_target} PRIVATE + + + + + + + + + + + + + + + + + ) + endif() + target_link_libraries(ml_test_${_target} ${ML_LINK_LIBRARIES}) add_test(NAME ml_test_${_target} COMMAND ml_test_${_target} diff --git a/cmake/variables.cmake b/cmake/variables.cmake index 3f42b3cd9..073c9d18f 100644 --- a/cmake/variables.cmake +++ b/cmake/variables.cmake @@ -160,9 +160,9 @@ endif() # Dictate which flags to use for "Release", "RelWithDebinfo", "Debug" and "Sanitizer" builds if(CMAKE_SYSTEM_NAME STREQUAL "Windows") set(CMAKE_CXX_FLAGS_RELEASE "/O2 /D NDEBUG /D EXCLUDE_TRACE_LOGGING /Qfast_transcendentals /Qvec-report:1") - set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "/Zi /O2 /D NDEBUG /D EXCLUDE_TRACE_LOGGING /Qfast_transcendentals /Qvec-report:1") - set(CMAKE_CXX_FLAGS_DEBUG "/Zi /Od /RTC1") - set(CMAKE_CXX_FLAGS_SANITIZER "/fsanitize=address /O2 /Zi" CACHE STRING + set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "/Z7 /O2 /D NDEBUG /D EXCLUDE_TRACE_LOGGING /Qfast_transcendentals /Qvec-report:1") + set(CMAKE_CXX_FLAGS_DEBUG "/Z7 /Od /RTC1") + set(CMAKE_CXX_FLAGS_SANITIZER "/fsanitize=address /O2 /Z7" CACHE STRING "Flags used by the C++ compiler during sanitizer builds." FORCE) set(CMAKE_EXE_LINKER_FLAGS_SANITIZER "") diff --git a/lib/api/CMakeLists.txt b/lib/api/CMakeLists.txt index f3baa5fe1..afafd3e40 100644 --- a/lib/api/CMakeLists.txt +++ b/lib/api/CMakeLists.txt @@ -21,7 +21,18 @@ set(ML_LINK_LIBRARIES $<$:ws2_32> ) -set(CMAKE_UNITY_BUILD OFF) +set_source_files_properties( + CAnomalyJob.cc + CAnomalyJobConfig.cc + CFieldDataCategorizer.cc + CForecastRunner.cc + CGlobalCategoryId.cc + CHierarchicalResultsWriter.cc + CModelSizeStatsJsonWriter.cc + CModelSnapshotJsonWriter.cc + CSingleFieldDataCategorizer.cc + PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON +) ml_add_library(MlApi SHARED CAnnotationJsonWriter.cc diff --git a/lib/maths/analytics/CMakeLists.txt b/lib/maths/analytics/CMakeLists.txt index 422d68ca8..03f3b0bff 100644 --- a/lib/maths/analytics/CMakeLists.txt +++ b/lib/maths/analytics/CMakeLists.txt @@ -16,8 +16,6 @@ set(ML_LINK_LIBRARIES MlMathsCommon MlCore) -set(CMAKE_UNITY_BUILD OFF) - ml_add_library(MlMathsAnalytics SHARED CBoostedTree.cc CBoostedTreeFactory.cc diff --git a/lib/maths/common/CMakeLists.txt b/lib/maths/common/CMakeLists.txt index 163a98c0a..53ddaf41c 100644 --- a/lib/maths/common/CMakeLists.txt +++ b/lib/maths/common/CMakeLists.txt @@ -15,10 +15,33 @@ set(ML_LINK_LIBRARIES ${Boost_LIBRARIES} MlCore) -# Many source files in this library define identically-named constants -# (EMPTY_STRING, DECAY_RATE_TAG, etc.) in anonymous namespaces, which -# causes redefinition errors under unity builds. -set(CMAKE_UNITY_BUILD OFF) +set_source_files_properties( + CBayesianOptimisation.cc + CBjkstUniqueValues.cc + CClusterer.cc + CConstantPrior.cc + CGammaRateConjugate.cc + CKMeansOnline1d.cc + CKMostCorrelated.cc + CLogNormalMeanPrecConjugate.cc + CModel.cc + CMultimodalPrior.cc + CMultinomialConjugate.cc + CMultivariateConstantPrior.cc + CMultivariateOneOfNPrior.cc + CNaiveBayes.cc + CNaturalBreaksClassifier.cc + CNormalMeanPrecConjugate.cc + COneOfNPrior.cc + CPoissonMeanConjugate.cc + CPriorStateSerialiser.cc + CQuantileSketch.cc + CSampling.cc + CStatisticalTests.cc + CXMeansOnline1d.cc + ProbabilityAggregators.cc + PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON +) ml_add_library(MlMathsCommon SHARED CAgglomerativeClusterer.cc diff --git a/lib/model/CMakeLists.txt b/lib/model/CMakeLists.txt index f7c5b0c8d..14a0f3dac 100644 --- a/lib/model/CMakeLists.txt +++ b/lib/model/CMakeLists.txt @@ -17,7 +17,22 @@ set(ML_LINK_LIBRARIES MlMathsCommon MlMathsTimeSeries) -set(CMAKE_UNITY_BUILD OFF) +set_source_files_properties( + CAnnotatedProbability.cc + CAnomalyDetector.cc + CAnomalyDetectorModel.cc + CAnomalyScore.cc + CBucketGatherer.cc + CCountingModel.cc + CDataGatherer.cc + CEventRateBucketGatherer.cc + CForecastModelPersist.cc + CHierarchicalResultsAggregator.cc + CMetricBucketGatherer.cc + CModelDetailsView.cc + CSearchKey.cc + PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON +) ml_add_library(MlModel SHARED CAnnotatedProbability.cc From 620fc58830fde7c7f05d09711850b983173f5962 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Thu, 19 Feb 2026 17:04:23 +1300 Subject: [PATCH 10/19] [ML] Add sccache with GCS backend for persistent compiler caching in CI Integrate sccache to cache compiled objects in a GCS bucket across CI builds. When SCCACHE_GCS_BUCKET is set (injected from Vault by the post-checkout hook), build scripts download sccache at build time and configure it as the compiler launcher. Infrastructure: - GCS bucket: gs://elastic-ml-cpp-sccache (europe-west2, 30-day lifecycle) - Service account: ml-cpp-sccache@elastic-ml.iam.gserviceaccount.com - Vault: secret/ci/elastic-ml-cpp/sccache/gcs_service_account Warm cache benchmarks show 18-30s build times vs 11-13 min baseline. Typical PR builds (few files changed) should see 1-3 min compilation. Co-authored-by: Cursor --- .buildkite/hooks/post-checkout | 14 ++++++++++++++ .buildkite/scripts/steps/build_and_test.ps1 | 11 +++++++++++ dev-tools/docker/docker_entrypoint.sh | 12 ++++++++++++ 3 files changed, 37 insertions(+) diff --git a/.buildkite/hooks/post-checkout b/.buildkite/hooks/post-checkout index 5998e8198..f5780f5c8 100644 --- a/.buildkite/hooks/post-checkout +++ b/.buildkite/hooks/post-checkout @@ -31,6 +31,20 @@ if [[ "$BUILDKITE_PIPELINE_SLUG" == ml-cpp* ]]; then export BUILDKITE_API_READ_TOKEN=$(vault read -field=token secret/ci/elastic-ml-cpp/buildkite/api_read_token 2>/dev/null || echo "") fi + # sccache with GCS backend — inject credentials for all build_test steps. + # The GCS service account key is stored in Vault. Build scripts detect + # SCCACHE_GCS_BUCKET and configure sccache automatically. + if [[ "$BUILDKITE_STEP_KEY" == build_test_* ]]; then + SCCACHE_GCS_KEY_JSON=$(vault read -field=key secret/ci/elastic-ml-cpp/sccache/gcs_service_account 2>/dev/null || echo "") + if [ -n "$SCCACHE_GCS_KEY_JSON" ]; then + export SCCACHE_GCS_BUCKET="elastic-ml-cpp-sccache" + export SCCACHE_GCS_KEY_FILE=$(mktemp) + echo "$SCCACHE_GCS_KEY_JSON" > "$SCCACHE_GCS_KEY_FILE" + export GOOGLE_APPLICATION_CREDENTIALS="$SCCACHE_GCS_KEY_FILE" + export SCCACHE_GCS_KEY_PATH="$SCCACHE_GCS_KEY_FILE" + fi + fi + if [[ "$BUILDKITE_STEP_KEY" == "build_pytorch_docker_image" ]]; then export DOCKER_REGISTRY_USERNAME=$(vault read --field=username secret/ci/elastic-ml-cpp/prod_docker_registry_credentials) export DOCKER_REGISTRY_PASSWORD=$(vault read --field=password secret/ci/elastic-ml-cpp/prod_docker_registry_credentials) diff --git a/.buildkite/scripts/steps/build_and_test.ps1 b/.buildkite/scripts/steps/build_and_test.ps1 index 10c6badc2..dd6771c83 100755 --- a/.buildkite/scripts/steps/build_and_test.ps1 +++ b/.buildkite/scripts/steps/build_and_test.ps1 @@ -53,6 +53,11 @@ if (Test-Path Env:ML_DEBUG) { $DebugOption="" } +# Set up sccache with GCS backend if the bucket env var has been injected +if (Test-Path Env:SCCACHE_GCS_BUCKET) { + . "$PSScriptRoot\..\..\..\dev-tools\setup_sccache.ps1" +} + # The exit code of the gradlew commands is checked explicitly, and their # stderr is treated as an error by PowerShell without this $ErrorActionPreference="Continue" @@ -69,4 +74,10 @@ if ($ExitCode -ne 0) { Exit $ExitCode } +# Print sccache stats if it was used +if (Test-Path Env:SCCACHE_PATH) { + & $Env:SCCACHE_PATH --show-stats 2>$null + & $Env:SCCACHE_PATH --stop-server 2>$null +} + buildkite-agent artifact upload "build/distributions/*" diff --git a/dev-tools/docker/docker_entrypoint.sh b/dev-tools/docker/docker_entrypoint.sh index 537b22293..9d55df75a 100755 --- a/dev-tools/docker/docker_entrypoint.sh +++ b/dev-tools/docker/docker_entrypoint.sh @@ -64,6 +64,12 @@ detect_cpus() { NCPUS=$(detect_cpus) echo "CPU detection: nproc=$(nproc), cgroup-aware=${NCPUS}" +# Set up sccache with GCS backend if credentials are available. +# SCCACHE_GCS_BUCKET is exported by the Buildkite post-checkout hook. +if [ -n "${SCCACHE_GCS_BUCKET:-}" ]; then + source ./dev-tools/setup_sccache.sh +fi + # Note: no need to clean due to the .dockerignore file # Configure the build @@ -116,3 +122,9 @@ if [ -n "${SCCACHE_PATH:-}" ]; then "$SCCACHE_PATH" --stop-server || true fi +# Print sccache stats if it was used +if [ -n "${SCCACHE_PATH:-}" ]; then + "$SCCACHE_PATH" --show-stats || true + "$SCCACHE_PATH" --stop-server || true +fi + From dcc74be0206c9f339fb9724c8cbbc4df93ac4aae Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Thu, 19 Feb 2026 17:16:52 +1300 Subject: [PATCH 11/19] [ML] Fix unity build conflicts and sccache setup issues in CI Add CMultivariateMultimodalPriorFactory.cc, CMultivariateNormalConjugateFactory.cc, and CMixtureDistribution.cc to SKIP_UNITY_BUILD_INCLUSION to fix CFactory template redefinition and TDoubleDoublePr alias conflicts in MlMathsCommon. Fix PowerShell sccache setup: --stop-server stderr output caused a terminating error due to $ErrorActionPreference="Stop". Temporarily switch to Continue. Remove Ninja Multi-Config generator from Windows CI pipeline until Ninja availability on CI agents is verified. Co-authored-by: Cursor --- lib/maths/common/CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/maths/common/CMakeLists.txt b/lib/maths/common/CMakeLists.txt index 53ddaf41c..33df7d1d6 100644 --- a/lib/maths/common/CMakeLists.txt +++ b/lib/maths/common/CMakeLists.txt @@ -24,10 +24,13 @@ set_source_files_properties( CKMeansOnline1d.cc CKMostCorrelated.cc CLogNormalMeanPrecConjugate.cc + CMixtureDistribution.cc CModel.cc CMultimodalPrior.cc CMultinomialConjugate.cc CMultivariateConstantPrior.cc + CMultivariateMultimodalPriorFactory.cc + CMultivariateNormalConjugateFactory.cc CMultivariateOneOfNPrior.cc CNaiveBayes.cc CNaturalBreaksClassifier.cc From 0a7009c0d8162720a1dc24d5cdeb1febbaefd9e1 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Thu, 19 Feb 2026 17:28:20 +1300 Subject: [PATCH 12/19] [ML] Disable unity builds for MlMathsAnalytics due to symbol conflicts ASSIGN_MISSING_TO_LEFT/RIGHT and TMemoryMappedFloatVector are defined in anonymous namespaces across multiple files, causing redefinition errors when combined in unity translation units. Co-authored-by: Cursor --- lib/maths/analytics/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/maths/analytics/CMakeLists.txt b/lib/maths/analytics/CMakeLists.txt index 03f3b0bff..422d68ca8 100644 --- a/lib/maths/analytics/CMakeLists.txt +++ b/lib/maths/analytics/CMakeLists.txt @@ -16,6 +16,8 @@ set(ML_LINK_LIBRARIES MlMathsCommon MlCore) +set(CMAKE_UNITY_BUILD OFF) + ml_add_library(MlMathsAnalytics SHARED CBoostedTree.cc CBoostedTreeFactory.cc From 4bfa69cc3197f7bcd21cec54c51129cdf80a56a7 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Fri, 20 Feb 2026 09:11:36 +1300 Subject: [PATCH 13/19] [ML] Fix remaining unity build conflicts and PCH Boost issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove from PCH — it pulls in Boost internals (boost/json, boost/container) that break on GCC when combined with unity translation units that also include boost/json.hpp. Add CMetricPopulationModel, CEventRatePopulationModel, CIndividualModel, CMetricModel, CEventRateModel, CPopulationModel, CSampleCounts, and CSimpleCountDetector to MlModel SKIP_UNITY_BUILD_INCLUSION list. Co-authored-by: Cursor --- cmake/functions.cmake | 2 -- lib/model/CMakeLists.txt | 8 ++++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/cmake/functions.cmake b/cmake/functions.cmake index e8fe99fcc..f9801d453 100644 --- a/cmake/functions.cmake +++ b/cmake/functions.cmake @@ -206,7 +206,6 @@ function(ml_add_library _target _type) - ) endif() @@ -400,7 +399,6 @@ function(ml_add_test_executable _target) - ) endif() diff --git a/lib/model/CMakeLists.txt b/lib/model/CMakeLists.txt index 14a0f3dac..26c533ba9 100644 --- a/lib/model/CMakeLists.txt +++ b/lib/model/CMakeLists.txt @@ -26,11 +26,19 @@ set_source_files_properties( CCountingModel.cc CDataGatherer.cc CEventRateBucketGatherer.cc + CEventRateModel.cc + CEventRatePopulationModel.cc CForecastModelPersist.cc CHierarchicalResultsAggregator.cc + CIndividualModel.cc CMetricBucketGatherer.cc + CMetricModel.cc + CMetricPopulationModel.cc CModelDetailsView.cc + CPopulationModel.cc + CSampleCounts.cc CSearchKey.cc + CSimpleCountDetector.cc PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON ) From cfd1a034d3e50c08115cf3973571787c1cf77e77 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Fri, 20 Feb 2026 09:17:50 +1300 Subject: [PATCH 14/19] [ML] Re-enable Ninja generator for Windows with diagnostic check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a ninja availability diagnostic to build_and_test.ps1 to verify whether ninja.exe is on PATH in the Windows CI agent image. Re-enable CMAKE_GENERATOR=Ninja Multi-Config in the pipeline — if ninja is not found, we'll see the diagnostic output and can fall back to installing it directly. Co-authored-by: Cursor --- .buildkite/scripts/steps/build_and_test.ps1 | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.buildkite/scripts/steps/build_and_test.ps1 b/.buildkite/scripts/steps/build_and_test.ps1 index dd6771c83..8d56ec3df 100755 --- a/.buildkite/scripts/steps/build_and_test.ps1 +++ b/.buildkite/scripts/steps/build_and_test.ps1 @@ -53,6 +53,17 @@ if (Test-Path Env:ML_DEBUG) { $DebugOption="" } +# Diagnostic: check if Ninja is available for the CMake generator +Write-Output "--- Ninja availability check ---" +$ninjaCmd = Get-Command ninja -ErrorAction SilentlyContinue +if ($ninjaCmd) { + Write-Output "ninja found: $($ninjaCmd.Source)" + & ninja --version +} else { + Write-Output "ninja NOT found on PATH" + Write-Output "PATH: $Env:PATH" +} + # Set up sccache with GCS backend if the bucket env var has been injected if (Test-Path Env:SCCACHE_GCS_BUCKET) { . "$PSScriptRoot\..\..\..\dev-tools\setup_sccache.ps1" From 540ae1c4a1f151e6ff8c5c73250e8ec071bd5d5f Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Fri, 20 Feb 2026 09:30:09 +1300 Subject: [PATCH 15/19] [ML] Install Ninja on Windows CI and disable MlApi unity builds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Download ninja.exe directly from GitHub releases when it's not already on PATH, matching the pattern used for sccache. This enables the Ninja Multi-Config generator on Windows CI for faster parallel compilation. Disable unity builds for MlApi entirely — too many anonymous-namespace symbol conflicts (toString, IS_TRAINING_FIELD_NAME, TYPICAL/ACTUAL, etc.) across numerous files make per-file exclusions impractical. Co-authored-by: Cursor --- .buildkite/scripts/steps/build_and_test.ps1 | 25 ++++++++++++++++----- lib/api/CMakeLists.txt | 13 +---------- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/.buildkite/scripts/steps/build_and_test.ps1 b/.buildkite/scripts/steps/build_and_test.ps1 index 8d56ec3df..5fa5de029 100755 --- a/.buildkite/scripts/steps/build_and_test.ps1 +++ b/.buildkite/scripts/steps/build_and_test.ps1 @@ -53,16 +53,31 @@ if (Test-Path Env:ML_DEBUG) { $DebugOption="" } -# Diagnostic: check if Ninja is available for the CMake generator -Write-Output "--- Ninja availability check ---" +# Ensure Ninja is available (required for Ninja Multi-Config generator) $ninjaCmd = Get-Command ninja -ErrorAction SilentlyContinue if ($ninjaCmd) { Write-Output "ninja found: $($ninjaCmd.Source)" - & ninja --version } else { - Write-Output "ninja NOT found on PATH" - Write-Output "PATH: $Env:PATH" + $ninjaVersion = "1.12.1" + $ninjaDir = "$Env:LOCALAPPDATA\ninja" + $ninjaExe = "$ninjaDir\ninja.exe" + if (Test-Path $ninjaExe) { + Write-Output "ninja already downloaded: $ninjaExe" + } else { + Write-Output "Downloading ninja v${ninjaVersion}..." + $url = "https://github.com/ninja-build/ninja/releases/download/v${ninjaVersion}/ninja-win.zip" + $zipPath = "$Env:TEMP\ninja-win.zip" + if (-not (Test-Path $ninjaDir)) { New-Item -ItemType Directory -Path $ninjaDir | Out-Null } + (New-Object Net.WebClient).DownloadFile($url, $zipPath) + Expand-Archive -Path $zipPath -DestinationPath $ninjaDir -Force + Remove-Item $zipPath -ErrorAction SilentlyContinue + Write-Output "ninja installed: $ninjaExe" + } + if ($Env:PATH -notlike "*$ninjaDir*") { + $Env:PATH = "$ninjaDir;$Env:PATH" + } } +& ninja --version # Set up sccache with GCS backend if the bucket env var has been injected if (Test-Path Env:SCCACHE_GCS_BUCKET) { diff --git a/lib/api/CMakeLists.txt b/lib/api/CMakeLists.txt index afafd3e40..f3baa5fe1 100644 --- a/lib/api/CMakeLists.txt +++ b/lib/api/CMakeLists.txt @@ -21,18 +21,7 @@ set(ML_LINK_LIBRARIES $<$:ws2_32> ) -set_source_files_properties( - CAnomalyJob.cc - CAnomalyJobConfig.cc - CFieldDataCategorizer.cc - CForecastRunner.cc - CGlobalCategoryId.cc - CHierarchicalResultsWriter.cc - CModelSizeStatsJsonWriter.cc - CModelSnapshotJsonWriter.cc - CSingleFieldDataCategorizer.cc - PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON -) +set(CMAKE_UNITY_BUILD OFF) ml_add_library(MlApi SHARED CAnnotationJsonWriter.cc From 8c113d9171c2e50a96583924a93fdbc3bb62d5e5 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Fri, 20 Feb 2026 10:50:17 +1300 Subject: [PATCH 16/19] [ML] Add Gradle build cache for Java integration tests Enable Gradle's local build cache for the ES integration test builds and persist it to GCS between CI runs. On cache-warm builds (same ES commit), compilation drops from ~15-20 min to ~1-2 min, saving significant CI wall-clock time. - Add gradle-build-cache-init.gradle init script - Pass --build-cache --init-script to both Gradle invocations - Download/upload cache tarball from GCS bucket - Extend post-checkout hook to inject GCS credentials for Java IT steps - Install gsutil on ES test agents when needed Co-authored-by: Cursor --- .buildkite/hooks/post-checkout | 8 ++-- .buildkite/scripts/steps/run_es_tests.sh | 12 ++++++ dev-tools/gradle-build-cache-init.gradle | 17 ++++++++ dev-tools/run_es_tests.sh | 53 +++++++++++++++++++++++- 4 files changed, 84 insertions(+), 6 deletions(-) create mode 100644 dev-tools/gradle-build-cache-init.gradle diff --git a/.buildkite/hooks/post-checkout b/.buildkite/hooks/post-checkout index f5780f5c8..b152d0d14 100644 --- a/.buildkite/hooks/post-checkout +++ b/.buildkite/hooks/post-checkout @@ -31,10 +31,10 @@ if [[ "$BUILDKITE_PIPELINE_SLUG" == ml-cpp* ]]; then export BUILDKITE_API_READ_TOKEN=$(vault read -field=token secret/ci/elastic-ml-cpp/buildkite/api_read_token 2>/dev/null || echo "") fi - # sccache with GCS backend — inject credentials for all build_test steps. - # The GCS service account key is stored in Vault. Build scripts detect - # SCCACHE_GCS_BUCKET and configure sccache automatically. - if [[ "$BUILDKITE_STEP_KEY" == build_test_* ]]; then + # GCS service account — inject credentials for build and Java IT steps. + # Build steps use it for sccache; Java IT steps use it for the Gradle + # build cache. The key is stored in Vault. + if [[ "$BUILDKITE_STEP_KEY" == build_test_* || "$BUILDKITE_STEP_KEY" == java_integration_tests_* ]]; then SCCACHE_GCS_KEY_JSON=$(vault read -field=key secret/ci/elastic-ml-cpp/sccache/gcs_service_account 2>/dev/null || echo "") if [ -n "$SCCACHE_GCS_KEY_JSON" ]; then export SCCACHE_GCS_BUCKET="elastic-ml-cpp-sccache" diff --git a/.buildkite/scripts/steps/run_es_tests.sh b/.buildkite/scripts/steps/run_es_tests.sh index d6ce2cbbe..e7d74d78a 100755 --- a/.buildkite/scripts/steps/run_es_tests.sh +++ b/.buildkite/scripts/steps/run_es_tests.sh @@ -24,6 +24,18 @@ export PR_AUTHOR=$(expr "$BUILDKITE_BRANCH" : '\(.*\):.*') export PR_SOURCE_BRANCH=$(expr "$BUILDKITE_BRANCH" : '.*:\(.*\)') export PR_TARGET_BRANCH=${BUILDKITE_PULL_REQUEST_BASE_BRANCH} +# Set up GCS credentials for Gradle build cache persistence (if available). +# The post-checkout hook writes the GCS service account key for sccache; +# reuse the same credentials for the Gradle cache bucket. +if [ -n "${SCCACHE_GCS_BUCKET:-}" ] && [ -n "${GOOGLE_APPLICATION_CREDENTIALS:-}" ]; then + export GRADLE_BUILD_CACHE_GCS_BUCKET="${SCCACHE_GCS_BUCKET}" + # Install gsutil if not already present + if ! command -v gsutil &>/dev/null; then + echo "--- Installing gsutil" + pip3 install --quiet gsutil 2>/dev/null || pip install --quiet gsutil 2>/dev/null || echo "Warning: failed to install gsutil" + fi +fi + mkdir -p "${IVY_REPO}/maven/org/elasticsearch/ml/ml-cpp/$VERSION" cp "build/distributions/ml-cpp-$VERSION-linux-$HARDWARE_ARCH.zip" "${IVY_REPO}/maven/org/elasticsearch/ml/ml-cpp/$VERSION/ml-cpp-$VERSION.zip" # Since this is all local, for simplicity, cheat with the dependencies/no-dependencies split diff --git a/dev-tools/gradle-build-cache-init.gradle b/dev-tools/gradle-build-cache-init.gradle new file mode 100644 index 000000000..9bf49bdf7 --- /dev/null +++ b/dev-tools/gradle-build-cache-init.gradle @@ -0,0 +1,17 @@ +/* + * Gradle init script to enable the local build cache for ES integration test + * builds. Injected via --init-script so that we don't need to modify the + * cloned Elasticsearch repository. + * + * The local build cache stores task outputs keyed on their inputs. When the + * cache directory is persisted between CI runs (e.g. via GCS), subsequent + * builds with the same ES commit get near-instant compilation. + */ + +settingsEvaluated { settings -> + settings.buildCache { + local { + enabled = true + } + } +} diff --git a/dev-tools/run_es_tests.sh b/dev-tools/run_es_tests.sh index 953408357..a9445269a 100755 --- a/dev-tools/run_es_tests.sh +++ b/dev-tools/run_es_tests.sh @@ -24,6 +24,9 @@ set -e +# Resolve the ml-cpp repo root before we cd away. +ML_CPP_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + function isCloneTargetValid { FORK_TO_CHECK="$1" BRANCH_TO_CHECK="$2" @@ -113,6 +116,52 @@ export GIT_COMMIT="$(git rev-parse HEAD)" export GIT_PREVIOUS_COMMIT="$GIT_COMMIT" IVY_REPO_URL="file://$2" -./gradlew $GRADLE_JVM_OPTS -Dbuild.ml_cpp.repo="$IVY_REPO_URL" :x-pack:plugin:ml:qa:native-multi-node-tests:javaRestTest $EXTRA_TEST_OPTS -./gradlew $GRADLE_JVM_OPTS -Dbuild.ml_cpp.repo="$IVY_REPO_URL" :x-pack:plugin:yamlRestTest --tests "org.elasticsearch.xpack.test.rest.XPackRestIT.test {p0=ml/*}" $EXTRA_TEST_OPTS + +INIT_SCRIPT="$ML_CPP_ROOT/dev-tools/gradle-build-cache-init.gradle" +GRADLE_CACHE_DIR="$HOME/.gradle/caches/build-cache-1" +CACHE_ARGS="" +if [ -f "$INIT_SCRIPT" ]; then + CACHE_ARGS="--build-cache --init-script $INIT_SCRIPT" +fi + +# Restore Gradle build cache from GCS if credentials are available. +# This lets ephemeral CI agents reuse compilation outputs from prior builds. +CACHE_KEY="gradle-build-cache-$(uname -m)" +GCS_CACHE_PATH="" +if [ -n "${GRADLE_BUILD_CACHE_GCS_BUCKET:-}" ] && [ -n "${GOOGLE_APPLICATION_CREDENTIALS:-}" ]; then + GCS_CACHE_PATH="gs://${GRADLE_BUILD_CACHE_GCS_BUCKET}/${CACHE_KEY}.tar.gz" + if command -v gsutil &>/dev/null; then + echo "--- Restoring Gradle build cache from $GCS_CACHE_PATH" + mkdir -p "$GRADLE_CACHE_DIR" + if gsutil -q stat "$GCS_CACHE_PATH" 2>/dev/null; then + gsutil cp "$GCS_CACHE_PATH" /tmp/gradle-cache.tar.gz \ + && tar xzf /tmp/gradle-cache.tar.gz -C "$HOME/.gradle/caches/" \ + && rm -f /tmp/gradle-cache.tar.gz \ + && echo "Gradle build cache restored ($(du -sh "$GRADLE_CACHE_DIR" 2>/dev/null | cut -f1))" \ + || echo "Warning: failed to restore Gradle build cache, continuing without it" + else + echo "No cached Gradle build cache found, will build from scratch" + fi + else + echo "gsutil not found, skipping Gradle build cache restore" + fi +fi + +./gradlew $GRADLE_JVM_OPTS $CACHE_ARGS -Dbuild.ml_cpp.repo="$IVY_REPO_URL" :x-pack:plugin:ml:qa:native-multi-node-tests:javaRestTest $EXTRA_TEST_OPTS +./gradlew $GRADLE_JVM_OPTS $CACHE_ARGS -Dbuild.ml_cpp.repo="$IVY_REPO_URL" :x-pack:plugin:yamlRestTest --tests "org.elasticsearch.xpack.test.rest.XPackRestIT.test {p0=ml/*}" $EXTRA_TEST_OPTS + +# Upload Gradle build cache to GCS for future builds. +if [ -n "$GCS_CACHE_PATH" ] && [ -d "$GRADLE_CACHE_DIR" ] && command -v gsutil &>/dev/null; then + echo "--- Uploading Gradle build cache to $GCS_CACHE_PATH" + CACHE_SIZE=$(du -sm "$GRADLE_CACHE_DIR" 2>/dev/null | cut -f1) + if [ "${CACHE_SIZE:-0}" -gt 0 ] && [ "${CACHE_SIZE:-0}" -lt 4096 ]; then + tar czf /tmp/gradle-cache.tar.gz -C "$HOME/.gradle/caches/" build-cache-1 \ + && gsutil -o "GSUtil:parallel_composite_upload_threshold=50M" cp /tmp/gradle-cache.tar.gz "$GCS_CACHE_PATH" \ + && rm -f /tmp/gradle-cache.tar.gz \ + && echo "Gradle build cache uploaded (${CACHE_SIZE}M)" \ + || echo "Warning: failed to upload Gradle build cache" + else + echo "Skipping cache upload (size=${CACHE_SIZE:-0}M, expected 1-4095M)" + fi +fi From e1471cc9937e65886bf395cd2b0d77e2cace5aef Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Fri, 13 Mar 2026 13:23:44 +1300 Subject: [PATCH 17/19] Restore .cursor/rules .mdc files Made-with: Cursor --- .cursor/rules/ml-cpp-build-system.mdc | 69 +++++++++++++++++++++ .cursor/rules/ml-cpp-coding-conventions.mdc | 54 ++++++++++++++++ 2 files changed, 123 insertions(+) create mode 100644 .cursor/rules/ml-cpp-build-system.mdc create mode 100644 .cursor/rules/ml-cpp-coding-conventions.mdc diff --git a/.cursor/rules/ml-cpp-build-system.mdc b/.cursor/rules/ml-cpp-build-system.mdc new file mode 100644 index 000000000..f97b27823 --- /dev/null +++ b/.cursor/rules/ml-cpp-build-system.mdc @@ -0,0 +1,69 @@ +--- +description: ml-cpp build system knowledge — CMake, Gradle, Docker, CI +globs: CMakeLists.txt, cmake/**, build.gradle, dev-tools/**, .buildkite/** +--- + +# ml-cpp Build System + +## CMake Structure + +- Top-level `CMakeLists.txt` configures the project, includes `CTest`, detects compiler cache +- `cmake/functions.cmake` defines `ml_add_library`, `ml_add_test_executable`, `ml_add_test`, `ml_install` +- `cmake/variables.cmake` defines compiler flags per platform +- Toolchain files: `cmake/{darwin-aarch64,linux-x86_64,linux-aarch64,windows-x86_64}.cmake` +- `test/CMakeLists.txt` defines test targets: `ml_test`, `test_individually`, `build_tests`, `run_tests` + +## Important: CTest and Target Names + +- `include(CTest)` reserves the `test` target name — custom targets must not use it +- Our monolithic test target is named `ml_test` (not `test`) +- `test_individually` runs tests via CTest with parallel execution + +## Build Acceleration + +### Unity Builds (`-DCMAKE_UNITY_BUILD=ON`) +- Combines multiple source files into single translation units +- Effective on x86_64 (~41% faster), minimal on aarch64 +- Conflicts from anonymous-namespace symbols need `SKIP_UNITY_BUILD_INCLUSION` +- Disabled entirely for: `MlMathsTimeSeries`, `MlMathsAnalytics`, `MlApi`, `ml_test_maths_common`, `ml_test_api` + +### Precompiled Headers (`-DML_PCH=ON`) +- Custom option, applied in `cmake/functions.cmake` via `target_precompile_headers()` +- STL headers + `` for test targets +- Do NOT include `` — conflicts with `boost/json.hpp` on GCC + +### sccache (GCS Backend) +- `dev-tools/setup_sccache.sh` / `setup_sccache.ps1` — downloads, configures, starts +- GCS bucket: `gs://elastic-ml-cpp-sccache`, per-platform prefix (`linux-x86_64/`, etc.) +- Vault: `secret/ci/elastic-ml-cpp/sccache/gcs_service_account` +- Requires `-DCMAKE_CXX_COMPILER_LAUNCHER=sccache` — top-level CMakeLists.txt respects existing launcher + +### Compiler Launcher Precedence +If `CMAKE_CXX_COMPILER_LAUNCHER` is already set (e.g. sccache), the ccache auto-detection in `CMakeLists.txt` is skipped. + +### MSVC `/Z7` vs `/Zi` +- `/Zi`: Debug info via shared PDB (`mspdbsrv.exe`) — serializes parallel compilation +- `/Z7`: Debug info embedded in `.obj` — fully parallel, sccache-compatible +- We use `/Z7` for all Windows configurations; `/FS` flag removed as unnecessary + +## Gradle Integration + +- `build.gradle` invokes CMake for macOS and Windows builds +- `task test` calls `cmake --build ... -t ml_test` +- `task check` depends on `test` +- `testParallel` formula: `numCpus <= 4 ? 2 : Math.ceil(numCpus / 2.0)` (Unix), `2` (Windows) +- Environment `CMAKE_FLAGS` are appended to Gradle's internal cmake flags (stripping duplicate toolchain) + +## Docker Builds (Linux) + +- `dev-tools/docker/docker_entrypoint.sh` — main build/test script inside containers +- `dev-tools/docker_build.sh` / `docker_test.sh` — host orchestration +- Linux aarch64 builds run in Docker; x86_64 runs `docker_entrypoint.sh` directly +- cgroup-aware CPU detection: check `/sys/fs/cgroup/cpu.max` (cgroups v2) or `/sys/fs/cgroup/cpu/cpu.cfs_{quota,period}_us` (v1) +- `ZIP_COMPRESSION_LEVEL`: 1 for PR/debug builds, 9 for release branches + +## Test Parallelism + +- Test parallelism formula: `numCpus <= 4 ? 2 : ceil(numCpus / 2)` +- CKMostCorrelatedTest/testScale is CPU-time-sensitive — keep parallelism conservative on low-core machines +- Each test suite internally uses `ctest --parallel ` for individual test case parallelism diff --git a/.cursor/rules/ml-cpp-coding-conventions.mdc b/.cursor/rules/ml-cpp-coding-conventions.mdc new file mode 100644 index 000000000..2a56a83ac --- /dev/null +++ b/.cursor/rules/ml-cpp-coding-conventions.mdc @@ -0,0 +1,54 @@ +--- +description: ml-cpp coding conventions and cross-platform considerations +globs: "**/*.cc", "**/*.h" +--- + +# ml-cpp Coding Conventions + +## Naming + +- Classes: `CUpperCamelCase` (C prefix) +- Methods: `lowerCamelCase` +- Member variables: `m_UpperCamelCase` +- Static member variables: `ms_UpperCamelCase` +- Types: `TUpperCamelCase` (T prefix for typedefs) +- Test files: `CClassNameTest.cc` +- Namespaces: `ml::module::submodule` + +## Commit Messages + +Format: `[ML] Short description` — 1-2 sentences explaining the "why". + +## Boost Test Framework + +- Tests use `BOOST_AUTO_TEST_SUITE` / `BOOST_AUTO_TEST_CASE` +- `BOOST_TEST_DONT_PRINT_LOG_VALUE` for types without operator<< +- JUnit output: `boost_test_results.junit` per test suite +- Seeded RNG: `maths::common::CSampling::seed()` at test start + +## Cross-Platform Considerations + +### Stream/IO +- `std::istream::eof()` behaves differently across platforms +- Use `peek() == std::char_traits::eof()` for portable end-of-stream detection +- `CJsonStateRestoreTraverser::isEof()` uses both checks for portability + +### Timing in Tests +- **Never use wall-clock time** (`CStopWatch`, `CLOCK_MONOTONIC`) for performance assertions in unit tests — flaky under parallel execution +- Use `std::clock()` (CPU time) for scaling/benchmark assertions +- `std::clock()` measures process CPU time on all platforms (POSIX + Windows) + +### Temporary Files in Tests +- Use process ID (`ml::core::CProcess::instance().id()`) for unique temp names +- Do NOT use small random ranges (e.g. `1-100`) — causes collisions under parallel CTest + +### Unity Build Conflicts +- Anonymous-namespace constants (`EMPTY_STRING`, `*_TAG`) cause redefinition errors +- Fix: rename to be unique, or add file to `SKIP_UNITY_BUILD_INCLUSION` +- `BOOST_TEST_DONT_PRINT_LOG_VALUE` macros also conflict in unity builds + +## RAII Patterns + +- `std::unique_ptr` with custom deleters for resource cleanup +- Use `reset()` not `release()` + manual cleanup — avoids leaks on exception paths +- `CStateFileRemover` is the RAII helper for state file deletion From 88490ba659e1967ecfcf6c6ffdac1abe14c6ab06 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Fri, 13 Mar 2026 13:51:19 +1300 Subject: [PATCH 18/19] [ML] Clean up duplicates and fix build timing step dependencies - Remove duplicate sccache stats/stop block in docker_entrypoint.sh - Remove duplicate GCS credential injection in post-checkout hook - Remove duplicate setup_sccache.sh sourcing in docker_entrypoint.sh - Fix analyze_build_timings depends_on to use test step keys Made-with: Cursor --- .buildkite/hooks/post-checkout | 11 ----------- .buildkite/pipelines/analyze_build_timings.yml.sh | 8 ++++---- dev-tools/docker/docker_entrypoint.sh | 12 ------------ 3 files changed, 4 insertions(+), 27 deletions(-) diff --git a/.buildkite/hooks/post-checkout b/.buildkite/hooks/post-checkout index 798c22235..4f2343ae9 100644 --- a/.buildkite/hooks/post-checkout +++ b/.buildkite/hooks/post-checkout @@ -50,15 +50,4 @@ if [[ "$BUILDKITE_PIPELINE_SLUG" == ml-cpp* ]]; then export DOCKER_REGISTRY_PASSWORD=$(vault read --field=password secret/ci/elastic-ml-cpp/prod_docker_registry_credentials) fi - # Retrieve GCS service account key for sccache (compiler caching). - if [[ "$BUILDKITE_STEP_KEY" == build_test_* ]]; then - SCCACHE_GCS_KEY_JSON=$(vault read -field=key secret/ci/elastic-ml-cpp/sccache/gcs_service_account 2>/dev/null || echo "") - if [ -n "$SCCACHE_GCS_KEY_JSON" ]; then - export SCCACHE_GCS_BUCKET="elastic-ml-cpp-sccache" - export SCCACHE_GCS_KEY_FILE=$(mktemp) - echo "$SCCACHE_GCS_KEY_JSON" > "$SCCACHE_GCS_KEY_FILE" - export GOOGLE_APPLICATION_CREDENTIALS="$SCCACHE_GCS_KEY_FILE" - export SCCACHE_GCS_KEY_PATH="$SCCACHE_GCS_KEY_FILE" - fi - fi fi diff --git a/.buildkite/pipelines/analyze_build_timings.yml.sh b/.buildkite/pipelines/analyze_build_timings.yml.sh index f59f31fbb..27e188f58 100755 --- a/.buildkite/pipelines/analyze_build_timings.yml.sh +++ b/.buildkite/pipelines/analyze_build_timings.yml.sh @@ -15,10 +15,10 @@ steps: command: - "python3 .buildkite/scripts/steps/analyze_build_timings.py" depends_on: - - "build_test_linux-aarch64-RelWithDebInfo" - - "build_test_linux-x86_64-RelWithDebInfo" - - "build_test_macos-aarch64-RelWithDebInfo" - - "build_test_Windows-x86_64-RelWithDebInfo" + - "test_linux-aarch64-RelWithDebInfo" + - "test_linux-x86_64-RelWithDebInfo" + - "test_macos-aarch64-RelWithDebInfo" + - "test_Windows-x86_64-RelWithDebInfo" allow_dependency_failure: true soft_fail: true agents: diff --git a/dev-tools/docker/docker_entrypoint.sh b/dev-tools/docker/docker_entrypoint.sh index f8df03e08..8653f6742 100755 --- a/dev-tools/docker/docker_entrypoint.sh +++ b/dev-tools/docker/docker_entrypoint.sh @@ -64,12 +64,6 @@ detect_cpus() { NCPUS=$(detect_cpus) echo "CPU detection: nproc=$(nproc), cgroup-aware=${NCPUS}" -# Set up sccache with GCS backend if credentials are available. -# SCCACHE_GCS_BUCKET is exported by the Buildkite post-checkout hook. -if [ -n "${SCCACHE_GCS_BUCKET:-}" ]; then - source ./dev-tools/setup_sccache.sh -fi - # Note: no need to clean due to the .dockerignore file # Configure the build @@ -122,9 +116,3 @@ if [ -n "${SCCACHE_PATH:-}" ]; then "$SCCACHE_PATH" --stop-server || true fi -# Print sccache stats if it was used -if [ -n "${SCCACHE_PATH:-}" ]; then - "$SCCACHE_PATH" --show-stats || true - "$SCCACHE_PATH" --stop-server || true -fi - From a5bf57149acc50a825491dbc6c2a974d52f20b89 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Fri, 13 Mar 2026 14:08:13 +1300 Subject: [PATCH 19/19] [ML] Add gcloud service account activation for gsutil in ES tests The gcloud SDK's gsutil requires explicit service account activation; GOOGLE_APPLICATION_CREDENTIALS alone is not sufficient on CI agents with the gcloud SDK installed. Made-with: Cursor --- dev-tools/run_es_tests.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/dev-tools/run_es_tests.sh b/dev-tools/run_es_tests.sh index a9445269a..b7f727792 100755 --- a/dev-tools/run_es_tests.sh +++ b/dev-tools/run_es_tests.sh @@ -131,6 +131,11 @@ GCS_CACHE_PATH="" if [ -n "${GRADLE_BUILD_CACHE_GCS_BUCKET:-}" ] && [ -n "${GOOGLE_APPLICATION_CREDENTIALS:-}" ]; then GCS_CACHE_PATH="gs://${GRADLE_BUILD_CACHE_GCS_BUCKET}/${CACHE_KEY}.tar.gz" if command -v gsutil &>/dev/null; then + # The gcloud SDK gsutil needs explicit service account activation; + # GOOGLE_APPLICATION_CREDENTIALS alone is not sufficient. + if command -v gcloud &>/dev/null; then + gcloud auth activate-service-account --key-file="$GOOGLE_APPLICATION_CREDENTIALS" 2>/dev/null || true + fi echo "--- Restoring Gradle build cache from $GCS_CACHE_PATH" mkdir -p "$GRADLE_CACHE_DIR" if gsutil -q stat "$GCS_CACHE_PATH" 2>/dev/null; then