diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2b18444..3af0123 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -18,8 +18,8 @@ jobs: compiler: - { name: gcc-11, cc: gcc-11, cxx: g++-11 } - { name: gcc-12, cc: gcc-12, cxx: g++-12 } - - { name: clang-14, cc: clang-14, cxx: clang++-14 } - - { name: clang-15, cc: clang-15, cxx: clang++-15 } + - { name: clang-16, cc: clang-16, cxx: clang++-16 } + - { name: clang-17, cc: clang-17, cxx: clang++-17 } - { name: msvc, cc: cl, cxx: cl } exclude: # MSVC only on Windows @@ -32,16 +32,16 @@ jobs: compiler: { name: gcc-11, cc: gcc-11, cxx: g++-11 } - os: windows-latest compiler: { name: gcc-12, cc: gcc-12, cxx: g++-12 } - # Older Clang not on Windows + # Clang not on Windows (use MSVC) - os: windows-latest - compiler: { name: clang-14, cc: clang-14, cxx: clang++-14 } + compiler: { name: clang-16, cc: clang-16, cxx: clang++-16 } - os: windows-latest - compiler: { name: clang-15, cc: clang-15, cxx: clang++-15 } + compiler: { name: clang-17, cc: clang-17, cxx: clang++-17 } # macOS uses AppleClang, exclude Linux clangs - os: macos-latest - compiler: { name: clang-14, cc: clang-14, cxx: clang++-14 } + compiler: { name: clang-16, cc: clang-16, cxx: clang++-16 } - os: macos-latest - compiler: { name: clang-15, cc: clang-15, cxx: clang++-15 } + compiler: { name: clang-17, cc: clang-17, cxx: clang++-17 } # macOS doesn't have these GCC versions easily - os: macos-latest compiler: { name: gcc-11, cc: gcc-11, cxx: g++-11 } @@ -134,7 +134,9 @@ jobs: # On Windows, ensure PATH includes DLL directories export PATH="${{ github.workspace }}/build/${{ matrix.build_type }}:${{ github.workspace }}/build/tests/${{ matrix.build_type }}:${{ github.workspace }}/build:$PATH" fi - ctest -C ${{ matrix.build_type }} --output-on-failure --parallel + # Exclude timing and benchmark tests: speedup assertions require a quiet + # dedicated machine and serial execution. Run them locally with -j1 -L timing. + ctest -C ${{ matrix.build_type }} --output-on-failure --parallel -LE "timing|benchmark" - name: Benchmark (Release only) if: matrix.build_type == 'Release' @@ -238,12 +240,16 @@ jobs: run: | cd build # Run tests sequentially for coverage to avoid profile data corruption - ctest --output-on-failure + # Exclude timing tests for the same reason as the main CI job. + ctest --output-on-failure -LE "timing|benchmark" - name: Generate coverage report run: | # Explicitly use gcov-11 for coverage collection - lcov --gcov-tool gcov-11 --directory build --capture --output-file coverage.info + # --ignore-errors gcov suppresses missing .gcno errors for CMakeTmp probe + # artifacts generated during configure-time SIMD detection; those files + # have no corresponding .gcno because CMakeTmp is cleaned up after cmake. + lcov --gcov-tool gcov-11 --directory build --capture --output-file coverage.info --ignore-errors gcov,mismatch lcov --remove coverage.info '/usr/*' '*/tests/*' '*/examples/*' --output-file coverage.info --ignore-errors unused lcov --list coverage.info diff --git a/CMakeLists.txt b/CMakeLists.txt index 3992ed7..efe6549 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,6 +8,17 @@ endif() # Verbose messaging control option(LIBSTATS_VERBOSE_BUILD "Enable verbose build messages for debugging" OFF) +# Compiler selection: opt in to Homebrew LLVM on macOS. WARNING: enabling this requires ALL +# consumers (shared libraries, Python extensions, test executables) to link the same Homebrew +# libc++. If any consumer uses Apple libc++ instead, C++ exceptions and std::type_info records will +# not match across the boundary, causing silent catch failures. The default (OFF) uses system +# AppleClang with Apple libc++ and GCD for parallelism, which is ABI-safe for all consumers. Only +# set this to ON if you own the entire toolchain and have verified ABI consistency end-to-end. +option( + LIBSTATS_USE_HOMEBREW_LLVM + "Use Homebrew LLVM instead of system AppleClang on macOS (requires matching libc++ in all consumers)" + OFF) + # Threading system preference control option(LIBSTATS_FORCE_TBB "Force TBB usage even on platforms with native threading (e.g., GCD on macOS)" OFF) @@ -38,9 +49,11 @@ if(CMAKE_GENERATOR STREQUAL "Unix Makefiles" OR CMAKE_GENERATOR STREQUAL "MinGW endif() # Set parallel options for CMake build command (for use with cmake --build) -set(CMAKE_BUILD_PARALLEL_LEVEL - ${CPU_COUNT} - CACHE STRING "Number of parallel build jobs" FORCE) +if(NOT DEFINED CMAKE_BUILD_PARALLEL_LEVEL) + set(CMAKE_BUILD_PARALLEL_LEVEL + ${CPU_COUNT} + CACHE STRING "Number of parallel build jobs") +endif() if(LIBSTATS_VERBOSE_BUILD) message(STATUS "Set CMAKE_BUILD_PARALLEL_LEVEL=${CPU_COUNT} for cmake --build") endif() @@ -120,18 +133,24 @@ function(detect_homebrew_llvm) endif() endfunction() -# Apply Homebrew LLVM detection (macOS/Linux only) -if(APPLE OR (UNIX AND NOT WIN32)) +# Apply Homebrew LLVM detection (macOS only — Homebrew paths are macOS-specific) +if(APPLE) detect_homebrew_llvm() - if(USING_HOMEBREW_LLVM) - # Configure Homebrew LLVM + if(USING_HOMEBREW_LLVM AND LIBSTATS_USE_HOMEBREW_LLVM) + # Opt-in: use Homebrew LLVM. All consumers must link the same Homebrew libc++. set(CMAKE_C_COMPILER "${LLVM_ROOT}/bin/clang") set(CMAKE_CXX_COMPILER "${LLVM_ROOT}/bin/clang++") set(CMAKE_PREFIX_PATH "${LLVM_ROOT}") - message(STATUS "Using Homebrew LLVM from ${LLVM_ROOT}") + message(STATUS "Using Homebrew LLVM from ${LLVM_ROOT} (LIBSTATS_USE_HOMEBREW_LLVM=ON)") else() - # Fallback to system compiler + # Default: system AppleClang with Apple libc++ - ABI-safe for all consumers. + if(USING_HOMEBREW_LLVM) + message( + STATUS + "Homebrew LLVM found but not used (LIBSTATS_USE_HOMEBREW_LLVM=OFF) - using system AppleClang for ABI safety" + ) + endif() find_program(CMAKE_C_COMPILER clang) find_program(CMAKE_CXX_COMPILER clang++) if(LIBSTATS_VERBOSE_BUILD) @@ -224,9 +243,9 @@ if(APPLE) # macOS specific configuration set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++") - if(USING_HOMEBREW_LLVM) - # Homebrew LLVM specific configuration - Force LLVM libc++ to get C++20 execution policies - # Note: rpath will be set per-target to avoid duplication warnings + if(USING_HOMEBREW_LLVM AND LIBSTATS_USE_HOMEBREW_LLVM) + # Opt-in Homebrew LLVM configuration: force LLVM libc++ headers for std::execution policies. + # Note: rpath will be set per-target to avoid duplication warnings. # CRITICAL: Include LLVM libc++ headers BEFORE system headers to get C++20 execution # policies @@ -241,8 +260,9 @@ if(APPLE) # Enable experimental PSTL support in LLVM libc++ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_LIBCPP_HAS_EXPERIMENTAL_PSTL=1") else() - # System compiler configuration - use system libc++ - message(STATUS "Using system libc++ with Apple Clang") + # Default: system AppleClang with Apple libc++. Parallelism uses GCD (already implemented in + # parallel_execution.h). ABI-safe for all consumers. + message(STATUS "Using system libc++ with Apple Clang (GCD parallel path active)") endif() # Comprehensive Threading System Detection with Caching Cache results to avoid repeated @@ -274,7 +294,9 @@ if(APPLE) if(TBB_FOUND) include_directories(${TBB_INCLUDE_DIRS}) link_directories(${TBB_LIBRARY_DIRS}) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TBB_CFLAGS_OTHER}") + set(CMAKE_CXX_FLAGS + "${CMAKE_CXX_FLAGS} ${TBB_CFLAGS_OTHER}" + PARENT_SCOPE) set(LIBSTATS_HAS_TBB TRUE) message( STATUS " ✓ TBB found via pkg-config - parallel execution policies enhanced") @@ -457,7 +479,9 @@ elseif(UNIX AND NOT APPLE) if(TBB_FOUND) include_directories(${TBB_INCLUDE_DIRS}) link_directories(${TBB_LIBRARY_DIRS}) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TBB_CFLAGS_OTHER}") + set(CMAKE_CXX_FLAGS + "${CMAKE_CXX_FLAGS} ${TBB_CFLAGS_OTHER}" + PARENT_SCOPE) set(LIBSTATS_HAS_TBB TRUE CACHE BOOL "Intel TBB support available") @@ -690,15 +714,6 @@ elseif(WIN32) # Execute comprehensive threading detection for Windows detect_threading_systems_windows() - # Find GTest using find_package (no Homebrew or pkg-config on Windows) - find_package(GTest QUIET) - if(GTest_FOUND) - set(GTEST_FOUND TRUE) - message(STATUS "GTest found via find_package (Windows)") - else() - set(GTEST_FOUND FALSE) - message(STATUS "GTest not found - GTest-based tests will be skipped (Windows)") - endif() endif() # SIMD feature detection and compilation flags Use our comprehensive SIMD detection system @@ -706,9 +721,6 @@ include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/SIMDDetection.cmake") # Set options for SIMD detection behavior option(LIBSTATS_ENABLE_RUNTIME_CHECKS "Enable runtime CPU checks even when cross-compiling" OFF) -option(LIBSTATS_CONSERVATIVE_SIMD "Use conservative SIMD settings (disable newer instruction sets)" - OFF) - # Perform comprehensive SIMD detection detect_simd_features() @@ -744,7 +756,9 @@ function(detect_tbb_unified) if(TBB_FOUND) include_directories(${TBB_INCLUDE_DIRS}) link_directories(${TBB_LIBRARY_DIRS}) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TBB_CFLAGS_OTHER}") + set(CMAKE_CXX_FLAGS + "${CMAKE_CXX_FLAGS} ${TBB_CFLAGS_OTHER}" + PARENT_SCOPE) set(LIBSTATS_HAS_TBB TRUE) if(LIBSTATS_VERBOSE_BUILD) message(STATUS " ✓ TBB found via pkg-config") @@ -945,11 +959,6 @@ set(LIBSTATS_MSVC_ENHANCED_WARNINGS /we4239 # Nonstandard extension used (can catch duplicate definitions) ) -# MSVC-specific linker flags for ODR detection -set(LIBSTATS_MSVC_ODR_LINKER_FLAGS /FORCE:MULTIPLE # Force link even with multiply-defined symbols - # (to catch ODR issues) -) - # Optimization levels for different build types set(LIBSTATS_OPT_NONE_UNIX -O0) set(LIBSTATS_OPT_LIGHT_UNIX -O1) @@ -1470,9 +1479,11 @@ endif() # Create SIMD interface target for modern CMake approach create_simd_interface_target() +# Apply per-source-file SIMD compile flags once (file-global properties). +apply_simd_source_flags() -# Configure SIMD compilation for all object libraries and final targets This ensures SIMD-specific -# compile flags are applied correctly to each component +# Link the SIMD interface target to every object library and final library so all TUs receive the +# LIBSTATS_HAS_* compile definitions. configure_simd_target(libstats_foundation_obj) configure_simd_target(libstats_core_utilities_obj) configure_simd_target(libstats_platform_obj) @@ -1501,60 +1512,60 @@ if(LIBSTATS_BUILD_TESTS) # Create tests directory in build folder file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tests) - # Find GTest (Homebrew installation) First, try to use Homebrew's GTest installation directly - # Check both Intel and ARM64 paths - set(GTEST_ROOT_INTEL "/usr/local/opt/googletest") - set(GTEST_ROOT_ARM "/opt/homebrew/opt/googletest") - - # Determine which Homebrew path to use - if(EXISTS "${GTEST_ROOT_ARM}/lib/cmake/GTest") - set(GTEST_ROOT "${GTEST_ROOT_ARM}") - message(STATUS "Using ARM64 Homebrew GTest path") - elseif(EXISTS "${GTEST_ROOT_INTEL}/lib/cmake/GTest") - set(GTEST_ROOT "${GTEST_ROOT_INTEL}") - message(STATUS "Using Intel Homebrew GTest path") - else() - set(GTEST_ROOT "${GTEST_ROOT_ARM}") - message(STATUS "Using default ARM64 Homebrew GTest path (may not exist)") - endif() - - set(CMAKE_PREFIX_PATH "${CMAKE_PREFIX_PATH};${GTEST_ROOT}") - - # Try find_package first with explicit path - find_package(GTest QUIET HINTS ${GTEST_ROOT}/lib/cmake/GTest) + # GTest detection — three-step: generic find_package, then Apple/Homebrew probe, then + # FetchContent as a self-contained fallback (matches libhmm test pattern). + # Step 1: plain find_package — covers vcpkg, system installs, and previously cached results. + if(NOT GTest_FOUND) + find_package(GTest QUIET) + endif() if(GTest_FOUND OR TARGET GTest::gtest) set(GTEST_FOUND TRUE) message(STATUS "GTest found via find_package") - else() - # Fallback to pkg-config - find_package(PkgConfig QUIET) - if(PkgConfig_FOUND) - # Set PKG_CONFIG_PATH to include Homebrew's pkgconfig directory - set(ENV{PKG_CONFIG_PATH} "${GTEST_ROOT}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}") - pkg_check_modules(GTEST QUIET gtest) - pkg_check_modules(GTEST_MAIN QUIET gtest_main) - if(GTEST_FOUND) - message(STATUS "GTest found via pkg-config") - endif() - endif() + endif() - # Manual fallback if both methods fail - if(NOT GTEST_FOUND) - # Check if the Homebrew installation exists manually - if(EXISTS "${GTEST_ROOT}/lib/libgtest.a" AND EXISTS - "${GTEST_ROOT}/include/gtest/gtest.h") + # Step 2: macOS only — probe architecture-appropriate Homebrew path. + if(NOT GTEST_FOUND AND APPLE) + if(EXISTS "/opt/homebrew/opt/googletest/lib/cmake/GTest") + set(GTEST_ROOT "/opt/homebrew/opt/googletest") + elseif(EXISTS "/usr/local/opt/googletest/lib/cmake/GTest") + set(GTEST_ROOT "/usr/local/opt/googletest") + endif() + if(DEFINED GTEST_ROOT) + find_package(GTest QUIET HINTS "${GTEST_ROOT}/lib/cmake/GTest") + if(GTest_FOUND OR TARGET GTest::gtest) + set(GTEST_FOUND TRUE) + message(STATUS "GTest found via Homebrew at ${GTEST_ROOT}") + elseif(EXISTS "${GTEST_ROOT}/lib/libgtest.a" AND EXISTS + "${GTEST_ROOT}/include/gtest/gtest.h") set(GTEST_FOUND TRUE) set(GTEST_INCLUDE_DIRS "${GTEST_ROOT}/include") set(GTEST_LIBRARIES "${GTEST_ROOT}/lib/libgtest.a") set(GTEST_MAIN_LIBRARIES "${GTEST_ROOT}/lib/libgtest_main.a") message(STATUS "GTest found manually at ${GTEST_ROOT}") - else() - message(STATUS "GTest not found - GTest-based tests will be skipped") endif() endif() endif() + # Step 3: FetchContent fallback — self-contained for CI and machines without a system GTest. + if(NOT GTEST_FOUND + AND NOT TARGET GTest::gtest + AND NOT TARGET gtest) + message(STATUS "GTest not found locally - fetching via FetchContent") + include(FetchContent) + FetchContent_Declare( + googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG v1.17.0 + GIT_SHALLOW TRUE) + # Prevent GTest from overriding the project's CRT choice on Windows. + set(gtest_force_shared_crt + ON + CACHE BOOL "" FORCE) + FetchContent_MakeAvailable(googletest) + set(GTEST_FOUND TRUE) + endif() + # Common test configuration function - reduces code duplication across test types function(configure_common_test_settings TEST_NAME) # Set output directory to build/tests @@ -1887,8 +1898,12 @@ if(LIBSTATS_BUILD_TESTS) # missing targets. if(GTEST_FOUND) set_tests_properties( - test_gaussian_enhanced test_chi_squared_enhanced test_student_t_enhanced - test_beta_enhanced test_performance_dispatcher + test_math_comprehensive # vectorized math speedup assertions + test_gaussian_enhanced + test_chi_squared_enhanced + test_student_t_enhanced + test_beta_enhanced + test_performance_dispatcher test_system_capabilities # runs live SIMD/threading/bandwidth benchmarks PROPERTIES LABELS "timing") if(LIBSTATS_HAS_REQUIRES_EXPRESSIONS) diff --git a/cmake/DetectAVX512.cmake b/cmake/DetectAVX512.cmake deleted file mode 100644 index 5e99ea4..0000000 --- a/cmake/DetectAVX512.cmake +++ /dev/null @@ -1,203 +0,0 @@ -# DetectAVX512.cmake - Advanced AVX-512 detection and configuration This module provides -# comprehensive AVX-512 support detection for libstats - -# Options for AVX-512 control -option(LIBSTATS_ENABLE_AVX512 "Enable AVX-512 support if available" ON) -option(LIBSTATS_FORCE_AVX512 "Force AVX-512 compilation even without runtime detection" OFF) -option(LIBSTATS_TEST_AVX512_COMPILATION "Test AVX-512 compilation during configuration" ON) - -# Function to test AVX-512 compilation -function(test_avx512_compilation) - if(NOT LIBSTATS_TEST_AVX512_COMPILATION) - return() - endif() - - message(STATUS "Testing AVX-512 compilation support...") - - # Create test source - set(AVX512_TEST_SOURCE - " - #include - int main() { - __m512d a = _mm512_setzero_pd(); - __m512d b = _mm512_set1_pd(1.0); - __m512d c = _mm512_add_pd(a, b); - __m512d d = _mm512_fmadd_pd(a, b, c); - double result[8]; - _mm512_storeu_pd(result, d); - return (int)result[0]; - } - ") - - # Test compilation with different flag combinations - set(AVX512_FLAGS_TO_TEST - "-mavx512f" "-mavx512f -mavx512dq" "-mavx512f -mavx512dq -mavx512bw -mavx512vl" - "-march=skylake-avx512" "-march=native") - - foreach(flags ${AVX512_FLAGS_TO_TEST}) - string(REPLACE " " "_" flag_name "${flags}") - string(REPLACE "-" "_" flag_name "${flag_name}") - - try_compile( - AVX512_COMPILE_${flag_name} ${CMAKE_BINARY_DIR}/cmake_temp - SOURCES ${CMAKE_BINARY_DIR}/cmake_temp/avx512_test.cpp - CMAKE_FLAGS "-DCMAKE_CXX_FLAGS=${flags}") - - if(AVX512_COMPILE_${flag_name}) - message(STATUS " ✅ AVX-512 compiles with flags: ${flags}") - set(LIBSTATS_AVX512_COMPILE_FLAGS - "${flags}" - PARENT_SCOPE) - set(LIBSTATS_AVX512_COMPILATION_SUPPORTED - TRUE - PARENT_SCOPE) - break() - else() - message(STATUS " ❌ AVX-512 failed with flags: ${flags}") - endif() - endforeach() - - # Write test source to file - file(WRITE ${CMAKE_BINARY_DIR}/cmake_temp/avx512_test.cpp "${AVX512_TEST_SOURCE}") -endfunction() - -# Function to detect AVX-512 runtime support -function(detect_avx512_runtime) - message(STATUS "Checking for AVX-512 runtime support...") - - # Create runtime detection program - set(AVX512_RUNTIME_TEST - " - #include - #include - - #ifdef _WIN32 - #include - #endif - - bool check_avx512_support() { - unsigned int eax, ebx, ecx, edx; - - // Check if CPUID is supported - #ifdef _WIN32 - int cpuinfo[4]; - __cpuid(cpuinfo, 0); - if (cpuinfo[0] < 7) return false; - - __cpuidex(cpuinfo, 7, 0); - return (cpuinfo[1] & (1 << 16)) != 0; // AVX-512F - #else - __asm__ (\"cpuid\" : \"=a\"(eax), \"=b\"(ebx), \"=c\"(ecx), \"=d\"(edx) : \"a\"(0)); - if (eax < 7) return false; - - __asm__ (\"cpuid\" : \"=a\"(eax), \"=b\"(ebx), \"=c\"(ecx), \"=d\"(edx) : \"a\"(7), \"c\"(0)); - return (ebx & (1 << 16)) != 0; // AVX-512F - #endif - } - - int main() { - if (check_avx512_support()) { - std::cout << \"AVX-512 runtime support: YES\" << std::endl; - return 0; - } else { - std::cout << \"AVX-512 runtime support: NO\" << std::endl; - return 1; - } - } - ") - - # Write and try to run the test - file(WRITE ${CMAKE_BINARY_DIR}/cmake_temp/avx512_runtime_test.cpp "${AVX512_RUNTIME_TEST}") - - try_compile( - AVX512_RUNTIME_COMPILE ${CMAKE_BINARY_DIR}/cmake_temp - SOURCES ${CMAKE_BINARY_DIR}/cmake_temp/avx512_runtime_test.cpp - CMAKE_FLAGS "-DCMAKE_CXX_FLAGS=${LIBSTATS_AVX512_COMPILE_FLAGS}" - COPY_FILE ${CMAKE_BINARY_DIR}/cmake_temp/avx512_runtime_test) - - if(AVX512_RUNTIME_COMPILE) - execute_process( - COMMAND ${CMAKE_BINARY_DIR}/cmake_temp/avx512_runtime_test - RESULT_VARIABLE AVX512_RUNTIME_RESULT - OUTPUT_VARIABLE AVX512_RUNTIME_OUTPUT - ERROR_QUIET) - - if(AVX512_RUNTIME_RESULT EQUAL 0) - message(STATUS " ✅ ${AVX512_RUNTIME_OUTPUT}") - set(LIBSTATS_AVX512_RUNTIME_SUPPORTED - TRUE - PARENT_SCOPE) - else() - message(STATUS " ❌ ${AVX512_RUNTIME_OUTPUT}") - set(LIBSTATS_AVX512_RUNTIME_SUPPORTED - FALSE - PARENT_SCOPE) - endif() - else() - message(STATUS " ❌ Failed to compile AVX-512 runtime test") - set(LIBSTATS_AVX512_RUNTIME_SUPPORTED - FALSE - PARENT_SCOPE) - endif() -endfunction() - -# Main AVX-512 detection logic -if(LIBSTATS_ENABLE_AVX512 OR LIBSTATS_FORCE_AVX512) - message(STATUS "=== AVX-512 Detection ===") - - # Ensure temp directory exists - file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/cmake_temp) - - # Test compilation support - test_avx512_compilation() - - if(LIBSTATS_AVX512_COMPILATION_SUPPORTED OR LIBSTATS_FORCE_AVX512) - # Test runtime support (only if we can compile) - if(NOT CMAKE_CROSSCOMPILING AND NOT LIBSTATS_FORCE_AVX512) - detect_avx512_runtime() - else() - message(STATUS "Skipping AVX-512 runtime detection (cross-compiling or forced)") - set(LIBSTATS_AVX512_RUNTIME_SUPPORTED TRUE) - endif() - - # Set final configuration - if(LIBSTATS_AVX512_RUNTIME_SUPPORTED OR LIBSTATS_FORCE_AVX512) - set(LIBSTATS_HAS_AVX512 - TRUE - CACHE BOOL "AVX-512 support available" FORCE) - message(STATUS "✅ AVX-512 support enabled") - - if(LIBSTATS_AVX512_COMPILE_FLAGS) - message(STATUS "AVX-512 compile flags: ${LIBSTATS_AVX512_COMPILE_FLAGS}") - endif() - else() - set(LIBSTATS_HAS_AVX512 - FALSE - CACHE BOOL "AVX-512 support not available" FORCE) - message(STATUS "❌ AVX-512 runtime not supported") - endif() - else() - set(LIBSTATS_HAS_AVX512 - FALSE - CACHE BOOL "AVX-512 compilation not supported" FORCE) - message(STATUS "❌ AVX-512 compilation not supported") - endif() -else() - set(LIBSTATS_HAS_AVX512 - FALSE - CACHE BOOL "AVX-512 support disabled" FORCE) - message(STATUS "AVX-512 support disabled by configuration") -endif() - -# Export results for use in main CMakeLists.txt -if(LIBSTATS_HAS_AVX512) - add_compile_definitions(LIBSTATS_HAS_AVX512) - if(LIBSTATS_AVX512_COMPILE_FLAGS) - # These flags will be applied to AVX-512 specific files - set(CMAKE_CXX_FLAGS_AVX512 - "${LIBSTATS_AVX512_COMPILE_FLAGS}" - CACHE STRING "Flags for AVX-512 compilation" FORCE) - endif() -endif() - -message(STATUS "========================") diff --git a/cmake/SIMDDetection.cmake b/cmake/SIMDDetection.cmake index 0b48389..9c0b71c 100644 --- a/cmake/SIMDDetection.cmake +++ b/cmake/SIMDDetection.cmake @@ -506,11 +506,21 @@ bool test_avx2() { "SIMDDetection: check_cxx_compiler_flag('/arch:AVX512') result = ${COMPILER_SUPPORTS_AVX512}" ) if(COMPILER_SUPPORTS_AVX512) - test_runtime_cpu_feature( - "avx512" - "#include \\nbool test_avx512() {\\n __m512d a = _mm512_set1_pd(1.0);\\n __m512d b = _mm512_set1_pd(2.0);\\n __m512d c = _mm512_add_pd(a, b);\\n double result[8];\\n _mm512_storeu_pd(result, c);\\n for(int i=0; i<8; ++i) if(result[i] != 3.0) return false;\\n return true;\\n}" - RUNTIME_SUPPORTS_AVX512 - "/arch:AVX512") + # check_cxx_compiler_flag only tests that MSVC accepts the flag syntax, not + # that the build machine's CPU can execute the resulting instructions. + # Use check_cxx_source_runs (compile + execute) to verify actual CPU support. + include(CheckCXXSourceRuns) + set(_avx512_saved_req_flags "${CMAKE_REQUIRED_FLAGS}") + set(CMAKE_REQUIRED_FLAGS "/arch:AVX512") + check_cxx_source_runs( + "#include +int main() { + __m512d x = _mm512_setzero_pd(); + (void)x; + return 0; +}" + RUNTIME_SUPPORTS_AVX512) + set(CMAKE_REQUIRED_FLAGS "${_avx512_saved_req_flags}") if(RUNTIME_SUPPORTS_AVX512) set(LIBSTATS_HAS_AVX512 TRUE @@ -531,9 +541,9 @@ bool test_avx2() { set(LIBSTATS_SIMD_DEFINITIONS "${_definitions}" CACHE INTERNAL "List of SIMD compile definitions" FORCE) - message(STATUS "SIMD: AVX-512 enabled (compiler + runtime)") + message(STATUS "SIMD: AVX-512 enabled (compiler + runtime, CPU verified)") else() - message(STATUS "SIMD: AVX-512 disabled (runtime check failed)") + message(STATUS "SIMD: AVX-512 disabled (CPU does not support it)") endif() else() message(STATUS "SIMD: AVX-512 disabled (compiler not supported)") @@ -543,7 +553,16 @@ bool test_avx2() { if(COMPILER_SUPPORTS_AVX512) test_runtime_cpu_feature( "avx512" - "#include \\nbool test_avx512() {\\n __m512d a = _mm512_set1_pd(1.0);\\n __m512d b = _mm512_set1_pd(2.0);\\n __m512d c = _mm512_add_pd(a, b);\\n double result[8];\\n _mm512_storeu_pd(result, c);\\n for(int i=0; i<8; ++i) if(result[i] != 3.0) return false;\\n return true;\\n}" + "#include +bool test_avx512() { + __m512d a = _mm512_set1_pd(1.0); + __m512d b = _mm512_set1_pd(2.0); + __m512d c = _mm512_add_pd(a, b); + double result[8]; + _mm512_storeu_pd(result, c); + for(int i=0; i<8; ++i) if(result[i] != 3.0) return false; + return true; +}" RUNTIME_SUPPORTS_AVX512 "-mavx512f") if(RUNTIME_SUPPORTS_AVX512) @@ -722,8 +741,9 @@ function(create_simd_interface_target) message(STATUS "SIMD interface target created with definitions: ${_definitions}") endfunction() -# Function to configure SIMD compilation for a target using modern CMake approach -function(configure_simd_target TARGET_NAME) +# Apply per-source-file SIMD compile flags. These are file-global properties and need to be set +# only once; call this function once after create_simd_interface_target(). +function(apply_simd_source_flags) get_property( _sse2 CACHE LIBSTATS_HAS_SSE2 @@ -745,12 +765,6 @@ function(configure_simd_target TARGET_NAME) CACHE LIBSTATS_HAS_NEON PROPERTY VALUE) - # Link to SIMD interface target for definitions (modern CMake approach) - if(TARGET libstats_simd_interface) - target_link_libraries(${TARGET_NAME} PRIVATE libstats::simd) - endif() - - # Configure source-specific SIMD compilation flags if(_sse2) if(MSVC OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND WIN32)) set_source_files_properties("${CMAKE_CURRENT_SOURCE_DIR}/src/simd_sse2.cpp" @@ -801,3 +815,11 @@ function(configure_simd_target TARGET_NAME) endif() endif() endfunction() + +# Link the SIMD interface target (with its compile definitions) to a specific library or object +# target. Call this for every target that compiles any source in the project. +function(configure_simd_target TARGET_NAME) + if(TARGET libstats_simd_interface) + target_link_libraries(${TARGET_NAME} PRIVATE libstats::simd) + endif() +endfunction() diff --git a/include/common/utility_common.h b/include/common/utility_common.h index 170a560..c4e1258 100644 --- a/include/common/utility_common.h +++ b/include/common/utility_common.h @@ -39,6 +39,19 @@ #else #define LIBSTATS_NEEDS_CATALINA_CONCEPT_SYNTAX_FALLBACK 0 #endif + +// C++20 branch-prediction attributes: [[likely]] and [[unlikely]]. +// AppleClang 12 (Catalina) does not implement these attributes and emits +// -Wunknown-attributes. Use __has_cpp_attribute for a standard, portable check +// rather than a compiler-version guard so that any future compiler that adds +// support gets the attributes automatically. +#if __has_cpp_attribute(likely) + #define LIBSTATS_LIKELY [[likely]] + #define LIBSTATS_UNLIKELY [[unlikely]] +#else + #define LIBSTATS_LIKELY + #define LIBSTATS_UNLIKELY +#endif #include #include #include // Exception types diff --git a/include/core/math_utils.h b/include/core/math_utils.h index 46801cf..e76007e 100644 --- a/include/core/math_utils.h +++ b/include/core/math_utils.h @@ -465,13 +465,16 @@ namespace detail { * @return log(1 + exp(x)) */ [[nodiscard]] inline double log1pexp(double x) noexcept { - if (x > detail::LOG1PEXP_LARGE_THRESHOLD) [[likely]] { - return x; // exp(x) dominates - } else if (x > detail::LOG1PEXP_SMALL_THRESHOLD) { + if (x > detail::LOG1PEXP_LARGE_THRESHOLD) + LIBSTATS_LIKELY { + return x; // exp(x) dominates + } + else if (x > detail::LOG1PEXP_SMALL_THRESHOLD) { return std::log1p(std::exp(x)); - } else [[unlikely]] { - return std::exp(x); // 1 + exp(x) ≈ 1 - } + } else + LIBSTATS_UNLIKELY { + return std::exp(x); // 1 + exp(x) ≈ 1 + } } /** @@ -480,9 +483,11 @@ namespace detail { * @return log(exp(x) - 1) */ [[nodiscard]] inline double logexpm1(double x) noexcept { - if (x > detail::LOG1PEXP_LARGE_THRESHOLD) [[likely]] { - return x; // exp(x) dominates - } else { + if (x > detail::LOG1PEXP_LARGE_THRESHOLD) + LIBSTATS_LIKELY { + return x; // exp(x) dominates + } + else { return std::log(std::expm1(x)); } } @@ -494,9 +499,11 @@ namespace detail { * @return log(x + y) */ [[nodiscard]] inline double log_sum_exp(double log_x, double log_y) noexcept { - if (log_x > log_y) [[likely]] { - return log_x + std::log1p(std::exp(log_y - log_x)); - } else { + if (log_x > log_y) + LIBSTATS_LIKELY { + return log_x + std::log1p(std::exp(log_y - log_x)); + } + else { return log_y + std::log1p(std::exp(log_x - log_y)); } } @@ -522,9 +529,10 @@ LIBSTATS_CONSTRAINED_NODISCARD constexpr bool is_safe_float(T x) noexcept { template requires FloatingPoint LIBSTATS_CONSTRAINED_NODISCARD constexpr T clamp_safe(T x, T min_val, T max_val) noexcept { - if (std::isnan(x)) [[unlikely]] { - return min_val; - } + if (std::isnan(x)) + LIBSTATS_UNLIKELY { + return min_val; + } return std::clamp(x, min_val, max_val); } diff --git a/src/cpu_detection.cpp b/src/cpu_detection.cpp index 4fe71e8..c07e7eb 100644 --- a/src/cpu_detection.cpp +++ b/src/cpu_detection.cpp @@ -1,14 +1,14 @@ // CRITICAL: Ensure CPU detection code uses NO advanced SIMD instructions // This file detects CPU features and must not use the features it's detecting! // Use compiler-specific approaches to disable SIMD in CPU detection code -#if (defined(__clang__) && \ +#if (defined(__clang__) && \ (defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86))) // Clang: use function attributes to disable SIMD per function #define CPU_DETECTION_NO_SIMD \ __attribute__(( \ target("no-avx512f,no-avx512cd,no-avx512bw,no-avx512dq,no-avx512vl,no-avx2,no-avx,no-" \ "sse4.2,no-sse4.1,no-ssse3,no-sse3"))) -#elif (defined(__GNUC__) && \ +#elif (defined(__GNUC__) && \ (defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86))) // GCC: use pragma to disable SIMD globally for this file #pragma GCC push_options @@ -451,16 +451,18 @@ Features detect_x86_features() { features.avx512f = (ebx & (1 << 16)) != 0; } - // Get brand string if available + // Get brand string if available. Each CPUID call returns 4 × 4-byte registers + // (eax, ebx, ecx, edx) = 16 bytes of brand string per call, three calls total. safe_cpuid(0x80000000, 0, eax, ebx, ecx, edx); if (eax >= 0x80000004) { + uint32_t regs[4]; char brand[49] = {0}; - safe_cpuid(0x80000002, 0, eax, ebx, ecx, edx); - memcpy(brand, &eax, 16); - safe_cpuid(0x80000003, 0, eax, ebx, ecx, edx); - memcpy(brand + 16, &eax, 16); - safe_cpuid(0x80000004, 0, eax, ebx, ecx, edx); - memcpy(brand + 32, &eax, 16); + safe_cpuid(0x80000002, 0, regs[0], regs[1], regs[2], regs[3]); + memcpy(brand, regs, 16); + safe_cpuid(0x80000003, 0, regs[0], regs[1], regs[2], regs[3]); + memcpy(brand + 16, regs, 16); + safe_cpuid(0x80000004, 0, regs[0], regs[1], regs[2], regs[3]); + memcpy(brand + 32, regs, 16); features.brand = brand; } @@ -1021,7 +1023,7 @@ bool is_modern_intel() { } // namespace stats // Restore original compiler SIMD settings (only needed for GCC) -#if (defined(__GNUC__) && !defined(__clang__) && \ +#if (defined(__GNUC__) && !defined(__clang__) && \ (defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86))) #pragma GCC pop_options #endif diff --git a/src/discrete.cpp b/src/discrete.cpp index b8002fa..f8df0eb 100644 --- a/src/discrete.cpp +++ b/src/discrete.cpp @@ -2266,11 +2266,6 @@ void DiscreteDistribution::getCumulativeProbability(std::span valu void DiscreteDistribution::getProbabilityWithStrategy(std::span values, std::span results, detail::Strategy strategy) const { - // GPU acceleration fallback - GPU implementation not yet available, use optimal CPU strategy - if (strategy == detail::Strategy::WORK_STEALING) { - strategy = detail::Strategy::WORK_STEALING; - } - detail::DispatchUtils::executeWithStrategy( *this, values, results, strategy, [](const DiscreteDistribution& dist, double value) { return dist.getProbability(value); }, @@ -2418,11 +2413,6 @@ void DiscreteDistribution::getProbabilityWithStrategy(std::span va void DiscreteDistribution::getLogProbabilityWithStrategy(std::span values, std::span results, detail::Strategy strategy) const { - // GPU acceleration fallback - GPU implementation not yet available, use optimal CPU strategy - if (strategy == detail::Strategy::WORK_STEALING) { - strategy = detail::Strategy::WORK_STEALING; - } - detail::DispatchUtils::executeWithStrategy( *this, values, results, strategy, [](const DiscreteDistribution& dist, double value) { @@ -2588,11 +2578,6 @@ void DiscreteDistribution::getLogProbabilityWithStrategy(std::span void DiscreteDistribution::getCumulativeProbabilityWithStrategy(std::span values, std::span results, detail::Strategy strategy) const { - // GPU acceleration fallback - GPU implementation not yet available, use optimal CPU strategy - if (strategy == detail::Strategy::WORK_STEALING) { - strategy = detail::Strategy::WORK_STEALING; - } - detail::DispatchUtils::executeWithStrategy( *this, values, results, strategy, [](const DiscreteDistribution& dist, double value) { diff --git a/src/exponential.cpp b/src/exponential.cpp index 9de8664..81b9405 100644 --- a/src/exponential.cpp +++ b/src/exponential.cpp @@ -1843,11 +1843,6 @@ void ExponentialDistribution::getCumulativeProbability(std::span v void ExponentialDistribution::getProbabilityWithStrategy(std::span values, std::span results, detail::Strategy strategy) const { - // GPU acceleration fallback - GPU implementation not yet available, use optimal CPU strategy - if (strategy == detail::Strategy::WORK_STEALING) { - strategy = detail::Strategy::WORK_STEALING; - } - detail::DispatchUtils::executeWithStrategy( *this, values, results, strategy, [](const ExponentialDistribution& dist, double value) { @@ -2004,11 +1999,6 @@ void ExponentialDistribution::getProbabilityWithStrategy(std::span void ExponentialDistribution::getLogProbabilityWithStrategy(std::span values, std::span results, detail::Strategy strategy) const { - // GPU acceleration fallback - GPU implementation not yet available, use optimal CPU strategy - if (strategy == detail::Strategy::WORK_STEALING) { - strategy = detail::Strategy::WORK_STEALING; - } - detail::DispatchUtils::executeWithStrategy( *this, values, results, strategy, [](const ExponentialDistribution& dist, double value) { @@ -2165,11 +2155,6 @@ void ExponentialDistribution::getLogProbabilityWithStrategy(std::span values, std::span results, detail::Strategy strategy) const { - // GPU acceleration fallback - GPU implementation not yet available, use optimal CPU strategy - if (strategy == detail::Strategy::WORK_STEALING) { - strategy = detail::Strategy::WORK_STEALING; - } - detail::DispatchUtils::executeWithStrategy( *this, values, results, strategy, [](const ExponentialDistribution& dist, double value) { diff --git a/src/gamma.cpp b/src/gamma.cpp index ade644f..ec6b749 100644 --- a/src/gamma.cpp +++ b/src/gamma.cpp @@ -1959,11 +1959,6 @@ void GammaDistribution::getCumulativeProbability(std::span values, void GammaDistribution::getProbabilityWithStrategy(std::span values, std::span results, detail::Strategy strategy) const { - // GPU acceleration fallback - GPU implementation not yet available, use optimal CPU strategy - if (strategy == detail::Strategy::WORK_STEALING) { - strategy = detail::Strategy::WORK_STEALING; - } - detail::DispatchUtils::executeWithStrategy( *this, values, results, strategy, [](const GammaDistribution& dist, double value) { return dist.getProbability(value); }, @@ -2123,11 +2118,6 @@ void GammaDistribution::getProbabilityWithStrategy(std::span value void GammaDistribution::getLogProbabilityWithStrategy(std::span values, std::span results, detail::Strategy strategy) const { - // GPU acceleration fallback - GPU implementation not yet available, use optimal CPU strategy - if (strategy == detail::Strategy::WORK_STEALING) { - strategy = detail::Strategy::WORK_STEALING; - } - detail::DispatchUtils::executeWithStrategy( *this, values, results, strategy, [](const GammaDistribution& dist, double value) { return dist.getLogProbability(value); }, @@ -2287,11 +2277,6 @@ void GammaDistribution::getLogProbabilityWithStrategy(std::span va void GammaDistribution::getCumulativeProbabilityWithStrategy(std::span values, std::span results, detail::Strategy strategy) const { - // GPU acceleration fallback - GPU implementation not yet available, use optimal CPU strategy - if (strategy == detail::Strategy::WORK_STEALING) { - strategy = detail::Strategy::WORK_STEALING; - } - detail::DispatchUtils::executeWithStrategy( *this, values, results, strategy, [](const GammaDistribution& dist, double value) { diff --git a/src/libstats_init.cpp b/src/libstats_init.cpp index d2ad085..009391c 100644 --- a/src/libstats_init.cpp +++ b/src/libstats_init.cpp @@ -3,25 +3,27 @@ #include "libstats/platform/simd_policy.h" #include "libstats/platform/thread_pool.h" +#include #include namespace stats { void initialize_performance_systems() { - // Thread-safe one-time initialization using static local variable - static bool initialized = false; + // Thread-safe one-time initialization using double-checked locking. + // The atomic load on the fast path avoids a data race under the C++ memory model. + static std::atomic initialized{false}; static std::mutex init_mutex; // Fast path: if already initialized, return immediately - if (initialized) { + if (initialized.load(std::memory_order_acquire)) { return; } // Slow path: acquire mutex and initialize std::lock_guard lock(init_mutex); - // Double-check pattern: another thread might have initialized while we waited - if (initialized) { + // Double-check: another thread might have initialized while we waited + if (initialized.load(std::memory_order_acquire)) { return; } @@ -48,7 +50,7 @@ void initialize_performance_systems() { [[maybe_unused]] auto optimal_threads = ThreadPool::getOptimalThreadCount(); // Mark as initialized - initialized = true; + initialized.store(true, std::memory_order_release); } catch (...) { // If initialization fails, don't mark as initialized diff --git a/src/performance_history.cpp b/src/performance_history.cpp index d92cd0e..79cf127 100644 --- a/src/performance_history.cpp +++ b/src/performance_history.cpp @@ -70,8 +70,8 @@ PerformanceHistory::StrategyRecommendation PerformanceHistory::getBestStrategy( std::unique_lock lock(data_mutex_); // Check all available strategies - for (auto strategy : {Strategy::SCALAR, Strategy::VECTORIZED, Strategy::PARALLEL, - Strategy::WORK_STEALING, Strategy::WORK_STEALING}) { + for (auto strategy : + {Strategy::SCALAR, Strategy::VECTORIZED, Strategy::PARALLEL, Strategy::WORK_STEALING}) { std::string key = generateKey(strategy, distribution_type, batch_category); auto it = performance_data_.find(key); @@ -130,12 +130,10 @@ std::optional> PerformanceHistory::learnOpti if (last_underscore != std::string::npos) { std::size_t batch_category = std::stoull(key.substr(last_underscore + 1)); - // Determine strategy from key - improved parsing + // Determine strategy from key Strategy strategy = Strategy::SCALAR; if (key.find("WORK_STEALING") != std::string::npos) strategy = Strategy::WORK_STEALING; - else if (key.find("WORK_STEALING") != std::string::npos) - strategy = Strategy::WORK_STEALING; else if (key.find("PARALLEL") != std::string::npos) strategy = Strategy::PARALLEL; else if (key.find("VECTORIZED") != std::string::npos) diff --git a/src/uniform.cpp b/src/uniform.cpp index fca8b77..51efa1a 100644 --- a/src/uniform.cpp +++ b/src/uniform.cpp @@ -1826,11 +1826,6 @@ void UniformDistribution::getCumulativeProbability(std::span value void UniformDistribution::getProbabilityWithStrategy(std::span values, std::span results, detail::Strategy strategy) const { - // GPU acceleration fallback - GPU implementation not yet available, use optimal CPU strategy - if (strategy == detail::Strategy::WORK_STEALING) { - strategy = detail::Strategy::WORK_STEALING; - } - detail::DispatchUtils::executeWithStrategy( *this, values, results, strategy, [](const UniformDistribution& dist, double value) { return dist.getProbability(value); }, @@ -1960,11 +1955,6 @@ void UniformDistribution::getProbabilityWithStrategy(std::span val void UniformDistribution::getLogProbabilityWithStrategy(std::span values, std::span results, detail::Strategy strategy) const { - // GPU acceleration fallback - GPU implementation not yet available, use optimal CPU strategy - if (strategy == detail::Strategy::WORK_STEALING) { - strategy = detail::Strategy::WORK_STEALING; - } - detail::DispatchUtils::executeWithStrategy( *this, values, results, strategy, [](const UniformDistribution& dist, double value) { return dist.getLogProbability(value); }, @@ -2112,11 +2102,6 @@ void UniformDistribution::getLogProbabilityWithStrategy(std::span void UniformDistribution::getCumulativeProbabilityWithStrategy(std::span values, std::span results, detail::Strategy strategy) const { - // GPU acceleration fallback - GPU implementation not yet available, use optimal CPU strategy - if (strategy == detail::Strategy::WORK_STEALING) { - strategy = detail::Strategy::WORK_STEALING; - } - detail::DispatchUtils::executeWithStrategy( *this, values, results, strategy, [](const UniformDistribution& dist, double value) { diff --git a/tests/test_simd_policy.cpp b/tests/test_simd_policy.cpp index 7a132d8..00703e6 100644 --- a/tests/test_simd_policy.cpp +++ b/tests/test_simd_policy.cpp @@ -234,7 +234,7 @@ void test_simd_level_detection() { // Test 2: Level consistency with runtime CPU detection { - SIMDPolicy::Level policy_level = SIMDPolicy::getBestLevel(); + [[maybe_unused]] SIMDPolicy::Level policy_level = SIMDPolicy::getBestLevel(); // Compare with direct CPU detection bool cpu_supports_avx512 = stats::arch::supports_avx512();