Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
fe7a660
Fix AVX-512 dispatch: restrict WORK_STEALING, raise parallel thresholds
OldCrow Apr 12, 2026
e453cae
Fix three test failures on AVX2: t-quantile, dispatch thresholds, timing
OldCrow Apr 12, 2026
ea57b00
Fix 6 pre-existing test failures on NEON/arm64
OldCrow Apr 12, 2026
0e4e9f1
Add canonical strategy profiler, remove superseded tools
OldCrow Apr 12, 2026
6aef918
data: add AVX2 (Kaby Lake) dispatcher profiling bundle
OldCrow Apr 12, 2026
d31a9e2
Replace NEON Dev profile with Release profile
OldCrow Apr 12, 2026
e75c6e3
Add AVX (Ivy Bridge i7-3820QM) strategy profile results
OldCrow Apr 12, 2026
32c0819
Add AVX (Ivy Bridge) dispatcher profile bundle
OldCrow Apr 12, 2026
36642f5
Add AVX-512 dispatcher profile bundle (Zen 4 Ryzen 7 7445HS, Windows)
OldCrow Apr 12, 2026
ddb526e
Fix Beta batch paths: hoist lgamma prefix, use SIMD in parallel lambdas
OldCrow Apr 12, 2026
c9e640a
Replace dispatch thresholds with profiling-derived constexpr lookup t…
OldCrow Apr 12, 2026
247008e
Migrate all callers from deprecated selectOptimalStrategy to selectSt…
OldCrow Apr 12, 2026
1c86f90
Remove old threshold systems and dead code
OldCrow Apr 12, 2026
d8e31ea
Fix unused parameter warning in createForSIMDLevel
OldCrow Apr 12, 2026
7a68b94
Remove stale strategy_profile_results.csv from project root
OldCrow Apr 12, 2026
041dc53
Fix AVX-512/MSVC build, test thresholds, and Student-T MLE robustness
OldCrow Apr 12, 2026
575a826
Update BUILD_SYSTEM_GUIDE: AVX-512 not server-only, MSVC flag behavio…
OldCrow Apr 12, 2026
9089f2c
Remove unused MAX_COMPLEXITY_DEMOS constant from system_inspector
OldCrow Apr 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 31 additions & 31 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1092,19 +1092,28 @@ endif()
# source-file-specific flags (cmake/SIMDDetection.cmake) - All platforms: Definitions are set by
# SIMDDetection.cmake based on detection

# Windows compilers: Use global SIMD flags for compatibility
# Windows compilers: Use highest detected SIMD level as global flag.
# SIMDDetection.cmake has already run by this point and set LIBSTATS_HAS_AVX512 etc.
if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" AND CMAKE_SIZEOF_VOID_P EQUAL 8)
# MSVC x64 has comprehensive SIMD support
add_compile_options(/arch:AVX2)
message(STATUS "Applied MSVC x64 SIMD flags: /arch:AVX2")
if(LIBSTATS_HAS_AVX512)
add_compile_options(/arch:AVX512)
message(STATUS "Applied MSVC x64 SIMD flags: /arch:AVX512")
else()
add_compile_options(/arch:AVX2)
message(STATUS "Applied MSVC x64 SIMD flags: /arch:AVX2")
endif()

elseif(
CMAKE_CXX_COMPILER_ID MATCHES "Clang"
AND WIN32
AND CMAKE_SIZEOF_VOID_P EQUAL 8)
# Clang-cl on Windows x64
add_compile_options(-mavx2)
message(STATUS "Applied Clang-cl x64 SIMD flags: -mavx2")
if(LIBSTATS_HAS_AVX512)
add_compile_options(-mavx512f)
message(STATUS "Applied Clang-cl x64 SIMD flags: -mavx512f")
else()
add_compile_options(-mavx2)
message(STATUS "Applied Clang-cl x64 SIMD flags: -mavx2")
endif()
endif()

# IMPORTANT: SIMD compile definitions are handled by cmake/SIMDDetection.cmake That system detects
Expand Down Expand Up @@ -1169,12 +1178,20 @@ endif()

# SIMD Status Messages (compiler-specific)
if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" AND CMAKE_SIZEOF_VOID_P EQUAL 8)
message(STATUS "SIMD: AVX2/AVX/SSE2 enabled (MSVC x64)")
if(LIBSTATS_HAS_AVX512)
message(STATUS "SIMD: AVX-512/AVX2/AVX/SSE2 enabled (MSVC x64)")
else()
message(STATUS "SIMD: AVX2/AVX/SSE2 enabled (MSVC x64)")
endif()
elseif(
CMAKE_CXX_COMPILER_ID MATCHES "Clang"
AND WIN32
AND CMAKE_SIZEOF_VOID_P EQUAL 8)
message(STATUS "SIMD: AVX2/AVX/SSE2 enabled (ClangCL x64)")
if(LIBSTATS_HAS_AVX512)
message(STATUS "SIMD: AVX-512/AVX2/AVX/SSE2 enabled (ClangCL x64)")
else()
message(STATUS "SIMD: AVX2/AVX/SSE2 enabled (ClangCL x64)")
endif()
elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
# GCC SIMD status messages
set(SIMD_FEATURES "")
Expand Down Expand Up @@ -1225,10 +1242,8 @@ set(LIBSTATS_CORE_UTILITIES_SOURCES
)

# Level 2: Platform Capabilities (Depends on Level 0-1)
set(LIBSTATS_PLATFORM_SOURCES
src/parallel_thresholds.cpp # Architecture-specific parallel thresholds
src/thread_pool.cpp # Thread pool implementation
src/work_stealing_pool.cpp # Advanced work-stealing thread pool
set(LIBSTATS_PLATFORM_SOURCES src/thread_pool.cpp # Thread pool implementation
src/work_stealing_pool.cpp # Advanced work-stealing thread pool
)

# Level 3: Advanced Infrastructure (Depends on Level 0-2)
Expand Down Expand Up @@ -1799,7 +1814,7 @@ if(LIBSTATS_BUILD_TESTS)
test_student_t_enhanced
test_beta_enhanced
test_performance_dispatcher
test_system_capabilities # runs live SIMD/threading/bandwidth benchmarks
test_system_capabilities # runs live SIMD/threading/bandwidth benchmarks
PROPERTIES LABELS "timing")
endif()
set_tests_properties(benchmark_simd_all PROPERTIES LABELS "benchmark")
Expand Down Expand Up @@ -1933,11 +1948,8 @@ if(LIBSTATS_BUILD_TOOLS)
add_standalone_tool(cpp20_features_inspector cpp20_features_inspector.cpp)

# Performance & Benchmarking Tools
add_libstats_tool(parallel_threshold_benchmark parallel_threshold_benchmark.cpp)
add_libstats_tool(parallel_batch_fitting_benchmark parallel_batch_fitting_benchmark.cpp)
add_libstats_tool(performance_dispatcher_tool performance_dispatcher_tool.cpp)
add_libstats_tool(learning_analyzer learning_analyzer.cpp)
add_libstats_tool(empirical_characteristics_demo empirical_characteristics_demo.cpp)
add_libstats_tool(strategy_profile strategy_profile.cpp)
add_libstats_tool(simd_verification simd_verification.cpp)
add_libstats_tool(parallel_correctness_verification parallel_correctness_verification.cpp)

Expand All @@ -1950,25 +1962,13 @@ if(LIBSTATS_BUILD_TOOLS)
STATUS
" - cpp20_features_inspector: Comprehensive C++20 compiler and standard library feature detection with detailed functionality tests"
)
message(
STATUS
" - parallel_threshold_benchmark: Enhanced distribution-specific threshold optimization with adaptive learning"
)
message(
STATUS
" - parallel_batch_fitting_benchmark: Comprehensive parallel batch fitting performance analysis across all distributions with scalability testing"
)
message(
STATUS
" - performance_dispatcher_tool: Interactive Phase 3 performance framework demonstration"
)
message(
STATUS
" - learning_analyzer: Unified adaptive learning analysis with both educational simulation and real execution data (consolidates threshold_learning_demo and adaptive_learning_analyzer)"
)
message(
STATUS
" - empirical_characteristics_demo: Demonstration of empirical distribution characteristics system replacing assumption-based performance models"
" - strategy_profile: Canonical forced-strategy profiler for dispatcher threshold tuning across distributions, operations, and batch sizes"
)
message(
STATUS
Expand Down
5 changes: 2 additions & 3 deletions PROJECT_CONCEPT.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,9 +139,8 @@ These help validate correctness, SIMD behavior, thresholds, and runtime capabili
Examples:
- `system_inspector`
- `simd_verification`
- `parallel_threshold_benchmark`
- `performance_dispatcher_tool`
- `learning_analyzer`
- `strategy_profile`
- `parallel_batch_fitting_benchmark`

### Historical or specialized analysis tools
These support specific refactors or investigations and should be documented as such when retained.
Expand Down
5 changes: 2 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -183,9 +183,8 @@ libstats/
### 🔧 **Analysis Tools** (`tools/` directory)
- `system_inspector` - CPU capabilities and system information
- `simd_verification` - SIMD correctness and speedup verification
- `parallel_threshold_benchmark` - Architecture-aware parallel threshold analysis
- `performance_dispatcher_tool` - Dispatch strategy inspection and comparison
- `learning_analyzer` - Performance-learning and threshold-analysis support
- `strategy_profile` - Canonical forced-strategy profiler for dispatcher threshold tuning
- `parallel_batch_fitting_benchmark` - Parallel batch fitting performance analysis


## Testing
Expand Down
17 changes: 10 additions & 7 deletions WARP.md
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ The active SIMD level changes fundamentally between machines:
SIMD code paths, performance thresholds, and test results are architecture-dependent. If the machine has changed since the last session:
- Note the change explicitly
- Verify the build directory is current for this architecture (`cmake ..` may be needed)
- Threshold values in `src/parallel_thresholds.cpp` may need review
- Dispatch thresholds in `include/core/dispatch_thresholds.h` are architecture-specific
- Benchmark results are not comparable across architectures

## Essential Build Commands
Expand Down Expand Up @@ -233,9 +233,11 @@ cmake -DCMAKE_BUILD_TYPE=MSVCStrict ..
./build/tools/cpp20_features_inspector

# Performance analysis
./build/tools/parallel_threshold_benchmark
./build/tools/strategy_profile
./build/tools/simd_verification
./build/tools/performance_dispatcher_tool

# Dispatcher profiling bundle capture
./scripts/capture_dispatcher_profile.sh

# Cross-compiler compatibility testing
./scripts/test-cross-compiler.sh --clean
Expand Down Expand Up @@ -429,7 +431,7 @@ include/
```
src/
├── [Level 0-1] Foundation and utilities (cpu_detection.cpp, safety.cpp)
├── [Level 2] Platform capabilities (thread_pool.cpp, parallel_thresholds.cpp)
├── [Level 2] Platform capabilities (thread_pool.cpp, work_stealing_pool.cpp)
├── [Level 3] Infrastructure (benchmark.cpp, performance_dispatcher.cpp)
├── [Level 4] Framework (distribution_base.cpp)
└── [Level 5] Distributions (gaussian.cpp, exponential.cpp, etc.)
Expand Down Expand Up @@ -464,7 +466,8 @@ The CMake system uses dependency-aware object libraries for parallel compilation
#### Parallel Processing
- Auto-dispatch API: `getProbability(std::span<const double>, std::span<double>, hint)`
- Explicit control: `getProbabilityWithStrategy(spans, Strategy::PARALLEL)`
- Performance thresholds: <8 elements (scalar), 8-1000 (SIMD), >1000 (parallel)
- Dispatch thresholds are per-(architecture, distribution, operation) in `dispatch_thresholds.h`
- Thresholds derived from four-architecture profiling data in `data/profiles/dispatcher/`

### Build System Customization

Expand Down Expand Up @@ -545,8 +548,8 @@ when the machine is loaded. This is a measurement problem, not a correctness pro
# Verify SIMD operations and performance
./build/tools/simd_verification

# Analyze parallel thresholds
./build/tools/parallel_threshold_benchmark
# Profile forced strategies for threshold tuning
./build/tools/strategy_profile

# System capability analysis
./build/tools/system_inspector --performance
Expand Down
Loading
Loading