diff --git a/CMakeLists.txt b/CMakeLists.txt index d7d3f40..d67c3ef 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1092,19 +1092,28 @@ endif() # source-file-specific flags (cmake/SIMDDetection.cmake) - All platforms: Definitions are set by # SIMDDetection.cmake based on detection -# Windows compilers: Use global SIMD flags for compatibility +# Windows compilers: Use highest detected SIMD level as global flag. +# SIMDDetection.cmake has already run by this point and set LIBSTATS_HAS_AVX512 etc. if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" AND CMAKE_SIZEOF_VOID_P EQUAL 8) - # MSVC x64 has comprehensive SIMD support - add_compile_options(/arch:AVX2) - message(STATUS "Applied MSVC x64 SIMD flags: /arch:AVX2") + if(LIBSTATS_HAS_AVX512) + add_compile_options(/arch:AVX512) + message(STATUS "Applied MSVC x64 SIMD flags: /arch:AVX512") + else() + add_compile_options(/arch:AVX2) + message(STATUS "Applied MSVC x64 SIMD flags: /arch:AVX2") + endif() elseif( CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND WIN32 AND CMAKE_SIZEOF_VOID_P EQUAL 8) - # Clang-cl on Windows x64 - add_compile_options(-mavx2) - message(STATUS "Applied Clang-cl x64 SIMD flags: -mavx2") + if(LIBSTATS_HAS_AVX512) + add_compile_options(-mavx512f) + message(STATUS "Applied Clang-cl x64 SIMD flags: -mavx512f") + else() + add_compile_options(-mavx2) + message(STATUS "Applied Clang-cl x64 SIMD flags: -mavx2") + endif() endif() # IMPORTANT: SIMD compile definitions are handled by cmake/SIMDDetection.cmake That system detects @@ -1169,12 +1178,20 @@ endif() # SIMD Status Messages (compiler-specific) if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" AND CMAKE_SIZEOF_VOID_P EQUAL 8) - message(STATUS "SIMD: AVX2/AVX/SSE2 enabled (MSVC x64)") + if(LIBSTATS_HAS_AVX512) + message(STATUS "SIMD: AVX-512/AVX2/AVX/SSE2 enabled (MSVC x64)") + else() + message(STATUS "SIMD: AVX2/AVX/SSE2 enabled (MSVC x64)") + endif() elseif( CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND WIN32 AND CMAKE_SIZEOF_VOID_P EQUAL 8) - message(STATUS "SIMD: AVX2/AVX/SSE2 enabled (ClangCL x64)") + if(LIBSTATS_HAS_AVX512) + message(STATUS "SIMD: AVX-512/AVX2/AVX/SSE2 enabled (ClangCL x64)") + else() + message(STATUS "SIMD: AVX2/AVX/SSE2 enabled (ClangCL x64)") + endif() elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU") # GCC SIMD status messages set(SIMD_FEATURES "") @@ -1225,10 +1242,8 @@ set(LIBSTATS_CORE_UTILITIES_SOURCES ) # Level 2: Platform Capabilities (Depends on Level 0-1) -set(LIBSTATS_PLATFORM_SOURCES - src/parallel_thresholds.cpp # Architecture-specific parallel thresholds - src/thread_pool.cpp # Thread pool implementation - src/work_stealing_pool.cpp # Advanced work-stealing thread pool +set(LIBSTATS_PLATFORM_SOURCES src/thread_pool.cpp # Thread pool implementation + src/work_stealing_pool.cpp # Advanced work-stealing thread pool ) # Level 3: Advanced Infrastructure (Depends on Level 0-2) @@ -1799,7 +1814,7 @@ if(LIBSTATS_BUILD_TESTS) test_student_t_enhanced test_beta_enhanced test_performance_dispatcher - test_system_capabilities # runs live SIMD/threading/bandwidth benchmarks + test_system_capabilities # runs live SIMD/threading/bandwidth benchmarks PROPERTIES LABELS "timing") endif() set_tests_properties(benchmark_simd_all PROPERTIES LABELS "benchmark") @@ -1933,11 +1948,8 @@ if(LIBSTATS_BUILD_TOOLS) add_standalone_tool(cpp20_features_inspector cpp20_features_inspector.cpp) # Performance & Benchmarking Tools - add_libstats_tool(parallel_threshold_benchmark parallel_threshold_benchmark.cpp) add_libstats_tool(parallel_batch_fitting_benchmark parallel_batch_fitting_benchmark.cpp) - add_libstats_tool(performance_dispatcher_tool performance_dispatcher_tool.cpp) - add_libstats_tool(learning_analyzer learning_analyzer.cpp) - add_libstats_tool(empirical_characteristics_demo empirical_characteristics_demo.cpp) + add_libstats_tool(strategy_profile strategy_profile.cpp) add_libstats_tool(simd_verification simd_verification.cpp) add_libstats_tool(parallel_correctness_verification parallel_correctness_verification.cpp) @@ -1950,25 +1962,13 @@ if(LIBSTATS_BUILD_TOOLS) STATUS " - cpp20_features_inspector: Comprehensive C++20 compiler and standard library feature detection with detailed functionality tests" ) - message( - STATUS - " - parallel_threshold_benchmark: Enhanced distribution-specific threshold optimization with adaptive learning" - ) message( STATUS " - parallel_batch_fitting_benchmark: Comprehensive parallel batch fitting performance analysis across all distributions with scalability testing" ) message( STATUS - " - performance_dispatcher_tool: Interactive Phase 3 performance framework demonstration" - ) - message( - STATUS - " - learning_analyzer: Unified adaptive learning analysis with both educational simulation and real execution data (consolidates threshold_learning_demo and adaptive_learning_analyzer)" - ) - message( - STATUS - " - empirical_characteristics_demo: Demonstration of empirical distribution characteristics system replacing assumption-based performance models" + " - strategy_profile: Canonical forced-strategy profiler for dispatcher threshold tuning across distributions, operations, and batch sizes" ) message( STATUS diff --git a/PROJECT_CONCEPT.md b/PROJECT_CONCEPT.md index 487dba9..fb10f16 100644 --- a/PROJECT_CONCEPT.md +++ b/PROJECT_CONCEPT.md @@ -139,9 +139,8 @@ These help validate correctness, SIMD behavior, thresholds, and runtime capabili Examples: - `system_inspector` - `simd_verification` -- `parallel_threshold_benchmark` -- `performance_dispatcher_tool` -- `learning_analyzer` +- `strategy_profile` +- `parallel_batch_fitting_benchmark` ### Historical or specialized analysis tools These support specific refactors or investigations and should be documented as such when retained. diff --git a/README.md b/README.md index 9f331ea..c0574e2 100644 --- a/README.md +++ b/README.md @@ -183,9 +183,8 @@ libstats/ ### πŸ”§ **Analysis Tools** (`tools/` directory) - `system_inspector` - CPU capabilities and system information - `simd_verification` - SIMD correctness and speedup verification -- `parallel_threshold_benchmark` - Architecture-aware parallel threshold analysis -- `performance_dispatcher_tool` - Dispatch strategy inspection and comparison -- `learning_analyzer` - Performance-learning and threshold-analysis support +- `strategy_profile` - Canonical forced-strategy profiler for dispatcher threshold tuning +- `parallel_batch_fitting_benchmark` - Parallel batch fitting performance analysis ## Testing diff --git a/WARP.md b/WARP.md index 506fbda..a84c30e 100644 --- a/WARP.md +++ b/WARP.md @@ -184,7 +184,7 @@ The active SIMD level changes fundamentally between machines: SIMD code paths, performance thresholds, and test results are architecture-dependent. If the machine has changed since the last session: - Note the change explicitly - Verify the build directory is current for this architecture (`cmake ..` may be needed) -- Threshold values in `src/parallel_thresholds.cpp` may need review +- Dispatch thresholds in `include/core/dispatch_thresholds.h` are architecture-specific - Benchmark results are not comparable across architectures ## Essential Build Commands @@ -233,9 +233,11 @@ cmake -DCMAKE_BUILD_TYPE=MSVCStrict .. ./build/tools/cpp20_features_inspector # Performance analysis -./build/tools/parallel_threshold_benchmark +./build/tools/strategy_profile ./build/tools/simd_verification -./build/tools/performance_dispatcher_tool + +# Dispatcher profiling bundle capture +./scripts/capture_dispatcher_profile.sh # Cross-compiler compatibility testing ./scripts/test-cross-compiler.sh --clean @@ -429,7 +431,7 @@ include/ ``` src/ β”œβ”€β”€ [Level 0-1] Foundation and utilities (cpu_detection.cpp, safety.cpp) -β”œβ”€β”€ [Level 2] Platform capabilities (thread_pool.cpp, parallel_thresholds.cpp) +β”œβ”€β”€ [Level 2] Platform capabilities (thread_pool.cpp, work_stealing_pool.cpp) β”œβ”€β”€ [Level 3] Infrastructure (benchmark.cpp, performance_dispatcher.cpp) β”œβ”€β”€ [Level 4] Framework (distribution_base.cpp) └── [Level 5] Distributions (gaussian.cpp, exponential.cpp, etc.) @@ -464,7 +466,8 @@ The CMake system uses dependency-aware object libraries for parallel compilation #### Parallel Processing - Auto-dispatch API: `getProbability(std::span, std::span, hint)` - Explicit control: `getProbabilityWithStrategy(spans, Strategy::PARALLEL)` -- Performance thresholds: <8 elements (scalar), 8-1000 (SIMD), >1000 (parallel) +- Dispatch thresholds are per-(architecture, distribution, operation) in `dispatch_thresholds.h` +- Thresholds derived from four-architecture profiling data in `data/profiles/dispatcher/` ### Build System Customization @@ -545,8 +548,8 @@ when the machine is loaded. This is a measurement problem, not a correctness pro # Verify SIMD operations and performance ./build/tools/simd_verification -# Analyze parallel thresholds -./build/tools/parallel_threshold_benchmark +# Profile forced strategies for threshold tuning +./build/tools/strategy_profile # System capability analysis ./build/tools/system_inspector --performance diff --git a/data/profiles/dispatcher/2026-04-12T05-27-04Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-0e4e9f1/best_strategies.csv b/data/profiles/dispatcher/2026-04-12T05-27-04Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-0e4e9f1/best_strategies.csv new file mode 100644 index 0000000..c498f93 --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T05-27-04Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-0e4e9f1/best_strategies.csv @@ -0,0 +1,433 @@ +distribution,operation,batch_size,best_strategy,best_time_us,scalar_time_us,speedup_vs_scalar +Beta,CDF,8,VECTORIZED,1.252,1.658,1.324 +Beta,CDF,16,VECTORIZED,2.437,3.297,1.353 +Beta,CDF,32,VECTORIZED,4.524,6.302,1.393 +Beta,CDF,64,VECTORIZED,8.263,11.549,1.398 +Beta,CDF,128,VECTORIZED,18.766,25.949,1.383 +Beta,CDF,256,VECTORIZED,39.654,53.871,1.359 +Beta,CDF,512,VECTORIZED,73.947,107.63,1.456 +Beta,CDF,1000,VECTORIZED,150.541,211.037,1.402 +Beta,CDF,2000,VECTORIZED,293.511,421.688,1.437 +Beta,CDF,5000,VECTORIZED,733.541,1101.81,1.502 +Beta,CDF,10000,VECTORIZED,1497.362,2076.387,1.387 +Beta,CDF,20000,VECTORIZED,3058.145,4261.475,1.393 +Beta,CDF,50000,VECTORIZED,7394.72,10196.959,1.379 +Beta,CDF,100000,VECTORIZED,14874.081,20411.823,1.372 +Beta,CDF,250000,VECTORIZED,39713.597,54558.686,1.374 +Beta,CDF,500000,VECTORIZED,76353.624,108133.486,1.416 +Beta,LogPDF,8,WORK_STEALING,0.58,0.776,1.338 +Beta,LogPDF,16,WORK_STEALING,0.821,1.521,1.853 +Beta,LogPDF,32,WORK_STEALING,1.434,2.82,1.967 +Beta,LogPDF,64,WORK_STEALING,2.676,5.717,2.136 +Beta,LogPDF,128,VECTORIZED,3.803,12.151,3.195 +Beta,LogPDF,256,VECTORIZED,6.28,22.399,3.567 +Beta,LogPDF,512,VECTORIZED,14.928,48.628,3.258 +Beta,LogPDF,1000,VECTORIZED,28.851,96.709,3.352 +Beta,LogPDF,2000,VECTORIZED,58.787,193.631,3.294 +Beta,LogPDF,5000,VECTORIZED,139.933,447.077,3.195 +Beta,LogPDF,10000,VECTORIZED,322.386,904.276,2.805 +Beta,LogPDF,20000,VECTORIZED,604.568,2149.963,3.556 +Beta,LogPDF,50000,VECTORIZED,1404.607,4916.122,3.5 +Beta,LogPDF,100000,VECTORIZED,2926.962,9512.36,3.25 +Beta,LogPDF,250000,VECTORIZED,8077.815,23750.288,2.94 +Beta,LogPDF,500000,VECTORIZED,16603.17,47104.603,2.837 +Beta,PDF,8,WORK_STEALING,0.678,0.942,1.389 +Beta,PDF,16,WORK_STEALING,1.081,1.751,1.62 +Beta,PDF,32,VECTORIZED,1.902,3.43,1.803 +Beta,PDF,64,VECTORIZED,3.118,6.588,2.113 +Beta,PDF,128,VECTORIZED,4.085,13.785,3.375 +Beta,PDF,256,VECTORIZED,7.537,26.973,3.579 +Beta,PDF,512,VECTORIZED,16.861,56.362,3.343 +Beta,PDF,1000,VECTORIZED,33.569,114.508,3.411 +Beta,PDF,2000,VECTORIZED,61.234,221.998,3.625 +Beta,PDF,5000,VECTORIZED,159.332,565.452,3.549 +Beta,PDF,10000,VECTORIZED,343.525,1104.677,3.216 +Beta,PDF,20000,VECTORIZED,653.35,2208.022,3.38 +Beta,PDF,50000,VECTORIZED,1666.887,5515.97,3.309 +Beta,PDF,100000,VECTORIZED,3421.353,10872.491,3.178 +Beta,PDF,250000,VECTORIZED,9261.105,28862.668,3.117 +Beta,PDF,500000,VECTORIZED,18759.322,55909.674,2.98 +ChiSquared,CDF,8,WORK_STEALING,0.761,1.247,1.639 +ChiSquared,CDF,16,PARALLEL,1.451,2.448,1.687 +ChiSquared,CDF,32,WORK_STEALING,2.688,4.956,1.844 +ChiSquared,CDF,64,WORK_STEALING,5.254,10.072,1.917 +ChiSquared,CDF,128,WORK_STEALING,10.489,20.093,1.916 +ChiSquared,CDF,256,WORK_STEALING,22.251,41.893,1.883 +ChiSquared,CDF,512,WORK_STEALING,45.382,82.09,1.809 +ChiSquared,CDF,1000,WORK_STEALING,89.524,164.5,1.837 +ChiSquared,CDF,2000,WORK_STEALING,189.098,341.462,1.806 +ChiSquared,CDF,5000,WORK_STEALING,400.891,802.052,2.001 +ChiSquared,CDF,10000,WORK_STEALING,475.294,1653.611,3.479 +ChiSquared,CDF,20000,WORK_STEALING,598.151,3380.129,5.651 +ChiSquared,CDF,50000,WORK_STEALING,1275.405,8866.315,6.952 +ChiSquared,CDF,100000,WORK_STEALING,2206.945,17804.283,8.067 +ChiSquared,CDF,250000,WORK_STEALING,5526.219,45146.338,8.169 +ChiSquared,CDF,500000,WORK_STEALING,12424.388,90282.907,7.267 +ChiSquared,LogPDF,8,WORK_STEALING,0.273,0.738,2.703 +ChiSquared,LogPDF,16,WORK_STEALING,0.358,1.422,3.972 +ChiSquared,LogPDF,32,WORK_STEALING,0.582,2.69,4.622 +ChiSquared,LogPDF,64,WORK_STEALING,0.978,5.113,5.228 +ChiSquared,LogPDF,128,VECTORIZED,1.405,10.15,7.224 +ChiSquared,LogPDF,256,VECTORIZED,2.415,21.487,8.897 +ChiSquared,LogPDF,512,VECTORIZED,4.509,41.406,9.183 +ChiSquared,LogPDF,1000,VECTORIZED,8.362,80.936,9.679 +ChiSquared,LogPDF,2000,VECTORIZED,17.427,164.18,9.421 +ChiSquared,LogPDF,5000,VECTORIZED,41.743,399.46,9.57 +ChiSquared,LogPDF,10000,VECTORIZED,82.764,785.411,9.49 +ChiSquared,LogPDF,20000,VECTORIZED,169.288,1592.501,9.407 +ChiSquared,LogPDF,50000,VECTORIZED,446.941,4303.628,9.629 +ChiSquared,LogPDF,100000,WORK_STEALING,644.719,8606.59,13.349 +ChiSquared,LogPDF,250000,WORK_STEALING,1284.253,22938.344,17.861 +ChiSquared,LogPDF,500000,WORK_STEALING,1483.282,43946.748,29.628 +ChiSquared,PDF,8,WORK_STEALING,0.404,1.343,3.324 +ChiSquared,PDF,16,PARALLEL,0.624,2.564,4.109 +ChiSquared,PDF,32,PARALLEL,1.16,5.213,4.494 +ChiSquared,PDF,64,VECTORIZED,1.511,10.082,6.672 +ChiSquared,PDF,128,VECTORIZED,2.063,20.479,9.927 +ChiSquared,PDF,256,VECTORIZED,3.668,41.856,11.411 +ChiSquared,PDF,512,VECTORIZED,6.987,82.192,11.764 +ChiSquared,PDF,1000,VECTORIZED,13.799,164.702,11.936 +ChiSquared,PDF,2000,VECTORIZED,27.287,337.78,12.379 +ChiSquared,PDF,5000,VECTORIZED,67.21,807.542,12.015 +ChiSquared,PDF,10000,VECTORIZED,129.504,1661.799,12.832 +ChiSquared,PDF,20000,VECTORIZED,295.997,3559.359,12.025 +ChiSquared,PDF,50000,WORK_STEALING,553.27,8598.857,15.542 +ChiSquared,PDF,100000,WORK_STEALING,843.277,19289.136,22.874 +ChiSquared,PDF,250000,WORK_STEALING,1655.998,44184.881,26.682 +ChiSquared,PDF,500000,WORK_STEALING,2664.156,87561.25,32.866 +Discrete,CDF,8,VECTORIZED,0.195,0.585,3.0 +Discrete,CDF,16,PARALLEL,0.242,1.09,4.504 +Discrete,CDF,32,WORK_STEALING,0.338,2.584,7.645 +Discrete,CDF,64,WORK_STEALING,0.488,4.413,9.043 +Discrete,CDF,128,WORK_STEALING,0.709,9.56,13.484 +Discrete,CDF,256,WORK_STEALING,1.381,16.974,12.291 +Discrete,CDF,512,VECTORIZED,1.996,32.375,16.22 +Discrete,CDF,1000,WORK_STEALING,4.292,66.075,15.395 +Discrete,CDF,2000,VECTORIZED,7.78,129.29,16.618 +Discrete,CDF,5000,VECTORIZED,24.219,333.603,13.774 +Discrete,CDF,10000,VECTORIZED,45.172,653.913,14.476 +Discrete,CDF,20000,VECTORIZED,124.182,1427.214,11.493 +Discrete,CDF,50000,WORK_STEALING,215.904,3357.277,15.55 +Discrete,CDF,100000,WORK_STEALING,278.062,6910.17,24.851 +Discrete,CDF,250000,WORK_STEALING,471.137,17315.039,36.752 +Discrete,CDF,500000,WORK_STEALING,609.68,33393.506,54.772 +Discrete,LogPDF,8,VECTORIZED,0.203,0.757,3.729 +Discrete,LogPDF,16,WORK_STEALING,0.281,1.263,4.495 +Discrete,LogPDF,32,VECTORIZED,0.366,2.701,7.38 +Discrete,LogPDF,64,VECTORIZED,0.637,5.345,8.391 +Discrete,LogPDF,128,WORK_STEALING,0.673,10.468,15.554 +Discrete,LogPDF,256,WORK_STEALING,1.405,18.0,12.811 +Discrete,LogPDF,512,WORK_STEALING,2.724,35.529,13.043 +Discrete,LogPDF,1000,WORK_STEALING,5.161,70.474,13.655 +Discrete,LogPDF,2000,WORK_STEALING,9.114,143.781,15.776 +Discrete,LogPDF,5000,VECTORIZED,24.633,345.96,14.045 +Discrete,LogPDF,10000,VECTORIZED,47.205,699.132,14.811 +Discrete,LogPDF,20000,VECTORIZED,106.854,1457.63,13.641 +Discrete,LogPDF,50000,WORK_STEALING,208.233,3688.981,17.716 +Discrete,LogPDF,100000,WORK_STEALING,294.133,7140.568,24.277 +Discrete,LogPDF,250000,WORK_STEALING,515.026,18409.459,35.745 +Discrete,LogPDF,500000,WORK_STEALING,674.444,35205.579,52.199 +Discrete,PDF,8,VECTORIZED,0.189,0.659,3.487 +Discrete,PDF,16,PARALLEL,0.26,1.386,5.331 +Discrete,PDF,32,VECTORIZED,0.32,2.673,8.353 +Discrete,PDF,64,VECTORIZED,0.43,4.594,10.684 +Discrete,PDF,128,VECTORIZED,0.683,9.654,14.135 +Discrete,PDF,256,VECTORIZED,1.059,21.218,20.036 +Discrete,PDF,512,VECTORIZED,2.035,35.556,17.472 +Discrete,PDF,1000,VECTORIZED,3.782,67.817,17.932 +Discrete,PDF,2000,VECTORIZED,7.311,135.357,18.514 +Discrete,PDF,5000,VECTORIZED,20.843,359.055,17.227 +Discrete,PDF,10000,VECTORIZED,33.743,676.962,20.062 +Discrete,PDF,20000,VECTORIZED,74.684,1469.176,19.672 +Discrete,PDF,50000,VECTORIZED,184.545,3791.954,20.548 +Discrete,PDF,100000,WORK_STEALING,246.9,7012.905,28.404 +Discrete,PDF,250000,WORK_STEALING,423.83,18585.281,43.851 +Discrete,PDF,500000,WORK_STEALING,661.421,36504.644,55.191 +Exponential,CDF,8,WORK_STEALING,0.25,0.71,2.84 +Exponential,CDF,16,WORK_STEALING,0.339,1.399,4.127 +Exponential,CDF,32,VECTORIZED,0.495,2.595,5.242 +Exponential,CDF,64,VECTORIZED,0.663,5.119,7.721 +Exponential,CDF,128,VECTORIZED,1.025,10.315,10.063 +Exponential,CDF,256,VECTORIZED,1.853,20.14,10.869 +Exponential,CDF,512,VECTORIZED,3.306,40.633,12.291 +Exponential,CDF,1000,VECTORIZED,6.327,78.052,12.336 +Exponential,CDF,2000,VECTORIZED,12.292,156.47,12.729 +Exponential,CDF,5000,VECTORIZED,49.228,410.463,8.338 +Exponential,CDF,10000,VECTORIZED,61.375,833.162,13.575 +Exponential,CDF,20000,VECTORIZED,124.014,1631.022,13.152 +Exponential,CDF,50000,WORK_STEALING,240.325,4066.975,16.923 +Exponential,CDF,100000,WORK_STEALING,365.802,7974.139,21.799 +Exponential,CDF,250000,WORK_STEALING,871.128,20058.659,23.026 +Exponential,CDF,500000,WORK_STEALING,1359.51,41850.468,30.783 +Exponential,LogPDF,8,WORK_STEALING,0.17,0.573,3.371 +Exponential,LogPDF,16,WORK_STEALING,0.189,1.219,6.45 +Exponential,LogPDF,32,WORK_STEALING,0.168,2.286,13.607 +Exponential,LogPDF,64,WORK_STEALING,0.192,4.315,22.474 +Exponential,LogPDF,128,WORK_STEALING,0.201,8.531,42.443 +Exponential,LogPDF,256,WORK_STEALING,0.311,17.091,54.955 +Exponential,LogPDF,512,WORK_STEALING,0.433,34.446,79.552 +Exponential,LogPDF,1000,WORK_STEALING,0.8,62.306,77.882 +Exponential,LogPDF,2000,WORK_STEALING,1.232,140.58,114.107 +Exponential,LogPDF,5000,VECTORIZED,6.479,347.104,53.574 +Exponential,LogPDF,10000,VECTORIZED,13.145,658.541,50.098 +Exponential,LogPDF,20000,VECTORIZED,31.447,1391.531,44.25 +Exponential,LogPDF,50000,VECTORIZED,78.644,3479.425,44.243 +Exponential,LogPDF,100000,VECTORIZED,153.344,7038.834,45.902 +Exponential,LogPDF,250000,WORK_STEALING,240.194,17279.27,71.939 +Exponential,LogPDF,500000,WORK_STEALING,478.612,37426.234,78.197 +Exponential,PDF,8,PARALLEL,0.245,0.736,3.004 +Exponential,PDF,16,VECTORIZED,0.348,1.36,3.908 +Exponential,PDF,32,VECTORIZED,0.42,2.734,6.51 +Exponential,PDF,64,VECTORIZED,0.64,5.115,7.992 +Exponential,PDF,128,VECTORIZED,1.019,9.97,9.784 +Exponential,PDF,256,VECTORIZED,1.776,20.059,11.294 +Exponential,PDF,512,VECTORIZED,3.321,42.323,12.744 +Exponential,PDF,1000,VECTORIZED,6.016,82.407,13.698 +Exponential,PDF,2000,WORK_STEALING,23.077,156.308,6.773 +Exponential,PDF,5000,VECTORIZED,29.165,388.021,13.304 +Exponential,PDF,10000,VECTORIZED,66.421,850.01,12.797 +Exponential,PDF,20000,VECTORIZED,118.992,1597.88,13.428 +Exponential,PDF,50000,WORK_STEALING,283.562,4130.232,14.566 +Exponential,PDF,100000,WORK_STEALING,318.483,7979.458,25.055 +Exponential,PDF,250000,WORK_STEALING,574.188,19971.144,34.782 +Exponential,PDF,500000,PARALLEL,2107.483,40598.687,19.264 +Gamma,CDF,8,WORK_STEALING,0.777,1.355,1.744 +Gamma,CDF,16,WORK_STEALING,1.372,2.37,1.727 +Gamma,CDF,32,WORK_STEALING,2.646,10.864,4.106 +Gamma,CDF,64,WORK_STEALING,5.034,9.586,1.904 +Gamma,CDF,128,WORK_STEALING,9.898,19.056,1.925 +Gamma,CDF,256,WORK_STEALING,20.492,39.001,1.903 +Gamma,CDF,512,WORK_STEALING,41.557,77.66,1.869 +Gamma,CDF,1000,VECTORIZED,82.877,148.716,1.794 +Gamma,CDF,2000,VECTORIZED,175.314,327.059,1.866 +Gamma,CDF,5000,WORK_STEALING,392.814,775.302,1.974 +Gamma,CDF,10000,WORK_STEALING,412.103,1559.241,3.784 +Gamma,CDF,20000,WORK_STEALING,564.148,3205.865,5.683 +Gamma,CDF,50000,WORK_STEALING,1159.759,8099.143,6.983 +Gamma,CDF,100000,WORK_STEALING,2190.159,17986.594,8.212 +Gamma,CDF,250000,WORK_STEALING,4971.674,42628.901,8.574 +Gamma,CDF,500000,WORK_STEALING,9718.016,80414.95,8.275 +Gamma,LogPDF,8,PARALLEL,0.294,0.802,2.728 +Gamma,LogPDF,16,WORK_STEALING,0.37,1.412,3.816 +Gamma,LogPDF,32,WORK_STEALING,0.602,2.582,4.289 +Gamma,LogPDF,64,WORK_STEALING,1.012,5.159,5.098 +Gamma,LogPDF,128,VECTORIZED,1.374,10.061,7.322 +Gamma,LogPDF,256,VECTORIZED,2.303,20.508,8.905 +Gamma,LogPDF,512,VECTORIZED,4.402,40.036,9.095 +Gamma,LogPDF,1000,VECTORIZED,7.748,75.912,9.798 +Gamma,LogPDF,2000,VECTORIZED,15.602,149.125,9.558 +Gamma,LogPDF,5000,VECTORIZED,39.782,379.808,9.547 +Gamma,LogPDF,10000,VECTORIZED,82.93,799.487,9.641 +Gamma,LogPDF,20000,VECTORIZED,184.304,1679.974,9.115 +Gamma,LogPDF,50000,WORK_STEALING,424.276,4237.671,9.988 +Gamma,LogPDF,100000,WORK_STEALING,631.08,8144.265,12.905 +Gamma,LogPDF,250000,WORK_STEALING,1310.03,22220.252,16.962 +Gamma,LogPDF,500000,WORK_STEALING,2037.304,41248.908,20.247 +Gamma,PDF,8,WORK_STEALING,0.405,1.465,3.617 +Gamma,PDF,16,PARALLEL,0.639,2.693,4.214 +Gamma,PDF,32,VECTORIZED,1.116,5.236,4.692 +Gamma,PDF,64,VECTORIZED,1.45,10.126,6.983 +Gamma,PDF,128,VECTORIZED,2.031,19.928,9.812 +Gamma,PDF,256,VECTORIZED,3.551,40.134,11.302 +Gamma,PDF,512,VECTORIZED,6.728,78.921,11.73 +Gamma,PDF,1000,VECTORIZED,12.746,155.275,12.182 +Gamma,PDF,2000,VECTORIZED,24.651,305.433,12.39 +Gamma,PDF,5000,VECTORIZED,62.136,784.078,12.619 +Gamma,PDF,10000,VECTORIZED,142.016,1663.879,11.716 +Gamma,PDF,20000,VECTORIZED,252.097,3164.196,12.552 +Gamma,PDF,50000,WORK_STEALING,550.224,8338.875,15.155 +Gamma,PDF,100000,WORK_STEALING,815.179,16362.225,20.072 +Gamma,PDF,250000,WORK_STEALING,1887.759,43199.275,22.884 +Gamma,PDF,500000,WORK_STEALING,2690.037,83968.083,31.214 +Gaussian,CDF,8,VECTORIZED,0.437,1.158,2.65 +Gaussian,CDF,16,VECTORIZED,0.53,2.178,4.109 +Gaussian,CDF,32,VECTORIZED,0.708,4.201,5.934 +Gaussian,CDF,64,VECTORIZED,0.971,7.788,8.021 +Gaussian,CDF,128,VECTORIZED,1.759,16.207,9.214 +Gaussian,CDF,256,VECTORIZED,3.078,30.76,9.994 +Gaussian,CDF,512,VECTORIZED,5.875,61.673,10.498 +Gaussian,CDF,1000,VECTORIZED,11.177,120.232,10.757 +Gaussian,CDF,2000,VECTORIZED,20.928,222.519,10.633 +Gaussian,CDF,5000,VECTORIZED,51.718,577.889,11.174 +Gaussian,CDF,10000,VECTORIZED,107.983,1179.511,10.923 +Gaussian,CDF,20000,VECTORIZED,217.735,2380.148,10.931 +Gaussian,CDF,50000,WORK_STEALING,505.507,5834.679,11.542 +Gaussian,CDF,100000,WORK_STEALING,966.26,11530.301,11.933 +Gaussian,CDF,250000,WORK_STEALING,2009.54,29237.648,14.549 +Gaussian,CDF,500000,WORK_STEALING,4203.526,56470.298,13.434 +Gaussian,LogPDF,8,PARALLEL,0.186,0.76,4.086 +Gaussian,LogPDF,16,WORK_STEALING,0.209,1.425,6.818 +Gaussian,LogPDF,32,WORK_STEALING,0.193,2.65,13.731 +Gaussian,LogPDF,64,PARALLEL,0.192,4.634,24.135 +Gaussian,LogPDF,128,PARALLEL,0.218,10.848,49.761 +Gaussian,LogPDF,256,WORK_STEALING,0.276,18.148,65.754 +Gaussian,LogPDF,512,WORK_STEALING,0.476,39.604,83.202 +Gaussian,LogPDF,1000,WORK_STEALING,0.648,102.293,157.86 +Gaussian,LogPDF,2000,WORK_STEALING,1.144,135.64,118.566 +Gaussian,LogPDF,5000,VECTORIZED,3.462,346.132,99.98 +Gaussian,LogPDF,10000,VECTORIZED,7.148,714.087,99.9 +Gaussian,LogPDF,20000,VECTORIZED,18.392,1398.58,76.043 +Gaussian,LogPDF,50000,VECTORIZED,56.415,3310.173,58.675 +Gaussian,LogPDF,100000,VECTORIZED,110.79,6854.059,61.865 +Gaussian,LogPDF,250000,WORK_STEALING,139.309,17031.612,122.258 +Gaussian,LogPDF,500000,WORK_STEALING,266.186,34141.993,128.264 +Gaussian,PDF,8,PARALLEL,0.258,0.742,2.876 +Gaussian,PDF,16,PARALLEL,0.404,1.594,3.946 +Gaussian,PDF,32,VECTORIZED,0.523,3.149,6.021 +Gaussian,PDF,64,VECTORIZED,0.69,5.975,8.659 +Gaussian,PDF,128,VECTORIZED,1.039,10.972,10.56 +Gaussian,PDF,256,VECTORIZED,1.786,23.298,13.045 +Gaussian,PDF,512,VECTORIZED,3.234,44.467,13.75 +Gaussian,PDF,1000,VECTORIZED,5.857,91.826,15.678 +Gaussian,PDF,2000,VECTORIZED,11.238,183.337,16.314 +Gaussian,PDF,5000,VECTORIZED,25.877,429.949,16.615 +Gaussian,PDF,10000,VECTORIZED,56.08,839.592,14.971 +Gaussian,PDF,20000,VECTORIZED,113.836,1704.838,14.976 +Gaussian,PDF,50000,WORK_STEALING,212.752,4177.111,19.634 +Gaussian,PDF,100000,PARALLEL,410.351,8144.983,19.849 +Gaussian,PDF,250000,WORK_STEALING,609.792,20566.641,33.727 +Gaussian,PDF,500000,WORK_STEALING,1388.958,41996.946,30.236 +Poisson,CDF,8,SCALAR,0.891,0.891,1.0 +Poisson,CDF,16,SCALAR,1.814,1.814,1.0 +Poisson,CDF,32,SCALAR,3.449,3.449,1.0 +Poisson,CDF,64,WORK_STEALING,7.391,7.595,1.028 +Poisson,CDF,128,SCALAR,13.714,13.714,1.0 +Poisson,CDF,256,WORK_STEALING,26.893,27.897,1.037 +Poisson,CDF,512,VECTORIZED,53.907,54.403,1.009 +Poisson,CDF,1000,VECTORIZED,105.254,106.764,1.014 +Poisson,CDF,2000,WORK_STEALING,215.93,226.508,1.049 +Poisson,CDF,5000,WORK_STEALING,331.473,532.894,1.608 +Poisson,CDF,10000,WORK_STEALING,550.599,1172.436,2.129 +Poisson,CDF,20000,WORK_STEALING,632.566,2275.264,3.597 +Poisson,CDF,50000,WORK_STEALING,1092.691,5784.786,5.294 +Poisson,CDF,100000,WORK_STEALING,2312.967,11401.173,4.929 +Poisson,CDF,250000,WORK_STEALING,5621.224,30257.435,5.383 +Poisson,CDF,500000,WORK_STEALING,9753.425,57261.99,5.871 +Poisson,LogPDF,8,VECTORIZED,0.289,0.834,2.886 +Poisson,LogPDF,16,VECTORIZED,0.419,1.462,3.489 +Poisson,LogPDF,32,WORK_STEALING,0.669,2.67,3.991 +Poisson,LogPDF,64,WORK_STEALING,1.418,5.47,3.858 +Poisson,LogPDF,128,WORK_STEALING,2.269,10.382,4.576 +Poisson,LogPDF,256,WORK_STEALING,4.65,20.78,4.469 +Poisson,LogPDF,512,VECTORIZED,8.958,39.894,4.453 +Poisson,LogPDF,1000,VECTORIZED,17.076,78.797,4.614 +Poisson,LogPDF,2000,WORK_STEALING,36.51,164.182,4.497 +Poisson,LogPDF,5000,VECTORIZED,96.038,394.762,4.11 +Poisson,LogPDF,10000,VECTORIZED,188.054,796.73,4.237 +Poisson,LogPDF,20000,WORK_STEALING,295.354,1806.553,6.117 +Poisson,LogPDF,50000,WORK_STEALING,390.385,4102.138,10.508 +Poisson,LogPDF,100000,WORK_STEALING,575.865,8438.857,14.654 +Poisson,LogPDF,250000,WORK_STEALING,1726.23,23842.146,13.812 +Poisson,LogPDF,500000,WORK_STEALING,2434.694,44515.97,18.284 +Poisson,PDF,8,VECTORIZED,0.511,1.019,1.994 +Poisson,PDF,16,VECTORIZED,0.854,1.87,2.19 +Poisson,PDF,32,VECTORIZED,1.457,3.645,2.502 +Poisson,PDF,64,VECTORIZED,2.806,7.233,2.578 +Poisson,PDF,128,VECTORIZED,5.174,13.763,2.66 +Poisson,PDF,256,VECTORIZED,10.411,27.34,2.626 +Poisson,PDF,512,VECTORIZED,20.283,52.993,2.613 +Poisson,PDF,1000,VECTORIZED,38.864,104.857,2.698 +Poisson,PDF,2000,WORK_STEALING,81.978,238.144,2.905 +Poisson,PDF,5000,VECTORIZED,192.563,544.057,2.825 +Poisson,PDF,10000,WORK_STEALING,283.063,1090.275,3.852 +Poisson,PDF,20000,WORK_STEALING,386.54,2326.088,6.018 +Poisson,PDF,50000,WORK_STEALING,534.823,5405.866,10.108 +Poisson,PDF,100000,WORK_STEALING,991.194,11228.159,11.328 +Poisson,PDF,250000,WORK_STEALING,1768.673,28455.89,16.089 +Poisson,PDF,500000,WORK_STEALING,3935.375,58799.509,14.941 +StudentT,CDF,8,WORK_STEALING,2.032,2.582,1.271 +StudentT,CDF,16,WORK_STEALING,3.39,4.528,1.336 +StudentT,CDF,32,VECTORIZED,7.239,9.516,1.315 +StudentT,CDF,64,VECTORIZED,14.776,19.536,1.322 +StudentT,CDF,128,PARALLEL,27.967,36.727,1.313 +StudentT,CDF,256,PARALLEL,56.307,74.288,1.319 +StudentT,CDF,512,VECTORIZED,114.5,155.491,1.358 +StudentT,CDF,1000,PARALLEL,228.912,305.896,1.336 +StudentT,CDF,2000,VECTORIZED,456.684,582.619,1.276 +StudentT,CDF,5000,WORK_STEALING,1158.973,1628.848,1.405 +StudentT,CDF,10000,VECTORIZED,2282.506,3342.026,1.464 +StudentT,CDF,20000,VECTORIZED,4681.422,6682.219,1.427 +StudentT,CDF,50000,WORK_STEALING,11964.562,15676.87,1.31 +StudentT,CDF,100000,VECTORIZED,23224.341,32025.274,1.379 +StudentT,CDF,250000,VECTORIZED,57617.375,75867.797,1.317 +StudentT,CDF,500000,VECTORIZED,115474.922,153916.958,1.333 +StudentT,LogPDF,8,VECTORIZED,0.443,0.82,1.851 +StudentT,LogPDF,16,VECTORIZED,0.471,1.43,3.036 +StudentT,LogPDF,32,VECTORIZED,0.546,2.834,5.19 +StudentT,LogPDF,64,VECTORIZED,0.756,5.222,6.907 +StudentT,LogPDF,128,VECTORIZED,1.197,9.859,8.236 +StudentT,LogPDF,256,VECTORIZED,2.037,20.825,10.223 +StudentT,LogPDF,512,VECTORIZED,3.799,42.045,11.067 +StudentT,LogPDF,1000,VECTORIZED,7.082,80.64,11.387 +StudentT,LogPDF,2000,VECTORIZED,13.895,162.475,11.693 +StudentT,LogPDF,5000,VECTORIZED,37.082,499.626,13.474 +StudentT,LogPDF,10000,VECTORIZED,73.491,899.599,12.241 +StudentT,LogPDF,20000,VECTORIZED,166.474,1797.985,10.8 +StudentT,LogPDF,50000,VECTORIZED,389.383,4421.506,11.355 +StudentT,LogPDF,100000,PARALLEL,661.982,8920.513,13.475 +StudentT,LogPDF,250000,PARALLEL,1163.776,20780.342,17.856 +StudentT,LogPDF,500000,WORK_STEALING,2254.997,43528.122,19.303 +StudentT,PDF,8,VECTORIZED,0.48,0.958,1.996 +StudentT,PDF,16,VECTORIZED,0.537,1.706,3.177 +StudentT,PDF,32,VECTORIZED,0.694,3.253,4.687 +StudentT,PDF,64,VECTORIZED,1.076,6.052,5.625 +StudentT,PDF,128,VECTORIZED,1.845,12.818,6.947 +StudentT,PDF,256,VECTORIZED,3.257,64.3,19.742 +StudentT,PDF,512,VECTORIZED,6.341,50.848,8.019 +StudentT,PDF,1000,VECTORIZED,11.914,98.173,8.24 +StudentT,PDF,2000,VECTORIZED,24.393,205.468,8.423 +StudentT,PDF,5000,VECTORIZED,64.74,629.826,9.729 +StudentT,PDF,10000,VECTORIZED,121.889,1162.008,9.533 +StudentT,PDF,20000,VECTORIZED,240.398,2101.707,8.743 +StudentT,PDF,50000,VECTORIZED,670.287,5394.263,8.048 +StudentT,PDF,100000,PARALLEL,907.325,10983.479,12.105 +StudentT,PDF,250000,WORK_STEALING,2071.162,25744.895,12.43 +StudentT,PDF,500000,PARALLEL,4299.085,53410.095,12.424 +Uniform,CDF,8,WORK_STEALING,0.194,0.762,3.928 +Uniform,CDF,16,WORK_STEALING,0.195,1.216,6.236 +Uniform,CDF,32,WORK_STEALING,0.205,2.383,11.624 +Uniform,CDF,64,WORK_STEALING,0.253,5.045,19.941 +Uniform,CDF,128,WORK_STEALING,0.321,10.13,31.558 +Uniform,CDF,256,WORK_STEALING,0.333,18.843,56.586 +Uniform,CDF,512,WORK_STEALING,0.836,38.476,46.024 +Uniform,CDF,1000,VECTORIZED,1.36,70.979,52.19 +Uniform,CDF,2000,WORK_STEALING,2.065,132.978,64.396 +Uniform,CDF,5000,VECTORIZED,11.519,267.07,23.185 +Uniform,CDF,10000,VECTORIZED,35.327,565.762,16.015 +Uniform,CDF,20000,WORK_STEALING,47.649,1444.327,30.312 +Uniform,CDF,50000,WORK_STEALING,95.622,3404.27,35.601 +Uniform,CDF,100000,WORK_STEALING,184.461,7417.152,40.21 +Uniform,CDF,250000,WORK_STEALING,415.733,18132.131,43.615 +Uniform,CDF,500000,PARALLEL,1256.224,40287.186,32.07 +Uniform,LogPDF,8,VECTORIZED,0.22,0.759,3.45 +Uniform,LogPDF,16,VECTORIZED,0.171,1.204,7.041 +Uniform,LogPDF,32,WORK_STEALING,0.189,2.394,12.667 +Uniform,LogPDF,64,WORK_STEALING,0.238,4.906,20.613 +Uniform,LogPDF,128,VECTORIZED,0.273,9.95,36.447 +Uniform,LogPDF,256,WORK_STEALING,0.309,18.952,61.333 +Uniform,LogPDF,512,VECTORIZED,0.391,37.011,94.657 +Uniform,LogPDF,1000,WORK_STEALING,0.482,69.663,144.529 +Uniform,LogPDF,2000,WORK_STEALING,0.831,143.565,172.762 +Uniform,LogPDF,5000,VECTORIZED,2.971,342.25,115.197 +Uniform,LogPDF,10000,VECTORIZED,5.925,542.895,91.628 +Uniform,LogPDF,20000,VECTORIZED,8.467,1298.033,153.305 +Uniform,LogPDF,50000,VECTORIZED,34.774,3669.92,105.536 +Uniform,LogPDF,100000,VECTORIZED,69.231,6691.124,96.649 +Uniform,LogPDF,250000,VECTORIZED,182.872,17324.342,94.735 +Uniform,LogPDF,500000,VECTORIZED,486.83,35235.903,72.378 +Uniform,PDF,8,VECTORIZED,0.142,0.592,4.169 +Uniform,PDF,16,VECTORIZED,0.151,1.215,8.046 +Uniform,PDF,32,VECTORIZED,0.153,2.359,15.418 +Uniform,PDF,64,WORK_STEALING,0.2,5.058,25.29 +Uniform,PDF,128,VECTORIZED,0.241,9.892,41.046 +Uniform,PDF,256,WORK_STEALING,0.211,19.183,90.915 +Uniform,PDF,512,VECTORIZED,0.302,37.051,122.685 +Uniform,PDF,1000,VECTORIZED,0.716,73.276,102.341 +Uniform,PDF,2000,VECTORIZED,1.028,136.502,132.784 +Uniform,PDF,5000,VECTORIZED,1.885,310.142,164.532 +Uniform,PDF,10000,VECTORIZED,3.427,524.805,153.138 +Uniform,PDF,20000,VECTORIZED,10.867,1175.601,108.181 +Uniform,PDF,50000,VECTORIZED,22.121,3442.679,155.629 +Uniform,PDF,100000,VECTORIZED,67.679,6797.237,100.433 +Uniform,PDF,250000,VECTORIZED,209.306,17564.354,83.917 +Uniform,PDF,500000,VECTORIZED,402.518,35403.341,87.955 diff --git a/data/profiles/dispatcher/2026-04-12T05-27-04Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-0e4e9f1/crossovers.csv b/data/profiles/dispatcher/2026-04-12T05-27-04Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-0e4e9f1/crossovers.csv new file mode 100644 index 0000000..2c8b062 --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T05-27-04Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-0e4e9f1/crossovers.csv @@ -0,0 +1,28 @@ +distribution,operation,scalar_to_vectorized,vectorized_to_parallel,parallel_to_work_stealing,best_strategy_at_max_size,best_time_us_at_max_size,max_batch_size +Beta,CDF,8,,8,VECTORIZED,76353.624,500000 +Beta,LogPDF,16,8,8,VECTORIZED,16603.17,500000 +Beta,PDF,16,8,8,VECTORIZED,18759.322,500000 +ChiSquared,CDF,8,8,8,WORK_STEALING,12424.388,500000 +ChiSquared,LogPDF,16,8,8,WORK_STEALING,1483.282,500000 +ChiSquared,PDF,8,8,8,WORK_STEALING,2664.156,500000 +Discrete,CDF,8,16,32,WORK_STEALING,609.68,500000 +Discrete,LogPDF,8,16,16,WORK_STEALING,674.444,500000 +Discrete,PDF,8,16,5000,WORK_STEALING,661.421,500000 +Exponential,CDF,8,8,8,WORK_STEALING,1359.51,500000 +Exponential,LogPDF,8,8,8,WORK_STEALING,478.612,500000 +Exponential,PDF,8,8,16,PARALLEL,2107.483,500000 +Gamma,CDF,16,8,8,WORK_STEALING,9718.016,500000 +Gamma,LogPDF,16,8,16,WORK_STEALING,2037.304,500000 +Gamma,PDF,8,8,8,WORK_STEALING,2690.037,500000 +Gaussian,CDF,8,,16,WORK_STEALING,4203.526,500000 +Gaussian,LogPDF,8,8,16,WORK_STEALING,266.186,500000 +Gaussian,PDF,8,8,64,WORK_STEALING,1388.958,500000 +Poisson,CDF,64,16,64,WORK_STEALING,9753.425,500000 +Poisson,LogPDF,8,50000,8,WORK_STEALING,2434.694,500000 +Poisson,PDF,8,2000,16,WORK_STEALING,3935.375,500000 +StudentT,CDF,8,128,8,VECTORIZED,115474.922,500000 +StudentT,LogPDF,8,100000,32,WORK_STEALING,2254.997,500000 +StudentT,PDF,8,100000,8,PARALLEL,4299.085,500000 +Uniform,CDF,8,8,8,PARALLEL,1256.224,500000 +Uniform,LogPDF,8,,32,VECTORIZED,486.83,500000 +Uniform,PDF,8,,16,VECTORIZED,402.518,500000 diff --git a/data/profiles/dispatcher/2026-04-12T05-27-04Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-0e4e9f1/logs/strategy_profile.txt b/data/profiles/dispatcher/2026-04-12T05-27-04Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-0e4e9f1/logs/strategy_profile.txt new file mode 100644 index 0000000..da8a356 --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T05-27-04Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-0e4e9f1/logs/strategy_profile.txt @@ -0,0 +1,658 @@ + +==================== + Strategy Profile +==================== + +Forced-strategy timing profiler for dispatcher threshold tuning + +System: 8 logical cores, AVX2 SIMD, 8192 KB L3 cache + +Batch sizes: 8 16 32 64 128 256 512 1000 2000 5000 10000 20000 50000 100000 250000 500000 + + +--- Uniform Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- Gaussian Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- Exponential Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- Discrete Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- Poisson Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- Gamma Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- StudentT Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- Beta Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- ChiSquared Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +========================= + Best Strategy Summary +========================= + +Distribution Operation Size Best Strategy Time (ΞΌs) +---------------------------------------------------------------- +Beta CDF 8 Vectorized 1.25 +Beta CDF 16 Vectorized 2.44 +Beta CDF 32 Vectorized 4.52 +Beta CDF 64 Vectorized 8.26 +Beta CDF 128 Vectorized 18.77 +Beta CDF 256 Vectorized 39.65 +Beta CDF 512 Vectorized 73.95 +Beta CDF 1000 Vectorized 150.54 +Beta CDF 2000 Vectorized 293.51 +Beta CDF 5000 Vectorized 733.54 +Beta CDF 10000 Vectorized 1497.36 +Beta CDF 20000 Vectorized 3058.14 +Beta CDF 50000 Vectorized 7394.72 +Beta CDF 100000 Vectorized 14874.08 +Beta CDF 250000 Vectorized 39713.60 +Beta CDF 500000 Vectorized 76353.62 +Beta LogPDF 8 Work-Stealing 0.58 +Beta LogPDF 16 Work-Stealing 0.82 +Beta LogPDF 32 Work-Stealing 1.43 +Beta LogPDF 64 Work-Stealing 2.68 +Beta LogPDF 128 Vectorized 3.80 +Beta LogPDF 256 Vectorized 6.28 +Beta LogPDF 512 Vectorized 14.93 +Beta LogPDF 1000 Vectorized 28.85 +Beta LogPDF 2000 Vectorized 58.79 +Beta LogPDF 5000 Vectorized 139.93 +Beta LogPDF 10000 Vectorized 322.39 +Beta LogPDF 20000 Vectorized 604.57 +Beta LogPDF 50000 Vectorized 1404.61 +Beta LogPDF 100000 Vectorized 2926.96 +Beta LogPDF 250000 Vectorized 8077.81 +Beta LogPDF 500000 Vectorized 16603.17 +Beta PDF 8 Work-Stealing 0.68 +Beta PDF 16 Work-Stealing 1.08 +Beta PDF 32 Vectorized 1.90 +Beta PDF 64 Vectorized 3.12 +Beta PDF 128 Vectorized 4.08 +Beta PDF 256 Vectorized 7.54 +Beta PDF 512 Vectorized 16.86 +Beta PDF 1000 Vectorized 33.57 +Beta PDF 2000 Vectorized 61.23 +Beta PDF 5000 Vectorized 159.33 +Beta PDF 10000 Vectorized 343.52 +Beta PDF 20000 Vectorized 653.35 +Beta PDF 50000 Vectorized 1666.89 +Beta PDF 100000 Vectorized 3421.35 +Beta PDF 250000 Vectorized 9261.10 +Beta PDF 500000 Vectorized 18759.32 +ChiSquared CDF 8 Work-Stealing 0.76 +ChiSquared CDF 16 Parallel 1.45 +ChiSquared CDF 32 Work-Stealing 2.69 +ChiSquared CDF 64 Work-Stealing 5.25 +ChiSquared CDF 128 Work-Stealing 10.49 +ChiSquared CDF 256 Work-Stealing 22.25 +ChiSquared CDF 512 Work-Stealing 45.38 +ChiSquared CDF 1000 Work-Stealing 89.52 +ChiSquared CDF 2000 Work-Stealing 189.10 +ChiSquared CDF 5000 Work-Stealing 400.89 +ChiSquared CDF 10000 Work-Stealing 475.29 +ChiSquared CDF 20000 Work-Stealing 598.15 +ChiSquared CDF 50000 Work-Stealing 1275.40 +ChiSquared CDF 100000 Work-Stealing 2206.95 +ChiSquared CDF 250000 Work-Stealing 5526.22 +ChiSquared CDF 500000 Work-Stealing 12424.39 +ChiSquared LogPDF 8 Work-Stealing 0.27 +ChiSquared LogPDF 16 Work-Stealing 0.36 +ChiSquared LogPDF 32 Work-Stealing 0.58 +ChiSquared LogPDF 64 Work-Stealing 0.98 +ChiSquared LogPDF 128 Vectorized 1.41 +ChiSquared LogPDF 256 Vectorized 2.42 +ChiSquared LogPDF 512 Vectorized 4.51 +ChiSquared LogPDF 1000 Vectorized 8.36 +ChiSquared LogPDF 2000 Vectorized 17.43 +ChiSquared LogPDF 5000 Vectorized 41.74 +ChiSquared LogPDF 10000 Vectorized 82.76 +ChiSquared LogPDF 20000 Vectorized 169.29 +ChiSquared LogPDF 50000 Vectorized 446.94 +ChiSquared LogPDF 100000 Work-Stealing 644.72 +ChiSquared LogPDF 250000 Work-Stealing 1284.25 +ChiSquared LogPDF 500000 Work-Stealing 1483.28 +ChiSquared PDF 8 Work-Stealing 0.40 +ChiSquared PDF 16 Parallel 0.62 +ChiSquared PDF 32 Parallel 1.16 +ChiSquared PDF 64 Vectorized 1.51 +ChiSquared PDF 128 Vectorized 2.06 +ChiSquared PDF 256 Vectorized 3.67 +ChiSquared PDF 512 Vectorized 6.99 +ChiSquared PDF 1000 Vectorized 13.80 +ChiSquared PDF 2000 Vectorized 27.29 +ChiSquared PDF 5000 Vectorized 67.21 +ChiSquared PDF 10000 Vectorized 129.50 +ChiSquared PDF 20000 Vectorized 296.00 +ChiSquared PDF 50000 Work-Stealing 553.27 +ChiSquared PDF 100000 Work-Stealing 843.28 +ChiSquared PDF 250000 Work-Stealing 1656.00 +ChiSquared PDF 500000 Work-Stealing 2664.16 +Discrete CDF 8 Vectorized 0.20 +Discrete CDF 16 Parallel 0.24 +Discrete CDF 32 Work-Stealing 0.34 +Discrete CDF 64 Work-Stealing 0.49 +Discrete CDF 128 Work-Stealing 0.71 +Discrete CDF 256 Work-Stealing 1.38 +Discrete CDF 512 Vectorized 2.00 +Discrete CDF 1000 Work-Stealing 4.29 +Discrete CDF 2000 Vectorized 7.78 +Discrete CDF 5000 Vectorized 24.22 +Discrete CDF 10000 Vectorized 45.17 +Discrete CDF 20000 Vectorized 124.18 +Discrete CDF 50000 Work-Stealing 215.90 +Discrete CDF 100000 Work-Stealing 278.06 +Discrete CDF 250000 Work-Stealing 471.14 +Discrete CDF 500000 Work-Stealing 609.68 +Discrete LogPDF 8 Vectorized 0.20 +Discrete LogPDF 16 Work-Stealing 0.28 +Discrete LogPDF 32 Vectorized 0.37 +Discrete LogPDF 64 Vectorized 0.64 +Discrete LogPDF 128 Work-Stealing 0.67 +Discrete LogPDF 256 Work-Stealing 1.41 +Discrete LogPDF 512 Work-Stealing 2.72 +Discrete LogPDF 1000 Work-Stealing 5.16 +Discrete LogPDF 2000 Work-Stealing 9.11 +Discrete LogPDF 5000 Vectorized 24.63 +Discrete LogPDF 10000 Vectorized 47.20 +Discrete LogPDF 20000 Vectorized 106.85 +Discrete LogPDF 50000 Work-Stealing 208.23 +Discrete LogPDF 100000 Work-Stealing 294.13 +Discrete LogPDF 250000 Work-Stealing 515.03 +Discrete LogPDF 500000 Work-Stealing 674.44 +Discrete PDF 8 Vectorized 0.19 +Discrete PDF 16 Parallel 0.26 +Discrete PDF 32 Vectorized 0.32 +Discrete PDF 64 Vectorized 0.43 +Discrete PDF 128 Vectorized 0.68 +Discrete PDF 256 Vectorized 1.06 +Discrete PDF 512 Vectorized 2.04 +Discrete PDF 1000 Vectorized 3.78 +Discrete PDF 2000 Vectorized 7.31 +Discrete PDF 5000 Vectorized 20.84 +Discrete PDF 10000 Vectorized 33.74 +Discrete PDF 20000 Vectorized 74.68 +Discrete PDF 50000 Vectorized 184.54 +Discrete PDF 100000 Work-Stealing 246.90 +Discrete PDF 250000 Work-Stealing 423.83 +Discrete PDF 500000 Work-Stealing 661.42 +Exponential CDF 8 Work-Stealing 0.25 +Exponential CDF 16 Work-Stealing 0.34 +Exponential CDF 32 Vectorized 0.49 +Exponential CDF 64 Vectorized 0.66 +Exponential CDF 128 Vectorized 1.02 +Exponential CDF 256 Vectorized 1.85 +Exponential CDF 512 Vectorized 3.31 +Exponential CDF 1000 Vectorized 6.33 +Exponential CDF 2000 Vectorized 12.29 +Exponential CDF 5000 Vectorized 49.23 +Exponential CDF 10000 Vectorized 61.38 +Exponential CDF 20000 Vectorized 124.01 +Exponential CDF 50000 Work-Stealing 240.32 +Exponential CDF 100000 Work-Stealing 365.80 +Exponential CDF 250000 Work-Stealing 871.13 +Exponential CDF 500000 Work-Stealing 1359.51 +Exponential LogPDF 8 Work-Stealing 0.17 +Exponential LogPDF 16 Work-Stealing 0.19 +Exponential LogPDF 32 Work-Stealing 0.17 +Exponential LogPDF 64 Work-Stealing 0.19 +Exponential LogPDF 128 Work-Stealing 0.20 +Exponential LogPDF 256 Work-Stealing 0.31 +Exponential LogPDF 512 Work-Stealing 0.43 +Exponential LogPDF 1000 Work-Stealing 0.80 +Exponential LogPDF 2000 Work-Stealing 1.23 +Exponential LogPDF 5000 Vectorized 6.48 +Exponential LogPDF 10000 Vectorized 13.14 +Exponential LogPDF 20000 Vectorized 31.45 +Exponential LogPDF 50000 Vectorized 78.64 +Exponential LogPDF 100000 Vectorized 153.34 +Exponential LogPDF 250000 Work-Stealing 240.19 +Exponential LogPDF 500000 Work-Stealing 478.61 +Exponential PDF 8 Parallel 0.24 +Exponential PDF 16 Vectorized 0.35 +Exponential PDF 32 Vectorized 0.42 +Exponential PDF 64 Vectorized 0.64 +Exponential PDF 128 Vectorized 1.02 +Exponential PDF 256 Vectorized 1.78 +Exponential PDF 512 Vectorized 3.32 +Exponential PDF 1000 Vectorized 6.02 +Exponential PDF 2000 Work-Stealing 23.08 +Exponential PDF 5000 Vectorized 29.16 +Exponential PDF 10000 Vectorized 66.42 +Exponential PDF 20000 Vectorized 118.99 +Exponential PDF 50000 Work-Stealing 283.56 +Exponential PDF 100000 Work-Stealing 318.48 +Exponential PDF 250000 Work-Stealing 574.19 +Exponential PDF 500000 Parallel 2107.48 +Gamma CDF 8 Work-Stealing 0.78 +Gamma CDF 16 Work-Stealing 1.37 +Gamma CDF 32 Work-Stealing 2.65 +Gamma CDF 64 Work-Stealing 5.03 +Gamma CDF 128 Work-Stealing 9.90 +Gamma CDF 256 Work-Stealing 20.49 +Gamma CDF 512 Work-Stealing 41.56 +Gamma CDF 1000 Vectorized 82.88 +Gamma CDF 2000 Vectorized 175.31 +Gamma CDF 5000 Work-Stealing 392.81 +Gamma CDF 10000 Work-Stealing 412.10 +Gamma CDF 20000 Work-Stealing 564.15 +Gamma CDF 50000 Work-Stealing 1159.76 +Gamma CDF 100000 Work-Stealing 2190.16 +Gamma CDF 250000 Work-Stealing 4971.67 +Gamma CDF 500000 Work-Stealing 9718.02 +Gamma LogPDF 8 Parallel 0.29 +Gamma LogPDF 16 Work-Stealing 0.37 +Gamma LogPDF 32 Work-Stealing 0.60 +Gamma LogPDF 64 Work-Stealing 1.01 +Gamma LogPDF 128 Vectorized 1.37 +Gamma LogPDF 256 Vectorized 2.30 +Gamma LogPDF 512 Vectorized 4.40 +Gamma LogPDF 1000 Vectorized 7.75 +Gamma LogPDF 2000 Vectorized 15.60 +Gamma LogPDF 5000 Vectorized 39.78 +Gamma LogPDF 10000 Vectorized 82.93 +Gamma LogPDF 20000 Vectorized 184.30 +Gamma LogPDF 50000 Work-Stealing 424.28 +Gamma LogPDF 100000 Work-Stealing 631.08 +Gamma LogPDF 250000 Work-Stealing 1310.03 +Gamma LogPDF 500000 Work-Stealing 2037.30 +Gamma PDF 8 Work-Stealing 0.41 +Gamma PDF 16 Parallel 0.64 +Gamma PDF 32 Vectorized 1.12 +Gamma PDF 64 Vectorized 1.45 +Gamma PDF 128 Vectorized 2.03 +Gamma PDF 256 Vectorized 3.55 +Gamma PDF 512 Vectorized 6.73 +Gamma PDF 1000 Vectorized 12.75 +Gamma PDF 2000 Vectorized 24.65 +Gamma PDF 5000 Vectorized 62.14 +Gamma PDF 10000 Vectorized 142.02 +Gamma PDF 20000 Vectorized 252.10 +Gamma PDF 50000 Work-Stealing 550.22 +Gamma PDF 100000 Work-Stealing 815.18 +Gamma PDF 250000 Work-Stealing 1887.76 +Gamma PDF 500000 Work-Stealing 2690.04 +Gaussian CDF 8 Vectorized 0.44 +Gaussian CDF 16 Vectorized 0.53 +Gaussian CDF 32 Vectorized 0.71 +Gaussian CDF 64 Vectorized 0.97 +Gaussian CDF 128 Vectorized 1.76 +Gaussian CDF 256 Vectorized 3.08 +Gaussian CDF 512 Vectorized 5.88 +Gaussian CDF 1000 Vectorized 11.18 +Gaussian CDF 2000 Vectorized 20.93 +Gaussian CDF 5000 Vectorized 51.72 +Gaussian CDF 10000 Vectorized 107.98 +Gaussian CDF 20000 Vectorized 217.74 +Gaussian CDF 50000 Work-Stealing 505.51 +Gaussian CDF 100000 Work-Stealing 966.26 +Gaussian CDF 250000 Work-Stealing 2009.54 +Gaussian CDF 500000 Work-Stealing 4203.53 +Gaussian LogPDF 8 Parallel 0.19 +Gaussian LogPDF 16 Work-Stealing 0.21 +Gaussian LogPDF 32 Work-Stealing 0.19 +Gaussian LogPDF 64 Parallel 0.19 +Gaussian LogPDF 128 Parallel 0.22 +Gaussian LogPDF 256 Work-Stealing 0.28 +Gaussian LogPDF 512 Work-Stealing 0.48 +Gaussian LogPDF 1000 Work-Stealing 0.65 +Gaussian LogPDF 2000 Work-Stealing 1.14 +Gaussian LogPDF 5000 Vectorized 3.46 +Gaussian LogPDF 10000 Vectorized 7.15 +Gaussian LogPDF 20000 Vectorized 18.39 +Gaussian LogPDF 50000 Vectorized 56.41 +Gaussian LogPDF 100000 Vectorized 110.79 +Gaussian LogPDF 250000 Work-Stealing 139.31 +Gaussian LogPDF 500000 Work-Stealing 266.19 +Gaussian PDF 8 Parallel 0.26 +Gaussian PDF 16 Parallel 0.40 +Gaussian PDF 32 Vectorized 0.52 +Gaussian PDF 64 Vectorized 0.69 +Gaussian PDF 128 Vectorized 1.04 +Gaussian PDF 256 Vectorized 1.79 +Gaussian PDF 512 Vectorized 3.23 +Gaussian PDF 1000 Vectorized 5.86 +Gaussian PDF 2000 Vectorized 11.24 +Gaussian PDF 5000 Vectorized 25.88 +Gaussian PDF 10000 Vectorized 56.08 +Gaussian PDF 20000 Vectorized 113.84 +Gaussian PDF 50000 Work-Stealing 212.75 +Gaussian PDF 100000 Parallel 410.35 +Gaussian PDF 250000 Work-Stealing 609.79 +Gaussian PDF 500000 Work-Stealing 1388.96 +Poisson CDF 8 Scalar 0.89 +Poisson CDF 16 Scalar 1.81 +Poisson CDF 32 Scalar 3.45 +Poisson CDF 64 Work-Stealing 7.39 +Poisson CDF 128 Scalar 13.71 +Poisson CDF 256 Work-Stealing 26.89 +Poisson CDF 512 Vectorized 53.91 +Poisson CDF 1000 Vectorized 105.25 +Poisson CDF 2000 Work-Stealing 215.93 +Poisson CDF 5000 Work-Stealing 331.47 +Poisson CDF 10000 Work-Stealing 550.60 +Poisson CDF 20000 Work-Stealing 632.57 +Poisson CDF 50000 Work-Stealing 1092.69 +Poisson CDF 100000 Work-Stealing 2312.97 +Poisson CDF 250000 Work-Stealing 5621.22 +Poisson CDF 500000 Work-Stealing 9753.42 +Poisson LogPDF 8 Vectorized 0.29 +Poisson LogPDF 16 Vectorized 0.42 +Poisson LogPDF 32 Work-Stealing 0.67 +Poisson LogPDF 64 Work-Stealing 1.42 +Poisson LogPDF 128 Work-Stealing 2.27 +Poisson LogPDF 256 Work-Stealing 4.65 +Poisson LogPDF 512 Vectorized 8.96 +Poisson LogPDF 1000 Vectorized 17.08 +Poisson LogPDF 2000 Work-Stealing 36.51 +Poisson LogPDF 5000 Vectorized 96.04 +Poisson LogPDF 10000 Vectorized 188.05 +Poisson LogPDF 20000 Work-Stealing 295.35 +Poisson LogPDF 50000 Work-Stealing 390.38 +Poisson LogPDF 100000 Work-Stealing 575.87 +Poisson LogPDF 250000 Work-Stealing 1726.23 +Poisson LogPDF 500000 Work-Stealing 2434.69 +Poisson PDF 8 Vectorized 0.51 +Poisson PDF 16 Vectorized 0.85 +Poisson PDF 32 Vectorized 1.46 +Poisson PDF 64 Vectorized 2.81 +Poisson PDF 128 Vectorized 5.17 +Poisson PDF 256 Vectorized 10.41 +Poisson PDF 512 Vectorized 20.28 +Poisson PDF 1000 Vectorized 38.86 +Poisson PDF 2000 Work-Stealing 81.98 +Poisson PDF 5000 Vectorized 192.56 +Poisson PDF 10000 Work-Stealing 283.06 +Poisson PDF 20000 Work-Stealing 386.54 +Poisson PDF 50000 Work-Stealing 534.82 +Poisson PDF 100000 Work-Stealing 991.19 +Poisson PDF 250000 Work-Stealing 1768.67 +Poisson PDF 500000 Work-Stealing 3935.38 +StudentT CDF 8 Work-Stealing 2.03 +StudentT CDF 16 Work-Stealing 3.39 +StudentT CDF 32 Vectorized 7.24 +StudentT CDF 64 Vectorized 14.78 +StudentT CDF 128 Parallel 27.97 +StudentT CDF 256 Parallel 56.31 +StudentT CDF 512 Vectorized 114.50 +StudentT CDF 1000 Parallel 228.91 +StudentT CDF 2000 Vectorized 456.68 +StudentT CDF 5000 Work-Stealing 1158.97 +StudentT CDF 10000 Vectorized 2282.51 +StudentT CDF 20000 Vectorized 4681.42 +StudentT CDF 50000 Work-Stealing 11964.56 +StudentT CDF 100000 Vectorized 23224.34 +StudentT CDF 250000 Vectorized 57617.38 +StudentT CDF 500000 Vectorized 115474.92 +StudentT LogPDF 8 Vectorized 0.44 +StudentT LogPDF 16 Vectorized 0.47 +StudentT LogPDF 32 Vectorized 0.55 +StudentT LogPDF 64 Vectorized 0.76 +StudentT LogPDF 128 Vectorized 1.20 +StudentT LogPDF 256 Vectorized 2.04 +StudentT LogPDF 512 Vectorized 3.80 +StudentT LogPDF 1000 Vectorized 7.08 +StudentT LogPDF 2000 Vectorized 13.89 +StudentT LogPDF 5000 Vectorized 37.08 +StudentT LogPDF 10000 Vectorized 73.49 +StudentT LogPDF 20000 Vectorized 166.47 +StudentT LogPDF 50000 Vectorized 389.38 +StudentT LogPDF 100000 Parallel 661.98 +StudentT LogPDF 250000 Parallel 1163.78 +StudentT LogPDF 500000 Work-Stealing 2255.00 +StudentT PDF 8 Vectorized 0.48 +StudentT PDF 16 Vectorized 0.54 +StudentT PDF 32 Vectorized 0.69 +StudentT PDF 64 Vectorized 1.08 +StudentT PDF 128 Vectorized 1.84 +StudentT PDF 256 Vectorized 3.26 +StudentT PDF 512 Vectorized 6.34 +StudentT PDF 1000 Vectorized 11.91 +StudentT PDF 2000 Vectorized 24.39 +StudentT PDF 5000 Vectorized 64.74 +StudentT PDF 10000 Vectorized 121.89 +StudentT PDF 20000 Vectorized 240.40 +StudentT PDF 50000 Vectorized 670.29 +StudentT PDF 100000 Parallel 907.33 +StudentT PDF 250000 Work-Stealing 2071.16 +StudentT PDF 500000 Parallel 4299.09 +Uniform CDF 8 Work-Stealing 0.19 +Uniform CDF 16 Work-Stealing 0.20 +Uniform CDF 32 Work-Stealing 0.20 +Uniform CDF 64 Work-Stealing 0.25 +Uniform CDF 128 Work-Stealing 0.32 +Uniform CDF 256 Work-Stealing 0.33 +Uniform CDF 512 Work-Stealing 0.84 +Uniform CDF 1000 Vectorized 1.36 +Uniform CDF 2000 Work-Stealing 2.06 +Uniform CDF 5000 Vectorized 11.52 +Uniform CDF 10000 Vectorized 35.33 +Uniform CDF 20000 Work-Stealing 47.65 +Uniform CDF 50000 Work-Stealing 95.62 +Uniform CDF 100000 Work-Stealing 184.46 +Uniform CDF 250000 Work-Stealing 415.73 +Uniform CDF 500000 Parallel 1256.22 +Uniform LogPDF 8 Vectorized 0.22 +Uniform LogPDF 16 Vectorized 0.17 +Uniform LogPDF 32 Work-Stealing 0.19 +Uniform LogPDF 64 Work-Stealing 0.24 +Uniform LogPDF 128 Vectorized 0.27 +Uniform LogPDF 256 Work-Stealing 0.31 +Uniform LogPDF 512 Vectorized 0.39 +Uniform LogPDF 1000 Work-Stealing 0.48 +Uniform LogPDF 2000 Work-Stealing 0.83 +Uniform LogPDF 5000 Vectorized 2.97 +Uniform LogPDF 10000 Vectorized 5.92 +Uniform LogPDF 20000 Vectorized 8.47 +Uniform LogPDF 50000 Vectorized 34.77 +Uniform LogPDF 100000 Vectorized 69.23 +Uniform LogPDF 250000 Vectorized 182.87 +Uniform LogPDF 500000 Vectorized 486.83 +Uniform PDF 8 Vectorized 0.14 +Uniform PDF 16 Vectorized 0.15 +Uniform PDF 32 Vectorized 0.15 +Uniform PDF 64 Work-Stealing 0.20 +Uniform PDF 128 Vectorized 0.24 +Uniform PDF 256 Work-Stealing 0.21 +Uniform PDF 512 Vectorized 0.30 +Uniform PDF 1000 Vectorized 0.72 +Uniform PDF 2000 Vectorized 1.03 +Uniform PDF 5000 Vectorized 1.89 +Uniform PDF 10000 Vectorized 3.43 +Uniform PDF 20000 Vectorized 10.87 +Uniform PDF 50000 Vectorized 22.12 +Uniform PDF 100000 Vectorized 67.68 +Uniform PDF 250000 Vectorized 209.31 +Uniform PDF 500000 Vectorized 402.52 + + +===================== + Crossover Summary +===================== + +Distribution Operation Sβ†’V Vβ†’P Pβ†’Work-Steal +-------------------------------------------------------------------------- +Beta CDF 8 never 8 +Beta LogPDF 16 8 8 +Beta PDF 16 8 8 +ChiSquared CDF 8 8 8 +ChiSquared LogPDF 16 8 8 +ChiSquared PDF 8 8 8 +Discrete CDF 8 16 32 +Discrete LogPDF 8 16 16 +Discrete PDF 8 16 5000 +Exponential CDF 8 8 8 +Exponential LogPDF 8 8 8 +Exponential PDF 8 8 16 +Gamma CDF 16 8 8 +Gamma LogPDF 16 8 16 +Gamma PDF 8 8 8 +Gaussian CDF 8 never 16 +Gaussian LogPDF 8 8 16 +Gaussian PDF 8 8 64 +Poisson CDF 64 16 64 +Poisson LogPDF 8 50000 8 +Poisson PDF 8 2000 16 +StudentT CDF 8 128 8 +StudentT LogPDF 8 100000 32 +StudentT PDF 8 100000 8 +Uniform CDF 8 8 8 +Uniform LogPDF 8 never 32 +Uniform PDF 8 never 16 + +Results saved to /Users/wolfman/Development/libstats/build/profiles/dispatcher/2026-04-12T05-27-04Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-0e4e9f1/strategy_profile_results.csv diff --git a/data/profiles/dispatcher/2026-04-12T05-27-04Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-0e4e9f1/logs/system_inspector_performance.txt b/data/profiles/dispatcher/2026-04-12T05-27-04Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-0e4e9f1/logs/system_inspector_performance.txt new file mode 100644 index 0000000..d958300 --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T05-27-04Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-0e4e9f1/logs/system_inspector_performance.txt @@ -0,0 +1,102 @@ + +======================================= + System Inspector - Performance Mode +======================================= + +System capabilities analysis with performance measurements + +System: 8 logical cores, AVX2 SIMD, 8192 KB L3 cache + + +--- CPU Features --- +Feature Support Description +------------------------------------------------------------ +AVX-512 No Foundation instructions +AVX2 Yes Advanced Vector Ext 2 +AVX Yes Advanced Vector Ext +SSE2 Yes Streaming SIMD Ext 2 +NEON No ARM SIMD instructions +FMA Yes Fused Multiply-Add + + +--- Cache Information --- +Cache Level Size (KB) Line Size +------------------------------------------ +L1 32 64 bytes +L2 256 64 bytes +L3 8192 64 bytes + + +--- CPU Topology --- +Hardware Threads: 8 +Logical Cores: 8 +Physical Cores: 4 +Hyperthreading: Enabled + + +--- SIMD Capabilities --- +Instruction Support Vector Width Description +-------------------------------------------------------------- +SSE2 Yes 128-bit Basic SIMD operations +AVX Yes 256-bit Advanced vector ext +AVX2 Yes 256-bit Integer AVX operations +AVX-512 No 512-bit Foundation instructions +NEON No 128-bit ARM SIMD instructions + +Active SIMD Level: AVX2 + + +--- Performance Baselines --- +Operation Type Time (ΞΌs) Throughput (MOps/s) +------------------------------------------------------------ +SIMD Multiply 932 1072 +Scalar Multiply 937 1066 + +SIMD Speedup: 1.01x + + +--- Performance Dispatcher Configuration --- +Example Strategy Selections: +Batch Size Distribution Complexity Strategy +---------------------------------------------------------------------- +100 Uniform Simple Vectorized +100 Gaussian Simple Vectorized +100 Exponential Simple Vectorized +100 Poisson Simple Vectorized +100 Discrete Simple Vectorized +1000 Uniform Simple Vectorized +1000 Gaussian Simple Parallel +1000 Exponential Simple Vectorized +1000 Poisson Simple Parallel +1000 Discrete Simple Vectorized +10000 Uniform Simple Parallel +10000 Gaussian Simple Parallel +10000 Exponential Simple Parallel +10000 Poisson Simple Work-Stealing +10000 Discrete Simple Parallel +100000 Uniform Simple Parallel +100000 Gaussian Simple Parallel +100000 Exponential Simple Parallel +100000 Poisson Simple Work-Stealing +100000 Discrete Simple Parallel + + +--- Platform Constants --- +Constant Value +-------------------------------------------------- +SIMD Block Size 4 doubles +Memory Alignment 32 bytes +Min SIMD Size 8 elements +Optimal Grain Size 48 elements +Fast Transcendental Support Yes + + +--- Adaptive Constants --- +Constant Value +-------------------------------------------------- +Min Elements for Parallel 4096 +Default Grain Size 512 +Simple Operation Grain Size 256 +Complex Operation Grain Size 1024 + +System inspection completed successfully. diff --git a/data/profiles/dispatcher/2026-04-12T05-27-04Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-0e4e9f1/manifest.txt b/data/profiles/dispatcher/2026-04-12T05-27-04Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-0e4e9f1/manifest.txt new file mode 100644 index 0000000..3dac69a --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T05-27-04Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-0e4e9f1/manifest.txt @@ -0,0 +1,14 @@ +Dispatcher profile bundle +========================= + +Run ID: 2026-04-12T05-27-04Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-0e4e9f1 +Captured at (UTC): 2026-04-12T05-27-04Z + +Files: +- metadata.json +- summary.json +- crossovers.csv +- best_strategies.csv +- strategy_profile_results.csv +- logs/system_inspector_performance.txt +- logs/strategy_profile.txt diff --git a/data/profiles/dispatcher/2026-04-12T05-27-04Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-0e4e9f1/metadata.json b/data/profiles/dispatcher/2026-04-12T05-27-04Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-0e4e9f1/metadata.json new file mode 100644 index 0000000..c11dff7 --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T05-27-04Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-0e4e9f1/metadata.json @@ -0,0 +1,15 @@ +{ + "captured_at_utc": "2026-04-12T05-27-04Z", + "run_id": "2026-04-12T05-27-04Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-0e4e9f1", + "git_branch": "investigate-gaussian-avx512-perf", + "git_sha": "0e4e9f1", + "project_root": "/Users/wolfman/Development/libstats", + "build_dir": "/Users/wolfman/Development/libstats/build", + "build_type": "Release", + "cxx_compiler": "", + "os": "darwin", + "arch": "x86_64", + "cpu_brand": "Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz", + "physical_cores": "4", + "logical_cores": "8" +} diff --git a/data/profiles/dispatcher/2026-04-12T05-27-04Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-0e4e9f1/strategy_profile_results.csv b/data/profiles/dispatcher/2026-04-12T05-27-04Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-0e4e9f1/strategy_profile_results.csv new file mode 100644 index 0000000..086f0f9 --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T05-27-04Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-0e4e9f1/strategy_profile_results.csv @@ -0,0 +1,1729 @@ +Distribution,Operation,BatchSize,Strategy,MedianTime_us +Uniform,PDF,8,SCALAR,0.592000 +Uniform,PDF,8,VECTORIZED,0.142000 +Uniform,PDF,8,PARALLEL,0.151000 +Uniform,PDF,8,WORK_STEALING,0.201000 +Uniform,LogPDF,8,SCALAR,0.759000 +Uniform,LogPDF,8,VECTORIZED,0.220000 +Uniform,LogPDF,8,PARALLEL,0.222000 +Uniform,LogPDF,8,WORK_STEALING,0.223000 +Uniform,CDF,8,SCALAR,0.762000 +Uniform,CDF,8,VECTORIZED,0.257000 +Uniform,CDF,8,PARALLEL,0.231000 +Uniform,CDF,8,WORK_STEALING,0.194000 +Uniform,PDF,16,SCALAR,1.215000 +Uniform,PDF,16,VECTORIZED,0.151000 +Uniform,PDF,16,PARALLEL,0.188000 +Uniform,PDF,16,WORK_STEALING,0.171000 +Uniform,LogPDF,16,SCALAR,1.204000 +Uniform,LogPDF,16,VECTORIZED,0.171000 +Uniform,LogPDF,16,PARALLEL,0.186000 +Uniform,LogPDF,16,WORK_STEALING,0.202000 +Uniform,CDF,16,SCALAR,1.216000 +Uniform,CDF,16,VECTORIZED,0.255000 +Uniform,CDF,16,PARALLEL,0.196000 +Uniform,CDF,16,WORK_STEALING,0.195000 +Uniform,PDF,32,SCALAR,2.359000 +Uniform,PDF,32,VECTORIZED,0.153000 +Uniform,PDF,32,PARALLEL,0.246000 +Uniform,PDF,32,WORK_STEALING,0.174000 +Uniform,LogPDF,32,SCALAR,2.394000 +Uniform,LogPDF,32,VECTORIZED,0.194000 +Uniform,LogPDF,32,PARALLEL,0.224000 +Uniform,LogPDF,32,WORK_STEALING,0.189000 +Uniform,CDF,32,SCALAR,2.383000 +Uniform,CDF,32,VECTORIZED,0.241000 +Uniform,CDF,32,PARALLEL,0.246000 +Uniform,CDF,32,WORK_STEALING,0.205000 +Uniform,PDF,64,SCALAR,5.058000 +Uniform,PDF,64,VECTORIZED,0.212000 +Uniform,PDF,64,PARALLEL,0.332000 +Uniform,PDF,64,WORK_STEALING,0.200000 +Uniform,LogPDF,64,SCALAR,4.906000 +Uniform,LogPDF,64,VECTORIZED,0.241000 +Uniform,LogPDF,64,PARALLEL,0.298000 +Uniform,LogPDF,64,WORK_STEALING,0.238000 +Uniform,CDF,64,SCALAR,5.045000 +Uniform,CDF,64,VECTORIZED,0.347000 +Uniform,CDF,64,PARALLEL,0.342000 +Uniform,CDF,64,WORK_STEALING,0.253000 +Uniform,PDF,128,SCALAR,9.892000 +Uniform,PDF,128,VECTORIZED,0.241000 +Uniform,PDF,128,PARALLEL,0.481000 +Uniform,PDF,128,WORK_STEALING,0.292000 +Uniform,LogPDF,128,SCALAR,9.950000 +Uniform,LogPDF,128,VECTORIZED,0.273000 +Uniform,LogPDF,128,PARALLEL,0.425000 +Uniform,LogPDF,128,WORK_STEALING,0.292000 +Uniform,CDF,128,SCALAR,10.130000 +Uniform,CDF,128,VECTORIZED,0.420000 +Uniform,CDF,128,PARALLEL,0.452000 +Uniform,CDF,128,WORK_STEALING,0.321000 +Uniform,PDF,256,SCALAR,19.183000 +Uniform,PDF,256,VECTORIZED,0.298000 +Uniform,PDF,256,PARALLEL,0.728000 +Uniform,PDF,256,WORK_STEALING,0.211000 +Uniform,LogPDF,256,SCALAR,18.952000 +Uniform,LogPDF,256,VECTORIZED,0.340000 +Uniform,LogPDF,256,PARALLEL,0.597000 +Uniform,LogPDF,256,WORK_STEALING,0.309000 +Uniform,CDF,256,SCALAR,18.843000 +Uniform,CDF,256,VECTORIZED,0.464000 +Uniform,CDF,256,PARALLEL,0.499000 +Uniform,CDF,256,WORK_STEALING,0.333000 +Uniform,PDF,512,SCALAR,37.051000 +Uniform,PDF,512,VECTORIZED,0.302000 +Uniform,PDF,512,PARALLEL,0.825000 +Uniform,PDF,512,WORK_STEALING,0.486000 +Uniform,LogPDF,512,SCALAR,37.011000 +Uniform,LogPDF,512,VECTORIZED,0.391000 +Uniform,LogPDF,512,PARALLEL,0.992000 +Uniform,LogPDF,512,WORK_STEALING,0.477000 +Uniform,CDF,512,SCALAR,38.476000 +Uniform,CDF,512,VECTORIZED,0.959000 +Uniform,CDF,512,PARALLEL,1.286000 +Uniform,CDF,512,WORK_STEALING,0.836000 +Uniform,PDF,1000,SCALAR,73.276000 +Uniform,PDF,1000,VECTORIZED,0.716000 +Uniform,PDF,1000,PARALLEL,2.726000 +Uniform,PDF,1000,WORK_STEALING,0.819000 +Uniform,LogPDF,1000,SCALAR,69.663000 +Uniform,LogPDF,1000,VECTORIZED,0.532000 +Uniform,LogPDF,1000,PARALLEL,2.366000 +Uniform,LogPDF,1000,WORK_STEALING,0.482000 +Uniform,CDF,1000,SCALAR,70.979000 +Uniform,CDF,1000,VECTORIZED,1.360000 +Uniform,CDF,1000,PARALLEL,1.709000 +Uniform,CDF,1000,WORK_STEALING,1.407000 +Uniform,PDF,2000,SCALAR,136.502000 +Uniform,PDF,2000,VECTORIZED,1.028000 +Uniform,PDF,2000,PARALLEL,4.703000 +Uniform,PDF,2000,WORK_STEALING,1.404000 +Uniform,LogPDF,2000,SCALAR,143.565000 +Uniform,LogPDF,2000,VECTORIZED,1.435000 +Uniform,LogPDF,2000,PARALLEL,7.281000 +Uniform,LogPDF,2000,WORK_STEALING,0.831000 +Uniform,CDF,2000,SCALAR,132.978000 +Uniform,CDF,2000,VECTORIZED,4.410000 +Uniform,CDF,2000,PARALLEL,5.136000 +Uniform,CDF,2000,WORK_STEALING,2.065000 +Uniform,PDF,5000,SCALAR,310.142000 +Uniform,PDF,5000,VECTORIZED,1.885000 +Uniform,PDF,5000,PARALLEL,86.464000 +Uniform,PDF,5000,WORK_STEALING,40.897000 +Uniform,LogPDF,5000,SCALAR,342.250000 +Uniform,LogPDF,5000,VECTORIZED,2.971000 +Uniform,LogPDF,5000,PARALLEL,65.507000 +Uniform,LogPDF,5000,WORK_STEALING,29.348000 +Uniform,CDF,5000,SCALAR,267.070000 +Uniform,CDF,5000,VECTORIZED,11.519000 +Uniform,CDF,5000,PARALLEL,55.612000 +Uniform,CDF,5000,WORK_STEALING,25.634000 +Uniform,PDF,10000,SCALAR,524.805000 +Uniform,PDF,10000,VECTORIZED,3.427000 +Uniform,PDF,10000,PARALLEL,102.317000 +Uniform,PDF,10000,WORK_STEALING,27.833000 +Uniform,LogPDF,10000,SCALAR,542.895000 +Uniform,LogPDF,10000,VECTORIZED,5.925000 +Uniform,LogPDF,10000,PARALLEL,101.508000 +Uniform,LogPDF,10000,WORK_STEALING,34.562000 +Uniform,CDF,10000,SCALAR,565.762000 +Uniform,CDF,10000,VECTORIZED,35.327000 +Uniform,CDF,10000,PARALLEL,99.462000 +Uniform,CDF,10000,WORK_STEALING,37.128000 +Uniform,PDF,20000,SCALAR,1175.601000 +Uniform,PDF,20000,VECTORIZED,10.867000 +Uniform,PDF,20000,PARALLEL,97.441000 +Uniform,PDF,20000,WORK_STEALING,37.857000 +Uniform,LogPDF,20000,SCALAR,1298.033000 +Uniform,LogPDF,20000,VECTORIZED,8.467000 +Uniform,LogPDF,20000,PARALLEL,127.296000 +Uniform,LogPDF,20000,WORK_STEALING,46.498000 +Uniform,CDF,20000,SCALAR,1444.327000 +Uniform,CDF,20000,VECTORIZED,125.525000 +Uniform,CDF,20000,PARALLEL,94.793000 +Uniform,CDF,20000,WORK_STEALING,47.649000 +Uniform,PDF,50000,SCALAR,3442.679000 +Uniform,PDF,50000,VECTORIZED,22.121000 +Uniform,PDF,50000,PARALLEL,162.659000 +Uniform,PDF,50000,WORK_STEALING,91.225000 +Uniform,LogPDF,50000,SCALAR,3669.920000 +Uniform,LogPDF,50000,VECTORIZED,34.774000 +Uniform,LogPDF,50000,PARALLEL,169.816000 +Uniform,LogPDF,50000,WORK_STEALING,88.386000 +Uniform,CDF,50000,SCALAR,3404.270000 +Uniform,CDF,50000,VECTORIZED,318.808000 +Uniform,CDF,50000,PARALLEL,172.610000 +Uniform,CDF,50000,WORK_STEALING,95.622000 +Uniform,PDF,100000,SCALAR,6797.237000 +Uniform,PDF,100000,VECTORIZED,67.679000 +Uniform,PDF,100000,PARALLEL,316.057000 +Uniform,PDF,100000,WORK_STEALING,254.240000 +Uniform,LogPDF,100000,SCALAR,6691.124000 +Uniform,LogPDF,100000,VECTORIZED,69.231000 +Uniform,LogPDF,100000,PARALLEL,285.360000 +Uniform,LogPDF,100000,WORK_STEALING,178.265000 +Uniform,CDF,100000,SCALAR,7417.152000 +Uniform,CDF,100000,VECTORIZED,674.531000 +Uniform,CDF,100000,PARALLEL,303.839000 +Uniform,CDF,100000,WORK_STEALING,184.461000 +Uniform,PDF,250000,SCALAR,17564.354000 +Uniform,PDF,250000,VECTORIZED,209.306000 +Uniform,PDF,250000,PARALLEL,665.071000 +Uniform,PDF,250000,WORK_STEALING,399.494000 +Uniform,LogPDF,250000,SCALAR,17324.342000 +Uniform,LogPDF,250000,VECTORIZED,182.872000 +Uniform,LogPDF,250000,PARALLEL,650.693000 +Uniform,LogPDF,250000,WORK_STEALING,390.194000 +Uniform,CDF,250000,SCALAR,18132.131000 +Uniform,CDF,250000,VECTORIZED,1700.726000 +Uniform,CDF,250000,PARALLEL,643.514000 +Uniform,CDF,250000,WORK_STEALING,415.733000 +Uniform,PDF,500000,SCALAR,35403.341000 +Uniform,PDF,500000,VECTORIZED,402.518000 +Uniform,PDF,500000,PARALLEL,1242.904000 +Uniform,PDF,500000,WORK_STEALING,726.928000 +Uniform,LogPDF,500000,SCALAR,35235.903000 +Uniform,LogPDF,500000,VECTORIZED,486.830000 +Uniform,LogPDF,500000,PARALLEL,1198.774000 +Uniform,LogPDF,500000,WORK_STEALING,869.824000 +Uniform,CDF,500000,SCALAR,40287.186000 +Uniform,CDF,500000,VECTORIZED,4205.631000 +Uniform,CDF,500000,PARALLEL,1256.224000 +Uniform,CDF,500000,WORK_STEALING,1301.832000 +Gaussian,PDF,8,SCALAR,0.742000 +Gaussian,PDF,8,VECTORIZED,0.320000 +Gaussian,PDF,8,PARALLEL,0.258000 +Gaussian,PDF,8,WORK_STEALING,0.289000 +Gaussian,LogPDF,8,SCALAR,0.760000 +Gaussian,LogPDF,8,VECTORIZED,0.296000 +Gaussian,LogPDF,8,PARALLEL,0.186000 +Gaussian,LogPDF,8,WORK_STEALING,0.199000 +Gaussian,CDF,8,SCALAR,1.158000 +Gaussian,CDF,8,VECTORIZED,0.437000 +Gaussian,CDF,8,PARALLEL,0.539000 +Gaussian,CDF,8,WORK_STEALING,0.565000 +Gaussian,PDF,16,SCALAR,1.594000 +Gaussian,PDF,16,VECTORIZED,0.446000 +Gaussian,PDF,16,PARALLEL,0.404000 +Gaussian,PDF,16,WORK_STEALING,0.407000 +Gaussian,LogPDF,16,SCALAR,1.425000 +Gaussian,LogPDF,16,VECTORIZED,0.302000 +Gaussian,LogPDF,16,PARALLEL,0.210000 +Gaussian,LogPDF,16,WORK_STEALING,0.209000 +Gaussian,CDF,16,SCALAR,2.178000 +Gaussian,CDF,16,VECTORIZED,0.530000 +Gaussian,CDF,16,PARALLEL,1.029000 +Gaussian,CDF,16,WORK_STEALING,1.024000 +Gaussian,PDF,32,SCALAR,3.149000 +Gaussian,PDF,32,VECTORIZED,0.523000 +Gaussian,PDF,32,PARALLEL,0.620000 +Gaussian,PDF,32,WORK_STEALING,0.624000 +Gaussian,LogPDF,32,SCALAR,2.650000 +Gaussian,LogPDF,32,VECTORIZED,0.325000 +Gaussian,LogPDF,32,PARALLEL,0.208000 +Gaussian,LogPDF,32,WORK_STEALING,0.193000 +Gaussian,CDF,32,SCALAR,4.201000 +Gaussian,CDF,32,VECTORIZED,0.708000 +Gaussian,CDF,32,PARALLEL,1.789000 +Gaussian,CDF,32,WORK_STEALING,1.746000 +Gaussian,PDF,64,SCALAR,5.975000 +Gaussian,PDF,64,VECTORIZED,0.690000 +Gaussian,PDF,64,PARALLEL,1.013000 +Gaussian,PDF,64,WORK_STEALING,0.984000 +Gaussian,LogPDF,64,SCALAR,4.634000 +Gaussian,LogPDF,64,VECTORIZED,0.288000 +Gaussian,LogPDF,64,PARALLEL,0.192000 +Gaussian,LogPDF,64,WORK_STEALING,0.206000 +Gaussian,CDF,64,SCALAR,7.788000 +Gaussian,CDF,64,VECTORIZED,0.971000 +Gaussian,CDF,64,PARALLEL,3.365000 +Gaussian,CDF,64,WORK_STEALING,3.365000 +Gaussian,PDF,128,SCALAR,10.972000 +Gaussian,PDF,128,VECTORIZED,1.039000 +Gaussian,PDF,128,PARALLEL,1.806000 +Gaussian,PDF,128,WORK_STEALING,1.776000 +Gaussian,LogPDF,128,SCALAR,10.848000 +Gaussian,LogPDF,128,VECTORIZED,0.366000 +Gaussian,LogPDF,128,PARALLEL,0.218000 +Gaussian,LogPDF,128,WORK_STEALING,0.227000 +Gaussian,CDF,128,SCALAR,16.207000 +Gaussian,CDF,128,VECTORIZED,1.759000 +Gaussian,CDF,128,PARALLEL,6.510000 +Gaussian,CDF,128,WORK_STEALING,6.483000 +Gaussian,PDF,256,SCALAR,23.298000 +Gaussian,PDF,256,VECTORIZED,1.786000 +Gaussian,PDF,256,PARALLEL,3.439000 +Gaussian,PDF,256,WORK_STEALING,3.465000 +Gaussian,LogPDF,256,SCALAR,18.148000 +Gaussian,LogPDF,256,VECTORIZED,0.363000 +Gaussian,LogPDF,256,PARALLEL,0.293000 +Gaussian,LogPDF,256,WORK_STEALING,0.276000 +Gaussian,CDF,256,SCALAR,30.760000 +Gaussian,CDF,256,VECTORIZED,3.078000 +Gaussian,CDF,256,PARALLEL,12.791000 +Gaussian,CDF,256,WORK_STEALING,12.795000 +Gaussian,PDF,512,SCALAR,44.467000 +Gaussian,PDF,512,VECTORIZED,3.234000 +Gaussian,PDF,512,PARALLEL,6.717000 +Gaussian,PDF,512,WORK_STEALING,6.615000 +Gaussian,LogPDF,512,SCALAR,39.604000 +Gaussian,LogPDF,512,VECTORIZED,0.611000 +Gaussian,LogPDF,512,PARALLEL,0.504000 +Gaussian,LogPDF,512,WORK_STEALING,0.476000 +Gaussian,CDF,512,SCALAR,61.673000 +Gaussian,CDF,512,VECTORIZED,5.875000 +Gaussian,CDF,512,PARALLEL,25.486000 +Gaussian,CDF,512,WORK_STEALING,26.234000 +Gaussian,PDF,1000,SCALAR,91.826000 +Gaussian,PDF,1000,VECTORIZED,5.857000 +Gaussian,PDF,1000,PARALLEL,12.700000 +Gaussian,PDF,1000,WORK_STEALING,12.708000 +Gaussian,LogPDF,1000,SCALAR,102.293000 +Gaussian,LogPDF,1000,VECTORIZED,0.768000 +Gaussian,LogPDF,1000,PARALLEL,0.705000 +Gaussian,LogPDF,1000,WORK_STEALING,0.648000 +Gaussian,CDF,1000,SCALAR,120.232000 +Gaussian,CDF,1000,VECTORIZED,11.177000 +Gaussian,CDF,1000,PARALLEL,52.796000 +Gaussian,CDF,1000,WORK_STEALING,46.074000 +Gaussian,PDF,2000,SCALAR,183.337000 +Gaussian,PDF,2000,VECTORIZED,11.238000 +Gaussian,PDF,2000,PARALLEL,24.677000 +Gaussian,PDF,2000,WORK_STEALING,24.435000 +Gaussian,LogPDF,2000,SCALAR,135.640000 +Gaussian,LogPDF,2000,VECTORIZED,1.403000 +Gaussian,LogPDF,2000,PARALLEL,1.155000 +Gaussian,LogPDF,2000,WORK_STEALING,1.144000 +Gaussian,CDF,2000,SCALAR,222.519000 +Gaussian,CDF,2000,VECTORIZED,20.928000 +Gaussian,CDF,2000,PARALLEL,91.518000 +Gaussian,CDF,2000,WORK_STEALING,91.398000 +Gaussian,PDF,5000,SCALAR,429.949000 +Gaussian,PDF,5000,VECTORIZED,25.877000 +Gaussian,PDF,5000,PARALLEL,245.721000 +Gaussian,PDF,5000,WORK_STEALING,84.992000 +Gaussian,LogPDF,5000,SCALAR,346.132000 +Gaussian,LogPDF,5000,VECTORIZED,3.462000 +Gaussian,LogPDF,5000,PARALLEL,148.527000 +Gaussian,LogPDF,5000,WORK_STEALING,74.093000 +Gaussian,CDF,5000,SCALAR,577.889000 +Gaussian,CDF,5000,VECTORIZED,51.718000 +Gaussian,CDF,5000,PARALLEL,169.644000 +Gaussian,CDF,5000,WORK_STEALING,157.374000 +Gaussian,PDF,10000,SCALAR,839.592000 +Gaussian,PDF,10000,VECTORIZED,56.080000 +Gaussian,PDF,10000,PARALLEL,282.972000 +Gaussian,PDF,10000,WORK_STEALING,143.620000 +Gaussian,LogPDF,10000,SCALAR,714.087000 +Gaussian,LogPDF,10000,VECTORIZED,7.148000 +Gaussian,LogPDF,10000,PARALLEL,244.197000 +Gaussian,LogPDF,10000,WORK_STEALING,93.418000 +Gaussian,CDF,10000,SCALAR,1179.511000 +Gaussian,CDF,10000,VECTORIZED,107.983000 +Gaussian,CDF,10000,PARALLEL,260.382000 +Gaussian,CDF,10000,WORK_STEALING,216.371000 +Gaussian,PDF,20000,SCALAR,1704.838000 +Gaussian,PDF,20000,VECTORIZED,113.836000 +Gaussian,PDF,20000,PARALLEL,266.594000 +Gaussian,PDF,20000,WORK_STEALING,176.778000 +Gaussian,LogPDF,20000,SCALAR,1398.580000 +Gaussian,LogPDF,20000,VECTORIZED,18.392000 +Gaussian,LogPDF,20000,PARALLEL,328.101000 +Gaussian,LogPDF,20000,WORK_STEALING,115.777000 +Gaussian,CDF,20000,SCALAR,2380.148000 +Gaussian,CDF,20000,VECTORIZED,217.735000 +Gaussian,CDF,20000,PARALLEL,347.242000 +Gaussian,CDF,20000,WORK_STEALING,255.036000 +Gaussian,PDF,50000,SCALAR,4177.111000 +Gaussian,PDF,50000,VECTORIZED,300.454000 +Gaussian,PDF,50000,PARALLEL,312.087000 +Gaussian,PDF,50000,WORK_STEALING,212.752000 +Gaussian,LogPDF,50000,SCALAR,3310.173000 +Gaussian,LogPDF,50000,VECTORIZED,56.415000 +Gaussian,LogPDF,50000,PARALLEL,264.130000 +Gaussian,LogPDF,50000,WORK_STEALING,92.318000 +Gaussian,CDF,50000,SCALAR,5834.679000 +Gaussian,CDF,50000,VECTORIZED,557.444000 +Gaussian,CDF,50000,PARALLEL,805.217000 +Gaussian,CDF,50000,WORK_STEALING,505.507000 +Gaussian,PDF,100000,SCALAR,8144.983000 +Gaussian,PDF,100000,VECTORIZED,561.352000 +Gaussian,PDF,100000,PARALLEL,410.351000 +Gaussian,PDF,100000,WORK_STEALING,482.314000 +Gaussian,LogPDF,100000,SCALAR,6854.059000 +Gaussian,LogPDF,100000,VECTORIZED,110.790000 +Gaussian,LogPDF,100000,PARALLEL,278.376000 +Gaussian,LogPDF,100000,WORK_STEALING,115.489000 +Gaussian,CDF,100000,SCALAR,11530.301000 +Gaussian,CDF,100000,VECTORIZED,1111.757000 +Gaussian,CDF,100000,PARALLEL,1494.950000 +Gaussian,CDF,100000,WORK_STEALING,966.260000 +Gaussian,PDF,250000,SCALAR,20566.641000 +Gaussian,PDF,250000,VECTORIZED,1598.335000 +Gaussian,PDF,250000,PARALLEL,918.029000 +Gaussian,PDF,250000,WORK_STEALING,609.792000 +Gaussian,LogPDF,250000,SCALAR,17031.612000 +Gaussian,LogPDF,250000,VECTORIZED,280.485000 +Gaussian,LogPDF,250000,PARALLEL,170.543000 +Gaussian,LogPDF,250000,WORK_STEALING,139.309000 +Gaussian,CDF,250000,SCALAR,29237.648000 +Gaussian,CDF,250000,VECTORIZED,2751.360000 +Gaussian,CDF,250000,PARALLEL,3564.810000 +Gaussian,CDF,250000,WORK_STEALING,2009.540000 +Gaussian,PDF,500000,SCALAR,41996.946000 +Gaussian,PDF,500000,VECTORIZED,3183.894000 +Gaussian,PDF,500000,PARALLEL,1849.569000 +Gaussian,PDF,500000,WORK_STEALING,1388.958000 +Gaussian,LogPDF,500000,SCALAR,34141.993000 +Gaussian,LogPDF,500000,VECTORIZED,738.922000 +Gaussian,LogPDF,500000,PARALLEL,288.414000 +Gaussian,LogPDF,500000,WORK_STEALING,266.186000 +Gaussian,CDF,500000,SCALAR,56470.298000 +Gaussian,CDF,500000,VECTORIZED,5773.979000 +Gaussian,CDF,500000,PARALLEL,6899.204000 +Gaussian,CDF,500000,WORK_STEALING,4203.526000 +Exponential,PDF,8,SCALAR,0.736000 +Exponential,PDF,8,VECTORIZED,0.301000 +Exponential,PDF,8,PARALLEL,0.245000 +Exponential,PDF,8,WORK_STEALING,0.275000 +Exponential,LogPDF,8,SCALAR,0.573000 +Exponential,LogPDF,8,VECTORIZED,0.197000 +Exponential,LogPDF,8,PARALLEL,0.175000 +Exponential,LogPDF,8,WORK_STEALING,0.170000 +Exponential,CDF,8,SCALAR,0.710000 +Exponential,CDF,8,VECTORIZED,0.338000 +Exponential,CDF,8,PARALLEL,0.257000 +Exponential,CDF,8,WORK_STEALING,0.250000 +Exponential,PDF,16,SCALAR,1.360000 +Exponential,PDF,16,VECTORIZED,0.348000 +Exponential,PDF,16,PARALLEL,0.356000 +Exponential,PDF,16,WORK_STEALING,0.353000 +Exponential,LogPDF,16,SCALAR,1.219000 +Exponential,LogPDF,16,VECTORIZED,0.216000 +Exponential,LogPDF,16,PARALLEL,0.192000 +Exponential,LogPDF,16,WORK_STEALING,0.189000 +Exponential,CDF,16,SCALAR,1.399000 +Exponential,CDF,16,VECTORIZED,0.374000 +Exponential,CDF,16,PARALLEL,0.351000 +Exponential,CDF,16,WORK_STEALING,0.339000 +Exponential,PDF,32,SCALAR,2.734000 +Exponential,PDF,32,VECTORIZED,0.420000 +Exponential,PDF,32,PARALLEL,0.565000 +Exponential,PDF,32,WORK_STEALING,0.537000 +Exponential,LogPDF,32,SCALAR,2.286000 +Exponential,LogPDF,32,VECTORIZED,0.244000 +Exponential,LogPDF,32,PARALLEL,0.207000 +Exponential,LogPDF,32,WORK_STEALING,0.168000 +Exponential,CDF,32,SCALAR,2.595000 +Exponential,CDF,32,VECTORIZED,0.495000 +Exponential,CDF,32,PARALLEL,0.585000 +Exponential,CDF,32,WORK_STEALING,0.554000 +Exponential,PDF,64,SCALAR,5.115000 +Exponential,PDF,64,VECTORIZED,0.640000 +Exponential,PDF,64,PARALLEL,0.944000 +Exponential,PDF,64,WORK_STEALING,0.896000 +Exponential,LogPDF,64,SCALAR,4.315000 +Exponential,LogPDF,64,VECTORIZED,0.300000 +Exponential,LogPDF,64,PARALLEL,0.272000 +Exponential,LogPDF,64,WORK_STEALING,0.192000 +Exponential,CDF,64,SCALAR,5.119000 +Exponential,CDF,64,VECTORIZED,0.663000 +Exponential,CDF,64,PARALLEL,0.983000 +Exponential,CDF,64,WORK_STEALING,0.955000 +Exponential,PDF,128,SCALAR,9.970000 +Exponential,PDF,128,VECTORIZED,1.019000 +Exponential,PDF,128,PARALLEL,1.713000 +Exponential,PDF,128,WORK_STEALING,1.659000 +Exponential,LogPDF,128,SCALAR,8.531000 +Exponential,LogPDF,128,VECTORIZED,0.369000 +Exponential,LogPDF,128,PARALLEL,0.377000 +Exponential,LogPDF,128,WORK_STEALING,0.201000 +Exponential,CDF,128,SCALAR,10.315000 +Exponential,CDF,128,VECTORIZED,1.025000 +Exponential,CDF,128,PARALLEL,1.814000 +Exponential,CDF,128,WORK_STEALING,1.722000 +Exponential,PDF,256,SCALAR,20.059000 +Exponential,PDF,256,VECTORIZED,1.776000 +Exponential,PDF,256,PARALLEL,3.228000 +Exponential,PDF,256,WORK_STEALING,3.089000 +Exponential,LogPDF,256,SCALAR,17.091000 +Exponential,LogPDF,256,VECTORIZED,0.431000 +Exponential,LogPDF,256,PARALLEL,0.619000 +Exponential,LogPDF,256,WORK_STEALING,0.311000 +Exponential,CDF,256,SCALAR,20.140000 +Exponential,CDF,256,VECTORIZED,1.853000 +Exponential,CDF,256,PARALLEL,3.471000 +Exponential,CDF,256,WORK_STEALING,3.291000 +Exponential,PDF,512,SCALAR,42.323000 +Exponential,PDF,512,VECTORIZED,3.321000 +Exponential,PDF,512,PARALLEL,6.378000 +Exponential,PDF,512,WORK_STEALING,6.081000 +Exponential,LogPDF,512,SCALAR,34.446000 +Exponential,LogPDF,512,VECTORIZED,0.759000 +Exponential,LogPDF,512,PARALLEL,0.967000 +Exponential,LogPDF,512,WORK_STEALING,0.433000 +Exponential,CDF,512,SCALAR,40.633000 +Exponential,CDF,512,VECTORIZED,3.306000 +Exponential,CDF,512,PARALLEL,7.042000 +Exponential,CDF,512,WORK_STEALING,6.556000 +Exponential,PDF,1000,SCALAR,82.407000 +Exponential,PDF,1000,VECTORIZED,6.016000 +Exponential,PDF,1000,PARALLEL,12.475000 +Exponential,PDF,1000,WORK_STEALING,11.623000 +Exponential,LogPDF,1000,SCALAR,62.306000 +Exponential,LogPDF,1000,VECTORIZED,1.385000 +Exponential,LogPDF,1000,PARALLEL,2.043000 +Exponential,LogPDF,1000,WORK_STEALING,0.800000 +Exponential,CDF,1000,SCALAR,78.052000 +Exponential,CDF,1000,VECTORIZED,6.327000 +Exponential,CDF,1000,PARALLEL,12.891000 +Exponential,CDF,1000,WORK_STEALING,12.205000 +Exponential,PDF,2000,SCALAR,156.308000 +Exponential,PDF,2000,VECTORIZED,27.686000 +Exponential,PDF,2000,PARALLEL,24.043000 +Exponential,PDF,2000,WORK_STEALING,23.077000 +Exponential,LogPDF,2000,SCALAR,140.580000 +Exponential,LogPDF,2000,VECTORIZED,3.779000 +Exponential,LogPDF,2000,PARALLEL,3.197000 +Exponential,LogPDF,2000,WORK_STEALING,1.232000 +Exponential,CDF,2000,SCALAR,156.470000 +Exponential,CDF,2000,VECTORIZED,12.292000 +Exponential,CDF,2000,PARALLEL,25.690000 +Exponential,CDF,2000,WORK_STEALING,24.200000 +Exponential,PDF,5000,SCALAR,388.021000 +Exponential,PDF,5000,VECTORIZED,29.165000 +Exponential,PDF,5000,PARALLEL,259.995000 +Exponential,PDF,5000,WORK_STEALING,125.122000 +Exponential,LogPDF,5000,SCALAR,347.104000 +Exponential,LogPDF,5000,VECTORIZED,6.479000 +Exponential,LogPDF,5000,PARALLEL,202.112000 +Exponential,LogPDF,5000,WORK_STEALING,80.077000 +Exponential,CDF,5000,SCALAR,410.463000 +Exponential,CDF,5000,VECTORIZED,49.228000 +Exponential,CDF,5000,PARALLEL,182.316000 +Exponential,CDF,5000,WORK_STEALING,142.411000 +Exponential,PDF,10000,SCALAR,850.010000 +Exponential,PDF,10000,VECTORIZED,66.421000 +Exponential,PDF,10000,PARALLEL,383.466000 +Exponential,PDF,10000,WORK_STEALING,167.617000 +Exponential,LogPDF,10000,SCALAR,658.541000 +Exponential,LogPDF,10000,VECTORIZED,13.145000 +Exponential,LogPDF,10000,PARALLEL,289.022000 +Exponential,LogPDF,10000,WORK_STEALING,81.599000 +Exponential,CDF,10000,SCALAR,833.162000 +Exponential,CDF,10000,VECTORIZED,61.375000 +Exponential,CDF,10000,PARALLEL,281.587000 +Exponential,CDF,10000,WORK_STEALING,135.943000 +Exponential,PDF,20000,SCALAR,1597.880000 +Exponential,PDF,20000,VECTORIZED,118.992000 +Exponential,PDF,20000,PARALLEL,238.638000 +Exponential,PDF,20000,WORK_STEALING,159.238000 +Exponential,LogPDF,20000,SCALAR,1391.531000 +Exponential,LogPDF,20000,VECTORIZED,31.447000 +Exponential,LogPDF,20000,PARALLEL,356.709000 +Exponential,LogPDF,20000,WORK_STEALING,108.806000 +Exponential,CDF,20000,SCALAR,1631.022000 +Exponential,CDF,20000,VECTORIZED,124.014000 +Exponential,CDF,20000,PARALLEL,328.021000 +Exponential,CDF,20000,WORK_STEALING,257.273000 +Exponential,PDF,50000,SCALAR,4130.232000 +Exponential,PDF,50000,VECTORIZED,314.749000 +Exponential,PDF,50000,PARALLEL,337.457000 +Exponential,PDF,50000,WORK_STEALING,283.562000 +Exponential,LogPDF,50000,SCALAR,3479.425000 +Exponential,LogPDF,50000,VECTORIZED,78.644000 +Exponential,LogPDF,50000,PARALLEL,211.857000 +Exponential,LogPDF,50000,WORK_STEALING,128.338000 +Exponential,CDF,50000,SCALAR,4066.975000 +Exponential,CDF,50000,VECTORIZED,324.066000 +Exponential,CDF,50000,PARALLEL,299.953000 +Exponential,CDF,50000,WORK_STEALING,240.325000 +Exponential,PDF,100000,SCALAR,7979.458000 +Exponential,PDF,100000,VECTORIZED,638.386000 +Exponential,PDF,100000,PARALLEL,476.964000 +Exponential,PDF,100000,WORK_STEALING,318.483000 +Exponential,LogPDF,100000,SCALAR,7038.834000 +Exponential,LogPDF,100000,VECTORIZED,153.344000 +Exponential,LogPDF,100000,PARALLEL,324.280000 +Exponential,LogPDF,100000,WORK_STEALING,183.315000 +Exponential,CDF,100000,SCALAR,7974.139000 +Exponential,CDF,100000,VECTORIZED,650.800000 +Exponential,CDF,100000,PARALLEL,480.839000 +Exponential,CDF,100000,WORK_STEALING,365.802000 +Exponential,PDF,250000,SCALAR,19971.144000 +Exponential,PDF,250000,VECTORIZED,1686.018000 +Exponential,PDF,250000,PARALLEL,927.285000 +Exponential,PDF,250000,WORK_STEALING,574.188000 +Exponential,LogPDF,250000,SCALAR,17279.270000 +Exponential,LogPDF,250000,VECTORIZED,471.816000 +Exponential,LogPDF,250000,PARALLEL,331.513000 +Exponential,LogPDF,250000,WORK_STEALING,240.194000 +Exponential,CDF,250000,SCALAR,20058.659000 +Exponential,CDF,250000,VECTORIZED,1723.075000 +Exponential,CDF,250000,PARALLEL,999.024000 +Exponential,CDF,250000,WORK_STEALING,871.128000 +Exponential,PDF,500000,SCALAR,40598.687000 +Exponential,PDF,500000,VECTORIZED,4467.374000 +Exponential,PDF,500000,PARALLEL,2107.483000 +Exponential,PDF,500000,WORK_STEALING,2223.893000 +Exponential,LogPDF,500000,SCALAR,37426.234000 +Exponential,LogPDF,500000,VECTORIZED,1267.269000 +Exponential,LogPDF,500000,PARALLEL,615.873000 +Exponential,LogPDF,500000,WORK_STEALING,478.612000 +Exponential,CDF,500000,SCALAR,41850.468000 +Exponential,CDF,500000,VECTORIZED,3870.036000 +Exponential,CDF,500000,PARALLEL,2043.357000 +Exponential,CDF,500000,WORK_STEALING,1359.510000 +Discrete,PDF,8,SCALAR,0.659000 +Discrete,PDF,8,VECTORIZED,0.189000 +Discrete,PDF,8,PARALLEL,0.222000 +Discrete,PDF,8,WORK_STEALING,0.231000 +Discrete,LogPDF,8,SCALAR,0.757000 +Discrete,LogPDF,8,VECTORIZED,0.203000 +Discrete,LogPDF,8,PARALLEL,0.217000 +Discrete,LogPDF,8,WORK_STEALING,0.218000 +Discrete,CDF,8,SCALAR,0.585000 +Discrete,CDF,8,VECTORIZED,0.195000 +Discrete,CDF,8,PARALLEL,0.204000 +Discrete,CDF,8,WORK_STEALING,0.227000 +Discrete,PDF,16,SCALAR,1.386000 +Discrete,PDF,16,VECTORIZED,0.263000 +Discrete,PDF,16,PARALLEL,0.260000 +Discrete,PDF,16,WORK_STEALING,0.292000 +Discrete,LogPDF,16,SCALAR,1.263000 +Discrete,LogPDF,16,VECTORIZED,0.296000 +Discrete,LogPDF,16,PARALLEL,0.294000 +Discrete,LogPDF,16,WORK_STEALING,0.281000 +Discrete,CDF,16,SCALAR,1.090000 +Discrete,CDF,16,VECTORIZED,0.244000 +Discrete,CDF,16,PARALLEL,0.242000 +Discrete,CDF,16,WORK_STEALING,0.264000 +Discrete,PDF,32,SCALAR,2.673000 +Discrete,PDF,32,VECTORIZED,0.320000 +Discrete,PDF,32,PARALLEL,0.368000 +Discrete,PDF,32,WORK_STEALING,0.415000 +Discrete,LogPDF,32,SCALAR,2.701000 +Discrete,LogPDF,32,VECTORIZED,0.366000 +Discrete,LogPDF,32,PARALLEL,0.434000 +Discrete,LogPDF,32,WORK_STEALING,0.388000 +Discrete,CDF,32,SCALAR,2.584000 +Discrete,CDF,32,VECTORIZED,0.357000 +Discrete,CDF,32,PARALLEL,0.369000 +Discrete,CDF,32,WORK_STEALING,0.338000 +Discrete,PDF,64,SCALAR,4.594000 +Discrete,PDF,64,VECTORIZED,0.430000 +Discrete,PDF,64,PARALLEL,0.521000 +Discrete,PDF,64,WORK_STEALING,0.746000 +Discrete,LogPDF,64,SCALAR,5.345000 +Discrete,LogPDF,64,VECTORIZED,0.637000 +Discrete,LogPDF,64,PARALLEL,0.687000 +Discrete,LogPDF,64,WORK_STEALING,0.650000 +Discrete,CDF,64,SCALAR,4.413000 +Discrete,CDF,64,VECTORIZED,0.519000 +Discrete,CDF,64,PARALLEL,0.538000 +Discrete,CDF,64,WORK_STEALING,0.488000 +Discrete,PDF,128,SCALAR,9.654000 +Discrete,PDF,128,VECTORIZED,0.683000 +Discrete,PDF,128,PARALLEL,0.830000 +Discrete,PDF,128,WORK_STEALING,1.117000 +Discrete,LogPDF,128,SCALAR,10.468000 +Discrete,LogPDF,128,VECTORIZED,0.858000 +Discrete,LogPDF,128,PARALLEL,1.005000 +Discrete,LogPDF,128,WORK_STEALING,0.673000 +Discrete,CDF,128,SCALAR,9.560000 +Discrete,CDF,128,VECTORIZED,0.832000 +Discrete,CDF,128,PARALLEL,0.844000 +Discrete,CDF,128,WORK_STEALING,0.709000 +Discrete,PDF,256,SCALAR,21.218000 +Discrete,PDF,256,VECTORIZED,1.059000 +Discrete,PDF,256,PARALLEL,1.354000 +Discrete,PDF,256,WORK_STEALING,1.828000 +Discrete,LogPDF,256,SCALAR,18.000000 +Discrete,LogPDF,256,VECTORIZED,1.491000 +Discrete,LogPDF,256,PARALLEL,1.587000 +Discrete,LogPDF,256,WORK_STEALING,1.405000 +Discrete,CDF,256,SCALAR,16.974000 +Discrete,CDF,256,VECTORIZED,1.389000 +Discrete,CDF,256,PARALLEL,1.579000 +Discrete,CDF,256,WORK_STEALING,1.381000 +Discrete,PDF,512,SCALAR,35.556000 +Discrete,PDF,512,VECTORIZED,2.035000 +Discrete,PDF,512,PARALLEL,2.591000 +Discrete,PDF,512,WORK_STEALING,4.195000 +Discrete,LogPDF,512,SCALAR,35.529000 +Discrete,LogPDF,512,VECTORIZED,2.884000 +Discrete,LogPDF,512,PARALLEL,3.041000 +Discrete,LogPDF,512,WORK_STEALING,2.724000 +Discrete,CDF,512,SCALAR,32.375000 +Discrete,CDF,512,VECTORIZED,1.996000 +Discrete,CDF,512,PARALLEL,2.582000 +Discrete,CDF,512,WORK_STEALING,2.174000 +Discrete,PDF,1000,SCALAR,67.817000 +Discrete,PDF,1000,VECTORIZED,3.782000 +Discrete,PDF,1000,PARALLEL,4.925000 +Discrete,PDF,1000,WORK_STEALING,6.728000 +Discrete,LogPDF,1000,SCALAR,70.474000 +Discrete,LogPDF,1000,VECTORIZED,5.445000 +Discrete,LogPDF,1000,PARALLEL,5.642000 +Discrete,LogPDF,1000,WORK_STEALING,5.161000 +Discrete,CDF,1000,SCALAR,66.075000 +Discrete,CDF,1000,VECTORIZED,5.158000 +Discrete,CDF,1000,PARALLEL,5.294000 +Discrete,CDF,1000,WORK_STEALING,4.292000 +Discrete,PDF,2000,SCALAR,135.357000 +Discrete,PDF,2000,VECTORIZED,7.311000 +Discrete,PDF,2000,PARALLEL,9.412000 +Discrete,PDF,2000,WORK_STEALING,12.873000 +Discrete,LogPDF,2000,SCALAR,143.781000 +Discrete,LogPDF,2000,VECTORIZED,10.096000 +Discrete,LogPDF,2000,PARALLEL,10.253000 +Discrete,LogPDF,2000,WORK_STEALING,9.114000 +Discrete,CDF,2000,SCALAR,129.290000 +Discrete,CDF,2000,VECTORIZED,7.780000 +Discrete,CDF,2000,PARALLEL,9.834000 +Discrete,CDF,2000,WORK_STEALING,8.195000 +Discrete,PDF,5000,SCALAR,359.055000 +Discrete,PDF,5000,VECTORIZED,20.843000 +Discrete,PDF,5000,PARALLEL,371.977000 +Discrete,PDF,5000,WORK_STEALING,156.547000 +Discrete,LogPDF,5000,SCALAR,345.960000 +Discrete,LogPDF,5000,VECTORIZED,24.633000 +Discrete,LogPDF,5000,PARALLEL,327.003000 +Discrete,LogPDF,5000,WORK_STEALING,140.590000 +Discrete,CDF,5000,SCALAR,333.603000 +Discrete,CDF,5000,VECTORIZED,24.219000 +Discrete,CDF,5000,PARALLEL,307.828000 +Discrete,CDF,5000,WORK_STEALING,167.318000 +Discrete,PDF,10000,SCALAR,676.962000 +Discrete,PDF,10000,VECTORIZED,33.743000 +Discrete,PDF,10000,PARALLEL,407.223000 +Discrete,PDF,10000,WORK_STEALING,147.914000 +Discrete,LogPDF,10000,SCALAR,699.132000 +Discrete,LogPDF,10000,VECTORIZED,47.205000 +Discrete,LogPDF,10000,PARALLEL,409.351000 +Discrete,LogPDF,10000,WORK_STEALING,221.061000 +Discrete,CDF,10000,SCALAR,653.913000 +Discrete,CDF,10000,VECTORIZED,45.172000 +Discrete,CDF,10000,PARALLEL,651.350000 +Discrete,CDF,10000,WORK_STEALING,204.256000 +Discrete,PDF,20000,SCALAR,1469.176000 +Discrete,PDF,20000,VECTORIZED,74.684000 +Discrete,PDF,20000,PARALLEL,341.859000 +Discrete,PDF,20000,WORK_STEALING,291.496000 +Discrete,LogPDF,20000,SCALAR,1457.630000 +Discrete,LogPDF,20000,VECTORIZED,106.854000 +Discrete,LogPDF,20000,PARALLEL,564.355000 +Discrete,LogPDF,20000,WORK_STEALING,183.501000 +Discrete,CDF,20000,SCALAR,1427.214000 +Discrete,CDF,20000,VECTORIZED,124.182000 +Discrete,CDF,20000,PARALLEL,445.971000 +Discrete,CDF,20000,WORK_STEALING,239.156000 +Discrete,PDF,50000,SCALAR,3791.954000 +Discrete,PDF,50000,VECTORIZED,184.545000 +Discrete,PDF,50000,PARALLEL,552.617000 +Discrete,PDF,50000,WORK_STEALING,249.306000 +Discrete,LogPDF,50000,SCALAR,3688.981000 +Discrete,LogPDF,50000,VECTORIZED,259.493000 +Discrete,LogPDF,50000,PARALLEL,434.681000 +Discrete,LogPDF,50000,WORK_STEALING,208.233000 +Discrete,CDF,50000,SCALAR,3357.277000 +Discrete,CDF,50000,VECTORIZED,242.607000 +Discrete,CDF,50000,PARALLEL,288.694000 +Discrete,CDF,50000,WORK_STEALING,215.904000 +Discrete,PDF,100000,SCALAR,7012.905000 +Discrete,PDF,100000,VECTORIZED,359.541000 +Discrete,PDF,100000,PARALLEL,463.882000 +Discrete,PDF,100000,WORK_STEALING,246.900000 +Discrete,LogPDF,100000,SCALAR,7140.568000 +Discrete,LogPDF,100000,VECTORIZED,507.401000 +Discrete,LogPDF,100000,PARALLEL,609.394000 +Discrete,LogPDF,100000,WORK_STEALING,294.133000 +Discrete,CDF,100000,SCALAR,6910.170000 +Discrete,CDF,100000,VECTORIZED,515.374000 +Discrete,CDF,100000,PARALLEL,296.040000 +Discrete,CDF,100000,WORK_STEALING,278.062000 +Discrete,PDF,250000,SCALAR,18585.281000 +Discrete,PDF,250000,VECTORIZED,898.687000 +Discrete,PDF,250000,PARALLEL,572.416000 +Discrete,PDF,250000,WORK_STEALING,423.830000 +Discrete,LogPDF,250000,SCALAR,18409.459000 +Discrete,LogPDF,250000,VECTORIZED,1293.568000 +Discrete,LogPDF,250000,PARALLEL,773.332000 +Discrete,LogPDF,250000,WORK_STEALING,515.026000 +Discrete,CDF,250000,SCALAR,17315.039000 +Discrete,CDF,250000,VECTORIZED,1238.242000 +Discrete,CDF,250000,PARALLEL,548.859000 +Discrete,CDF,250000,WORK_STEALING,471.137000 +Discrete,PDF,500000,SCALAR,36504.644000 +Discrete,PDF,500000,VECTORIZED,1838.664000 +Discrete,PDF,500000,PARALLEL,935.196000 +Discrete,PDF,500000,WORK_STEALING,661.421000 +Discrete,LogPDF,500000,SCALAR,35205.579000 +Discrete,LogPDF,500000,VECTORIZED,2532.812000 +Discrete,LogPDF,500000,PARALLEL,882.650000 +Discrete,LogPDF,500000,WORK_STEALING,674.444000 +Discrete,CDF,500000,SCALAR,33393.506000 +Discrete,CDF,500000,VECTORIZED,2493.183000 +Discrete,CDF,500000,PARALLEL,1128.378000 +Discrete,CDF,500000,WORK_STEALING,609.680000 +Poisson,PDF,8,SCALAR,1.019000 +Poisson,PDF,8,VECTORIZED,0.511000 +Poisson,PDF,8,PARALLEL,0.524000 +Poisson,PDF,8,WORK_STEALING,0.563000 +Poisson,LogPDF,8,SCALAR,0.834000 +Poisson,LogPDF,8,VECTORIZED,0.289000 +Poisson,LogPDF,8,PARALLEL,0.327000 +Poisson,LogPDF,8,WORK_STEALING,0.303000 +Poisson,CDF,8,SCALAR,0.891000 +Poisson,CDF,8,VECTORIZED,0.944000 +Poisson,CDF,8,PARALLEL,0.968000 +Poisson,CDF,8,WORK_STEALING,0.970000 +Poisson,PDF,16,SCALAR,1.870000 +Poisson,PDF,16,VECTORIZED,0.854000 +Poisson,PDF,16,PARALLEL,0.893000 +Poisson,PDF,16,WORK_STEALING,0.876000 +Poisson,LogPDF,16,SCALAR,1.462000 +Poisson,LogPDF,16,VECTORIZED,0.419000 +Poisson,LogPDF,16,PARALLEL,0.465000 +Poisson,LogPDF,16,WORK_STEALING,0.427000 +Poisson,CDF,16,SCALAR,1.814000 +Poisson,CDF,16,VECTORIZED,1.852000 +Poisson,CDF,16,PARALLEL,1.845000 +Poisson,CDF,16,WORK_STEALING,1.893000 +Poisson,PDF,32,SCALAR,3.645000 +Poisson,PDF,32,VECTORIZED,1.457000 +Poisson,PDF,32,PARALLEL,1.511000 +Poisson,PDF,32,WORK_STEALING,1.485000 +Poisson,LogPDF,32,SCALAR,2.670000 +Poisson,LogPDF,32,VECTORIZED,0.697000 +Poisson,LogPDF,32,PARALLEL,0.732000 +Poisson,LogPDF,32,WORK_STEALING,0.669000 +Poisson,CDF,32,SCALAR,3.449000 +Poisson,CDF,32,VECTORIZED,3.470000 +Poisson,CDF,32,PARALLEL,3.490000 +Poisson,CDF,32,WORK_STEALING,3.541000 +Poisson,PDF,64,SCALAR,7.233000 +Poisson,PDF,64,VECTORIZED,2.806000 +Poisson,PDF,64,PARALLEL,2.932000 +Poisson,PDF,64,WORK_STEALING,2.863000 +Poisson,LogPDF,64,SCALAR,5.470000 +Poisson,LogPDF,64,VECTORIZED,1.440000 +Poisson,LogPDF,64,PARALLEL,1.552000 +Poisson,LogPDF,64,WORK_STEALING,1.418000 +Poisson,CDF,64,SCALAR,7.595000 +Poisson,CDF,64,VECTORIZED,7.525000 +Poisson,CDF,64,PARALLEL,7.485000 +Poisson,CDF,64,WORK_STEALING,7.391000 +Poisson,PDF,128,SCALAR,13.763000 +Poisson,PDF,128,VECTORIZED,5.174000 +Poisson,PDF,128,PARALLEL,5.375000 +Poisson,PDF,128,WORK_STEALING,5.225000 +Poisson,LogPDF,128,SCALAR,10.382000 +Poisson,LogPDF,128,VECTORIZED,2.341000 +Poisson,LogPDF,128,PARALLEL,2.598000 +Poisson,LogPDF,128,WORK_STEALING,2.269000 +Poisson,CDF,128,SCALAR,13.714000 +Poisson,CDF,128,VECTORIZED,13.767000 +Poisson,CDF,128,PARALLEL,13.733000 +Poisson,CDF,128,WORK_STEALING,13.749000 +Poisson,PDF,256,SCALAR,27.340000 +Poisson,PDF,256,VECTORIZED,10.411000 +Poisson,PDF,256,PARALLEL,10.827000 +Poisson,PDF,256,WORK_STEALING,10.561000 +Poisson,LogPDF,256,SCALAR,20.780000 +Poisson,LogPDF,256,VECTORIZED,4.652000 +Poisson,LogPDF,256,PARALLEL,5.253000 +Poisson,LogPDF,256,WORK_STEALING,4.650000 +Poisson,CDF,256,SCALAR,27.897000 +Poisson,CDF,256,VECTORIZED,27.807000 +Poisson,CDF,256,PARALLEL,27.029000 +Poisson,CDF,256,WORK_STEALING,26.893000 +Poisson,PDF,512,SCALAR,52.993000 +Poisson,PDF,512,VECTORIZED,20.283000 +Poisson,PDF,512,PARALLEL,21.424000 +Poisson,PDF,512,WORK_STEALING,20.364000 +Poisson,LogPDF,512,SCALAR,39.894000 +Poisson,LogPDF,512,VECTORIZED,8.958000 +Poisson,LogPDF,512,PARALLEL,10.249000 +Poisson,LogPDF,512,WORK_STEALING,8.986000 +Poisson,CDF,512,SCALAR,54.403000 +Poisson,CDF,512,VECTORIZED,53.907000 +Poisson,CDF,512,PARALLEL,54.834000 +Poisson,CDF,512,WORK_STEALING,56.357000 +Poisson,PDF,1000,SCALAR,104.857000 +Poisson,PDF,1000,VECTORIZED,38.864000 +Poisson,PDF,1000,PARALLEL,40.796000 +Poisson,PDF,1000,WORK_STEALING,39.450000 +Poisson,LogPDF,1000,SCALAR,78.797000 +Poisson,LogPDF,1000,VECTORIZED,17.076000 +Poisson,LogPDF,1000,PARALLEL,19.980000 +Poisson,LogPDF,1000,WORK_STEALING,17.205000 +Poisson,CDF,1000,SCALAR,106.764000 +Poisson,CDF,1000,VECTORIZED,105.254000 +Poisson,CDF,1000,PARALLEL,106.174000 +Poisson,CDF,1000,WORK_STEALING,107.567000 +Poisson,PDF,2000,SCALAR,238.144000 +Poisson,PDF,2000,VECTORIZED,88.121000 +Poisson,PDF,2000,PARALLEL,85.154000 +Poisson,PDF,2000,WORK_STEALING,81.978000 +Poisson,LogPDF,2000,SCALAR,164.182000 +Poisson,LogPDF,2000,VECTORIZED,39.096000 +Poisson,LogPDF,2000,PARALLEL,41.924000 +Poisson,LogPDF,2000,WORK_STEALING,36.510000 +Poisson,CDF,2000,SCALAR,226.508000 +Poisson,CDF,2000,VECTORIZED,224.208000 +Poisson,CDF,2000,PARALLEL,225.598000 +Poisson,CDF,2000,WORK_STEALING,215.930000 +Poisson,PDF,5000,SCALAR,544.057000 +Poisson,PDF,5000,VECTORIZED,192.563000 +Poisson,PDF,5000,PARALLEL,314.692000 +Poisson,PDF,5000,WORK_STEALING,214.941000 +Poisson,LogPDF,5000,SCALAR,394.762000 +Poisson,LogPDF,5000,VECTORIZED,96.038000 +Poisson,LogPDF,5000,PARALLEL,192.996000 +Poisson,LogPDF,5000,WORK_STEALING,327.839000 +Poisson,CDF,5000,SCALAR,532.894000 +Poisson,CDF,5000,VECTORIZED,563.748000 +Poisson,CDF,5000,PARALLEL,446.840000 +Poisson,CDF,5000,WORK_STEALING,331.473000 +Poisson,PDF,10000,SCALAR,1090.275000 +Poisson,PDF,10000,VECTORIZED,607.814000 +Poisson,PDF,10000,PARALLEL,464.502000 +Poisson,PDF,10000,WORK_STEALING,283.063000 +Poisson,LogPDF,10000,SCALAR,796.730000 +Poisson,LogPDF,10000,VECTORIZED,188.054000 +Poisson,LogPDF,10000,PARALLEL,578.954000 +Poisson,LogPDF,10000,WORK_STEALING,246.162000 +Poisson,CDF,10000,SCALAR,1172.436000 +Poisson,CDF,10000,VECTORIZED,1069.479000 +Poisson,CDF,10000,PARALLEL,568.011000 +Poisson,CDF,10000,WORK_STEALING,550.599000 +Poisson,PDF,20000,SCALAR,2326.088000 +Poisson,PDF,20000,VECTORIZED,826.196000 +Poisson,PDF,20000,PARALLEL,517.448000 +Poisson,PDF,20000,WORK_STEALING,386.540000 +Poisson,LogPDF,20000,SCALAR,1806.553000 +Poisson,LogPDF,20000,VECTORIZED,392.421000 +Poisson,LogPDF,20000,PARALLEL,730.291000 +Poisson,LogPDF,20000,WORK_STEALING,295.354000 +Poisson,CDF,20000,SCALAR,2275.264000 +Poisson,CDF,20000,VECTORIZED,2302.332000 +Poisson,CDF,20000,PARALLEL,840.736000 +Poisson,CDF,20000,WORK_STEALING,632.566000 +Poisson,PDF,50000,SCALAR,5405.866000 +Poisson,PDF,50000,VECTORIZED,2024.645000 +Poisson,PDF,50000,PARALLEL,1020.924000 +Poisson,PDF,50000,WORK_STEALING,534.823000 +Poisson,LogPDF,50000,SCALAR,4102.138000 +Poisson,LogPDF,50000,VECTORIZED,947.190000 +Poisson,LogPDF,50000,PARALLEL,519.819000 +Poisson,LogPDF,50000,WORK_STEALING,390.385000 +Poisson,CDF,50000,SCALAR,5784.786000 +Poisson,CDF,50000,VECTORIZED,5457.024000 +Poisson,CDF,50000,PARALLEL,1834.000000 +Poisson,CDF,50000,WORK_STEALING,1092.691000 +Poisson,PDF,100000,SCALAR,11228.159000 +Poisson,PDF,100000,VECTORIZED,4032.259000 +Poisson,PDF,100000,PARALLEL,1335.839000 +Poisson,PDF,100000,WORK_STEALING,991.194000 +Poisson,LogPDF,100000,SCALAR,8438.857000 +Poisson,LogPDF,100000,VECTORIZED,2051.050000 +Poisson,LogPDF,100000,PARALLEL,930.499000 +Poisson,LogPDF,100000,WORK_STEALING,575.865000 +Poisson,CDF,100000,SCALAR,11401.173000 +Poisson,CDF,100000,VECTORIZED,11412.084000 +Poisson,CDF,100000,PARALLEL,3442.095000 +Poisson,CDF,100000,WORK_STEALING,2312.967000 +Poisson,PDF,250000,SCALAR,28455.890000 +Poisson,PDF,250000,VECTORIZED,10617.817000 +Poisson,PDF,250000,PARALLEL,3287.673000 +Poisson,PDF,250000,WORK_STEALING,1768.673000 +Poisson,LogPDF,250000,SCALAR,23842.146000 +Poisson,LogPDF,250000,VECTORIZED,5306.169000 +Poisson,LogPDF,250000,PARALLEL,2056.609000 +Poisson,LogPDF,250000,WORK_STEALING,1726.230000 +Poisson,CDF,250000,SCALAR,30257.435000 +Poisson,CDF,250000,VECTORIZED,29530.456000 +Poisson,CDF,250000,PARALLEL,8505.744000 +Poisson,CDF,250000,WORK_STEALING,5621.224000 +Poisson,PDF,500000,SCALAR,58799.509000 +Poisson,PDF,500000,VECTORIZED,21133.860000 +Poisson,PDF,500000,PARALLEL,6188.470000 +Poisson,PDF,500000,WORK_STEALING,3935.375000 +Poisson,LogPDF,500000,SCALAR,44515.970000 +Poisson,LogPDF,500000,VECTORIZED,10320.376000 +Poisson,LogPDF,500000,PARALLEL,3902.895000 +Poisson,LogPDF,500000,WORK_STEALING,2434.694000 +Poisson,CDF,500000,SCALAR,57261.990000 +Poisson,CDF,500000,VECTORIZED,57836.995000 +Poisson,CDF,500000,PARALLEL,16731.127000 +Poisson,CDF,500000,WORK_STEALING,9753.425000 +Gamma,PDF,8,SCALAR,1.465000 +Gamma,PDF,8,VECTORIZED,1.029000 +Gamma,PDF,8,PARALLEL,0.441000 +Gamma,PDF,8,WORK_STEALING,0.405000 +Gamma,LogPDF,8,SCALAR,0.802000 +Gamma,LogPDF,8,VECTORIZED,0.863000 +Gamma,LogPDF,8,PARALLEL,0.294000 +Gamma,LogPDF,8,WORK_STEALING,0.300000 +Gamma,CDF,8,SCALAR,1.355000 +Gamma,CDF,8,VECTORIZED,1.377000 +Gamma,CDF,8,PARALLEL,0.842000 +Gamma,CDF,8,WORK_STEALING,0.777000 +Gamma,PDF,16,SCALAR,2.693000 +Gamma,PDF,16,VECTORIZED,1.017000 +Gamma,PDF,16,PARALLEL,0.639000 +Gamma,PDF,16,WORK_STEALING,0.652000 +Gamma,LogPDF,16,SCALAR,1.412000 +Gamma,LogPDF,16,VECTORIZED,0.843000 +Gamma,LogPDF,16,PARALLEL,0.402000 +Gamma,LogPDF,16,WORK_STEALING,0.370000 +Gamma,CDF,16,SCALAR,2.370000 +Gamma,CDF,16,VECTORIZED,1.838000 +Gamma,CDF,16,PARALLEL,1.379000 +Gamma,CDF,16,WORK_STEALING,1.372000 +Gamma,PDF,32,SCALAR,5.236000 +Gamma,PDF,32,VECTORIZED,1.116000 +Gamma,PDF,32,PARALLEL,1.128000 +Gamma,PDF,32,WORK_STEALING,1.125000 +Gamma,LogPDF,32,SCALAR,2.582000 +Gamma,LogPDF,32,VECTORIZED,0.964000 +Gamma,LogPDF,32,PARALLEL,0.625000 +Gamma,LogPDF,32,WORK_STEALING,0.602000 +Gamma,CDF,32,SCALAR,10.864000 +Gamma,CDF,32,VECTORIZED,3.290000 +Gamma,CDF,32,PARALLEL,2.693000 +Gamma,CDF,32,WORK_STEALING,2.646000 +Gamma,PDF,64,SCALAR,10.126000 +Gamma,PDF,64,VECTORIZED,1.450000 +Gamma,PDF,64,PARALLEL,2.083000 +Gamma,PDF,64,WORK_STEALING,2.044000 +Gamma,LogPDF,64,SCALAR,5.159000 +Gamma,LogPDF,64,VECTORIZED,1.252000 +Gamma,LogPDF,64,PARALLEL,1.121000 +Gamma,LogPDF,64,WORK_STEALING,1.012000 +Gamma,CDF,64,SCALAR,9.586000 +Gamma,CDF,64,VECTORIZED,5.536000 +Gamma,CDF,64,PARALLEL,5.042000 +Gamma,CDF,64,WORK_STEALING,5.034000 +Gamma,PDF,128,SCALAR,19.928000 +Gamma,PDF,128,VECTORIZED,2.031000 +Gamma,PDF,128,PARALLEL,4.003000 +Gamma,PDF,128,WORK_STEALING,3.991000 +Gamma,LogPDF,128,SCALAR,10.061000 +Gamma,LogPDF,128,VECTORIZED,1.374000 +Gamma,LogPDF,128,PARALLEL,1.996000 +Gamma,LogPDF,128,WORK_STEALING,1.809000 +Gamma,CDF,128,SCALAR,19.056000 +Gamma,CDF,128,VECTORIZED,10.211000 +Gamma,CDF,128,PARALLEL,10.013000 +Gamma,CDF,128,WORK_STEALING,9.898000 +Gamma,PDF,256,SCALAR,40.134000 +Gamma,PDF,256,VECTORIZED,3.551000 +Gamma,PDF,256,PARALLEL,7.938000 +Gamma,PDF,256,WORK_STEALING,7.981000 +Gamma,LogPDF,256,SCALAR,20.508000 +Gamma,LogPDF,256,VECTORIZED,2.303000 +Gamma,LogPDF,256,PARALLEL,3.899000 +Gamma,LogPDF,256,WORK_STEALING,3.452000 +Gamma,CDF,256,SCALAR,39.001000 +Gamma,CDF,256,VECTORIZED,20.813000 +Gamma,CDF,256,PARALLEL,20.774000 +Gamma,CDF,256,WORK_STEALING,20.492000 +Gamma,PDF,512,SCALAR,78.921000 +Gamma,PDF,512,VECTORIZED,6.728000 +Gamma,PDF,512,PARALLEL,15.603000 +Gamma,PDF,512,WORK_STEALING,15.628000 +Gamma,LogPDF,512,SCALAR,40.036000 +Gamma,LogPDF,512,VECTORIZED,4.402000 +Gamma,LogPDF,512,PARALLEL,7.826000 +Gamma,LogPDF,512,WORK_STEALING,6.798000 +Gamma,CDF,512,SCALAR,77.660000 +Gamma,CDF,512,VECTORIZED,42.501000 +Gamma,CDF,512,PARALLEL,42.531000 +Gamma,CDF,512,WORK_STEALING,41.557000 +Gamma,PDF,1000,SCALAR,155.275000 +Gamma,PDF,1000,VECTORIZED,12.746000 +Gamma,PDF,1000,PARALLEL,29.488000 +Gamma,PDF,1000,WORK_STEALING,29.508000 +Gamma,LogPDF,1000,SCALAR,75.912000 +Gamma,LogPDF,1000,VECTORIZED,7.748000 +Gamma,LogPDF,1000,PARALLEL,14.417000 +Gamma,LogPDF,1000,WORK_STEALING,12.926000 +Gamma,CDF,1000,SCALAR,148.716000 +Gamma,CDF,1000,VECTORIZED,82.877000 +Gamma,CDF,1000,PARALLEL,82.984000 +Gamma,CDF,1000,WORK_STEALING,84.882000 +Gamma,PDF,2000,SCALAR,305.433000 +Gamma,PDF,2000,VECTORIZED,24.651000 +Gamma,PDF,2000,PARALLEL,59.165000 +Gamma,PDF,2000,WORK_STEALING,58.950000 +Gamma,LogPDF,2000,SCALAR,149.125000 +Gamma,LogPDF,2000,VECTORIZED,15.602000 +Gamma,LogPDF,2000,PARALLEL,29.443000 +Gamma,LogPDF,2000,WORK_STEALING,26.092000 +Gamma,CDF,2000,SCALAR,327.059000 +Gamma,CDF,2000,VECTORIZED,175.314000 +Gamma,CDF,2000,PARALLEL,177.092000 +Gamma,CDF,2000,WORK_STEALING,179.018000 +Gamma,PDF,5000,SCALAR,784.078000 +Gamma,PDF,5000,VECTORIZED,62.136000 +Gamma,PDF,5000,PARALLEL,305.624000 +Gamma,PDF,5000,WORK_STEALING,222.364000 +Gamma,LogPDF,5000,SCALAR,379.808000 +Gamma,LogPDF,5000,VECTORIZED,39.782000 +Gamma,LogPDF,5000,PARALLEL,294.384000 +Gamma,LogPDF,5000,WORK_STEALING,196.202000 +Gamma,CDF,5000,SCALAR,775.302000 +Gamma,CDF,5000,VECTORIZED,502.503000 +Gamma,CDF,5000,PARALLEL,460.110000 +Gamma,CDF,5000,WORK_STEALING,392.814000 +Gamma,PDF,10000,SCALAR,1663.879000 +Gamma,PDF,10000,VECTORIZED,142.016000 +Gamma,PDF,10000,PARALLEL,343.288000 +Gamma,PDF,10000,WORK_STEALING,337.361000 +Gamma,LogPDF,10000,SCALAR,799.487000 +Gamma,LogPDF,10000,VECTORIZED,82.930000 +Gamma,LogPDF,10000,PARALLEL,315.408000 +Gamma,LogPDF,10000,WORK_STEALING,282.396000 +Gamma,CDF,10000,SCALAR,1559.241000 +Gamma,CDF,10000,VECTORIZED,912.499000 +Gamma,CDF,10000,PARALLEL,576.206000 +Gamma,CDF,10000,WORK_STEALING,412.103000 +Gamma,PDF,20000,SCALAR,3164.196000 +Gamma,PDF,20000,VECTORIZED,252.097000 +Gamma,PDF,20000,PARALLEL,465.641000 +Gamma,PDF,20000,WORK_STEALING,387.550000 +Gamma,LogPDF,20000,SCALAR,1679.974000 +Gamma,LogPDF,20000,VECTORIZED,184.304000 +Gamma,LogPDF,20000,PARALLEL,461.632000 +Gamma,LogPDF,20000,WORK_STEALING,349.018000 +Gamma,CDF,20000,SCALAR,3205.865000 +Gamma,CDF,20000,VECTORIZED,1762.643000 +Gamma,CDF,20000,PARALLEL,845.109000 +Gamma,CDF,20000,WORK_STEALING,564.148000 +Gamma,PDF,50000,SCALAR,8338.875000 +Gamma,PDF,50000,VECTORIZED,661.143000 +Gamma,PDF,50000,PARALLEL,656.430000 +Gamma,PDF,50000,WORK_STEALING,550.224000 +Gamma,LogPDF,50000,SCALAR,4237.671000 +Gamma,LogPDF,50000,VECTORIZED,436.299000 +Gamma,LogPDF,50000,PARALLEL,463.029000 +Gamma,LogPDF,50000,WORK_STEALING,424.276000 +Gamma,CDF,50000,SCALAR,8099.143000 +Gamma,CDF,50000,VECTORIZED,4618.882000 +Gamma,CDF,50000,PARALLEL,1578.213000 +Gamma,CDF,50000,WORK_STEALING,1159.759000 +Gamma,PDF,100000,SCALAR,16362.225000 +Gamma,PDF,100000,VECTORIZED,1394.856000 +Gamma,PDF,100000,PARALLEL,1301.229000 +Gamma,PDF,100000,WORK_STEALING,815.179000 +Gamma,LogPDF,100000,SCALAR,8144.265000 +Gamma,LogPDF,100000,VECTORIZED,893.855000 +Gamma,LogPDF,100000,PARALLEL,662.274000 +Gamma,LogPDF,100000,WORK_STEALING,631.080000 +Gamma,CDF,100000,SCALAR,17986.594000 +Gamma,CDF,100000,VECTORIZED,10241.864000 +Gamma,CDF,100000,PARALLEL,3033.652000 +Gamma,CDF,100000,WORK_STEALING,2190.159000 +Gamma,PDF,250000,SCALAR,43199.275000 +Gamma,PDF,250000,VECTORIZED,3890.429000 +Gamma,PDF,250000,PARALLEL,2456.631000 +Gamma,PDF,250000,WORK_STEALING,1887.759000 +Gamma,LogPDF,250000,SCALAR,22220.252000 +Gamma,LogPDF,250000,VECTORIZED,2422.180000 +Gamma,LogPDF,250000,PARALLEL,1754.839000 +Gamma,LogPDF,250000,WORK_STEALING,1310.030000 +Gamma,CDF,250000,SCALAR,42628.901000 +Gamma,CDF,250000,VECTORIZED,23748.739000 +Gamma,CDF,250000,PARALLEL,7458.434000 +Gamma,CDF,250000,WORK_STEALING,4971.674000 +Gamma,PDF,500000,SCALAR,83968.083000 +Gamma,PDF,500000,VECTORIZED,8045.496000 +Gamma,PDF,500000,PARALLEL,4698.337000 +Gamma,PDF,500000,WORK_STEALING,2690.037000 +Gamma,LogPDF,500000,SCALAR,41248.908000 +Gamma,LogPDF,500000,VECTORIZED,5607.463000 +Gamma,LogPDF,500000,PARALLEL,2445.029000 +Gamma,LogPDF,500000,WORK_STEALING,2037.304000 +Gamma,CDF,500000,SCALAR,80414.950000 +Gamma,CDF,500000,VECTORIZED,46931.096000 +Gamma,CDF,500000,PARALLEL,14246.839000 +Gamma,CDF,500000,WORK_STEALING,9718.016000 +StudentT,PDF,8,SCALAR,0.958000 +StudentT,PDF,8,VECTORIZED,0.480000 +StudentT,PDF,8,PARALLEL,0.626000 +StudentT,PDF,8,WORK_STEALING,0.606000 +StudentT,LogPDF,8,SCALAR,0.820000 +StudentT,LogPDF,8,VECTORIZED,0.443000 +StudentT,LogPDF,8,PARALLEL,0.477000 +StudentT,LogPDF,8,WORK_STEALING,0.499000 +StudentT,CDF,8,SCALAR,2.582000 +StudentT,CDF,8,VECTORIZED,2.034000 +StudentT,CDF,8,PARALLEL,2.058000 +StudentT,CDF,8,WORK_STEALING,2.032000 +StudentT,PDF,16,SCALAR,1.706000 +StudentT,PDF,16,VECTORIZED,0.537000 +StudentT,PDF,16,PARALLEL,0.792000 +StudentT,PDF,16,WORK_STEALING,0.794000 +StudentT,LogPDF,16,SCALAR,1.430000 +StudentT,LogPDF,16,VECTORIZED,0.471000 +StudentT,LogPDF,16,PARALLEL,0.541000 +StudentT,LogPDF,16,WORK_STEALING,0.543000 +StudentT,CDF,16,SCALAR,4.528000 +StudentT,CDF,16,VECTORIZED,3.439000 +StudentT,CDF,16,PARALLEL,3.447000 +StudentT,CDF,16,WORK_STEALING,3.390000 +StudentT,PDF,32,SCALAR,3.253000 +StudentT,PDF,32,VECTORIZED,0.694000 +StudentT,PDF,32,PARALLEL,1.249000 +StudentT,PDF,32,WORK_STEALING,1.230000 +StudentT,LogPDF,32,SCALAR,2.834000 +StudentT,LogPDF,32,VECTORIZED,0.546000 +StudentT,LogPDF,32,PARALLEL,0.789000 +StudentT,LogPDF,32,WORK_STEALING,0.772000 +StudentT,CDF,32,SCALAR,9.516000 +StudentT,CDF,32,VECTORIZED,7.239000 +StudentT,CDF,32,PARALLEL,7.354000 +StudentT,CDF,32,WORK_STEALING,7.239000 +StudentT,PDF,64,SCALAR,6.052000 +StudentT,PDF,64,VECTORIZED,1.076000 +StudentT,PDF,64,PARALLEL,2.016000 +StudentT,PDF,64,WORK_STEALING,2.062000 +StudentT,LogPDF,64,SCALAR,5.222000 +StudentT,LogPDF,64,VECTORIZED,0.756000 +StudentT,LogPDF,64,PARALLEL,1.169000 +StudentT,LogPDF,64,WORK_STEALING,1.190000 +StudentT,CDF,64,SCALAR,19.536000 +StudentT,CDF,64,VECTORIZED,14.776000 +StudentT,CDF,64,PARALLEL,14.871000 +StudentT,CDF,64,WORK_STEALING,14.815000 +StudentT,PDF,128,SCALAR,12.818000 +StudentT,PDF,128,VECTORIZED,1.845000 +StudentT,PDF,128,PARALLEL,3.978000 +StudentT,PDF,128,WORK_STEALING,3.925000 +StudentT,LogPDF,128,SCALAR,9.859000 +StudentT,LogPDF,128,VECTORIZED,1.197000 +StudentT,LogPDF,128,PARALLEL,1.945000 +StudentT,LogPDF,128,WORK_STEALING,1.960000 +StudentT,CDF,128,SCALAR,36.727000 +StudentT,CDF,128,VECTORIZED,28.118000 +StudentT,CDF,128,PARALLEL,27.967000 +StudentT,CDF,128,WORK_STEALING,28.040000 +StudentT,PDF,256,SCALAR,64.300000 +StudentT,PDF,256,VECTORIZED,3.257000 +StudentT,PDF,256,PARALLEL,7.376000 +StudentT,PDF,256,WORK_STEALING,7.426000 +StudentT,LogPDF,256,SCALAR,20.825000 +StudentT,LogPDF,256,VECTORIZED,2.037000 +StudentT,LogPDF,256,PARALLEL,3.624000 +StudentT,LogPDF,256,WORK_STEALING,3.510000 +StudentT,CDF,256,SCALAR,74.288000 +StudentT,CDF,256,VECTORIZED,56.546000 +StudentT,CDF,256,PARALLEL,56.307000 +StudentT,CDF,256,WORK_STEALING,59.200000 +StudentT,PDF,512,SCALAR,50.848000 +StudentT,PDF,512,VECTORIZED,6.341000 +StudentT,PDF,512,PARALLEL,14.401000 +StudentT,PDF,512,WORK_STEALING,14.580000 +StudentT,LogPDF,512,SCALAR,42.045000 +StudentT,LogPDF,512,VECTORIZED,3.799000 +StudentT,LogPDF,512,PARALLEL,6.839000 +StudentT,LogPDF,512,WORK_STEALING,6.820000 +StudentT,CDF,512,SCALAR,155.491000 +StudentT,CDF,512,VECTORIZED,114.500000 +StudentT,CDF,512,PARALLEL,115.214000 +StudentT,CDF,512,WORK_STEALING,122.201000 +StudentT,PDF,1000,SCALAR,98.173000 +StudentT,PDF,1000,VECTORIZED,11.914000 +StudentT,PDF,1000,PARALLEL,28.686000 +StudentT,PDF,1000,WORK_STEALING,28.632000 +StudentT,LogPDF,1000,SCALAR,80.640000 +StudentT,LogPDF,1000,VECTORIZED,7.082000 +StudentT,LogPDF,1000,PARALLEL,13.377000 +StudentT,LogPDF,1000,WORK_STEALING,13.051000 +StudentT,CDF,1000,SCALAR,305.896000 +StudentT,CDF,1000,VECTORIZED,271.172000 +StudentT,CDF,1000,PARALLEL,228.912000 +StudentT,CDF,1000,WORK_STEALING,236.350000 +StudentT,PDF,2000,SCALAR,205.468000 +StudentT,PDF,2000,VECTORIZED,24.393000 +StudentT,PDF,2000,PARALLEL,58.371000 +StudentT,PDF,2000,WORK_STEALING,58.589000 +StudentT,LogPDF,2000,SCALAR,162.475000 +StudentT,LogPDF,2000,VECTORIZED,13.895000 +StudentT,LogPDF,2000,PARALLEL,26.097000 +StudentT,LogPDF,2000,WORK_STEALING,26.119000 +StudentT,CDF,2000,SCALAR,582.619000 +StudentT,CDF,2000,VECTORIZED,456.684000 +StudentT,CDF,2000,PARALLEL,533.553000 +StudentT,CDF,2000,WORK_STEALING,1245.194000 +StudentT,PDF,5000,SCALAR,629.826000 +StudentT,PDF,5000,VECTORIZED,64.740000 +StudentT,PDF,5000,PARALLEL,180.042000 +StudentT,PDF,5000,WORK_STEALING,157.748000 +StudentT,LogPDF,5000,SCALAR,499.626000 +StudentT,LogPDF,5000,VECTORIZED,37.082000 +StudentT,LogPDF,5000,PARALLEL,69.581000 +StudentT,LogPDF,5000,WORK_STEALING,69.319000 +StudentT,CDF,5000,SCALAR,1628.848000 +StudentT,CDF,5000,VECTORIZED,1202.050000 +StudentT,CDF,5000,PARALLEL,1187.711000 +StudentT,CDF,5000,WORK_STEALING,1158.973000 +StudentT,PDF,10000,SCALAR,1162.008000 +StudentT,PDF,10000,VECTORIZED,121.889000 +StudentT,PDF,10000,PARALLEL,632.676000 +StudentT,PDF,10000,WORK_STEALING,624.997000 +StudentT,LogPDF,10000,SCALAR,899.599000 +StudentT,LogPDF,10000,VECTORIZED,73.491000 +StudentT,LogPDF,10000,PARALLEL,630.018000 +StudentT,LogPDF,10000,WORK_STEALING,378.860000 +StudentT,CDF,10000,SCALAR,3342.026000 +StudentT,CDF,10000,VECTORIZED,2282.506000 +StudentT,CDF,10000,PARALLEL,2329.639000 +StudentT,CDF,10000,WORK_STEALING,2376.198000 +StudentT,PDF,20000,SCALAR,2101.707000 +StudentT,PDF,20000,VECTORIZED,240.398000 +StudentT,PDF,20000,PARALLEL,697.491000 +StudentT,PDF,20000,WORK_STEALING,552.403000 +StudentT,LogPDF,20000,SCALAR,1797.985000 +StudentT,LogPDF,20000,VECTORIZED,166.474000 +StudentT,LogPDF,20000,PARALLEL,1126.516000 +StudentT,LogPDF,20000,WORK_STEALING,475.552000 +StudentT,CDF,20000,SCALAR,6682.219000 +StudentT,CDF,20000,VECTORIZED,4681.422000 +StudentT,CDF,20000,PARALLEL,4841.769000 +StudentT,CDF,20000,WORK_STEALING,4775.221000 +StudentT,PDF,50000,SCALAR,5394.263000 +StudentT,PDF,50000,VECTORIZED,670.287000 +StudentT,PDF,50000,PARALLEL,791.752000 +StudentT,PDF,50000,WORK_STEALING,883.200000 +StudentT,LogPDF,50000,SCALAR,4421.506000 +StudentT,LogPDF,50000,VECTORIZED,389.383000 +StudentT,LogPDF,50000,PARALLEL,468.194000 +StudentT,LogPDF,50000,WORK_STEALING,392.177000 +StudentT,CDF,50000,SCALAR,15676.870000 +StudentT,CDF,50000,VECTORIZED,12061.435000 +StudentT,CDF,50000,PARALLEL,12005.050000 +StudentT,CDF,50000,WORK_STEALING,11964.562000 +StudentT,PDF,100000,SCALAR,10983.479000 +StudentT,PDF,100000,VECTORIZED,1330.286000 +StudentT,PDF,100000,PARALLEL,907.325000 +StudentT,PDF,100000,WORK_STEALING,1134.172000 +StudentT,LogPDF,100000,SCALAR,8920.513000 +StudentT,LogPDF,100000,VECTORIZED,812.802000 +StudentT,LogPDF,100000,PARALLEL,661.982000 +StudentT,LogPDF,100000,WORK_STEALING,850.467000 +StudentT,CDF,100000,SCALAR,32025.274000 +StudentT,CDF,100000,VECTORIZED,23224.341000 +StudentT,CDF,100000,PARALLEL,24053.166000 +StudentT,CDF,100000,WORK_STEALING,23864.158000 +StudentT,PDF,250000,SCALAR,25744.895000 +StudentT,PDF,250000,VECTORIZED,3173.367000 +StudentT,PDF,250000,PARALLEL,2117.561000 +StudentT,PDF,250000,WORK_STEALING,2071.162000 +StudentT,LogPDF,250000,SCALAR,20780.342000 +StudentT,LogPDF,250000,VECTORIZED,2076.578000 +StudentT,LogPDF,250000,PARALLEL,1163.776000 +StudentT,LogPDF,250000,WORK_STEALING,1184.104000 +StudentT,CDF,250000,SCALAR,75867.797000 +StudentT,CDF,250000,VECTORIZED,57617.375000 +StudentT,CDF,250000,PARALLEL,58952.586000 +StudentT,CDF,250000,WORK_STEALING,60256.292000 +StudentT,PDF,500000,SCALAR,53410.095000 +StudentT,PDF,500000,VECTORIZED,7743.481000 +StudentT,PDF,500000,PARALLEL,4299.085000 +StudentT,PDF,500000,WORK_STEALING,4677.026000 +StudentT,LogPDF,500000,SCALAR,43528.122000 +StudentT,LogPDF,500000,VECTORIZED,4582.641000 +StudentT,LogPDF,500000,PARALLEL,2343.243000 +StudentT,LogPDF,500000,WORK_STEALING,2254.997000 +StudentT,CDF,500000,SCALAR,153916.958000 +StudentT,CDF,500000,VECTORIZED,115474.922000 +StudentT,CDF,500000,PARALLEL,120756.854000 +StudentT,CDF,500000,WORK_STEALING,119764.019000 +Beta,PDF,8,SCALAR,0.942000 +Beta,PDF,8,VECTORIZED,1.037000 +Beta,PDF,8,PARALLEL,0.730000 +Beta,PDF,8,WORK_STEALING,0.678000 +Beta,LogPDF,8,SCALAR,0.776000 +Beta,LogPDF,8,VECTORIZED,0.966000 +Beta,LogPDF,8,PARALLEL,0.633000 +Beta,LogPDF,8,WORK_STEALING,0.580000 +Beta,CDF,8,SCALAR,1.658000 +Beta,CDF,8,VECTORIZED,1.252000 +Beta,CDF,8,PARALLEL,1.743000 +Beta,CDF,8,WORK_STEALING,1.711000 +Beta,PDF,16,SCALAR,1.751000 +Beta,PDF,16,VECTORIZED,1.272000 +Beta,PDF,16,PARALLEL,1.098000 +Beta,PDF,16,WORK_STEALING,1.081000 +Beta,LogPDF,16,SCALAR,1.521000 +Beta,LogPDF,16,VECTORIZED,1.120000 +Beta,LogPDF,16,PARALLEL,0.830000 +Beta,LogPDF,16,WORK_STEALING,0.821000 +Beta,CDF,16,SCALAR,3.297000 +Beta,CDF,16,VECTORIZED,2.437000 +Beta,CDF,16,PARALLEL,3.346000 +Beta,CDF,16,WORK_STEALING,3.327000 +Beta,PDF,32,SCALAR,3.430000 +Beta,PDF,32,VECTORIZED,1.902000 +Beta,PDF,32,PARALLEL,1.930000 +Beta,PDF,32,WORK_STEALING,1.933000 +Beta,LogPDF,32,SCALAR,2.820000 +Beta,LogPDF,32,VECTORIZED,1.718000 +Beta,LogPDF,32,PARALLEL,1.453000 +Beta,LogPDF,32,WORK_STEALING,1.434000 +Beta,CDF,32,SCALAR,6.302000 +Beta,CDF,32,VECTORIZED,4.524000 +Beta,CDF,32,PARALLEL,6.350000 +Beta,CDF,32,WORK_STEALING,6.220000 +Beta,PDF,64,SCALAR,6.588000 +Beta,PDF,64,VECTORIZED,3.118000 +Beta,PDF,64,PARALLEL,3.612000 +Beta,PDF,64,WORK_STEALING,3.598000 +Beta,LogPDF,64,SCALAR,5.717000 +Beta,LogPDF,64,VECTORIZED,2.772000 +Beta,LogPDF,64,PARALLEL,2.687000 +Beta,LogPDF,64,WORK_STEALING,2.676000 +Beta,CDF,64,SCALAR,11.549000 +Beta,CDF,64,VECTORIZED,8.263000 +Beta,CDF,64,PARALLEL,12.042000 +Beta,CDF,64,WORK_STEALING,12.108000 +Beta,PDF,128,SCALAR,13.785000 +Beta,PDF,128,VECTORIZED,4.085000 +Beta,PDF,128,PARALLEL,6.806000 +Beta,PDF,128,WORK_STEALING,6.894000 +Beta,LogPDF,128,SCALAR,12.151000 +Beta,LogPDF,128,VECTORIZED,3.803000 +Beta,LogPDF,128,PARALLEL,4.569000 +Beta,LogPDF,128,WORK_STEALING,4.519000 +Beta,CDF,128,SCALAR,25.949000 +Beta,CDF,128,VECTORIZED,18.766000 +Beta,CDF,128,PARALLEL,25.622000 +Beta,CDF,128,WORK_STEALING,25.645000 +Beta,PDF,256,SCALAR,26.973000 +Beta,PDF,256,VECTORIZED,7.537000 +Beta,PDF,256,PARALLEL,12.887000 +Beta,PDF,256,WORK_STEALING,12.843000 +Beta,LogPDF,256,SCALAR,22.399000 +Beta,LogPDF,256,VECTORIZED,6.280000 +Beta,LogPDF,256,PARALLEL,8.504000 +Beta,LogPDF,256,WORK_STEALING,8.422000 +Beta,CDF,256,SCALAR,53.871000 +Beta,CDF,256,VECTORIZED,39.654000 +Beta,CDF,256,PARALLEL,54.397000 +Beta,CDF,256,WORK_STEALING,54.373000 +Beta,PDF,512,SCALAR,56.362000 +Beta,PDF,512,VECTORIZED,16.861000 +Beta,PDF,512,PARALLEL,28.237000 +Beta,PDF,512,WORK_STEALING,27.969000 +Beta,LogPDF,512,SCALAR,48.628000 +Beta,LogPDF,512,VECTORIZED,14.928000 +Beta,LogPDF,512,PARALLEL,19.007000 +Beta,LogPDF,512,WORK_STEALING,18.758000 +Beta,CDF,512,SCALAR,107.630000 +Beta,CDF,512,VECTORIZED,73.947000 +Beta,CDF,512,PARALLEL,107.731000 +Beta,CDF,512,WORK_STEALING,107.534000 +Beta,PDF,1000,SCALAR,114.508000 +Beta,PDF,1000,VECTORIZED,33.569000 +Beta,PDF,1000,PARALLEL,55.837000 +Beta,PDF,1000,WORK_STEALING,53.587000 +Beta,LogPDF,1000,SCALAR,96.709000 +Beta,LogPDF,1000,VECTORIZED,28.851000 +Beta,LogPDF,1000,PARALLEL,37.778000 +Beta,LogPDF,1000,WORK_STEALING,37.405000 +Beta,CDF,1000,SCALAR,211.037000 +Beta,CDF,1000,VECTORIZED,150.541000 +Beta,CDF,1000,PARALLEL,195.743000 +Beta,CDF,1000,WORK_STEALING,202.510000 +Beta,PDF,2000,SCALAR,221.998000 +Beta,PDF,2000,VECTORIZED,61.234000 +Beta,PDF,2000,PARALLEL,107.159000 +Beta,PDF,2000,WORK_STEALING,111.062000 +Beta,LogPDF,2000,SCALAR,193.631000 +Beta,LogPDF,2000,VECTORIZED,58.787000 +Beta,LogPDF,2000,PARALLEL,84.305000 +Beta,LogPDF,2000,WORK_STEALING,78.146000 +Beta,CDF,2000,SCALAR,421.688000 +Beta,CDF,2000,VECTORIZED,293.511000 +Beta,CDF,2000,PARALLEL,427.414000 +Beta,CDF,2000,WORK_STEALING,410.311000 +Beta,PDF,5000,SCALAR,565.452000 +Beta,PDF,5000,VECTORIZED,159.332000 +Beta,PDF,5000,PARALLEL,280.514000 +Beta,PDF,5000,WORK_STEALING,271.294000 +Beta,LogPDF,5000,SCALAR,447.077000 +Beta,LogPDF,5000,VECTORIZED,139.933000 +Beta,LogPDF,5000,PARALLEL,191.247000 +Beta,LogPDF,5000,WORK_STEALING,198.140000 +Beta,CDF,5000,SCALAR,1101.810000 +Beta,CDF,5000,VECTORIZED,733.541000 +Beta,CDF,5000,PARALLEL,1021.361000 +Beta,CDF,5000,WORK_STEALING,1084.597000 +Beta,PDF,10000,SCALAR,1104.677000 +Beta,PDF,10000,VECTORIZED,343.525000 +Beta,PDF,10000,PARALLEL,992.900000 +Beta,PDF,10000,WORK_STEALING,1035.336000 +Beta,LogPDF,10000,SCALAR,904.276000 +Beta,LogPDF,10000,VECTORIZED,322.386000 +Beta,LogPDF,10000,PARALLEL,926.092000 +Beta,LogPDF,10000,WORK_STEALING,1028.538000 +Beta,CDF,10000,SCALAR,2076.387000 +Beta,CDF,10000,VECTORIZED,1497.362000 +Beta,CDF,10000,PARALLEL,2075.227000 +Beta,CDF,10000,WORK_STEALING,1969.129000 +Beta,PDF,20000,SCALAR,2208.022000 +Beta,PDF,20000,VECTORIZED,653.350000 +Beta,PDF,20000,PARALLEL,1913.629000 +Beta,PDF,20000,WORK_STEALING,1999.534000 +Beta,LogPDF,20000,SCALAR,2149.963000 +Beta,LogPDF,20000,VECTORIZED,604.568000 +Beta,LogPDF,20000,PARALLEL,1705.584000 +Beta,LogPDF,20000,WORK_STEALING,1688.624000 +Beta,CDF,20000,SCALAR,4261.475000 +Beta,CDF,20000,VECTORIZED,3058.145000 +Beta,CDF,20000,PARALLEL,4056.192000 +Beta,CDF,20000,WORK_STEALING,4039.362000 +Beta,PDF,50000,SCALAR,5515.970000 +Beta,PDF,50000,VECTORIZED,1666.887000 +Beta,PDF,50000,PARALLEL,3557.233000 +Beta,PDF,50000,WORK_STEALING,3642.062000 +Beta,LogPDF,50000,SCALAR,4916.122000 +Beta,LogPDF,50000,VECTORIZED,1404.607000 +Beta,LogPDF,50000,PARALLEL,2854.939000 +Beta,LogPDF,50000,WORK_STEALING,2856.408000 +Beta,CDF,50000,SCALAR,10196.959000 +Beta,CDF,50000,VECTORIZED,7394.720000 +Beta,CDF,50000,PARALLEL,10369.104000 +Beta,CDF,50000,WORK_STEALING,10349.841000 +Beta,PDF,100000,SCALAR,10872.491000 +Beta,PDF,100000,VECTORIZED,3421.353000 +Beta,PDF,100000,PARALLEL,8522.222000 +Beta,PDF,100000,WORK_STEALING,6690.390000 +Beta,LogPDF,100000,SCALAR,9512.360000 +Beta,LogPDF,100000,VECTORIZED,2926.962000 +Beta,LogPDF,100000,PARALLEL,4928.826000 +Beta,LogPDF,100000,WORK_STEALING,5049.905000 +Beta,CDF,100000,SCALAR,20411.823000 +Beta,CDF,100000,VECTORIZED,14874.081000 +Beta,CDF,100000,PARALLEL,20422.756000 +Beta,CDF,100000,WORK_STEALING,20777.464000 +Beta,PDF,250000,SCALAR,28862.668000 +Beta,PDF,250000,VECTORIZED,9261.105000 +Beta,PDF,250000,PARALLEL,16337.822000 +Beta,PDF,250000,WORK_STEALING,17184.326000 +Beta,LogPDF,250000,SCALAR,23750.288000 +Beta,LogPDF,250000,VECTORIZED,8077.815000 +Beta,LogPDF,250000,PARALLEL,12657.385000 +Beta,LogPDF,250000,WORK_STEALING,12484.529000 +Beta,CDF,250000,SCALAR,54558.686000 +Beta,CDF,250000,VECTORIZED,39713.597000 +Beta,CDF,250000,PARALLEL,54729.834000 +Beta,CDF,250000,WORK_STEALING,51906.487000 +Beta,PDF,500000,SCALAR,55909.674000 +Beta,PDF,500000,VECTORIZED,18759.322000 +Beta,PDF,500000,PARALLEL,30201.645000 +Beta,PDF,500000,WORK_STEALING,29897.268000 +Beta,LogPDF,500000,SCALAR,47104.603000 +Beta,LogPDF,500000,VECTORIZED,16603.170000 +Beta,LogPDF,500000,PARALLEL,26288.007000 +Beta,LogPDF,500000,WORK_STEALING,23641.672000 +Beta,CDF,500000,SCALAR,108133.486000 +Beta,CDF,500000,VECTORIZED,76353.624000 +Beta,CDF,500000,PARALLEL,103562.284000 +Beta,CDF,500000,WORK_STEALING,102371.057000 +ChiSquared,PDF,8,SCALAR,1.343000 +ChiSquared,PDF,8,VECTORIZED,0.923000 +ChiSquared,PDF,8,PARALLEL,0.405000 +ChiSquared,PDF,8,WORK_STEALING,0.404000 +ChiSquared,LogPDF,8,SCALAR,0.738000 +ChiSquared,LogPDF,8,VECTORIZED,0.814000 +ChiSquared,LogPDF,8,PARALLEL,0.286000 +ChiSquared,LogPDF,8,WORK_STEALING,0.273000 +ChiSquared,CDF,8,SCALAR,1.247000 +ChiSquared,CDF,8,VECTORIZED,1.204000 +ChiSquared,CDF,8,PARALLEL,0.782000 +ChiSquared,CDF,8,WORK_STEALING,0.761000 +ChiSquared,PDF,16,SCALAR,2.564000 +ChiSquared,PDF,16,VECTORIZED,0.954000 +ChiSquared,PDF,16,PARALLEL,0.624000 +ChiSquared,PDF,16,WORK_STEALING,0.628000 +ChiSquared,LogPDF,16,SCALAR,1.422000 +ChiSquared,LogPDF,16,VECTORIZED,0.913000 +ChiSquared,LogPDF,16,PARALLEL,0.397000 +ChiSquared,LogPDF,16,WORK_STEALING,0.358000 +ChiSquared,CDF,16,SCALAR,2.448000 +ChiSquared,CDF,16,VECTORIZED,1.953000 +ChiSquared,CDF,16,PARALLEL,1.451000 +ChiSquared,CDF,16,WORK_STEALING,1.482000 +ChiSquared,PDF,32,SCALAR,5.213000 +ChiSquared,PDF,32,VECTORIZED,1.167000 +ChiSquared,PDF,32,PARALLEL,1.160000 +ChiSquared,PDF,32,WORK_STEALING,1.160000 +ChiSquared,LogPDF,32,SCALAR,2.690000 +ChiSquared,LogPDF,32,VECTORIZED,0.994000 +ChiSquared,LogPDF,32,PARALLEL,0.594000 +ChiSquared,LogPDF,32,WORK_STEALING,0.582000 +ChiSquared,CDF,32,SCALAR,4.956000 +ChiSquared,CDF,32,VECTORIZED,3.145000 +ChiSquared,CDF,32,PARALLEL,2.753000 +ChiSquared,CDF,32,WORK_STEALING,2.688000 +ChiSquared,PDF,64,SCALAR,10.082000 +ChiSquared,PDF,64,VECTORIZED,1.511000 +ChiSquared,PDF,64,PARALLEL,2.117000 +ChiSquared,PDF,64,WORK_STEALING,2.102000 +ChiSquared,LogPDF,64,SCALAR,5.113000 +ChiSquared,LogPDF,64,VECTORIZED,1.165000 +ChiSquared,LogPDF,64,PARALLEL,1.141000 +ChiSquared,LogPDF,64,WORK_STEALING,0.978000 +ChiSquared,CDF,64,SCALAR,10.072000 +ChiSquared,CDF,64,VECTORIZED,5.696000 +ChiSquared,CDF,64,PARALLEL,5.387000 +ChiSquared,CDF,64,WORK_STEALING,5.254000 +ChiSquared,PDF,128,SCALAR,20.479000 +ChiSquared,PDF,128,VECTORIZED,2.063000 +ChiSquared,PDF,128,PARALLEL,4.093000 +ChiSquared,PDF,128,WORK_STEALING,4.056000 +ChiSquared,LogPDF,128,SCALAR,10.150000 +ChiSquared,LogPDF,128,VECTORIZED,1.405000 +ChiSquared,LogPDF,128,PARALLEL,2.082000 +ChiSquared,LogPDF,128,WORK_STEALING,1.831000 +ChiSquared,CDF,128,SCALAR,20.093000 +ChiSquared,CDF,128,VECTORIZED,10.839000 +ChiSquared,CDF,128,PARALLEL,10.665000 +ChiSquared,CDF,128,WORK_STEALING,10.489000 +ChiSquared,PDF,256,SCALAR,41.856000 +ChiSquared,PDF,256,VECTORIZED,3.668000 +ChiSquared,PDF,256,PARALLEL,8.187000 +ChiSquared,PDF,256,WORK_STEALING,8.151000 +ChiSquared,LogPDF,256,SCALAR,21.487000 +ChiSquared,LogPDF,256,VECTORIZED,2.415000 +ChiSquared,LogPDF,256,PARALLEL,4.093000 +ChiSquared,LogPDF,256,WORK_STEALING,3.650000 +ChiSquared,CDF,256,SCALAR,41.893000 +ChiSquared,CDF,256,VECTORIZED,22.678000 +ChiSquared,CDF,256,PARALLEL,22.618000 +ChiSquared,CDF,256,WORK_STEALING,22.251000 +ChiSquared,PDF,512,SCALAR,82.192000 +ChiSquared,PDF,512,VECTORIZED,6.987000 +ChiSquared,PDF,512,PARALLEL,16.239000 +ChiSquared,PDF,512,WORK_STEALING,16.153000 +ChiSquared,LogPDF,512,SCALAR,41.406000 +ChiSquared,LogPDF,512,VECTORIZED,4.509000 +ChiSquared,LogPDF,512,PARALLEL,8.016000 +ChiSquared,LogPDF,512,WORK_STEALING,7.037000 +ChiSquared,CDF,512,SCALAR,82.090000 +ChiSquared,CDF,512,VECTORIZED,45.546000 +ChiSquared,CDF,512,PARALLEL,46.409000 +ChiSquared,CDF,512,WORK_STEALING,45.382000 +ChiSquared,PDF,1000,SCALAR,164.702000 +ChiSquared,PDF,1000,VECTORIZED,13.799000 +ChiSquared,PDF,1000,PARALLEL,32.548000 +ChiSquared,PDF,1000,WORK_STEALING,32.531000 +ChiSquared,LogPDF,1000,SCALAR,80.936000 +ChiSquared,LogPDF,1000,VECTORIZED,8.362000 +ChiSquared,LogPDF,1000,PARALLEL,15.322000 +ChiSquared,LogPDF,1000,WORK_STEALING,13.737000 +ChiSquared,CDF,1000,SCALAR,164.500000 +ChiSquared,CDF,1000,VECTORIZED,94.744000 +ChiSquared,CDF,1000,PARALLEL,121.025000 +ChiSquared,CDF,1000,WORK_STEALING,89.524000 +ChiSquared,PDF,2000,SCALAR,337.780000 +ChiSquared,PDF,2000,VECTORIZED,27.287000 +ChiSquared,PDF,2000,PARALLEL,63.595000 +ChiSquared,PDF,2000,WORK_STEALING,62.586000 +ChiSquared,LogPDF,2000,SCALAR,164.180000 +ChiSquared,LogPDF,2000,VECTORIZED,17.427000 +ChiSquared,LogPDF,2000,PARALLEL,32.200000 +ChiSquared,LogPDF,2000,WORK_STEALING,28.967000 +ChiSquared,CDF,2000,SCALAR,341.462000 +ChiSquared,CDF,2000,VECTORIZED,193.267000 +ChiSquared,CDF,2000,PARALLEL,202.126000 +ChiSquared,CDF,2000,WORK_STEALING,189.098000 +ChiSquared,PDF,5000,SCALAR,807.542000 +ChiSquared,PDF,5000,VECTORIZED,67.210000 +ChiSquared,PDF,5000,PARALLEL,424.606000 +ChiSquared,PDF,5000,WORK_STEALING,294.679000 +ChiSquared,LogPDF,5000,SCALAR,399.460000 +ChiSquared,LogPDF,5000,VECTORIZED,41.743000 +ChiSquared,LogPDF,5000,PARALLEL,221.123000 +ChiSquared,LogPDF,5000,WORK_STEALING,220.354000 +ChiSquared,CDF,5000,SCALAR,802.052000 +ChiSquared,CDF,5000,VECTORIZED,469.391000 +ChiSquared,CDF,5000,PARALLEL,426.302000 +ChiSquared,CDF,5000,WORK_STEALING,400.891000 +ChiSquared,PDF,10000,SCALAR,1661.799000 +ChiSquared,PDF,10000,VECTORIZED,129.504000 +ChiSquared,PDF,10000,PARALLEL,378.710000 +ChiSquared,PDF,10000,WORK_STEALING,274.857000 +ChiSquared,LogPDF,10000,SCALAR,785.411000 +ChiSquared,LogPDF,10000,VECTORIZED,82.764000 +ChiSquared,LogPDF,10000,PARALLEL,542.558000 +ChiSquared,LogPDF,10000,WORK_STEALING,242.743000 +ChiSquared,CDF,10000,SCALAR,1653.611000 +ChiSquared,CDF,10000,VECTORIZED,948.999000 +ChiSquared,CDF,10000,PARALLEL,499.617000 +ChiSquared,CDF,10000,WORK_STEALING,475.294000 +ChiSquared,PDF,20000,SCALAR,3559.359000 +ChiSquared,PDF,20000,VECTORIZED,295.997000 +ChiSquared,PDF,20000,PARALLEL,727.534000 +ChiSquared,PDF,20000,WORK_STEALING,360.970000 +ChiSquared,LogPDF,20000,SCALAR,1592.501000 +ChiSquared,LogPDF,20000,VECTORIZED,169.288000 +ChiSquared,LogPDF,20000,PARALLEL,606.308000 +ChiSquared,LogPDF,20000,WORK_STEALING,292.180000 +ChiSquared,CDF,20000,SCALAR,3380.129000 +ChiSquared,CDF,20000,VECTORIZED,1934.701000 +ChiSquared,CDF,20000,PARALLEL,1256.116000 +ChiSquared,CDF,20000,WORK_STEALING,598.151000 +ChiSquared,PDF,50000,SCALAR,8598.857000 +ChiSquared,PDF,50000,VECTORIZED,678.740000 +ChiSquared,PDF,50000,PARALLEL,757.780000 +ChiSquared,PDF,50000,WORK_STEALING,553.270000 +ChiSquared,LogPDF,50000,SCALAR,4303.628000 +ChiSquared,LogPDF,50000,VECTORIZED,446.941000 +ChiSquared,LogPDF,50000,PARALLEL,546.774000 +ChiSquared,LogPDF,50000,WORK_STEALING,542.003000 +ChiSquared,CDF,50000,SCALAR,8866.315000 +ChiSquared,CDF,50000,VECTORIZED,5011.863000 +ChiSquared,CDF,50000,PARALLEL,1712.056000 +ChiSquared,CDF,50000,WORK_STEALING,1275.405000 +ChiSquared,PDF,100000,SCALAR,19289.136000 +ChiSquared,PDF,100000,VECTORIZED,1420.630000 +ChiSquared,PDF,100000,PARALLEL,1187.866000 +ChiSquared,PDF,100000,WORK_STEALING,843.277000 +ChiSquared,LogPDF,100000,SCALAR,8606.590000 +ChiSquared,LogPDF,100000,VECTORIZED,894.854000 +ChiSquared,LogPDF,100000,PARALLEL,696.861000 +ChiSquared,LogPDF,100000,WORK_STEALING,644.719000 +ChiSquared,CDF,100000,SCALAR,17804.283000 +ChiSquared,CDF,100000,VECTORIZED,10235.941000 +ChiSquared,CDF,100000,PARALLEL,2999.134000 +ChiSquared,CDF,100000,WORK_STEALING,2206.945000 +ChiSquared,PDF,250000,SCALAR,44184.881000 +ChiSquared,PDF,250000,VECTORIZED,3999.355000 +ChiSquared,PDF,250000,PARALLEL,2626.943000 +ChiSquared,PDF,250000,WORK_STEALING,1655.998000 +ChiSquared,LogPDF,250000,SCALAR,22938.344000 +ChiSquared,LogPDF,250000,VECTORIZED,2890.097000 +ChiSquared,LogPDF,250000,PARALLEL,1511.402000 +ChiSquared,LogPDF,250000,WORK_STEALING,1284.253000 +ChiSquared,CDF,250000,SCALAR,45146.338000 +ChiSquared,CDF,250000,VECTORIZED,26670.349000 +ChiSquared,CDF,250000,PARALLEL,7488.741000 +ChiSquared,CDF,250000,WORK_STEALING,5526.219000 +ChiSquared,PDF,500000,SCALAR,87561.250000 +ChiSquared,PDF,500000,VECTORIZED,8251.868000 +ChiSquared,PDF,500000,PARALLEL,4862.540000 +ChiSquared,PDF,500000,WORK_STEALING,2664.156000 +ChiSquared,LogPDF,500000,SCALAR,43946.748000 +ChiSquared,LogPDF,500000,VECTORIZED,6010.522000 +ChiSquared,LogPDF,500000,PARALLEL,2389.656000 +ChiSquared,LogPDF,500000,WORK_STEALING,1483.282000 +ChiSquared,CDF,500000,SCALAR,90282.907000 +ChiSquared,CDF,500000,VECTORIZED,54050.248000 +ChiSquared,CDF,500000,PARALLEL,15632.033000 +ChiSquared,CDF,500000,WORK_STEALING,12424.388000 diff --git a/data/profiles/dispatcher/2026-04-12T05-27-04Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-0e4e9f1/summary.json b/data/profiles/dispatcher/2026-04-12T05-27-04Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-0e4e9f1/summary.json new file mode 100644 index 0000000..b37bb3a --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T05-27-04Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-0e4e9f1/summary.json @@ -0,0 +1,183 @@ +{ + "run_id": "2026-04-12T05-27-04Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-0e4e9f1", + "data_source": "strategy_profile_results.csv", + "metadata": { + "captured_at_utc": "2026-04-12T05-27-04Z", + "run_id": "2026-04-12T05-27-04Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-0e4e9f1", + "git_branch": "investigate-gaussian-avx512-perf", + "git_sha": "0e4e9f1", + "project_root": "/Users/wolfman/Development/libstats", + "build_dir": "/Users/wolfman/Development/libstats/build", + "build_type": "Release", + "cxx_compiler": "", + "os": "darwin", + "arch": "x86_64", + "cpu_brand": "Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz", + "physical_cores": "4", + "logical_cores": "8" + }, + "coverage": { + "distributions": [ + "Beta", + "ChiSquared", + "Discrete", + "Exponential", + "Gamma", + "Gaussian", + "Poisson", + "StudentT", + "Uniform" + ], + "operations": [ + "CDF", + "LogPDF", + "PDF" + ], + "batch_sizes": [ + 8, + 16, + 32, + 64, + 128, + 256, + 512, + 1000, + 2000, + 5000, + 10000, + 20000, + 50000, + 100000, + 250000, + 500000 + ], + "total_measurements": 1728 + }, + "strategy_win_counts": { + "VECTORIZED": 236, + "WORK_STEALING": 169, + "PARALLEL": 23, + "SCALAR": 4 + }, + "crossover_summary": { + "groups": 27, + "vectorized_never_wins": [], + "parallel_crossover_sizes": [ + { + "distribution": "Beta", + "operation": "LogPDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Beta", + "operation": "PDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "ChiSquared", + "operation": "CDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "ChiSquared", + "operation": "LogPDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "ChiSquared", + "operation": "PDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Discrete", + "operation": "CDF", + "vectorized_to_parallel": 16 + }, + { + "distribution": "Discrete", + "operation": "LogPDF", + "vectorized_to_parallel": 16 + }, + { + "distribution": "Discrete", + "operation": "PDF", + "vectorized_to_parallel": 16 + }, + { + "distribution": "Exponential", + "operation": "CDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Exponential", + "operation": "LogPDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Exponential", + "operation": "PDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Gamma", + "operation": "CDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Gamma", + "operation": "LogPDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Gamma", + "operation": "PDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Gaussian", + "operation": "LogPDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Gaussian", + "operation": "PDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Poisson", + "operation": "CDF", + "vectorized_to_parallel": 16 + }, + { + "distribution": "Poisson", + "operation": "LogPDF", + "vectorized_to_parallel": 50000 + }, + { + "distribution": "Poisson", + "operation": "PDF", + "vectorized_to_parallel": 2000 + }, + { + "distribution": "StudentT", + "operation": "CDF", + "vectorized_to_parallel": 128 + }, + { + "distribution": "StudentT", + "operation": "LogPDF", + "vectorized_to_parallel": 100000 + }, + { + "distribution": "StudentT", + "operation": "PDF", + "vectorized_to_parallel": 100000 + }, + { + "distribution": "Uniform", + "operation": "CDF", + "vectorized_to_parallel": 8 + } + ] + } +} diff --git a/data/profiles/dispatcher/2026-04-12T05-36-21Z_darwin-arm64_investigate-gaussian-avx512-perf_sha-6aef918/best_strategies.csv b/data/profiles/dispatcher/2026-04-12T05-36-21Z_darwin-arm64_investigate-gaussian-avx512-perf_sha-6aef918/best_strategies.csv new file mode 100644 index 0000000..f3879b0 --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T05-36-21Z_darwin-arm64_investigate-gaussian-avx512-perf_sha-6aef918/best_strategies.csv @@ -0,0 +1,433 @@ +distribution,operation,batch_size,best_strategy,best_time_us,scalar_time_us,speedup_vs_scalar +Beta,CDF,8,VECTORIZED,0.375,0.5,1.333 +Beta,CDF,16,VECTORIZED,0.75,1.0,1.333 +Beta,CDF,32,VECTORIZED,1.416,2.0,1.412 +Beta,CDF,64,VECTORIZED,2.625,3.5,1.333 +Beta,CDF,128,VECTORIZED,5.625,7.667,1.363 +Beta,CDF,256,VECTORIZED,11.959,16.25,1.359 +Beta,CDF,512,VECTORIZED,22.666,30.291,1.336 +Beta,CDF,1000,VECTORIZED,44.833,60.334,1.346 +Beta,CDF,2000,VECTORIZED,92.5,123.0,1.33 +Beta,CDF,5000,VECTORIZED,229.541,307.375,1.339 +Beta,CDF,10000,VECTORIZED,458.042,610.75,1.333 +Beta,CDF,20000,VECTORIZED,912.083,1214.25,1.331 +Beta,CDF,50000,VECTORIZED,2372.0,3097.417,1.306 +Beta,CDF,100000,VECTORIZED,5903.666,7648.958,1.296 +Beta,CDF,250000,VECTORIZED,11486.542,15244.875,1.327 +Beta,CDF,500000,VECTORIZED,22979.416,30511.208,1.328 +Beta,LogPDF,8,PARALLEL,0.125,0.167,1.336 +Beta,LogPDF,16,PARALLEL,0.208,0.334,1.606 +Beta,LogPDF,32,PARALLEL,0.334,0.709,2.123 +Beta,LogPDF,64,WORK_STEALING,0.625,1.416,2.266 +Beta,LogPDF,128,PARALLEL,1.083,2.916,2.693 +Beta,LogPDF,256,WORK_STEALING,1.958,5.75,2.937 +Beta,LogPDF,512,WORK_STEALING,4.291,11.584,2.7 +Beta,LogPDF,1000,WORK_STEALING,8.75,22.416,2.562 +Beta,LogPDF,2000,WORK_STEALING,20.583,45.208,2.196 +Beta,LogPDF,5000,WORK_STEALING,54.416,112.375,2.065 +Beta,LogPDF,10000,VECTORIZED,204.083,226.042,1.108 +Beta,LogPDF,20000,VECTORIZED,326.166,448.375,1.375 +Beta,LogPDF,50000,VECTORIZED,832.625,1125.75,1.352 +Beta,LogPDF,100000,VECTORIZED,1660.291,2244.708,1.352 +Beta,LogPDF,250000,VECTORIZED,4218.208,6379.333,1.512 +Beta,LogPDF,500000,VECTORIZED,8464.75,11310.375,1.336 +Beta,PDF,8,PARALLEL,0.167,0.208,1.246 +Beta,PDF,16,PARALLEL,0.25,0.375,1.5 +Beta,PDF,32,PARALLEL,0.458,0.791,1.727 +Beta,PDF,64,PARALLEL,0.875,1.583,1.809 +Beta,PDF,128,PARALLEL,1.5,3.459,2.306 +Beta,PDF,256,WORK_STEALING,2.834,7.25,2.558 +Beta,PDF,512,WORK_STEALING,5.959,14.625,2.454 +Beta,PDF,1000,WORK_STEALING,12.25,29.208,2.384 +Beta,PDF,2000,PARALLEL,28.708,60.833,2.119 +Beta,PDF,5000,WORK_STEALING,86.417,151.208,1.75 +Beta,PDF,10000,VECTORIZED,226.625,304.042,1.342 +Beta,PDF,20000,VECTORIZED,462.5,611.167,1.321 +Beta,PDF,50000,VECTORIZED,1192.542,1520.875,1.275 +Beta,PDF,100000,VECTORIZED,2425.792,3046.25,1.256 +Beta,PDF,250000,VECTORIZED,5993.75,7630.375,1.273 +Beta,PDF,500000,SCALAR,15195.792,15195.792,1.0 +ChiSquared,CDF,8,VECTORIZED,0.208,0.333,1.601 +ChiSquared,CDF,16,WORK_STEALING,0.416,0.709,1.704 +ChiSquared,CDF,32,PARALLEL,0.75,1.458,1.944 +ChiSquared,CDF,64,WORK_STEALING,1.417,3.25,2.294 +ChiSquared,CDF,128,VECTORIZED,3.25,6.792,2.09 +ChiSquared,CDF,256,VECTORIZED,6.125,14.25,2.327 +ChiSquared,CDF,512,VECTORIZED,14.208,28.25,1.988 +ChiSquared,CDF,1000,VECTORIZED,32.5,56.208,1.729 +ChiSquared,CDF,2000,PARALLEL,63.625,113.791,1.788 +ChiSquared,CDF,5000,PARALLEL,105.084,286.916,2.73 +ChiSquared,CDF,10000,PARALLEL,193.666,1348.375,6.962 +ChiSquared,CDF,20000,PARALLEL,287.75,1146.208,3.983 +ChiSquared,CDF,50000,PARALLEL,534.5,2883.291,5.394 +ChiSquared,CDF,100000,PARALLEL,1124.417,5748.333,5.112 +ChiSquared,CDF,250000,PARALLEL,2544.417,14364.125,5.645 +ChiSquared,CDF,500000,PARALLEL,5311.209,34489.792,6.494 +ChiSquared,LogPDF,8,PARALLEL,0.042,0.167,3.976 +ChiSquared,LogPDF,16,WORK_STEALING,0.083,0.334,4.024 +ChiSquared,LogPDF,32,PARALLEL,0.166,0.667,4.018 +ChiSquared,LogPDF,64,WORK_STEALING,0.209,1.25,5.981 +ChiSquared,LogPDF,128,WORK_STEALING,0.458,2.458,5.367 +ChiSquared,LogPDF,256,WORK_STEALING,0.875,4.875,5.571 +ChiSquared,LogPDF,512,WORK_STEALING,1.709,9.708,5.681 +ChiSquared,LogPDF,1000,WORK_STEALING,3.333,18.959,5.688 +ChiSquared,LogPDF,2000,VECTORIZED,8.792,37.875,4.308 +ChiSquared,LogPDF,5000,VECTORIZED,23.0,94.541,4.11 +ChiSquared,LogPDF,10000,VECTORIZED,48.041,189.208,3.938 +ChiSquared,LogPDF,20000,VECTORIZED,95.542,378.292,3.959 +ChiSquared,LogPDF,50000,WORK_STEALING,179.958,945.542,5.254 +ChiSquared,LogPDF,100000,PARALLEL,157.042,1892.875,12.053 +ChiSquared,LogPDF,250000,PARALLEL,303.5,4736.375,15.606 +ChiSquared,LogPDF,500000,PARALLEL,536.334,11370.666,21.201 +ChiSquared,PDF,8,PARALLEL,0.084,0.333,3.964 +ChiSquared,PDF,16,PARALLEL,0.166,0.625,3.765 +ChiSquared,PDF,32,WORK_STEALING,0.25,1.25,5.0 +ChiSquared,PDF,64,PARALLEL,0.5,2.417,4.834 +ChiSquared,PDF,128,PARALLEL,0.958,4.833,5.045 +ChiSquared,PDF,256,PARALLEL,1.875,9.625,5.133 +ChiSquared,PDF,512,WORK_STEALING,3.625,19.167,5.287 +ChiSquared,PDF,1000,WORK_STEALING,7.083,37.625,5.312 +ChiSquared,PDF,2000,VECTORIZED,14.25,74.833,5.251 +ChiSquared,PDF,5000,VECTORIZED,35.875,187.375,5.223 +ChiSquared,PDF,10000,VECTORIZED,75.542,374.708,4.96 +ChiSquared,PDF,20000,WORK_STEALING,128.583,747.958,5.817 +ChiSquared,PDF,50000,PARALLEL,242.959,1874.167,7.714 +ChiSquared,PDF,100000,PARALLEL,266.417,3748.834,14.071 +ChiSquared,PDF,250000,PARALLEL,543.292,9369.291,17.245 +ChiSquared,PDF,500000,PARALLEL,1522.083,18767.625,12.33 +Discrete,CDF,8,VECTORIZED,0.042,0.166,3.952 +Discrete,CDF,16,VECTORIZED,0.042,0.292,6.952 +Discrete,CDF,32,VECTORIZED,0.042,0.625,14.881 +Discrete,CDF,64,VECTORIZED,0.083,1.208,14.554 +Discrete,CDF,128,VECTORIZED,0.125,2.209,17.672 +Discrete,CDF,256,VECTORIZED,0.25,4.542,18.168 +Discrete,CDF,512,PARALLEL,0.583,8.833,15.151 +Discrete,CDF,1000,VECTORIZED,1.166,17.334,14.866 +Discrete,CDF,2000,VECTORIZED,2.333,35.167,15.074 +Discrete,CDF,5000,VECTORIZED,6.375,86.417,13.556 +Discrete,CDF,10000,VECTORIZED,13.375,174.334,13.034 +Discrete,CDF,20000,VECTORIZED,27.125,348.167,12.836 +Discrete,CDF,50000,VECTORIZED,70.209,868.875,12.376 +Discrete,CDF,100000,WORK_STEALING,114.708,1741.541,15.182 +Discrete,CDF,250000,PARALLEL,174.833,4341.166,24.83 +Discrete,CDF,500000,PARALLEL,311.125,8669.417,27.865 +Discrete,LogPDF,8,VECTORIZED,0.042,0.167,3.976 +Discrete,LogPDF,16,VECTORIZED,0.041,0.333,8.122 +Discrete,LogPDF,32,VECTORIZED,0.042,0.666,15.857 +Discrete,LogPDF,64,VECTORIZED,0.083,1.291,15.554 +Discrete,LogPDF,128,VECTORIZED,0.167,2.5,14.97 +Discrete,LogPDF,256,VECTORIZED,0.292,4.917,16.839 +Discrete,LogPDF,512,VECTORIZED,0.542,9.792,18.066 +Discrete,LogPDF,1000,VECTORIZED,1.042,19.0,18.234 +Discrete,LogPDF,2000,VECTORIZED,2.125,38.125,17.941 +Discrete,LogPDF,5000,VECTORIZED,5.125,95.292,18.594 +Discrete,LogPDF,10000,VECTORIZED,10.125,190.166,18.782 +Discrete,LogPDF,20000,VECTORIZED,20.125,380.084,18.886 +Discrete,LogPDF,50000,VECTORIZED,50.25,952.125,18.948 +Discrete,LogPDF,100000,VECTORIZED,100.5,1893.541,18.841 +Discrete,LogPDF,250000,PARALLEL,145.833,4753.917,32.598 +Discrete,LogPDF,500000,PARALLEL,221.25,9531.417,43.08 +Discrete,PDF,8,VECTORIZED,0.042,0.167,3.976 +Discrete,PDF,16,VECTORIZED,0.041,0.333,8.122 +Discrete,PDF,32,VECTORIZED,0.042,0.625,14.881 +Discrete,PDF,64,VECTORIZED,0.083,1.25,15.06 +Discrete,PDF,128,PARALLEL,0.166,2.459,14.813 +Discrete,PDF,256,VECTORIZED,0.292,4.917,16.839 +Discrete,PDF,512,VECTORIZED,0.542,9.75,17.989 +Discrete,PDF,1000,VECTORIZED,1.042,19.0,18.234 +Discrete,PDF,2000,VECTORIZED,2.125,37.959,17.863 +Discrete,PDF,5000,VECTORIZED,5.125,94.834,18.504 +Discrete,PDF,10000,VECTORIZED,10.125,189.666,18.732 +Discrete,PDF,20000,VECTORIZED,20.208,381.083,18.858 +Discrete,PDF,50000,VECTORIZED,50.25,950.458,18.915 +Discrete,PDF,100000,VECTORIZED,100.833,1897.708,18.82 +Discrete,PDF,250000,PARALLEL,127.5,4743.958,37.208 +Discrete,PDF,500000,PARALLEL,173.625,9496.709,54.697 +Exponential,CDF,8,PARALLEL,0.042,0.167,3.976 +Exponential,CDF,16,PARALLEL,0.083,0.333,4.012 +Exponential,CDF,32,PARALLEL,0.125,0.625,5.0 +Exponential,CDF,64,PARALLEL,0.208,1.25,6.01 +Exponential,CDF,128,WORK_STEALING,0.375,2.458,6.555 +Exponential,CDF,256,WORK_STEALING,0.75,4.792,6.389 +Exponential,CDF,512,WORK_STEALING,1.417,9.625,6.793 +Exponential,CDF,1000,WORK_STEALING,2.791,18.791,6.733 +Exponential,CDF,2000,VECTORIZED,7.208,37.5,5.203 +Exponential,CDF,5000,VECTORIZED,17.75,93.584,5.272 +Exponential,CDF,10000,VECTORIZED,35.5,187.459,5.281 +Exponential,CDF,20000,WORK_STEALING,74.583,376.375,5.046 +Exponential,CDF,50000,WORK_STEALING,76.333,941.708,12.337 +Exponential,CDF,100000,PARALLEL,144.25,1870.542,12.967 +Exponential,CDF,250000,PARALLEL,242.459,4703.75,19.4 +Exponential,CDF,500000,PARALLEL,483.708,9367.833,19.367 +Exponential,LogPDF,8,WORK_STEALING,0.041,0.167,4.073 +Exponential,LogPDF,16,WORK_STEALING,0.041,0.333,8.122 +Exponential,LogPDF,32,WORK_STEALING,0.041,0.625,15.244 +Exponential,LogPDF,64,WORK_STEALING,0.042,1.291,30.738 +Exponential,LogPDF,128,WORK_STEALING,0.042,2.458,58.524 +Exponential,LogPDF,256,WORK_STEALING,0.083,4.792,57.735 +Exponential,LogPDF,512,WORK_STEALING,0.084,9.75,116.071 +Exponential,LogPDF,1000,WORK_STEALING,0.167,19.0,113.772 +Exponential,LogPDF,2000,VECTORIZED,1.5,37.417,24.945 +Exponential,LogPDF,5000,VECTORIZED,3.5,94.833,27.095 +Exponential,LogPDF,10000,VECTORIZED,7.292,190.125,26.073 +Exponential,LogPDF,20000,VECTORIZED,13.542,378.125,27.922 +Exponential,LogPDF,50000,VECTORIZED,33.959,949.625,27.964 +Exponential,LogPDF,100000,PARALLEL,46.625,1892.917,40.599 +Exponential,LogPDF,250000,WORK_STEALING,101.625,4740.75,46.649 +Exponential,LogPDF,500000,PARALLEL,101.25,9493.625,93.764 +Exponential,PDF,8,PARALLEL,0.042,0.167,3.976 +Exponential,PDF,16,PARALLEL,0.083,0.333,4.012 +Exponential,PDF,32,PARALLEL,0.125,0.625,5.0 +Exponential,PDF,64,WORK_STEALING,0.208,1.209,5.813 +Exponential,PDF,128,WORK_STEALING,0.416,2.417,5.81 +Exponential,PDF,256,WORK_STEALING,0.709,4.792,6.759 +Exponential,PDF,512,WORK_STEALING,1.417,9.5,6.704 +Exponential,PDF,1000,WORK_STEALING,2.791,18.542,6.643 +Exponential,PDF,2000,VECTORIZED,6.833,36.917,5.403 +Exponential,PDF,5000,VECTORIZED,16.875,92.708,5.494 +Exponential,PDF,10000,VECTORIZED,33.916,184.875,5.451 +Exponential,PDF,20000,VECTORIZED,72.583,369.959,5.097 +Exponential,PDF,50000,WORK_STEALING,118.583,926.584,7.814 +Exponential,PDF,100000,PARALLEL,135.75,1859.166,13.696 +Exponential,PDF,250000,WORK_STEALING,248.459,4647.375,18.705 +Exponential,PDF,500000,PARALLEL,441.292,9270.166,21.007 +Gamma,CDF,8,WORK_STEALING,0.208,0.333,1.601 +Gamma,CDF,16,PARALLEL,0.333,0.584,1.754 +Gamma,CDF,32,PARALLEL,0.667,1.375,2.061 +Gamma,CDF,64,VECTORIZED,1.542,3.083,1.999 +Gamma,CDF,128,WORK_STEALING,2.791,6.25,2.239 +Gamma,CDF,256,VECTORIZED,6.5,13.0,2.0 +Gamma,CDF,512,WORK_STEALING,14.666,26.458,1.804 +Gamma,CDF,1000,VECTORIZED,30.917,53.375,1.726 +Gamma,CDF,2000,PARALLEL,65.167,104.5,1.604 +Gamma,CDF,5000,PARALLEL,93.625,264.5,2.825 +Gamma,CDF,10000,PARALLEL,151.625,529.167,3.49 +Gamma,CDF,20000,PARALLEL,236.375,1060.958,4.488 +Gamma,CDF,50000,PARALLEL,497.125,2648.666,5.328 +Gamma,CDF,100000,PARALLEL,1035.417,6832.584,6.599 +Gamma,CDF,250000,WORK_STEALING,2327.291,13286.084,5.709 +Gamma,CDF,500000,WORK_STEALING,4233.333,26626.75,6.29 +Gamma,LogPDF,8,WORK_STEALING,0.042,0.167,3.976 +Gamma,LogPDF,16,PARALLEL,0.083,0.333,4.012 +Gamma,LogPDF,32,WORK_STEALING,0.125,0.625,5.0 +Gamma,LogPDF,64,WORK_STEALING,0.208,1.25,6.01 +Gamma,LogPDF,128,WORK_STEALING,0.458,2.458,5.367 +Gamma,LogPDF,256,WORK_STEALING,0.834,4.875,5.845 +Gamma,LogPDF,512,WORK_STEALING,1.708,9.708,5.684 +Gamma,LogPDF,1000,WORK_STEALING,3.375,18.958,5.617 +Gamma,LogPDF,2000,VECTORIZED,8.667,37.875,4.37 +Gamma,LogPDF,5000,VECTORIZED,22.375,94.625,4.229 +Gamma,LogPDF,10000,VECTORIZED,50.833,190.0,3.738 +Gamma,LogPDF,20000,WORK_STEALING,95.916,378.75,3.949 +Gamma,LogPDF,50000,PARALLEL,133.208,946.459,7.105 +Gamma,LogPDF,100000,PARALLEL,164.375,1896.708,11.539 +Gamma,LogPDF,250000,PARALLEL,301.792,4738.875,15.702 +Gamma,LogPDF,500000,PARALLEL,586.208,9478.75,16.17 +Gamma,PDF,8,PARALLEL,0.083,0.333,4.012 +Gamma,PDF,16,WORK_STEALING,0.166,0.625,3.765 +Gamma,PDF,32,WORK_STEALING,0.25,1.209,4.836 +Gamma,PDF,64,PARALLEL,0.5,2.417,4.834 +Gamma,PDF,128,VECTORIZED,0.958,4.833,5.045 +Gamma,PDF,256,VECTORIZED,1.834,9.625,5.248 +Gamma,PDF,512,VECTORIZED,3.584,19.167,5.348 +Gamma,PDF,1000,VECTORIZED,7.042,37.417,5.313 +Gamma,PDF,2000,VECTORIZED,13.75,74.917,5.449 +Gamma,PDF,5000,VECTORIZED,35.834,187.292,5.227 +Gamma,PDF,10000,VECTORIZED,77.083,374.542,4.859 +Gamma,PDF,20000,PARALLEL,140.541,749.25,5.331 +Gamma,PDF,50000,PARALLEL,161.625,1873.375,11.591 +Gamma,PDF,100000,PARALLEL,236.5,3745.041,15.835 +Gamma,PDF,250000,PARALLEL,542.25,9364.375,17.269 +Gamma,PDF,500000,PARALLEL,1034.167,19928.333,19.27 +Gaussian,CDF,8,PARALLEL,0.125,0.291,2.328 +Gaussian,CDF,16,PARALLEL,0.208,0.458,2.202 +Gaussian,CDF,32,PARALLEL,0.416,0.875,2.103 +Gaussian,CDF,64,PARALLEL,0.75,1.708,2.277 +Gaussian,CDF,128,VECTORIZED,1.458,3.292,2.258 +Gaussian,CDF,256,VECTORIZED,2.833,6.625,2.339 +Gaussian,CDF,512,VECTORIZED,5.542,13.167,2.376 +Gaussian,CDF,1000,WORK_STEALING,10.708,25.75,2.405 +Gaussian,CDF,2000,VECTORIZED,21.292,51.5,2.419 +Gaussian,CDF,5000,VECTORIZED,52.958,128.667,2.43 +Gaussian,CDF,10000,WORK_STEALING,67.584,257.458,3.809 +Gaussian,CDF,20000,WORK_STEALING,96.458,516.542,5.355 +Gaussian,CDF,50000,WORK_STEALING,188.625,1285.5,6.815 +Gaussian,CDF,100000,WORK_STEALING,342.333,2574.542,7.521 +Gaussian,CDF,250000,WORK_STEALING,743.042,6439.125,8.666 +Gaussian,CDF,500000,WORK_STEALING,1259.875,12872.916,10.218 +Gaussian,LogPDF,8,WORK_STEALING,0.041,0.167,4.073 +Gaussian,LogPDF,16,PARALLEL,0.042,0.334,7.952 +Gaussian,LogPDF,32,PARALLEL,0.042,0.666,15.857 +Gaussian,LogPDF,64,PARALLEL,0.042,1.25,29.762 +Gaussian,LogPDF,128,PARALLEL,0.042,2.458,58.524 +Gaussian,LogPDF,256,WORK_STEALING,0.042,4.958,118.048 +Gaussian,LogPDF,512,PARALLEL,0.083,9.916,119.47 +Gaussian,LogPDF,1000,PARALLEL,0.166,19.375,116.717 +Gaussian,LogPDF,2000,VECTORIZED,1.083,38.584,35.627 +Gaussian,LogPDF,5000,VECTORIZED,2.666,95.125,35.681 +Gaussian,LogPDF,10000,VECTORIZED,6.208,192.042,30.935 +Gaussian,LogPDF,20000,VECTORIZED,11.583,385.584,33.289 +Gaussian,LogPDF,50000,VECTORIZED,27.5,958.125,34.841 +Gaussian,LogPDF,100000,WORK_STEALING,53.208,1917.875,36.045 +Gaussian,LogPDF,250000,WORK_STEALING,101.959,4797.375,47.052 +Gaussian,LogPDF,500000,WORK_STEALING,158.584,9544.5,60.186 +Gaussian,PDF,8,PARALLEL,0.083,0.167,2.012 +Gaussian,PDF,16,PARALLEL,0.083,0.333,4.012 +Gaussian,PDF,32,PARALLEL,0.125,0.666,5.328 +Gaussian,PDF,64,PARALLEL,0.208,1.208,5.808 +Gaussian,PDF,128,PARALLEL,0.375,2.417,6.445 +Gaussian,PDF,256,WORK_STEALING,0.708,4.833,6.826 +Gaussian,PDF,512,WORK_STEALING,1.375,9.625,7.0 +Gaussian,PDF,1000,WORK_STEALING,2.667,18.75,7.03 +Gaussian,PDF,2000,VECTORIZED,6.5,37.209,5.724 +Gaussian,PDF,5000,VECTORIZED,16.125,93.541,5.801 +Gaussian,PDF,10000,VECTORIZED,33.125,186.083,5.618 +Gaussian,PDF,20000,VECTORIZED,69.334,380.292,5.485 +Gaussian,PDF,50000,WORK_STEALING,90.75,936.0,10.314 +Gaussian,PDF,100000,PARALLEL,129.833,1861.459,14.337 +Gaussian,PDF,250000,PARALLEL,229.375,4671.958,20.368 +Gaussian,PDF,500000,WORK_STEALING,368.75,10121.959,27.449 +Poisson,CDF,8,SCALAR,0.208,0.208,1.0 +Poisson,CDF,16,SCALAR,0.5,0.5,1.0 +Poisson,CDF,32,SCALAR,1.0,1.0,1.0 +Poisson,CDF,64,SCALAR,2.375,2.375,1.0 +Poisson,CDF,128,VECTORIZED,4.458,4.5,1.009 +Poisson,CDF,256,VECTORIZED,9.333,9.458,1.013 +Poisson,CDF,512,WORK_STEALING,19.583,19.667,1.004 +Poisson,CDF,1000,WORK_STEALING,38.583,38.875,1.008 +Poisson,CDF,2000,PARALLEL,73.458,78.292,1.066 +Poisson,CDF,5000,PARALLEL,107.791,197.791,1.835 +Poisson,CDF,10000,PARALLEL,158.75,398.667,2.511 +Poisson,CDF,20000,PARALLEL,255.666,794.5,3.108 +Poisson,CDF,50000,WORK_STEALING,615.292,1989.916,3.234 +Poisson,CDF,100000,PARALLEL,1184.875,4007.167,3.382 +Poisson,CDF,250000,PARALLEL,2731.833,10009.167,3.664 +Poisson,CDF,500000,WORK_STEALING,4841.25,19976.834,4.126 +Poisson,LogPDF,8,WORK_STEALING,0.042,0.459,10.929 +Poisson,LogPDF,16,VECTORIZED,0.083,0.292,3.518 +Poisson,LogPDF,32,VECTORIZED,0.125,0.625,5.0 +Poisson,LogPDF,64,VECTORIZED,0.292,1.208,4.137 +Poisson,LogPDF,128,WORK_STEALING,0.458,2.458,5.367 +Poisson,LogPDF,256,WORK_STEALING,0.958,4.875,5.089 +Poisson,LogPDF,512,VECTORIZED,1.875,9.625,5.133 +Poisson,LogPDF,1000,VECTORIZED,3.5,18.792,5.369 +Poisson,LogPDF,2000,VECTORIZED,7.5,37.5,5.0 +Poisson,LogPDF,5000,VECTORIZED,20.959,93.666,4.469 +Poisson,LogPDF,10000,VECTORIZED,44.291,187.292,4.229 +Poisson,LogPDF,20000,VECTORIZED,94.458,374.625,3.966 +Poisson,LogPDF,50000,PARALLEL,198.458,936.375,4.718 +Poisson,LogPDF,100000,PARALLEL,189.667,1873.916,9.88 +Poisson,LogPDF,250000,WORK_STEALING,448.25,4690.542,10.464 +Poisson,LogPDF,500000,WORK_STEALING,772.291,9371.958,12.135 +Poisson,PDF,8,VECTORIZED,0.125,0.208,1.664 +Poisson,PDF,16,VECTORIZED,0.208,0.417,2.005 +Poisson,PDF,32,VECTORIZED,0.292,0.792,2.712 +Poisson,PDF,64,VECTORIZED,0.625,1.542,2.467 +Poisson,PDF,128,VECTORIZED,1.166,3.042,2.609 +Poisson,PDF,256,VECTORIZED,2.416,6.125,2.535 +Poisson,PDF,512,VECTORIZED,4.75,12.167,2.561 +Poisson,PDF,1000,VECTORIZED,9.166,23.833,2.6 +Poisson,PDF,2000,VECTORIZED,18.333,47.667,2.6 +Poisson,PDF,5000,VECTORIZED,45.458,149.791,3.295 +Poisson,PDF,10000,VECTORIZED,90.959,238.0,2.617 +Poisson,PDF,20000,WORK_STEALING,132.958,476.041,3.58 +Poisson,PDF,50000,PARALLEL,193.792,1190.0,6.141 +Poisson,PDF,100000,PARALLEL,293.917,2380.041,8.098 +Poisson,PDF,250000,PARALLEL,685.833,5961.75,8.693 +Poisson,PDF,500000,WORK_STEALING,1274.417,11908.416,9.344 +StudentT,CDF,8,PARALLEL,0.666,0.833,1.251 +StudentT,CDF,16,VECTORIZED,1.125,1.417,1.26 +StudentT,CDF,32,PARALLEL,2.666,3.417,1.282 +StudentT,CDF,64,VECTORIZED,5.334,6.666,1.25 +StudentT,CDF,128,VECTORIZED,10.625,13.167,1.239 +StudentT,CDF,256,VECTORIZED,22.292,26.708,1.198 +StudentT,CDF,512,VECTORIZED,43.75,52.417,1.198 +StudentT,CDF,1000,VECTORIZED,88.125,104.833,1.19 +StudentT,CDF,2000,PARALLEL,176.459,212.25,1.203 +StudentT,CDF,5000,WORK_STEALING,447.542,525.791,1.175 +StudentT,CDF,10000,VECTORIZED,889.958,1059.917,1.191 +StudentT,CDF,20000,WORK_STEALING,1781.25,2104.041,1.181 +StudentT,CDF,50000,PARALLEL,4434.417,5279.375,1.191 +StudentT,CDF,100000,VECTORIZED,8873.583,10490.792,1.182 +StudentT,CDF,250000,WORK_STEALING,22232.292,26765.708,1.204 +StudentT,CDF,500000,PARALLEL,44347.791,52478.042,1.183 +StudentT,LogPDF,8,PARALLEL,0.125,0.167,1.336 +StudentT,LogPDF,16,WORK_STEALING,0.125,0.334,2.672 +StudentT,LogPDF,32,PARALLEL,0.167,0.625,3.743 +StudentT,LogPDF,64,WORK_STEALING,0.291,1.25,4.296 +StudentT,LogPDF,128,PARALLEL,0.5,2.5,5.0 +StudentT,LogPDF,256,WORK_STEALING,0.917,4.917,5.362 +StudentT,LogPDF,512,WORK_STEALING,1.834,9.75,5.316 +StudentT,LogPDF,1000,WORK_STEALING,3.542,19.083,5.388 +StudentT,LogPDF,2000,PARALLEL,7.542,38.083,5.049 +StudentT,LogPDF,5000,WORK_STEALING,21.125,94.916,4.493 +StudentT,LogPDF,10000,VECTORIZED,51.625,192.166,3.722 +StudentT,LogPDF,20000,VECTORIZED,101.583,380.583,3.747 +StudentT,LogPDF,50000,PARALLEL,121.958,950.292,7.792 +StudentT,LogPDF,100000,WORK_STEALING,162.333,1899.917,11.704 +StudentT,LogPDF,250000,WORK_STEALING,353.084,4753.75,13.464 +StudentT,LogPDF,500000,PARALLEL,680.625,9505.459,13.966 +StudentT,PDF,8,VECTORIZED,0.167,0.209,1.251 +StudentT,PDF,16,PARALLEL,0.208,0.416,2.0 +StudentT,PDF,32,WORK_STEALING,0.292,0.75,2.568 +StudentT,PDF,64,VECTORIZED,0.542,1.5,2.768 +StudentT,PDF,128,VECTORIZED,0.958,2.833,2.957 +StudentT,PDF,256,VECTORIZED,1.917,5.625,2.934 +StudentT,PDF,512,VECTORIZED,3.792,11.291,2.978 +StudentT,PDF,1000,VECTORIZED,7.167,21.958,3.064 +StudentT,PDF,2000,VECTORIZED,14.209,43.875,3.088 +StudentT,PDF,5000,VECTORIZED,36.375,109.667,3.015 +StudentT,PDF,10000,VECTORIZED,76.416,222.75,2.915 +StudentT,PDF,20000,WORK_STEALING,104.042,439.416,4.223 +StudentT,PDF,50000,WORK_STEALING,144.875,1096.792,7.571 +StudentT,PDF,100000,PARALLEL,219.0,2194.708,10.021 +StudentT,PDF,250000,PARALLEL,482.042,5491.125,11.391 +StudentT,PDF,500000,WORK_STEALING,949.042,10971.917,11.561 +Uniform,CDF,8,PARALLEL,0.041,0.167,4.073 +Uniform,CDF,16,PARALLEL,0.042,0.333,7.929 +Uniform,CDF,32,PARALLEL,0.042,0.625,14.881 +Uniform,CDF,64,WORK_STEALING,0.042,1.208,28.762 +Uniform,CDF,128,WORK_STEALING,0.042,2.459,58.548 +Uniform,CDF,256,WORK_STEALING,0.083,4.917,59.241 +Uniform,CDF,512,WORK_STEALING,0.125,9.958,79.664 +Uniform,CDF,1000,WORK_STEALING,0.25,19.375,77.5 +Uniform,CDF,2000,VECTORIZED,2.208,38.459,17.418 +Uniform,CDF,5000,VECTORIZED,5.417,97.125,17.93 +Uniform,CDF,10000,VECTORIZED,19.541,194.583,9.958 +Uniform,CDF,20000,WORK_STEALING,49.0,391.875,7.997 +Uniform,CDF,50000,WORK_STEALING,73.292,981.25,13.388 +Uniform,CDF,100000,WORK_STEALING,120.334,1972.792,16.394 +Uniform,CDF,250000,PARALLEL,253.417,4875.333,19.238 +Uniform,CDF,500000,WORK_STEALING,453.166,9690.0,21.383 +Uniform,LogPDF,8,VECTORIZED,0.042,0.167,3.976 +Uniform,LogPDF,16,VECTORIZED,0.042,0.333,7.929 +Uniform,LogPDF,32,VECTORIZED,0.041,0.625,15.244 +Uniform,LogPDF,64,VECTORIZED,0.042,1.208,28.762 +Uniform,LogPDF,128,VECTORIZED,0.042,2.417,57.548 +Uniform,LogPDF,256,VECTORIZED,0.083,4.917,59.241 +Uniform,LogPDF,512,VECTORIZED,0.125,9.959,79.672 +Uniform,LogPDF,1000,VECTORIZED,0.208,19.583,94.149 +Uniform,LogPDF,2000,VECTORIZED,0.375,43.833,116.888 +Uniform,LogPDF,5000,VECTORIZED,0.875,96.667,110.477 +Uniform,LogPDF,10000,VECTORIZED,1.708,201.167,117.779 +Uniform,LogPDF,20000,VECTORIZED,3.458,390.917,113.047 +Uniform,LogPDF,50000,VECTORIZED,8.584,979.334,114.088 +Uniform,LogPDF,100000,VECTORIZED,17.125,1957.25,114.292 +Uniform,LogPDF,250000,VECTORIZED,43.542,4904.166,112.631 +Uniform,LogPDF,500000,VECTORIZED,89.375,9815.458,109.823 +Uniform,PDF,8,VECTORIZED,0.041,0.167,4.073 +Uniform,PDF,16,VECTORIZED,0.042,0.333,7.929 +Uniform,PDF,32,VECTORIZED,0.041,0.625,15.244 +Uniform,PDF,64,VECTORIZED,0.041,1.209,29.488 +Uniform,PDF,128,VECTORIZED,0.042,2.5,59.524 +Uniform,PDF,256,VECTORIZED,0.083,5.042,60.747 +Uniform,PDF,512,VECTORIZED,0.125,9.958,79.664 +Uniform,PDF,1000,VECTORIZED,0.208,19.583,94.149 +Uniform,PDF,2000,VECTORIZED,0.375,39.458,105.221 +Uniform,PDF,5000,VECTORIZED,0.875,96.792,110.619 +Uniform,PDF,10000,VECTORIZED,1.709,197.875,115.784 +Uniform,PDF,20000,VECTORIZED,3.708,393.166,106.032 +Uniform,PDF,50000,VECTORIZED,9.25,985.292,106.518 +Uniform,PDF,100000,VECTORIZED,17.916,1962.458,109.537 +Uniform,PDF,250000,VECTORIZED,44.875,4931.583,109.896 +Uniform,PDF,500000,VECTORIZED,89.75,9842.333,109.664 diff --git a/data/profiles/dispatcher/2026-04-12T05-36-21Z_darwin-arm64_investigate-gaussian-avx512-perf_sha-6aef918/crossovers.csv b/data/profiles/dispatcher/2026-04-12T05-36-21Z_darwin-arm64_investigate-gaussian-avx512-perf_sha-6aef918/crossovers.csv new file mode 100644 index 0000000..4c2ceb7 --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T05-36-21Z_darwin-arm64_investigate-gaussian-avx512-perf_sha-6aef918/crossovers.csv @@ -0,0 +1,28 @@ +distribution,operation,scalar_to_vectorized,vectorized_to_parallel,parallel_to_work_stealing,best_strategy_at_max_size,best_time_us_at_max_size,max_batch_size +Beta,CDF,8,,128,VECTORIZED,22979.416,500000 +Beta,LogPDF,16,8,64,VECTORIZED,8464.75,500000 +Beta,PDF,16,8,256,SCALAR,15195.792,500000 +ChiSquared,CDF,8,32,16,PARALLEL,5311.209,500000 +ChiSquared,LogPDF,8,8,16,PARALLEL,536.334,500000 +ChiSquared,PDF,8,8,32,PARALLEL,1522.083,500000 +Discrete,CDF,8,512,64,PARALLEL,311.125,500000 +Discrete,LogPDF,8,250000,32,PARALLEL,221.25,500000 +Discrete,PDF,8,128,16,PARALLEL,173.625,500000 +Exponential,CDF,8,8,128,PARALLEL,483.708,500000 +Exponential,LogPDF,8,8,8,PARALLEL,101.25,500000 +Exponential,PDF,8,8,64,PARALLEL,441.292,500000 +Gamma,CDF,8,8,8,WORK_STEALING,4233.333,500000 +Gamma,LogPDF,8,8,8,PARALLEL,586.208,500000 +Gamma,PDF,8,8,16,PARALLEL,1034.167,500000 +Gaussian,CDF,8,8,128,WORK_STEALING,1259.875,500000 +Gaussian,LogPDF,8,8,8,WORK_STEALING,158.584,500000 +Gaussian,PDF,8,8,256,WORK_STEALING,368.75,500000 +Poisson,CDF,128,2000,32,WORK_STEALING,4841.25,500000 +Poisson,LogPDF,8,50000,8,WORK_STEALING,772.291,500000 +Poisson,PDF,8,50000,64,WORK_STEALING,1274.417,500000 +StudentT,CDF,8,8,64,PARALLEL,44347.791,500000 +StudentT,LogPDF,16,8,16,PARALLEL,680.625,500000 +StudentT,PDF,8,16,32,WORK_STEALING,949.042,500000 +Uniform,CDF,8,8,64,WORK_STEALING,453.166,500000 +Uniform,LogPDF,8,,32,VECTORIZED,89.375,500000 +Uniform,PDF,8,,128,VECTORIZED,89.75,500000 diff --git a/data/profiles/dispatcher/2026-04-12T05-36-21Z_darwin-arm64_investigate-gaussian-avx512-perf_sha-6aef918/logs/strategy_profile.txt b/data/profiles/dispatcher/2026-04-12T05-36-21Z_darwin-arm64_investigate-gaussian-avx512-perf_sha-6aef918/logs/strategy_profile.txt new file mode 100644 index 0000000..3761bbc --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T05-36-21Z_darwin-arm64_investigate-gaussian-avx512-perf_sha-6aef918/logs/strategy_profile.txt @@ -0,0 +1,658 @@ + +==================== + Strategy Profile +==================== + +Forced-strategy timing profiler for dispatcher threshold tuning + +System: 8 logical cores, NEON SIMD, 0 KB L3 cache + +Batch sizes: 8 16 32 64 128 256 512 1000 2000 5000 10000 20000 50000 100000 250000 500000 + + +--- Uniform Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- Gaussian Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- Exponential Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- Discrete Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- Poisson Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- Gamma Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- StudentT Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- Beta Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- ChiSquared Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +========================= + Best Strategy Summary +========================= + +Distribution Operation Size Best Strategy Time (ΞΌs) +---------------------------------------------------------------- +Beta CDF 8 Vectorized 0.38 +Beta CDF 16 Vectorized 0.75 +Beta CDF 32 Vectorized 1.42 +Beta CDF 64 Vectorized 2.62 +Beta CDF 128 Vectorized 5.62 +Beta CDF 256 Vectorized 11.96 +Beta CDF 512 Vectorized 22.67 +Beta CDF 1000 Vectorized 44.83 +Beta CDF 2000 Vectorized 92.50 +Beta CDF 5000 Vectorized 229.54 +Beta CDF 10000 Vectorized 458.04 +Beta CDF 20000 Vectorized 912.08 +Beta CDF 50000 Vectorized 2372.00 +Beta CDF 100000 Vectorized 5903.67 +Beta CDF 250000 Vectorized 11486.54 +Beta CDF 500000 Vectorized 22979.42 +Beta LogPDF 8 Parallel 0.12 +Beta LogPDF 16 Parallel 0.21 +Beta LogPDF 32 Parallel 0.33 +Beta LogPDF 64 Work-Stealing 0.62 +Beta LogPDF 128 Parallel 1.08 +Beta LogPDF 256 Work-Stealing 1.96 +Beta LogPDF 512 Work-Stealing 4.29 +Beta LogPDF 1000 Work-Stealing 8.75 +Beta LogPDF 2000 Work-Stealing 20.58 +Beta LogPDF 5000 Work-Stealing 54.42 +Beta LogPDF 10000 Vectorized 204.08 +Beta LogPDF 20000 Vectorized 326.17 +Beta LogPDF 50000 Vectorized 832.62 +Beta LogPDF 100000 Vectorized 1660.29 +Beta LogPDF 250000 Vectorized 4218.21 +Beta LogPDF 500000 Vectorized 8464.75 +Beta PDF 8 Parallel 0.17 +Beta PDF 16 Parallel 0.25 +Beta PDF 32 Parallel 0.46 +Beta PDF 64 Parallel 0.88 +Beta PDF 128 Parallel 1.50 +Beta PDF 256 Work-Stealing 2.83 +Beta PDF 512 Work-Stealing 5.96 +Beta PDF 1000 Work-Stealing 12.25 +Beta PDF 2000 Parallel 28.71 +Beta PDF 5000 Work-Stealing 86.42 +Beta PDF 10000 Vectorized 226.62 +Beta PDF 20000 Vectorized 462.50 +Beta PDF 50000 Vectorized 1192.54 +Beta PDF 100000 Vectorized 2425.79 +Beta PDF 250000 Vectorized 5993.75 +Beta PDF 500000 Scalar 15195.79 +ChiSquared CDF 8 Vectorized 0.21 +ChiSquared CDF 16 Work-Stealing 0.42 +ChiSquared CDF 32 Parallel 0.75 +ChiSquared CDF 64 Work-Stealing 1.42 +ChiSquared CDF 128 Vectorized 3.25 +ChiSquared CDF 256 Vectorized 6.12 +ChiSquared CDF 512 Vectorized 14.21 +ChiSquared CDF 1000 Vectorized 32.50 +ChiSquared CDF 2000 Parallel 63.62 +ChiSquared CDF 5000 Parallel 105.08 +ChiSquared CDF 10000 Parallel 193.67 +ChiSquared CDF 20000 Parallel 287.75 +ChiSquared CDF 50000 Parallel 534.50 +ChiSquared CDF 100000 Parallel 1124.42 +ChiSquared CDF 250000 Parallel 2544.42 +ChiSquared CDF 500000 Parallel 5311.21 +ChiSquared LogPDF 8 Parallel 0.04 +ChiSquared LogPDF 16 Work-Stealing 0.08 +ChiSquared LogPDF 32 Parallel 0.17 +ChiSquared LogPDF 64 Work-Stealing 0.21 +ChiSquared LogPDF 128 Work-Stealing 0.46 +ChiSquared LogPDF 256 Work-Stealing 0.88 +ChiSquared LogPDF 512 Work-Stealing 1.71 +ChiSquared LogPDF 1000 Work-Stealing 3.33 +ChiSquared LogPDF 2000 Vectorized 8.79 +ChiSquared LogPDF 5000 Vectorized 23.00 +ChiSquared LogPDF 10000 Vectorized 48.04 +ChiSquared LogPDF 20000 Vectorized 95.54 +ChiSquared LogPDF 50000 Work-Stealing 179.96 +ChiSquared LogPDF 100000 Parallel 157.04 +ChiSquared LogPDF 250000 Parallel 303.50 +ChiSquared LogPDF 500000 Parallel 536.33 +ChiSquared PDF 8 Parallel 0.08 +ChiSquared PDF 16 Parallel 0.17 +ChiSquared PDF 32 Work-Stealing 0.25 +ChiSquared PDF 64 Parallel 0.50 +ChiSquared PDF 128 Parallel 0.96 +ChiSquared PDF 256 Parallel 1.88 +ChiSquared PDF 512 Work-Stealing 3.62 +ChiSquared PDF 1000 Work-Stealing 7.08 +ChiSquared PDF 2000 Vectorized 14.25 +ChiSquared PDF 5000 Vectorized 35.88 +ChiSquared PDF 10000 Vectorized 75.54 +ChiSquared PDF 20000 Work-Stealing 128.58 +ChiSquared PDF 50000 Parallel 242.96 +ChiSquared PDF 100000 Parallel 266.42 +ChiSquared PDF 250000 Parallel 543.29 +ChiSquared PDF 500000 Parallel 1522.08 +Discrete CDF 8 Vectorized 0.04 +Discrete CDF 16 Vectorized 0.04 +Discrete CDF 32 Vectorized 0.04 +Discrete CDF 64 Vectorized 0.08 +Discrete CDF 128 Vectorized 0.12 +Discrete CDF 256 Vectorized 0.25 +Discrete CDF 512 Parallel 0.58 +Discrete CDF 1000 Vectorized 1.17 +Discrete CDF 2000 Vectorized 2.33 +Discrete CDF 5000 Vectorized 6.38 +Discrete CDF 10000 Vectorized 13.38 +Discrete CDF 20000 Vectorized 27.12 +Discrete CDF 50000 Vectorized 70.21 +Discrete CDF 100000 Work-Stealing 114.71 +Discrete CDF 250000 Parallel 174.83 +Discrete CDF 500000 Parallel 311.12 +Discrete LogPDF 8 Vectorized 0.04 +Discrete LogPDF 16 Vectorized 0.04 +Discrete LogPDF 32 Vectorized 0.04 +Discrete LogPDF 64 Vectorized 0.08 +Discrete LogPDF 128 Vectorized 0.17 +Discrete LogPDF 256 Vectorized 0.29 +Discrete LogPDF 512 Vectorized 0.54 +Discrete LogPDF 1000 Vectorized 1.04 +Discrete LogPDF 2000 Vectorized 2.12 +Discrete LogPDF 5000 Vectorized 5.12 +Discrete LogPDF 10000 Vectorized 10.12 +Discrete LogPDF 20000 Vectorized 20.12 +Discrete LogPDF 50000 Vectorized 50.25 +Discrete LogPDF 100000 Vectorized 100.50 +Discrete LogPDF 250000 Parallel 145.83 +Discrete LogPDF 500000 Parallel 221.25 +Discrete PDF 8 Vectorized 0.04 +Discrete PDF 16 Vectorized 0.04 +Discrete PDF 32 Vectorized 0.04 +Discrete PDF 64 Vectorized 0.08 +Discrete PDF 128 Parallel 0.17 +Discrete PDF 256 Vectorized 0.29 +Discrete PDF 512 Vectorized 0.54 +Discrete PDF 1000 Vectorized 1.04 +Discrete PDF 2000 Vectorized 2.12 +Discrete PDF 5000 Vectorized 5.12 +Discrete PDF 10000 Vectorized 10.12 +Discrete PDF 20000 Vectorized 20.21 +Discrete PDF 50000 Vectorized 50.25 +Discrete PDF 100000 Vectorized 100.83 +Discrete PDF 250000 Parallel 127.50 +Discrete PDF 500000 Parallel 173.62 +Exponential CDF 8 Parallel 0.04 +Exponential CDF 16 Parallel 0.08 +Exponential CDF 32 Parallel 0.12 +Exponential CDF 64 Parallel 0.21 +Exponential CDF 128 Work-Stealing 0.38 +Exponential CDF 256 Work-Stealing 0.75 +Exponential CDF 512 Work-Stealing 1.42 +Exponential CDF 1000 Work-Stealing 2.79 +Exponential CDF 2000 Vectorized 7.21 +Exponential CDF 5000 Vectorized 17.75 +Exponential CDF 10000 Vectorized 35.50 +Exponential CDF 20000 Work-Stealing 74.58 +Exponential CDF 50000 Work-Stealing 76.33 +Exponential CDF 100000 Parallel 144.25 +Exponential CDF 250000 Parallel 242.46 +Exponential CDF 500000 Parallel 483.71 +Exponential LogPDF 8 Work-Stealing 0.04 +Exponential LogPDF 16 Work-Stealing 0.04 +Exponential LogPDF 32 Work-Stealing 0.04 +Exponential LogPDF 64 Work-Stealing 0.04 +Exponential LogPDF 128 Work-Stealing 0.04 +Exponential LogPDF 256 Work-Stealing 0.08 +Exponential LogPDF 512 Work-Stealing 0.08 +Exponential LogPDF 1000 Work-Stealing 0.17 +Exponential LogPDF 2000 Vectorized 1.50 +Exponential LogPDF 5000 Vectorized 3.50 +Exponential LogPDF 10000 Vectorized 7.29 +Exponential LogPDF 20000 Vectorized 13.54 +Exponential LogPDF 50000 Vectorized 33.96 +Exponential LogPDF 100000 Parallel 46.62 +Exponential LogPDF 250000 Work-Stealing 101.62 +Exponential LogPDF 500000 Parallel 101.25 +Exponential PDF 8 Parallel 0.04 +Exponential PDF 16 Parallel 0.08 +Exponential PDF 32 Parallel 0.12 +Exponential PDF 64 Work-Stealing 0.21 +Exponential PDF 128 Work-Stealing 0.42 +Exponential PDF 256 Work-Stealing 0.71 +Exponential PDF 512 Work-Stealing 1.42 +Exponential PDF 1000 Work-Stealing 2.79 +Exponential PDF 2000 Vectorized 6.83 +Exponential PDF 5000 Vectorized 16.88 +Exponential PDF 10000 Vectorized 33.92 +Exponential PDF 20000 Vectorized 72.58 +Exponential PDF 50000 Work-Stealing 118.58 +Exponential PDF 100000 Parallel 135.75 +Exponential PDF 250000 Work-Stealing 248.46 +Exponential PDF 500000 Parallel 441.29 +Gamma CDF 8 Work-Stealing 0.21 +Gamma CDF 16 Parallel 0.33 +Gamma CDF 32 Parallel 0.67 +Gamma CDF 64 Vectorized 1.54 +Gamma CDF 128 Work-Stealing 2.79 +Gamma CDF 256 Vectorized 6.50 +Gamma CDF 512 Work-Stealing 14.67 +Gamma CDF 1000 Vectorized 30.92 +Gamma CDF 2000 Parallel 65.17 +Gamma CDF 5000 Parallel 93.62 +Gamma CDF 10000 Parallel 151.62 +Gamma CDF 20000 Parallel 236.38 +Gamma CDF 50000 Parallel 497.12 +Gamma CDF 100000 Parallel 1035.42 +Gamma CDF 250000 Work-Stealing 2327.29 +Gamma CDF 500000 Work-Stealing 4233.33 +Gamma LogPDF 8 Work-Stealing 0.04 +Gamma LogPDF 16 Parallel 0.08 +Gamma LogPDF 32 Work-Stealing 0.12 +Gamma LogPDF 64 Work-Stealing 0.21 +Gamma LogPDF 128 Work-Stealing 0.46 +Gamma LogPDF 256 Work-Stealing 0.83 +Gamma LogPDF 512 Work-Stealing 1.71 +Gamma LogPDF 1000 Work-Stealing 3.38 +Gamma LogPDF 2000 Vectorized 8.67 +Gamma LogPDF 5000 Vectorized 22.38 +Gamma LogPDF 10000 Vectorized 50.83 +Gamma LogPDF 20000 Work-Stealing 95.92 +Gamma LogPDF 50000 Parallel 133.21 +Gamma LogPDF 100000 Parallel 164.38 +Gamma LogPDF 250000 Parallel 301.79 +Gamma LogPDF 500000 Parallel 586.21 +Gamma PDF 8 Parallel 0.08 +Gamma PDF 16 Work-Stealing 0.17 +Gamma PDF 32 Work-Stealing 0.25 +Gamma PDF 64 Parallel 0.50 +Gamma PDF 128 Vectorized 0.96 +Gamma PDF 256 Vectorized 1.83 +Gamma PDF 512 Vectorized 3.58 +Gamma PDF 1000 Vectorized 7.04 +Gamma PDF 2000 Vectorized 13.75 +Gamma PDF 5000 Vectorized 35.83 +Gamma PDF 10000 Vectorized 77.08 +Gamma PDF 20000 Parallel 140.54 +Gamma PDF 50000 Parallel 161.62 +Gamma PDF 100000 Parallel 236.50 +Gamma PDF 250000 Parallel 542.25 +Gamma PDF 500000 Parallel 1034.17 +Gaussian CDF 8 Parallel 0.12 +Gaussian CDF 16 Parallel 0.21 +Gaussian CDF 32 Parallel 0.42 +Gaussian CDF 64 Parallel 0.75 +Gaussian CDF 128 Vectorized 1.46 +Gaussian CDF 256 Vectorized 2.83 +Gaussian CDF 512 Vectorized 5.54 +Gaussian CDF 1000 Work-Stealing 10.71 +Gaussian CDF 2000 Vectorized 21.29 +Gaussian CDF 5000 Vectorized 52.96 +Gaussian CDF 10000 Work-Stealing 67.58 +Gaussian CDF 20000 Work-Stealing 96.46 +Gaussian CDF 50000 Work-Stealing 188.62 +Gaussian CDF 100000 Work-Stealing 342.33 +Gaussian CDF 250000 Work-Stealing 743.04 +Gaussian CDF 500000 Work-Stealing 1259.88 +Gaussian LogPDF 8 Work-Stealing 0.04 +Gaussian LogPDF 16 Parallel 0.04 +Gaussian LogPDF 32 Parallel 0.04 +Gaussian LogPDF 64 Parallel 0.04 +Gaussian LogPDF 128 Parallel 0.04 +Gaussian LogPDF 256 Work-Stealing 0.04 +Gaussian LogPDF 512 Parallel 0.08 +Gaussian LogPDF 1000 Parallel 0.17 +Gaussian LogPDF 2000 Vectorized 1.08 +Gaussian LogPDF 5000 Vectorized 2.67 +Gaussian LogPDF 10000 Vectorized 6.21 +Gaussian LogPDF 20000 Vectorized 11.58 +Gaussian LogPDF 50000 Vectorized 27.50 +Gaussian LogPDF 100000 Work-Stealing 53.21 +Gaussian LogPDF 250000 Work-Stealing 101.96 +Gaussian LogPDF 500000 Work-Stealing 158.58 +Gaussian PDF 8 Parallel 0.08 +Gaussian PDF 16 Parallel 0.08 +Gaussian PDF 32 Parallel 0.12 +Gaussian PDF 64 Parallel 0.21 +Gaussian PDF 128 Parallel 0.38 +Gaussian PDF 256 Work-Stealing 0.71 +Gaussian PDF 512 Work-Stealing 1.38 +Gaussian PDF 1000 Work-Stealing 2.67 +Gaussian PDF 2000 Vectorized 6.50 +Gaussian PDF 5000 Vectorized 16.12 +Gaussian PDF 10000 Vectorized 33.12 +Gaussian PDF 20000 Vectorized 69.33 +Gaussian PDF 50000 Work-Stealing 90.75 +Gaussian PDF 100000 Parallel 129.83 +Gaussian PDF 250000 Parallel 229.38 +Gaussian PDF 500000 Work-Stealing 368.75 +Poisson CDF 8 Scalar 0.21 +Poisson CDF 16 Scalar 0.50 +Poisson CDF 32 Scalar 1.00 +Poisson CDF 64 Scalar 2.38 +Poisson CDF 128 Vectorized 4.46 +Poisson CDF 256 Vectorized 9.33 +Poisson CDF 512 Work-Stealing 19.58 +Poisson CDF 1000 Work-Stealing 38.58 +Poisson CDF 2000 Parallel 73.46 +Poisson CDF 5000 Parallel 107.79 +Poisson CDF 10000 Parallel 158.75 +Poisson CDF 20000 Parallel 255.67 +Poisson CDF 50000 Work-Stealing 615.29 +Poisson CDF 100000 Parallel 1184.88 +Poisson CDF 250000 Parallel 2731.83 +Poisson CDF 500000 Work-Stealing 4841.25 +Poisson LogPDF 8 Work-Stealing 0.04 +Poisson LogPDF 16 Vectorized 0.08 +Poisson LogPDF 32 Vectorized 0.12 +Poisson LogPDF 64 Vectorized 0.29 +Poisson LogPDF 128 Work-Stealing 0.46 +Poisson LogPDF 256 Work-Stealing 0.96 +Poisson LogPDF 512 Vectorized 1.88 +Poisson LogPDF 1000 Vectorized 3.50 +Poisson LogPDF 2000 Vectorized 7.50 +Poisson LogPDF 5000 Vectorized 20.96 +Poisson LogPDF 10000 Vectorized 44.29 +Poisson LogPDF 20000 Vectorized 94.46 +Poisson LogPDF 50000 Parallel 198.46 +Poisson LogPDF 100000 Parallel 189.67 +Poisson LogPDF 250000 Work-Stealing 448.25 +Poisson LogPDF 500000 Work-Stealing 772.29 +Poisson PDF 8 Vectorized 0.12 +Poisson PDF 16 Vectorized 0.21 +Poisson PDF 32 Vectorized 0.29 +Poisson PDF 64 Vectorized 0.62 +Poisson PDF 128 Vectorized 1.17 +Poisson PDF 256 Vectorized 2.42 +Poisson PDF 512 Vectorized 4.75 +Poisson PDF 1000 Vectorized 9.17 +Poisson PDF 2000 Vectorized 18.33 +Poisson PDF 5000 Vectorized 45.46 +Poisson PDF 10000 Vectorized 90.96 +Poisson PDF 20000 Work-Stealing 132.96 +Poisson PDF 50000 Parallel 193.79 +Poisson PDF 100000 Parallel 293.92 +Poisson PDF 250000 Parallel 685.83 +Poisson PDF 500000 Work-Stealing 1274.42 +StudentT CDF 8 Parallel 0.67 +StudentT CDF 16 Vectorized 1.12 +StudentT CDF 32 Parallel 2.67 +StudentT CDF 64 Vectorized 5.33 +StudentT CDF 128 Vectorized 10.62 +StudentT CDF 256 Vectorized 22.29 +StudentT CDF 512 Vectorized 43.75 +StudentT CDF 1000 Vectorized 88.12 +StudentT CDF 2000 Parallel 176.46 +StudentT CDF 5000 Work-Stealing 447.54 +StudentT CDF 10000 Vectorized 889.96 +StudentT CDF 20000 Work-Stealing 1781.25 +StudentT CDF 50000 Parallel 4434.42 +StudentT CDF 100000 Vectorized 8873.58 +StudentT CDF 250000 Work-Stealing 22232.29 +StudentT CDF 500000 Parallel 44347.79 +StudentT LogPDF 8 Parallel 0.12 +StudentT LogPDF 16 Work-Stealing 0.12 +StudentT LogPDF 32 Parallel 0.17 +StudentT LogPDF 64 Work-Stealing 0.29 +StudentT LogPDF 128 Parallel 0.50 +StudentT LogPDF 256 Work-Stealing 0.92 +StudentT LogPDF 512 Work-Stealing 1.83 +StudentT LogPDF 1000 Work-Stealing 3.54 +StudentT LogPDF 2000 Parallel 7.54 +StudentT LogPDF 5000 Work-Stealing 21.12 +StudentT LogPDF 10000 Vectorized 51.62 +StudentT LogPDF 20000 Vectorized 101.58 +StudentT LogPDF 50000 Parallel 121.96 +StudentT LogPDF 100000 Work-Stealing 162.33 +StudentT LogPDF 250000 Work-Stealing 353.08 +StudentT LogPDF 500000 Parallel 680.62 +StudentT PDF 8 Vectorized 0.17 +StudentT PDF 16 Parallel 0.21 +StudentT PDF 32 Work-Stealing 0.29 +StudentT PDF 64 Vectorized 0.54 +StudentT PDF 128 Vectorized 0.96 +StudentT PDF 256 Vectorized 1.92 +StudentT PDF 512 Vectorized 3.79 +StudentT PDF 1000 Vectorized 7.17 +StudentT PDF 2000 Vectorized 14.21 +StudentT PDF 5000 Vectorized 36.38 +StudentT PDF 10000 Vectorized 76.42 +StudentT PDF 20000 Work-Stealing 104.04 +StudentT PDF 50000 Work-Stealing 144.88 +StudentT PDF 100000 Parallel 219.00 +StudentT PDF 250000 Parallel 482.04 +StudentT PDF 500000 Work-Stealing 949.04 +Uniform CDF 8 Parallel 0.04 +Uniform CDF 16 Parallel 0.04 +Uniform CDF 32 Parallel 0.04 +Uniform CDF 64 Work-Stealing 0.04 +Uniform CDF 128 Work-Stealing 0.04 +Uniform CDF 256 Work-Stealing 0.08 +Uniform CDF 512 Work-Stealing 0.12 +Uniform CDF 1000 Work-Stealing 0.25 +Uniform CDF 2000 Vectorized 2.21 +Uniform CDF 5000 Vectorized 5.42 +Uniform CDF 10000 Vectorized 19.54 +Uniform CDF 20000 Work-Stealing 49.00 +Uniform CDF 50000 Work-Stealing 73.29 +Uniform CDF 100000 Work-Stealing 120.33 +Uniform CDF 250000 Parallel 253.42 +Uniform CDF 500000 Work-Stealing 453.17 +Uniform LogPDF 8 Vectorized 0.04 +Uniform LogPDF 16 Vectorized 0.04 +Uniform LogPDF 32 Vectorized 0.04 +Uniform LogPDF 64 Vectorized 0.04 +Uniform LogPDF 128 Vectorized 0.04 +Uniform LogPDF 256 Vectorized 0.08 +Uniform LogPDF 512 Vectorized 0.12 +Uniform LogPDF 1000 Vectorized 0.21 +Uniform LogPDF 2000 Vectorized 0.38 +Uniform LogPDF 5000 Vectorized 0.88 +Uniform LogPDF 10000 Vectorized 1.71 +Uniform LogPDF 20000 Vectorized 3.46 +Uniform LogPDF 50000 Vectorized 8.58 +Uniform LogPDF 100000 Vectorized 17.12 +Uniform LogPDF 250000 Vectorized 43.54 +Uniform LogPDF 500000 Vectorized 89.38 +Uniform PDF 8 Vectorized 0.04 +Uniform PDF 16 Vectorized 0.04 +Uniform PDF 32 Vectorized 0.04 +Uniform PDF 64 Vectorized 0.04 +Uniform PDF 128 Vectorized 0.04 +Uniform PDF 256 Vectorized 0.08 +Uniform PDF 512 Vectorized 0.12 +Uniform PDF 1000 Vectorized 0.21 +Uniform PDF 2000 Vectorized 0.38 +Uniform PDF 5000 Vectorized 0.88 +Uniform PDF 10000 Vectorized 1.71 +Uniform PDF 20000 Vectorized 3.71 +Uniform PDF 50000 Vectorized 9.25 +Uniform PDF 100000 Vectorized 17.92 +Uniform PDF 250000 Vectorized 44.88 +Uniform PDF 500000 Vectorized 89.75 + + +===================== + Crossover Summary +===================== + +Distribution Operation Sβ†’V Vβ†’P Pβ†’Work-Steal +-------------------------------------------------------------------------- +Beta CDF 8 never 128 +Beta LogPDF 16 8 64 +Beta PDF 16 8 256 +ChiSquared CDF 8 32 16 +ChiSquared LogPDF 8 8 16 +ChiSquared PDF 8 8 32 +Discrete CDF 8 512 64 +Discrete LogPDF 8 250000 32 +Discrete PDF 8 128 16 +Exponential CDF 8 8 128 +Exponential LogPDF 8 8 8 +Exponential PDF 8 8 64 +Gamma CDF 8 8 8 +Gamma LogPDF 8 8 8 +Gamma PDF 8 8 16 +Gaussian CDF 8 8 128 +Gaussian LogPDF 8 8 8 +Gaussian PDF 8 8 256 +Poisson CDF 128 2000 32 +Poisson LogPDF 8 50000 8 +Poisson PDF 8 50000 64 +StudentT CDF 8 8 64 +StudentT LogPDF 16 8 16 +StudentT PDF 8 16 32 +Uniform CDF 8 8 64 +Uniform LogPDF 8 never 32 +Uniform PDF 8 never 128 + +Results saved to /Users/wolfman/Development/libstats/build/profiles/dispatcher/2026-04-12T05-36-21Z_darwin-arm64_investigate-gaussian-avx512-perf_sha-6aef918/strategy_profile_results.csv diff --git a/data/profiles/dispatcher/2026-04-12T05-36-21Z_darwin-arm64_investigate-gaussian-avx512-perf_sha-6aef918/logs/system_inspector_performance.txt b/data/profiles/dispatcher/2026-04-12T05-36-21Z_darwin-arm64_investigate-gaussian-avx512-perf_sha-6aef918/logs/system_inspector_performance.txt new file mode 100644 index 0000000..7f24fc5 --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T05-36-21Z_darwin-arm64_investigate-gaussian-avx512-perf_sha-6aef918/logs/system_inspector_performance.txt @@ -0,0 +1,103 @@ + +======================================= + System Inspector - Performance Mode +======================================= + +System capabilities analysis with performance measurements + +System: 8 logical cores, NEON SIMD, 0 KB L3 cache + + +--- CPU Features --- +Feature Support Description +------------------------------------------------------------ +AVX-512 No Foundation instructions +AVX2 No Advanced Vector Ext 2 +AVX No Advanced Vector Ext +SSE2 No Streaming SIMD Ext 2 +NEON Yes ARM SIMD instructions +FMA No Fused Multiply-Add + + +--- Cache Information --- +Cache Level Size (KB) Line Size +------------------------------------------ +L1 64 64 bytes +L2 4096 64 bytes +L3 0 64 bytes + + +--- CPU Topology --- +Hardware Threads: 8 +Logical Cores: 8 +Physical Cores: 4 +Hyperthreading: Enabled + + +--- SIMD Capabilities --- +Instruction Support Vector Width Description +-------------------------------------------------------------- +SSE2 No 128-bit Basic SIMD operations +AVX No 256-bit Advanced vector ext +AVX2 No 256-bit Integer AVX operations +AVX-512 No 512-bit Foundation instructions +NEON Yes 128-bit ARM SIMD instructions + +Active SIMD Level: NEON + + +--- Performance Baselines --- +Operation Type Time (ΞΌs) Throughput (MOps/s) +------------------------------------------------------------ +SIMD Multiply 214 4657 +Scalar Multiply 209 4764 + +SIMD Speedup: 0.98x + + +--- Performance Dispatcher Configuration --- +Example Strategy Selections: +Batch Size Distribution Complexity Strategy +---------------------------------------------------------------------- +100 Uniform Simple Vectorized +100 Gaussian Simple Vectorized +100 Exponential Simple Vectorized +100 Poisson Simple Vectorized +100 Discrete Simple Vectorized +1000 Uniform Simple Vectorized +1000 Gaussian Simple Parallel +1000 Exponential Simple Vectorized +1000 Poisson Simple Parallel +1000 Discrete Simple Vectorized +10000 Uniform Simple Parallel +10000 Gaussian Simple Parallel +10000 Exponential Simple Parallel +10000 Poisson Simple Parallel +10000 Discrete Simple Parallel +100000 Uniform Simple Parallel +100000 Gaussian Simple Parallel +100000 Exponential Simple Parallel +100000 Poisson Simple Work-Stealing +100000 Discrete Simple Parallel + + +--- Platform Constants --- +Constant Value +-------------------------------------------------- +SIMD Block Size 48 doubles +Memory Alignment 128 bytes +Min SIMD Size 4 elements +Optimal Grain Size 384 elements +Fast Transcendental Support No + + +--- Adaptive Constants --- +Constant Value +-------------------------------------------------- +Min Elements for Parallel 1536 +Default Grain Size 256 +Simple Operation Grain Size 128 +Complex Operation Grain Size 512 + +Warning: L2 cache larger than L3 cache - may be normal on some systems +System inspection completed successfully. diff --git a/data/profiles/dispatcher/2026-04-12T05-36-21Z_darwin-arm64_investigate-gaussian-avx512-perf_sha-6aef918/manifest.txt b/data/profiles/dispatcher/2026-04-12T05-36-21Z_darwin-arm64_investigate-gaussian-avx512-perf_sha-6aef918/manifest.txt new file mode 100644 index 0000000..174f79f --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T05-36-21Z_darwin-arm64_investigate-gaussian-avx512-perf_sha-6aef918/manifest.txt @@ -0,0 +1,14 @@ +Dispatcher profile bundle +========================= + +Run ID: 2026-04-12T05-36-21Z_darwin-arm64_investigate-gaussian-avx512-perf_sha-6aef918 +Captured at (UTC): 2026-04-12T05-36-21Z + +Files: +- metadata.json +- summary.json +- crossovers.csv +- best_strategies.csv +- strategy_profile_results.csv +- logs/system_inspector_performance.txt +- logs/strategy_profile.txt diff --git a/data/profiles/dispatcher/2026-04-12T05-36-21Z_darwin-arm64_investigate-gaussian-avx512-perf_sha-6aef918/metadata.json b/data/profiles/dispatcher/2026-04-12T05-36-21Z_darwin-arm64_investigate-gaussian-avx512-perf_sha-6aef918/metadata.json new file mode 100644 index 0000000..d871953 --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T05-36-21Z_darwin-arm64_investigate-gaussian-avx512-perf_sha-6aef918/metadata.json @@ -0,0 +1,15 @@ +{ + "captured_at_utc": "2026-04-12T05-36-21Z", + "run_id": "2026-04-12T05-36-21Z_darwin-arm64_investigate-gaussian-avx512-perf_sha-6aef918", + "git_branch": "investigate-gaussian-avx512-perf", + "git_sha": "6aef918", + "project_root": "/Users/wolfman/Development/libstats", + "build_dir": "/Users/wolfman/Development/libstats/build", + "build_type": "Release", + "cxx_compiler": "", + "os": "darwin", + "arch": "arm64", + "cpu_brand": "Apple M1", + "physical_cores": "8", + "logical_cores": "8" +} diff --git a/data/profiles/dispatcher/2026-04-12T05-36-21Z_darwin-arm64_investigate-gaussian-avx512-perf_sha-6aef918/strategy_profile_results.csv b/data/profiles/dispatcher/2026-04-12T05-36-21Z_darwin-arm64_investigate-gaussian-avx512-perf_sha-6aef918/strategy_profile_results.csv new file mode 100644 index 0000000..76b4469 --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T05-36-21Z_darwin-arm64_investigate-gaussian-avx512-perf_sha-6aef918/strategy_profile_results.csv @@ -0,0 +1,1729 @@ +Distribution,Operation,BatchSize,Strategy,MedianTime_us +Uniform,PDF,8,SCALAR,0.167000 +Uniform,PDF,8,VECTORIZED,0.041000 +Uniform,PDF,8,PARALLEL,0.041000 +Uniform,PDF,8,WORK_STEALING,0.042000 +Uniform,LogPDF,8,SCALAR,0.167000 +Uniform,LogPDF,8,VECTORIZED,0.042000 +Uniform,LogPDF,8,PARALLEL,0.042000 +Uniform,LogPDF,8,WORK_STEALING,0.042000 +Uniform,CDF,8,SCALAR,0.167000 +Uniform,CDF,8,VECTORIZED,0.042000 +Uniform,CDF,8,PARALLEL,0.041000 +Uniform,CDF,8,WORK_STEALING,0.042000 +Uniform,PDF,16,SCALAR,0.333000 +Uniform,PDF,16,VECTORIZED,0.042000 +Uniform,PDF,16,PARALLEL,0.042000 +Uniform,PDF,16,WORK_STEALING,0.042000 +Uniform,LogPDF,16,SCALAR,0.333000 +Uniform,LogPDF,16,VECTORIZED,0.042000 +Uniform,LogPDF,16,PARALLEL,0.042000 +Uniform,LogPDF,16,WORK_STEALING,0.042000 +Uniform,CDF,16,SCALAR,0.333000 +Uniform,CDF,16,VECTORIZED,0.083000 +Uniform,CDF,16,PARALLEL,0.042000 +Uniform,CDF,16,WORK_STEALING,0.042000 +Uniform,PDF,32,SCALAR,0.625000 +Uniform,PDF,32,VECTORIZED,0.041000 +Uniform,PDF,32,PARALLEL,0.042000 +Uniform,PDF,32,WORK_STEALING,0.042000 +Uniform,LogPDF,32,SCALAR,0.625000 +Uniform,LogPDF,32,VECTORIZED,0.041000 +Uniform,LogPDF,32,PARALLEL,0.042000 +Uniform,LogPDF,32,WORK_STEALING,0.041000 +Uniform,CDF,32,SCALAR,0.625000 +Uniform,CDF,32,VECTORIZED,0.083000 +Uniform,CDF,32,PARALLEL,0.042000 +Uniform,CDF,32,WORK_STEALING,0.042000 +Uniform,PDF,64,SCALAR,1.209000 +Uniform,PDF,64,VECTORIZED,0.041000 +Uniform,PDF,64,PARALLEL,0.042000 +Uniform,PDF,64,WORK_STEALING,0.042000 +Uniform,LogPDF,64,SCALAR,1.208000 +Uniform,LogPDF,64,VECTORIZED,0.042000 +Uniform,LogPDF,64,PARALLEL,0.083000 +Uniform,LogPDF,64,WORK_STEALING,0.042000 +Uniform,CDF,64,SCALAR,1.208000 +Uniform,CDF,64,VECTORIZED,0.125000 +Uniform,CDF,64,PARALLEL,0.083000 +Uniform,CDF,64,WORK_STEALING,0.042000 +Uniform,PDF,128,SCALAR,2.500000 +Uniform,PDF,128,VECTORIZED,0.042000 +Uniform,PDF,128,PARALLEL,0.125000 +Uniform,PDF,128,WORK_STEALING,0.042000 +Uniform,LogPDF,128,SCALAR,2.417000 +Uniform,LogPDF,128,VECTORIZED,0.042000 +Uniform,LogPDF,128,PARALLEL,0.125000 +Uniform,LogPDF,128,WORK_STEALING,0.042000 +Uniform,CDF,128,SCALAR,2.459000 +Uniform,CDF,128,VECTORIZED,0.167000 +Uniform,CDF,128,PARALLEL,0.125000 +Uniform,CDF,128,WORK_STEALING,0.042000 +Uniform,PDF,256,SCALAR,5.042000 +Uniform,PDF,256,VECTORIZED,0.083000 +Uniform,PDF,256,PARALLEL,0.208000 +Uniform,PDF,256,WORK_STEALING,0.083000 +Uniform,LogPDF,256,SCALAR,4.917000 +Uniform,LogPDF,256,VECTORIZED,0.083000 +Uniform,LogPDF,256,PARALLEL,0.209000 +Uniform,LogPDF,256,WORK_STEALING,0.083000 +Uniform,CDF,256,SCALAR,4.917000 +Uniform,CDF,256,VECTORIZED,0.333000 +Uniform,CDF,256,PARALLEL,0.292000 +Uniform,CDF,256,WORK_STEALING,0.083000 +Uniform,PDF,512,SCALAR,9.958000 +Uniform,PDF,512,VECTORIZED,0.125000 +Uniform,PDF,512,PARALLEL,0.375000 +Uniform,PDF,512,WORK_STEALING,0.125000 +Uniform,LogPDF,512,SCALAR,9.959000 +Uniform,LogPDF,512,VECTORIZED,0.125000 +Uniform,LogPDF,512,PARALLEL,0.375000 +Uniform,LogPDF,512,WORK_STEALING,0.125000 +Uniform,CDF,512,SCALAR,9.958000 +Uniform,CDF,512,VECTORIZED,0.583000 +Uniform,CDF,512,PARALLEL,0.541000 +Uniform,CDF,512,WORK_STEALING,0.125000 +Uniform,PDF,1000,SCALAR,19.583000 +Uniform,PDF,1000,VECTORIZED,0.208000 +Uniform,PDF,1000,PARALLEL,0.750000 +Uniform,PDF,1000,WORK_STEALING,0.250000 +Uniform,LogPDF,1000,SCALAR,19.583000 +Uniform,LogPDF,1000,VECTORIZED,0.208000 +Uniform,LogPDF,1000,PARALLEL,0.791000 +Uniform,LogPDF,1000,WORK_STEALING,0.208000 +Uniform,CDF,1000,SCALAR,19.375000 +Uniform,CDF,1000,VECTORIZED,1.125000 +Uniform,CDF,1000,PARALLEL,2.375000 +Uniform,CDF,1000,WORK_STEALING,0.250000 +Uniform,PDF,2000,SCALAR,39.458000 +Uniform,PDF,2000,VECTORIZED,0.375000 +Uniform,PDF,2000,PARALLEL,60.458000 +Uniform,PDF,2000,WORK_STEALING,10.541000 +Uniform,LogPDF,2000,SCALAR,43.833000 +Uniform,LogPDF,2000,VECTORIZED,0.375000 +Uniform,LogPDF,2000,PARALLEL,37.333000 +Uniform,LogPDF,2000,WORK_STEALING,17.791000 +Uniform,CDF,2000,SCALAR,38.459000 +Uniform,CDF,2000,VECTORIZED,2.208000 +Uniform,CDF,2000,PARALLEL,48.166000 +Uniform,CDF,2000,WORK_STEALING,11.458000 +Uniform,PDF,5000,SCALAR,96.792000 +Uniform,PDF,5000,VECTORIZED,0.875000 +Uniform,PDF,5000,PARALLEL,75.792000 +Uniform,PDF,5000,WORK_STEALING,21.291000 +Uniform,LogPDF,5000,SCALAR,96.667000 +Uniform,LogPDF,5000,VECTORIZED,0.875000 +Uniform,LogPDF,5000,PARALLEL,118.875000 +Uniform,LogPDF,5000,WORK_STEALING,23.583000 +Uniform,CDF,5000,SCALAR,97.125000 +Uniform,CDF,5000,VECTORIZED,5.417000 +Uniform,CDF,5000,PARALLEL,89.084000 +Uniform,CDF,5000,WORK_STEALING,28.250000 +Uniform,PDF,10000,SCALAR,197.875000 +Uniform,PDF,10000,VECTORIZED,1.709000 +Uniform,PDF,10000,PARALLEL,180.541000 +Uniform,PDF,10000,WORK_STEALING,32.250000 +Uniform,LogPDF,10000,SCALAR,201.167000 +Uniform,LogPDF,10000,VECTORIZED,1.708000 +Uniform,LogPDF,10000,PARALLEL,163.791000 +Uniform,LogPDF,10000,WORK_STEALING,27.292000 +Uniform,CDF,10000,SCALAR,194.583000 +Uniform,CDF,10000,VECTORIZED,19.541000 +Uniform,CDF,10000,PARALLEL,139.917000 +Uniform,CDF,10000,WORK_STEALING,27.000000 +Uniform,PDF,20000,SCALAR,393.166000 +Uniform,PDF,20000,VECTORIZED,3.708000 +Uniform,PDF,20000,PARALLEL,147.833000 +Uniform,PDF,20000,WORK_STEALING,42.292000 +Uniform,LogPDF,20000,SCALAR,390.917000 +Uniform,LogPDF,20000,VECTORIZED,3.458000 +Uniform,LogPDF,20000,PARALLEL,57.875000 +Uniform,LogPDF,20000,WORK_STEALING,42.250000 +Uniform,CDF,20000,SCALAR,391.875000 +Uniform,CDF,20000,VECTORIZED,74.208000 +Uniform,CDF,20000,PARALLEL,94.875000 +Uniform,CDF,20000,WORK_STEALING,49.000000 +Uniform,PDF,50000,SCALAR,985.292000 +Uniform,PDF,50000,VECTORIZED,9.250000 +Uniform,PDF,50000,PARALLEL,102.166000 +Uniform,PDF,50000,WORK_STEALING,76.208000 +Uniform,LogPDF,50000,SCALAR,979.334000 +Uniform,LogPDF,50000,VECTORIZED,8.584000 +Uniform,LogPDF,50000,PARALLEL,109.958000 +Uniform,LogPDF,50000,WORK_STEALING,80.625000 +Uniform,CDF,50000,SCALAR,981.250000 +Uniform,CDF,50000,VECTORIZED,227.792000 +Uniform,CDF,50000,PARALLEL,116.417000 +Uniform,CDF,50000,WORK_STEALING,73.292000 +Uniform,PDF,100000,SCALAR,1962.458000 +Uniform,PDF,100000,VECTORIZED,17.916000 +Uniform,PDF,100000,PARALLEL,135.750000 +Uniform,PDF,100000,WORK_STEALING,143.166000 +Uniform,LogPDF,100000,SCALAR,1957.250000 +Uniform,LogPDF,100000,VECTORIZED,17.125000 +Uniform,LogPDF,100000,PARALLEL,157.875000 +Uniform,LogPDF,100000,WORK_STEALING,131.500000 +Uniform,CDF,100000,SCALAR,1972.792000 +Uniform,CDF,100000,VECTORIZED,486.459000 +Uniform,CDF,100000,PARALLEL,150.333000 +Uniform,CDF,100000,WORK_STEALING,120.334000 +Uniform,PDF,250000,SCALAR,4931.583000 +Uniform,PDF,250000,VECTORIZED,44.875000 +Uniform,PDF,250000,PARALLEL,262.500000 +Uniform,PDF,250000,WORK_STEALING,257.125000 +Uniform,LogPDF,250000,SCALAR,4904.166000 +Uniform,LogPDF,250000,VECTORIZED,43.542000 +Uniform,LogPDF,250000,PARALLEL,250.667000 +Uniform,LogPDF,250000,WORK_STEALING,301.084000 +Uniform,CDF,250000,SCALAR,4875.333000 +Uniform,CDF,250000,VECTORIZED,1221.333000 +Uniform,CDF,250000,PARALLEL,253.417000 +Uniform,CDF,250000,WORK_STEALING,267.375000 +Uniform,PDF,500000,SCALAR,9842.333000 +Uniform,PDF,500000,VECTORIZED,89.750000 +Uniform,PDF,500000,PARALLEL,458.625000 +Uniform,PDF,500000,WORK_STEALING,502.833000 +Uniform,LogPDF,500000,SCALAR,9815.458000 +Uniform,LogPDF,500000,VECTORIZED,89.375000 +Uniform,LogPDF,500000,PARALLEL,564.250000 +Uniform,LogPDF,500000,WORK_STEALING,484.292000 +Uniform,CDF,500000,SCALAR,9690.000000 +Uniform,CDF,500000,VECTORIZED,2533.792000 +Uniform,CDF,500000,PARALLEL,483.000000 +Uniform,CDF,500000,WORK_STEALING,453.166000 +Gaussian,PDF,8,SCALAR,0.167000 +Gaussian,PDF,8,VECTORIZED,0.125000 +Gaussian,PDF,8,PARALLEL,0.083000 +Gaussian,PDF,8,WORK_STEALING,0.083000 +Gaussian,LogPDF,8,SCALAR,0.167000 +Gaussian,LogPDF,8,VECTORIZED,0.083000 +Gaussian,LogPDF,8,PARALLEL,0.042000 +Gaussian,LogPDF,8,WORK_STEALING,0.041000 +Gaussian,CDF,8,SCALAR,0.291000 +Gaussian,CDF,8,VECTORIZED,0.167000 +Gaussian,CDF,8,PARALLEL,0.125000 +Gaussian,CDF,8,WORK_STEALING,0.125000 +Gaussian,PDF,16,SCALAR,0.333000 +Gaussian,PDF,16,VECTORIZED,0.125000 +Gaussian,PDF,16,PARALLEL,0.083000 +Gaussian,PDF,16,WORK_STEALING,0.083000 +Gaussian,LogPDF,16,SCALAR,0.334000 +Gaussian,LogPDF,16,VECTORIZED,0.083000 +Gaussian,LogPDF,16,PARALLEL,0.042000 +Gaussian,LogPDF,16,WORK_STEALING,0.042000 +Gaussian,CDF,16,SCALAR,0.458000 +Gaussian,CDF,16,VECTORIZED,0.250000 +Gaussian,CDF,16,PARALLEL,0.208000 +Gaussian,CDF,16,WORK_STEALING,0.209000 +Gaussian,PDF,32,SCALAR,0.666000 +Gaussian,PDF,32,VECTORIZED,0.167000 +Gaussian,PDF,32,PARALLEL,0.125000 +Gaussian,PDF,32,WORK_STEALING,0.125000 +Gaussian,LogPDF,32,SCALAR,0.666000 +Gaussian,LogPDF,32,VECTORIZED,0.083000 +Gaussian,LogPDF,32,PARALLEL,0.042000 +Gaussian,LogPDF,32,WORK_STEALING,0.042000 +Gaussian,CDF,32,SCALAR,0.875000 +Gaussian,CDF,32,VECTORIZED,0.417000 +Gaussian,CDF,32,PARALLEL,0.416000 +Gaussian,CDF,32,WORK_STEALING,0.416000 +Gaussian,PDF,64,SCALAR,1.208000 +Gaussian,PDF,64,VECTORIZED,0.292000 +Gaussian,PDF,64,PARALLEL,0.208000 +Gaussian,PDF,64,WORK_STEALING,0.208000 +Gaussian,LogPDF,64,SCALAR,1.250000 +Gaussian,LogPDF,64,VECTORIZED,0.083000 +Gaussian,LogPDF,64,PARALLEL,0.042000 +Gaussian,LogPDF,64,WORK_STEALING,0.042000 +Gaussian,CDF,64,SCALAR,1.708000 +Gaussian,CDF,64,VECTORIZED,0.791000 +Gaussian,CDF,64,PARALLEL,0.750000 +Gaussian,CDF,64,WORK_STEALING,0.750000 +Gaussian,PDF,128,SCALAR,2.417000 +Gaussian,PDF,128,VECTORIZED,0.459000 +Gaussian,PDF,128,PARALLEL,0.375000 +Gaussian,PDF,128,WORK_STEALING,0.375000 +Gaussian,LogPDF,128,SCALAR,2.458000 +Gaussian,LogPDF,128,VECTORIZED,0.125000 +Gaussian,LogPDF,128,PARALLEL,0.042000 +Gaussian,LogPDF,128,WORK_STEALING,0.042000 +Gaussian,CDF,128,SCALAR,3.292000 +Gaussian,CDF,128,VECTORIZED,1.458000 +Gaussian,CDF,128,PARALLEL,1.459000 +Gaussian,CDF,128,WORK_STEALING,1.458000 +Gaussian,PDF,256,SCALAR,4.833000 +Gaussian,PDF,256,VECTORIZED,0.917000 +Gaussian,PDF,256,PARALLEL,0.750000 +Gaussian,PDF,256,WORK_STEALING,0.708000 +Gaussian,LogPDF,256,SCALAR,4.958000 +Gaussian,LogPDF,256,VECTORIZED,0.208000 +Gaussian,LogPDF,256,PARALLEL,0.083000 +Gaussian,LogPDF,256,WORK_STEALING,0.042000 +Gaussian,CDF,256,SCALAR,6.625000 +Gaussian,CDF,256,VECTORIZED,2.833000 +Gaussian,CDF,256,PARALLEL,2.833000 +Gaussian,CDF,256,WORK_STEALING,2.833000 +Gaussian,PDF,512,SCALAR,9.625000 +Gaussian,PDF,512,VECTORIZED,1.750000 +Gaussian,PDF,512,PARALLEL,1.417000 +Gaussian,PDF,512,WORK_STEALING,1.375000 +Gaussian,LogPDF,512,SCALAR,9.916000 +Gaussian,LogPDF,512,VECTORIZED,0.333000 +Gaussian,LogPDF,512,PARALLEL,0.083000 +Gaussian,LogPDF,512,WORK_STEALING,0.083000 +Gaussian,CDF,512,SCALAR,13.167000 +Gaussian,CDF,512,VECTORIZED,5.542000 +Gaussian,CDF,512,PARALLEL,5.583000 +Gaussian,CDF,512,WORK_STEALING,5.542000 +Gaussian,PDF,1000,SCALAR,18.750000 +Gaussian,PDF,1000,VECTORIZED,3.292000 +Gaussian,PDF,1000,PARALLEL,2.750000 +Gaussian,PDF,1000,WORK_STEALING,2.667000 +Gaussian,LogPDF,1000,SCALAR,19.375000 +Gaussian,LogPDF,1000,VECTORIZED,0.584000 +Gaussian,LogPDF,1000,PARALLEL,0.166000 +Gaussian,LogPDF,1000,WORK_STEALING,0.167000 +Gaussian,CDF,1000,SCALAR,25.750000 +Gaussian,CDF,1000,VECTORIZED,10.750000 +Gaussian,CDF,1000,PARALLEL,10.791000 +Gaussian,CDF,1000,WORK_STEALING,10.708000 +Gaussian,PDF,2000,SCALAR,37.209000 +Gaussian,PDF,2000,VECTORIZED,6.500000 +Gaussian,PDF,2000,PARALLEL,47.375000 +Gaussian,PDF,2000,WORK_STEALING,22.125000 +Gaussian,LogPDF,2000,SCALAR,38.584000 +Gaussian,LogPDF,2000,VECTORIZED,1.083000 +Gaussian,LogPDF,2000,PARALLEL,38.834000 +Gaussian,LogPDF,2000,WORK_STEALING,12.916000 +Gaussian,CDF,2000,SCALAR,51.500000 +Gaussian,CDF,2000,VECTORIZED,21.292000 +Gaussian,CDF,2000,PARALLEL,43.458000 +Gaussian,CDF,2000,WORK_STEALING,43.458000 +Gaussian,PDF,5000,SCALAR,93.541000 +Gaussian,PDF,5000,VECTORIZED,16.125000 +Gaussian,PDF,5000,PARALLEL,85.209000 +Gaussian,PDF,5000,WORK_STEALING,23.583000 +Gaussian,LogPDF,5000,SCALAR,95.125000 +Gaussian,LogPDF,5000,VECTORIZED,2.666000 +Gaussian,LogPDF,5000,PARALLEL,94.459000 +Gaussian,LogPDF,5000,WORK_STEALING,20.083000 +Gaussian,CDF,5000,SCALAR,128.667000 +Gaussian,CDF,5000,VECTORIZED,52.958000 +Gaussian,CDF,5000,PARALLEL,101.459000 +Gaussian,CDF,5000,WORK_STEALING,66.791000 +Gaussian,PDF,10000,SCALAR,186.083000 +Gaussian,PDF,10000,VECTORIZED,33.125000 +Gaussian,PDF,10000,PARALLEL,131.250000 +Gaussian,PDF,10000,WORK_STEALING,39.125000 +Gaussian,LogPDF,10000,SCALAR,192.042000 +Gaussian,LogPDF,10000,VECTORIZED,6.208000 +Gaussian,LogPDF,10000,PARALLEL,164.958000 +Gaussian,LogPDF,10000,WORK_STEALING,29.375000 +Gaussian,CDF,10000,SCALAR,257.458000 +Gaussian,CDF,10000,VECTORIZED,106.916000 +Gaussian,CDF,10000,PARALLEL,132.791000 +Gaussian,CDF,10000,WORK_STEALING,67.584000 +Gaussian,PDF,20000,SCALAR,380.292000 +Gaussian,PDF,20000,VECTORIZED,69.334000 +Gaussian,PDF,20000,PARALLEL,147.917000 +Gaussian,PDF,20000,WORK_STEALING,72.041000 +Gaussian,LogPDF,20000,SCALAR,385.584000 +Gaussian,LogPDF,20000,VECTORIZED,11.583000 +Gaussian,LogPDF,20000,PARALLEL,141.500000 +Gaussian,LogPDF,20000,WORK_STEALING,18.709000 +Gaussian,CDF,20000,SCALAR,516.542000 +Gaussian,CDF,20000,VECTORIZED,222.625000 +Gaussian,CDF,20000,PARALLEL,123.792000 +Gaussian,CDF,20000,WORK_STEALING,96.458000 +Gaussian,PDF,50000,SCALAR,936.000000 +Gaussian,PDF,50000,VECTORIZED,626.542000 +Gaussian,PDF,50000,PARALLEL,158.708000 +Gaussian,PDF,50000,WORK_STEALING,90.750000 +Gaussian,LogPDF,50000,SCALAR,958.125000 +Gaussian,LogPDF,50000,VECTORIZED,27.500000 +Gaussian,LogPDF,50000,PARALLEL,141.292000 +Gaussian,LogPDF,50000,WORK_STEALING,46.000000 +Gaussian,CDF,50000,SCALAR,1285.500000 +Gaussian,CDF,50000,VECTORIZED,541.791000 +Gaussian,CDF,50000,PARALLEL,198.875000 +Gaussian,CDF,50000,WORK_STEALING,188.625000 +Gaussian,PDF,100000,SCALAR,1861.459000 +Gaussian,PDF,100000,VECTORIZED,323.333000 +Gaussian,PDF,100000,PARALLEL,129.833000 +Gaussian,PDF,100000,WORK_STEALING,151.208000 +Gaussian,LogPDF,100000,SCALAR,1917.875000 +Gaussian,LogPDF,100000,VECTORIZED,55.166000 +Gaussian,LogPDF,100000,PARALLEL,153.125000 +Gaussian,LogPDF,100000,WORK_STEALING,53.208000 +Gaussian,CDF,100000,SCALAR,2574.542000 +Gaussian,CDF,100000,VECTORIZED,1073.292000 +Gaussian,CDF,100000,PARALLEL,370.209000 +Gaussian,CDF,100000,WORK_STEALING,342.333000 +Gaussian,PDF,250000,SCALAR,4671.958000 +Gaussian,PDF,250000,VECTORIZED,814.292000 +Gaussian,PDF,250000,PARALLEL,229.375000 +Gaussian,PDF,250000,WORK_STEALING,251.416000 +Gaussian,LogPDF,250000,SCALAR,4797.375000 +Gaussian,LogPDF,250000,VECTORIZED,143.625000 +Gaussian,LogPDF,250000,PARALLEL,164.666000 +Gaussian,LogPDF,250000,WORK_STEALING,101.959000 +Gaussian,CDF,250000,SCALAR,6439.125000 +Gaussian,CDF,250000,VECTORIZED,2670.000000 +Gaussian,CDF,250000,PARALLEL,859.000000 +Gaussian,CDF,250000,WORK_STEALING,743.042000 +Gaussian,PDF,500000,SCALAR,10121.959000 +Gaussian,PDF,500000,VECTORIZED,1698.459000 +Gaussian,PDF,500000,PARALLEL,427.000000 +Gaussian,PDF,500000,WORK_STEALING,368.750000 +Gaussian,LogPDF,500000,SCALAR,9544.500000 +Gaussian,LogPDF,500000,VECTORIZED,342.250000 +Gaussian,LogPDF,500000,PARALLEL,186.833000 +Gaussian,LogPDF,500000,WORK_STEALING,158.584000 +Gaussian,CDF,500000,SCALAR,12872.916000 +Gaussian,CDF,500000,VECTORIZED,5411.292000 +Gaussian,CDF,500000,PARALLEL,1663.333000 +Gaussian,CDF,500000,WORK_STEALING,1259.875000 +Exponential,PDF,8,SCALAR,0.167000 +Exponential,PDF,8,VECTORIZED,0.083000 +Exponential,PDF,8,PARALLEL,0.042000 +Exponential,PDF,8,WORK_STEALING,0.042000 +Exponential,LogPDF,8,SCALAR,0.167000 +Exponential,LogPDF,8,VECTORIZED,0.083000 +Exponential,LogPDF,8,PARALLEL,0.042000 +Exponential,LogPDF,8,WORK_STEALING,0.041000 +Exponential,CDF,8,SCALAR,0.167000 +Exponential,CDF,8,VECTORIZED,0.084000 +Exponential,CDF,8,PARALLEL,0.042000 +Exponential,CDF,8,WORK_STEALING,0.083000 +Exponential,PDF,16,SCALAR,0.333000 +Exponential,PDF,16,VECTORIZED,0.125000 +Exponential,PDF,16,PARALLEL,0.083000 +Exponential,PDF,16,WORK_STEALING,0.083000 +Exponential,LogPDF,16,SCALAR,0.333000 +Exponential,LogPDF,16,VECTORIZED,0.083000 +Exponential,LogPDF,16,PARALLEL,0.042000 +Exponential,LogPDF,16,WORK_STEALING,0.041000 +Exponential,CDF,16,SCALAR,0.333000 +Exponential,CDF,16,VECTORIZED,0.125000 +Exponential,CDF,16,PARALLEL,0.083000 +Exponential,CDF,16,WORK_STEALING,0.084000 +Exponential,PDF,32,SCALAR,0.625000 +Exponential,PDF,32,VECTORIZED,0.167000 +Exponential,PDF,32,PARALLEL,0.125000 +Exponential,PDF,32,WORK_STEALING,0.125000 +Exponential,LogPDF,32,SCALAR,0.625000 +Exponential,LogPDF,32,VECTORIZED,0.083000 +Exponential,LogPDF,32,PARALLEL,0.042000 +Exponential,LogPDF,32,WORK_STEALING,0.041000 +Exponential,CDF,32,SCALAR,0.625000 +Exponential,CDF,32,VECTORIZED,0.167000 +Exponential,CDF,32,PARALLEL,0.125000 +Exponential,CDF,32,WORK_STEALING,0.125000 +Exponential,PDF,64,SCALAR,1.209000 +Exponential,PDF,64,VECTORIZED,0.250000 +Exponential,PDF,64,PARALLEL,0.209000 +Exponential,PDF,64,WORK_STEALING,0.208000 +Exponential,LogPDF,64,SCALAR,1.291000 +Exponential,LogPDF,64,VECTORIZED,0.083000 +Exponential,LogPDF,64,PARALLEL,0.083000 +Exponential,LogPDF,64,WORK_STEALING,0.042000 +Exponential,CDF,64,SCALAR,1.250000 +Exponential,CDF,64,VECTORIZED,0.292000 +Exponential,CDF,64,PARALLEL,0.208000 +Exponential,CDF,64,WORK_STEALING,0.208000 +Exponential,PDF,128,SCALAR,2.417000 +Exponential,PDF,128,VECTORIZED,0.500000 +Exponential,PDF,128,PARALLEL,0.417000 +Exponential,PDF,128,WORK_STEALING,0.416000 +Exponential,LogPDF,128,SCALAR,2.458000 +Exponential,LogPDF,128,VECTORIZED,0.125000 +Exponential,LogPDF,128,PARALLEL,0.083000 +Exponential,LogPDF,128,WORK_STEALING,0.042000 +Exponential,CDF,128,SCALAR,2.458000 +Exponential,CDF,128,VECTORIZED,0.500000 +Exponential,CDF,128,PARALLEL,0.417000 +Exponential,CDF,128,WORK_STEALING,0.375000 +Exponential,PDF,256,SCALAR,4.792000 +Exponential,PDF,256,VECTORIZED,0.959000 +Exponential,PDF,256,PARALLEL,0.750000 +Exponential,PDF,256,WORK_STEALING,0.709000 +Exponential,LogPDF,256,SCALAR,4.792000 +Exponential,LogPDF,256,VECTORIZED,0.209000 +Exponential,LogPDF,256,PARALLEL,0.208000 +Exponential,LogPDF,256,WORK_STEALING,0.083000 +Exponential,CDF,256,SCALAR,4.792000 +Exponential,CDF,256,VECTORIZED,1.000000 +Exponential,CDF,256,PARALLEL,0.834000 +Exponential,CDF,256,WORK_STEALING,0.750000 +Exponential,PDF,512,SCALAR,9.500000 +Exponential,PDF,512,VECTORIZED,1.792000 +Exponential,PDF,512,PARALLEL,1.459000 +Exponential,PDF,512,WORK_STEALING,1.417000 +Exponential,LogPDF,512,SCALAR,9.750000 +Exponential,LogPDF,512,VECTORIZED,0.417000 +Exponential,LogPDF,512,PARALLEL,0.333000 +Exponential,LogPDF,512,WORK_STEALING,0.084000 +Exponential,CDF,512,SCALAR,9.625000 +Exponential,CDF,512,VECTORIZED,1.958000 +Exponential,CDF,512,PARALLEL,1.625000 +Exponential,CDF,512,WORK_STEALING,1.417000 +Exponential,PDF,1000,SCALAR,18.542000 +Exponential,PDF,1000,VECTORIZED,3.500000 +Exponential,PDF,1000,PARALLEL,2.834000 +Exponential,PDF,1000,WORK_STEALING,2.791000 +Exponential,LogPDF,1000,SCALAR,19.000000 +Exponential,LogPDF,1000,VECTORIZED,0.750000 +Exponential,LogPDF,1000,PARALLEL,0.584000 +Exponential,LogPDF,1000,WORK_STEALING,0.167000 +Exponential,CDF,1000,SCALAR,18.791000 +Exponential,CDF,1000,VECTORIZED,3.625000 +Exponential,CDF,1000,PARALLEL,3.042000 +Exponential,CDF,1000,WORK_STEALING,2.791000 +Exponential,PDF,2000,SCALAR,36.917000 +Exponential,PDF,2000,VECTORIZED,6.833000 +Exponential,PDF,2000,PARALLEL,51.083000 +Exponential,PDF,2000,WORK_STEALING,35.584000 +Exponential,LogPDF,2000,SCALAR,37.417000 +Exponential,LogPDF,2000,VECTORIZED,1.500000 +Exponential,LogPDF,2000,PARALLEL,50.542000 +Exponential,LogPDF,2000,WORK_STEALING,21.334000 +Exponential,CDF,2000,SCALAR,37.500000 +Exponential,CDF,2000,VECTORIZED,7.208000 +Exponential,CDF,2000,PARALLEL,44.042000 +Exponential,CDF,2000,WORK_STEALING,22.917000 +Exponential,PDF,5000,SCALAR,92.708000 +Exponential,PDF,5000,VECTORIZED,16.875000 +Exponential,PDF,5000,PARALLEL,129.417000 +Exponential,PDF,5000,WORK_STEALING,29.791000 +Exponential,LogPDF,5000,SCALAR,94.833000 +Exponential,LogPDF,5000,VECTORIZED,3.500000 +Exponential,LogPDF,5000,PARALLEL,109.041000 +Exponential,LogPDF,5000,WORK_STEALING,30.084000 +Exponential,CDF,5000,SCALAR,93.584000 +Exponential,CDF,5000,VECTORIZED,17.750000 +Exponential,CDF,5000,PARALLEL,114.375000 +Exponential,CDF,5000,WORK_STEALING,28.500000 +Exponential,PDF,10000,SCALAR,184.875000 +Exponential,PDF,10000,VECTORIZED,33.916000 +Exponential,PDF,10000,PARALLEL,169.583000 +Exponential,PDF,10000,WORK_STEALING,41.833000 +Exponential,LogPDF,10000,SCALAR,190.125000 +Exponential,LogPDF,10000,VECTORIZED,7.292000 +Exponential,LogPDF,10000,PARALLEL,152.458000 +Exponential,LogPDF,10000,WORK_STEALING,42.875000 +Exponential,CDF,10000,SCALAR,187.459000 +Exponential,CDF,10000,VECTORIZED,35.500000 +Exponential,CDF,10000,PARALLEL,214.167000 +Exponential,CDF,10000,WORK_STEALING,59.042000 +Exponential,PDF,20000,SCALAR,369.959000 +Exponential,PDF,20000,VECTORIZED,72.583000 +Exponential,PDF,20000,PARALLEL,169.750000 +Exponential,PDF,20000,WORK_STEALING,111.500000 +Exponential,LogPDF,20000,SCALAR,378.125000 +Exponential,LogPDF,20000,VECTORIZED,13.542000 +Exponential,LogPDF,20000,PARALLEL,221.583000 +Exponential,LogPDF,20000,WORK_STEALING,37.333000 +Exponential,CDF,20000,SCALAR,376.375000 +Exponential,CDF,20000,VECTORIZED,77.667000 +Exponential,CDF,20000,PARALLEL,197.792000 +Exponential,CDF,20000,WORK_STEALING,74.583000 +Exponential,PDF,50000,SCALAR,926.584000 +Exponential,PDF,50000,VECTORIZED,168.750000 +Exponential,PDF,50000,PARALLEL,132.167000 +Exponential,PDF,50000,WORK_STEALING,118.583000 +Exponential,LogPDF,50000,SCALAR,949.625000 +Exponential,LogPDF,50000,VECTORIZED,33.959000 +Exponential,LogPDF,50000,PARALLEL,206.583000 +Exponential,LogPDF,50000,WORK_STEALING,49.958000 +Exponential,CDF,50000,SCALAR,941.708000 +Exponential,CDF,50000,VECTORIZED,177.292000 +Exponential,CDF,50000,PARALLEL,111.708000 +Exponential,CDF,50000,WORK_STEALING,76.333000 +Exponential,PDF,100000,SCALAR,1859.166000 +Exponential,PDF,100000,VECTORIZED,338.292000 +Exponential,PDF,100000,PARALLEL,135.750000 +Exponential,PDF,100000,WORK_STEALING,162.500000 +Exponential,LogPDF,100000,SCALAR,1892.917000 +Exponential,LogPDF,100000,VECTORIZED,68.208000 +Exponential,LogPDF,100000,PARALLEL,46.625000 +Exponential,LogPDF,100000,WORK_STEALING,51.375000 +Exponential,CDF,100000,SCALAR,1870.542000 +Exponential,CDF,100000,VECTORIZED,356.250000 +Exponential,CDF,100000,PARALLEL,144.250000 +Exponential,CDF,100000,WORK_STEALING,144.792000 +Exponential,PDF,250000,SCALAR,4647.375000 +Exponential,PDF,250000,VECTORIZED,847.334000 +Exponential,PDF,250000,PARALLEL,260.541000 +Exponential,PDF,250000,WORK_STEALING,248.459000 +Exponential,LogPDF,250000,SCALAR,4740.750000 +Exponential,LogPDF,250000,VECTORIZED,179.792000 +Exponential,LogPDF,250000,PARALLEL,124.916000 +Exponential,LogPDF,250000,WORK_STEALING,101.625000 +Exponential,CDF,250000,SCALAR,4703.750000 +Exponential,CDF,250000,VECTORIZED,885.500000 +Exponential,CDF,250000,PARALLEL,242.459000 +Exponential,CDF,250000,WORK_STEALING,296.500000 +Exponential,PDF,500000,SCALAR,9270.166000 +Exponential,PDF,500000,VECTORIZED,1730.416000 +Exponential,PDF,500000,PARALLEL,441.292000 +Exponential,PDF,500000,WORK_STEALING,444.750000 +Exponential,LogPDF,500000,SCALAR,9493.625000 +Exponential,LogPDF,500000,VECTORIZED,390.250000 +Exponential,LogPDF,500000,PARALLEL,101.250000 +Exponential,LogPDF,500000,WORK_STEALING,197.917000 +Exponential,CDF,500000,SCALAR,9367.833000 +Exponential,CDF,500000,VECTORIZED,1867.083000 +Exponential,CDF,500000,PARALLEL,483.708000 +Exponential,CDF,500000,WORK_STEALING,535.500000 +Discrete,PDF,8,SCALAR,0.167000 +Discrete,PDF,8,VECTORIZED,0.042000 +Discrete,PDF,8,PARALLEL,0.042000 +Discrete,PDF,8,WORK_STEALING,0.042000 +Discrete,LogPDF,8,SCALAR,0.167000 +Discrete,LogPDF,8,VECTORIZED,0.042000 +Discrete,LogPDF,8,PARALLEL,0.042000 +Discrete,LogPDF,8,WORK_STEALING,0.042000 +Discrete,CDF,8,SCALAR,0.166000 +Discrete,CDF,8,VECTORIZED,0.042000 +Discrete,CDF,8,PARALLEL,0.042000 +Discrete,CDF,8,WORK_STEALING,0.042000 +Discrete,PDF,16,SCALAR,0.333000 +Discrete,PDF,16,VECTORIZED,0.041000 +Discrete,PDF,16,PARALLEL,0.042000 +Discrete,PDF,16,WORK_STEALING,0.041000 +Discrete,LogPDF,16,SCALAR,0.333000 +Discrete,LogPDF,16,VECTORIZED,0.041000 +Discrete,LogPDF,16,PARALLEL,0.042000 +Discrete,LogPDF,16,WORK_STEALING,0.042000 +Discrete,CDF,16,SCALAR,0.292000 +Discrete,CDF,16,VECTORIZED,0.042000 +Discrete,CDF,16,PARALLEL,0.042000 +Discrete,CDF,16,WORK_STEALING,0.042000 +Discrete,PDF,32,SCALAR,0.625000 +Discrete,PDF,32,VECTORIZED,0.042000 +Discrete,PDF,32,PARALLEL,0.042000 +Discrete,PDF,32,WORK_STEALING,0.083000 +Discrete,LogPDF,32,SCALAR,0.666000 +Discrete,LogPDF,32,VECTORIZED,0.042000 +Discrete,LogPDF,32,PARALLEL,0.083000 +Discrete,LogPDF,32,WORK_STEALING,0.042000 +Discrete,CDF,32,SCALAR,0.625000 +Discrete,CDF,32,VECTORIZED,0.042000 +Discrete,CDF,32,PARALLEL,0.042000 +Discrete,CDF,32,WORK_STEALING,0.042000 +Discrete,PDF,64,SCALAR,1.250000 +Discrete,PDF,64,VECTORIZED,0.083000 +Discrete,PDF,64,PARALLEL,0.125000 +Discrete,PDF,64,WORK_STEALING,0.125000 +Discrete,LogPDF,64,SCALAR,1.291000 +Discrete,LogPDF,64,VECTORIZED,0.083000 +Discrete,LogPDF,64,PARALLEL,0.125000 +Discrete,LogPDF,64,WORK_STEALING,0.084000 +Discrete,CDF,64,SCALAR,1.208000 +Discrete,CDF,64,VECTORIZED,0.083000 +Discrete,CDF,64,PARALLEL,0.125000 +Discrete,CDF,64,WORK_STEALING,0.083000 +Discrete,PDF,128,SCALAR,2.459000 +Discrete,PDF,128,VECTORIZED,0.167000 +Discrete,PDF,128,PARALLEL,0.166000 +Discrete,PDF,128,WORK_STEALING,0.167000 +Discrete,LogPDF,128,SCALAR,2.500000 +Discrete,LogPDF,128,VECTORIZED,0.167000 +Discrete,LogPDF,128,PARALLEL,0.209000 +Discrete,LogPDF,128,WORK_STEALING,0.167000 +Discrete,CDF,128,SCALAR,2.209000 +Discrete,CDF,128,VECTORIZED,0.125000 +Discrete,CDF,128,PARALLEL,0.208000 +Discrete,CDF,128,WORK_STEALING,0.167000 +Discrete,PDF,256,SCALAR,4.917000 +Discrete,PDF,256,VECTORIZED,0.292000 +Discrete,PDF,256,PARALLEL,0.292000 +Discrete,PDF,256,WORK_STEALING,0.292000 +Discrete,LogPDF,256,SCALAR,4.917000 +Discrete,LogPDF,256,VECTORIZED,0.292000 +Discrete,LogPDF,256,PARALLEL,0.292000 +Discrete,LogPDF,256,WORK_STEALING,0.292000 +Discrete,CDF,256,SCALAR,4.542000 +Discrete,CDF,256,VECTORIZED,0.250000 +Discrete,CDF,256,PARALLEL,0.334000 +Discrete,CDF,256,WORK_STEALING,0.375000 +Discrete,PDF,512,SCALAR,9.750000 +Discrete,PDF,512,VECTORIZED,0.542000 +Discrete,PDF,512,PARALLEL,0.542000 +Discrete,PDF,512,WORK_STEALING,0.542000 +Discrete,LogPDF,512,SCALAR,9.792000 +Discrete,LogPDF,512,VECTORIZED,0.542000 +Discrete,LogPDF,512,PARALLEL,0.542000 +Discrete,LogPDF,512,WORK_STEALING,0.625000 +Discrete,CDF,512,SCALAR,8.833000 +Discrete,CDF,512,VECTORIZED,0.584000 +Discrete,CDF,512,PARALLEL,0.583000 +Discrete,CDF,512,WORK_STEALING,0.750000 +Discrete,PDF,1000,SCALAR,19.000000 +Discrete,PDF,1000,VECTORIZED,1.042000 +Discrete,PDF,1000,PARALLEL,1.042000 +Discrete,PDF,1000,WORK_STEALING,1.042000 +Discrete,LogPDF,1000,SCALAR,19.000000 +Discrete,LogPDF,1000,VECTORIZED,1.042000 +Discrete,LogPDF,1000,PARALLEL,1.083000 +Discrete,LogPDF,1000,WORK_STEALING,1.042000 +Discrete,CDF,1000,SCALAR,17.334000 +Discrete,CDF,1000,VECTORIZED,1.166000 +Discrete,CDF,1000,PARALLEL,1.208000 +Discrete,CDF,1000,WORK_STEALING,1.500000 +Discrete,PDF,2000,SCALAR,37.959000 +Discrete,PDF,2000,VECTORIZED,2.125000 +Discrete,PDF,2000,PARALLEL,38.708000 +Discrete,PDF,2000,WORK_STEALING,19.667000 +Discrete,LogPDF,2000,SCALAR,38.125000 +Discrete,LogPDF,2000,VECTORIZED,2.125000 +Discrete,LogPDF,2000,PARALLEL,40.208000 +Discrete,LogPDF,2000,WORK_STEALING,24.708000 +Discrete,CDF,2000,SCALAR,35.167000 +Discrete,CDF,2000,VECTORIZED,2.333000 +Discrete,CDF,2000,PARALLEL,48.291000 +Discrete,CDF,2000,WORK_STEALING,27.792000 +Discrete,PDF,5000,SCALAR,94.834000 +Discrete,PDF,5000,VECTORIZED,5.125000 +Discrete,PDF,5000,PARALLEL,105.167000 +Discrete,PDF,5000,WORK_STEALING,42.125000 +Discrete,LogPDF,5000,SCALAR,95.292000 +Discrete,LogPDF,5000,VECTORIZED,5.125000 +Discrete,LogPDF,5000,PARALLEL,97.125000 +Discrete,LogPDF,5000,WORK_STEALING,23.167000 +Discrete,CDF,5000,SCALAR,86.417000 +Discrete,CDF,5000,VECTORIZED,6.375000 +Discrete,CDF,5000,PARALLEL,121.625000 +Discrete,CDF,5000,WORK_STEALING,42.292000 +Discrete,PDF,10000,SCALAR,189.666000 +Discrete,PDF,10000,VECTORIZED,10.125000 +Discrete,PDF,10000,PARALLEL,160.875000 +Discrete,PDF,10000,WORK_STEALING,23.375000 +Discrete,LogPDF,10000,SCALAR,190.166000 +Discrete,LogPDF,10000,VECTORIZED,10.125000 +Discrete,LogPDF,10000,PARALLEL,160.125000 +Discrete,LogPDF,10000,WORK_STEALING,53.959000 +Discrete,CDF,10000,SCALAR,174.334000 +Discrete,CDF,10000,VECTORIZED,13.375000 +Discrete,CDF,10000,PARALLEL,176.959000 +Discrete,CDF,10000,WORK_STEALING,62.291000 +Discrete,PDF,20000,SCALAR,381.083000 +Discrete,PDF,20000,VECTORIZED,20.208000 +Discrete,PDF,20000,PARALLEL,160.416000 +Discrete,PDF,20000,WORK_STEALING,50.125000 +Discrete,LogPDF,20000,SCALAR,380.084000 +Discrete,LogPDF,20000,VECTORIZED,20.125000 +Discrete,LogPDF,20000,PARALLEL,170.042000 +Discrete,LogPDF,20000,WORK_STEALING,63.417000 +Discrete,CDF,20000,SCALAR,348.167000 +Discrete,CDF,20000,VECTORIZED,27.125000 +Discrete,CDF,20000,PARALLEL,172.084000 +Discrete,CDF,20000,WORK_STEALING,56.625000 +Discrete,PDF,50000,SCALAR,950.458000 +Discrete,PDF,50000,VECTORIZED,50.250000 +Discrete,PDF,50000,PARALLEL,153.625000 +Discrete,PDF,50000,WORK_STEALING,78.125000 +Discrete,LogPDF,50000,SCALAR,952.125000 +Discrete,LogPDF,50000,VECTORIZED,50.250000 +Discrete,LogPDF,50000,PARALLEL,175.959000 +Discrete,LogPDF,50000,WORK_STEALING,63.375000 +Discrete,CDF,50000,SCALAR,868.875000 +Discrete,CDF,50000,VECTORIZED,70.209000 +Discrete,CDF,50000,PARALLEL,170.500000 +Discrete,CDF,50000,WORK_STEALING,97.417000 +Discrete,PDF,100000,SCALAR,1897.708000 +Discrete,PDF,100000,VECTORIZED,100.833000 +Discrete,PDF,100000,PARALLEL,114.875000 +Discrete,PDF,100000,WORK_STEALING,111.833000 +Discrete,LogPDF,100000,SCALAR,1893.541000 +Discrete,LogPDF,100000,VECTORIZED,100.500000 +Discrete,LogPDF,100000,PARALLEL,139.625000 +Discrete,LogPDF,100000,WORK_STEALING,111.167000 +Discrete,CDF,100000,SCALAR,1741.541000 +Discrete,CDF,100000,VECTORIZED,142.708000 +Discrete,CDF,100000,PARALLEL,143.125000 +Discrete,CDF,100000,WORK_STEALING,114.708000 +Discrete,PDF,250000,SCALAR,4743.958000 +Discrete,PDF,250000,VECTORIZED,254.459000 +Discrete,PDF,250000,PARALLEL,127.500000 +Discrete,PDF,250000,WORK_STEALING,172.292000 +Discrete,LogPDF,250000,SCALAR,4753.917000 +Discrete,LogPDF,250000,VECTORIZED,253.500000 +Discrete,LogPDF,250000,PARALLEL,145.833000 +Discrete,LogPDF,250000,WORK_STEALING,182.166000 +Discrete,CDF,250000,SCALAR,4341.166000 +Discrete,CDF,250000,VECTORIZED,361.625000 +Discrete,CDF,250000,PARALLEL,174.833000 +Discrete,CDF,250000,WORK_STEALING,222.208000 +Discrete,PDF,500000,SCALAR,9496.709000 +Discrete,PDF,500000,VECTORIZED,505.167000 +Discrete,PDF,500000,PARALLEL,173.625000 +Discrete,PDF,500000,WORK_STEALING,270.708000 +Discrete,LogPDF,500000,SCALAR,9531.417000 +Discrete,LogPDF,500000,VECTORIZED,502.666000 +Discrete,LogPDF,500000,PARALLEL,221.250000 +Discrete,LogPDF,500000,WORK_STEALING,283.083000 +Discrete,CDF,500000,SCALAR,8669.417000 +Discrete,CDF,500000,VECTORIZED,724.042000 +Discrete,CDF,500000,PARALLEL,311.125000 +Discrete,CDF,500000,WORK_STEALING,341.708000 +Poisson,PDF,8,SCALAR,0.208000 +Poisson,PDF,8,VECTORIZED,0.125000 +Poisson,PDF,8,PARALLEL,0.125000 +Poisson,PDF,8,WORK_STEALING,0.416000 +Poisson,LogPDF,8,SCALAR,0.459000 +Poisson,LogPDF,8,VECTORIZED,0.167000 +Poisson,LogPDF,8,PARALLEL,0.208000 +Poisson,LogPDF,8,WORK_STEALING,0.042000 +Poisson,CDF,8,SCALAR,0.208000 +Poisson,CDF,8,VECTORIZED,0.209000 +Poisson,CDF,8,PARALLEL,0.250000 +Poisson,CDF,8,WORK_STEALING,0.250000 +Poisson,PDF,16,SCALAR,0.417000 +Poisson,PDF,16,VECTORIZED,0.208000 +Poisson,PDF,16,PARALLEL,0.208000 +Poisson,PDF,16,WORK_STEALING,0.208000 +Poisson,LogPDF,16,SCALAR,0.292000 +Poisson,LogPDF,16,VECTORIZED,0.083000 +Poisson,LogPDF,16,PARALLEL,0.084000 +Poisson,LogPDF,16,WORK_STEALING,0.083000 +Poisson,CDF,16,SCALAR,0.500000 +Poisson,CDF,16,VECTORIZED,0.500000 +Poisson,CDF,16,PARALLEL,0.500000 +Poisson,CDF,16,WORK_STEALING,0.500000 +Poisson,PDF,32,SCALAR,0.792000 +Poisson,PDF,32,VECTORIZED,0.292000 +Poisson,PDF,32,PARALLEL,0.333000 +Poisson,PDF,32,WORK_STEALING,0.333000 +Poisson,LogPDF,32,SCALAR,0.625000 +Poisson,LogPDF,32,VECTORIZED,0.125000 +Poisson,LogPDF,32,PARALLEL,0.167000 +Poisson,LogPDF,32,WORK_STEALING,0.125000 +Poisson,CDF,32,SCALAR,1.000000 +Poisson,CDF,32,VECTORIZED,1.041000 +Poisson,CDF,32,PARALLEL,1.083000 +Poisson,CDF,32,WORK_STEALING,1.042000 +Poisson,PDF,64,SCALAR,1.542000 +Poisson,PDF,64,VECTORIZED,0.625000 +Poisson,PDF,64,PARALLEL,0.708000 +Poisson,PDF,64,WORK_STEALING,0.625000 +Poisson,LogPDF,64,SCALAR,1.208000 +Poisson,LogPDF,64,VECTORIZED,0.292000 +Poisson,LogPDF,64,PARALLEL,0.292000 +Poisson,LogPDF,64,WORK_STEALING,0.292000 +Poisson,CDF,64,SCALAR,2.375000 +Poisson,CDF,64,VECTORIZED,2.416000 +Poisson,CDF,64,PARALLEL,2.500000 +Poisson,CDF,64,WORK_STEALING,2.417000 +Poisson,PDF,128,SCALAR,3.042000 +Poisson,PDF,128,VECTORIZED,1.166000 +Poisson,PDF,128,PARALLEL,1.250000 +Poisson,PDF,128,WORK_STEALING,1.167000 +Poisson,LogPDF,128,SCALAR,2.458000 +Poisson,LogPDF,128,VECTORIZED,0.500000 +Poisson,LogPDF,128,PARALLEL,0.500000 +Poisson,LogPDF,128,WORK_STEALING,0.458000 +Poisson,CDF,128,SCALAR,4.500000 +Poisson,CDF,128,VECTORIZED,4.458000 +Poisson,CDF,128,PARALLEL,4.625000 +Poisson,CDF,128,WORK_STEALING,4.500000 +Poisson,PDF,256,SCALAR,6.125000 +Poisson,PDF,256,VECTORIZED,2.416000 +Poisson,PDF,256,PARALLEL,2.541000 +Poisson,PDF,256,WORK_STEALING,2.417000 +Poisson,LogPDF,256,SCALAR,4.875000 +Poisson,LogPDF,256,VECTORIZED,1.000000 +Poisson,LogPDF,256,PARALLEL,1.125000 +Poisson,LogPDF,256,WORK_STEALING,0.958000 +Poisson,CDF,256,SCALAR,9.458000 +Poisson,CDF,256,VECTORIZED,9.333000 +Poisson,CDF,256,PARALLEL,9.709000 +Poisson,CDF,256,WORK_STEALING,9.375000 +Poisson,PDF,512,SCALAR,12.167000 +Poisson,PDF,512,VECTORIZED,4.750000 +Poisson,PDF,512,PARALLEL,5.083000 +Poisson,PDF,512,WORK_STEALING,4.792000 +Poisson,LogPDF,512,SCALAR,9.625000 +Poisson,LogPDF,512,VECTORIZED,1.875000 +Poisson,LogPDF,512,PARALLEL,2.125000 +Poisson,LogPDF,512,WORK_STEALING,1.875000 +Poisson,CDF,512,SCALAR,19.667000 +Poisson,CDF,512,VECTORIZED,19.708000 +Poisson,CDF,512,PARALLEL,20.542000 +Poisson,CDF,512,WORK_STEALING,19.583000 +Poisson,PDF,1000,SCALAR,23.833000 +Poisson,PDF,1000,VECTORIZED,9.166000 +Poisson,PDF,1000,PARALLEL,9.750000 +Poisson,PDF,1000,WORK_STEALING,9.292000 +Poisson,LogPDF,1000,SCALAR,18.792000 +Poisson,LogPDF,1000,VECTORIZED,3.500000 +Poisson,LogPDF,1000,PARALLEL,3.958000 +Poisson,LogPDF,1000,WORK_STEALING,3.584000 +Poisson,CDF,1000,SCALAR,38.875000 +Poisson,CDF,1000,VECTORIZED,38.667000 +Poisson,CDF,1000,PARALLEL,40.458000 +Poisson,CDF,1000,WORK_STEALING,38.583000 +Poisson,PDF,2000,SCALAR,47.667000 +Poisson,PDF,2000,VECTORIZED,18.333000 +Poisson,PDF,2000,PARALLEL,57.709000 +Poisson,PDF,2000,WORK_STEALING,55.750000 +Poisson,LogPDF,2000,SCALAR,37.500000 +Poisson,LogPDF,2000,VECTORIZED,7.500000 +Poisson,LogPDF,2000,PARALLEL,50.416000 +Poisson,LogPDF,2000,WORK_STEALING,43.375000 +Poisson,CDF,2000,SCALAR,78.292000 +Poisson,CDF,2000,VECTORIZED,77.792000 +Poisson,CDF,2000,PARALLEL,73.458000 +Poisson,CDF,2000,WORK_STEALING,96.458000 +Poisson,PDF,5000,SCALAR,149.791000 +Poisson,PDF,5000,VECTORIZED,45.458000 +Poisson,PDF,5000,PARALLEL,123.917000 +Poisson,PDF,5000,WORK_STEALING,85.417000 +Poisson,LogPDF,5000,SCALAR,93.666000 +Poisson,LogPDF,5000,VECTORIZED,20.959000 +Poisson,LogPDF,5000,PARALLEL,107.000000 +Poisson,LogPDF,5000,WORK_STEALING,61.083000 +Poisson,CDF,5000,SCALAR,197.791000 +Poisson,CDF,5000,VECTORIZED,196.417000 +Poisson,CDF,5000,PARALLEL,107.791000 +Poisson,CDF,5000,WORK_STEALING,148.125000 +Poisson,PDF,10000,SCALAR,238.000000 +Poisson,PDF,10000,VECTORIZED,90.959000 +Poisson,PDF,10000,PARALLEL,149.334000 +Poisson,PDF,10000,WORK_STEALING,119.208000 +Poisson,LogPDF,10000,SCALAR,187.292000 +Poisson,LogPDF,10000,VECTORIZED,44.291000 +Poisson,LogPDF,10000,PARALLEL,219.083000 +Poisson,LogPDF,10000,WORK_STEALING,77.917000 +Poisson,CDF,10000,SCALAR,398.667000 +Poisson,CDF,10000,VECTORIZED,395.834000 +Poisson,CDF,10000,PARALLEL,158.750000 +Poisson,CDF,10000,WORK_STEALING,213.500000 +Poisson,PDF,20000,SCALAR,476.041000 +Poisson,PDF,20000,VECTORIZED,181.959000 +Poisson,PDF,20000,PARALLEL,239.458000 +Poisson,PDF,20000,WORK_STEALING,132.958000 +Poisson,LogPDF,20000,SCALAR,374.625000 +Poisson,LogPDF,20000,VECTORIZED,94.458000 +Poisson,LogPDF,20000,PARALLEL,144.458000 +Poisson,LogPDF,20000,WORK_STEALING,102.875000 +Poisson,CDF,20000,SCALAR,794.500000 +Poisson,CDF,20000,VECTORIZED,791.208000 +Poisson,CDF,20000,PARALLEL,255.666000 +Poisson,CDF,20000,WORK_STEALING,382.333000 +Poisson,PDF,50000,SCALAR,1190.000000 +Poisson,PDF,50000,VECTORIZED,454.625000 +Poisson,PDF,50000,PARALLEL,193.792000 +Poisson,PDF,50000,WORK_STEALING,228.041000 +Poisson,LogPDF,50000,SCALAR,936.375000 +Poisson,LogPDF,50000,VECTORIZED,236.292000 +Poisson,LogPDF,50000,PARALLEL,198.458000 +Poisson,LogPDF,50000,WORK_STEALING,218.333000 +Poisson,CDF,50000,SCALAR,1989.916000 +Poisson,CDF,50000,VECTORIZED,1983.084000 +Poisson,CDF,50000,PARALLEL,636.333000 +Poisson,CDF,50000,WORK_STEALING,615.292000 +Poisson,PDF,100000,SCALAR,2380.041000 +Poisson,PDF,100000,VECTORIZED,911.334000 +Poisson,PDF,100000,PARALLEL,293.917000 +Poisson,PDF,100000,WORK_STEALING,437.875000 +Poisson,LogPDF,100000,SCALAR,1873.916000 +Poisson,LogPDF,100000,VECTORIZED,475.167000 +Poisson,LogPDF,100000,PARALLEL,189.667000 +Poisson,LogPDF,100000,WORK_STEALING,263.542000 +Poisson,CDF,100000,SCALAR,4007.167000 +Poisson,CDF,100000,VECTORIZED,3979.583000 +Poisson,CDF,100000,PARALLEL,1184.875000 +Poisson,CDF,100000,WORK_STEALING,1233.667000 +Poisson,PDF,250000,SCALAR,5961.750000 +Poisson,PDF,250000,VECTORIZED,2295.417000 +Poisson,PDF,250000,PARALLEL,685.833000 +Poisson,PDF,250000,WORK_STEALING,832.709000 +Poisson,LogPDF,250000,SCALAR,4690.542000 +Poisson,LogPDF,250000,VECTORIZED,1202.167000 +Poisson,LogPDF,250000,PARALLEL,466.750000 +Poisson,LogPDF,250000,WORK_STEALING,448.250000 +Poisson,CDF,250000,SCALAR,10009.167000 +Poisson,CDF,250000,VECTORIZED,9927.083000 +Poisson,CDF,250000,PARALLEL,2731.833000 +Poisson,CDF,250000,WORK_STEALING,2768.583000 +Poisson,PDF,500000,SCALAR,11908.416000 +Poisson,PDF,500000,VECTORIZED,4564.042000 +Poisson,PDF,500000,PARALLEL,1282.250000 +Poisson,PDF,500000,WORK_STEALING,1274.417000 +Poisson,LogPDF,500000,SCALAR,9371.958000 +Poisson,LogPDF,500000,VECTORIZED,2409.916000 +Poisson,LogPDF,500000,PARALLEL,892.292000 +Poisson,LogPDF,500000,WORK_STEALING,772.291000 +Poisson,CDF,500000,SCALAR,19976.834000 +Poisson,CDF,500000,VECTORIZED,19850.875000 +Poisson,CDF,500000,PARALLEL,5677.625000 +Poisson,CDF,500000,WORK_STEALING,4841.250000 +Gamma,PDF,8,SCALAR,0.333000 +Gamma,PDF,8,VECTORIZED,0.167000 +Gamma,PDF,8,PARALLEL,0.083000 +Gamma,PDF,8,WORK_STEALING,0.125000 +Gamma,LogPDF,8,SCALAR,0.167000 +Gamma,LogPDF,8,VECTORIZED,0.125000 +Gamma,LogPDF,8,PARALLEL,0.083000 +Gamma,LogPDF,8,WORK_STEALING,0.042000 +Gamma,CDF,8,SCALAR,0.333000 +Gamma,CDF,8,VECTORIZED,0.250000 +Gamma,CDF,8,PARALLEL,0.209000 +Gamma,CDF,8,WORK_STEALING,0.208000 +Gamma,PDF,16,SCALAR,0.625000 +Gamma,PDF,16,VECTORIZED,0.208000 +Gamma,PDF,16,PARALLEL,0.167000 +Gamma,PDF,16,WORK_STEALING,0.166000 +Gamma,LogPDF,16,SCALAR,0.333000 +Gamma,LogPDF,16,VECTORIZED,0.166000 +Gamma,LogPDF,16,PARALLEL,0.083000 +Gamma,LogPDF,16,WORK_STEALING,0.084000 +Gamma,CDF,16,SCALAR,0.584000 +Gamma,CDF,16,VECTORIZED,0.375000 +Gamma,CDF,16,PARALLEL,0.333000 +Gamma,CDF,16,WORK_STEALING,0.334000 +Gamma,PDF,32,SCALAR,1.209000 +Gamma,PDF,32,VECTORIZED,0.333000 +Gamma,PDF,32,PARALLEL,0.292000 +Gamma,PDF,32,WORK_STEALING,0.250000 +Gamma,LogPDF,32,SCALAR,0.625000 +Gamma,LogPDF,32,VECTORIZED,0.208000 +Gamma,LogPDF,32,PARALLEL,0.167000 +Gamma,LogPDF,32,WORK_STEALING,0.125000 +Gamma,CDF,32,SCALAR,1.375000 +Gamma,CDF,32,VECTORIZED,0.708000 +Gamma,CDF,32,PARALLEL,0.667000 +Gamma,CDF,32,WORK_STEALING,0.709000 +Gamma,PDF,64,SCALAR,2.417000 +Gamma,PDF,64,VECTORIZED,0.541000 +Gamma,PDF,64,PARALLEL,0.500000 +Gamma,PDF,64,WORK_STEALING,0.500000 +Gamma,LogPDF,64,SCALAR,1.250000 +Gamma,LogPDF,64,VECTORIZED,0.334000 +Gamma,LogPDF,64,PARALLEL,0.291000 +Gamma,LogPDF,64,WORK_STEALING,0.208000 +Gamma,CDF,64,SCALAR,3.083000 +Gamma,CDF,64,VECTORIZED,1.542000 +Gamma,CDF,64,PARALLEL,1.542000 +Gamma,CDF,64,WORK_STEALING,1.583000 +Gamma,PDF,128,SCALAR,4.833000 +Gamma,PDF,128,VECTORIZED,0.958000 +Gamma,PDF,128,PARALLEL,0.958000 +Gamma,PDF,128,WORK_STEALING,0.958000 +Gamma,LogPDF,128,SCALAR,2.458000 +Gamma,LogPDF,128,VECTORIZED,0.583000 +Gamma,LogPDF,128,PARALLEL,0.500000 +Gamma,LogPDF,128,WORK_STEALING,0.458000 +Gamma,CDF,128,SCALAR,6.250000 +Gamma,CDF,128,VECTORIZED,3.125000 +Gamma,CDF,128,PARALLEL,3.166000 +Gamma,CDF,128,WORK_STEALING,2.791000 +Gamma,PDF,256,SCALAR,9.625000 +Gamma,PDF,256,VECTORIZED,1.834000 +Gamma,PDF,256,PARALLEL,1.917000 +Gamma,PDF,256,WORK_STEALING,1.875000 +Gamma,LogPDF,256,SCALAR,4.875000 +Gamma,LogPDF,256,VECTORIZED,1.125000 +Gamma,LogPDF,256,PARALLEL,0.917000 +Gamma,LogPDF,256,WORK_STEALING,0.834000 +Gamma,CDF,256,SCALAR,13.000000 +Gamma,CDF,256,VECTORIZED,6.500000 +Gamma,CDF,256,PARALLEL,6.834000 +Gamma,CDF,256,WORK_STEALING,6.959000 +Gamma,PDF,512,SCALAR,19.167000 +Gamma,PDF,512,VECTORIZED,3.584000 +Gamma,PDF,512,PARALLEL,3.833000 +Gamma,PDF,512,WORK_STEALING,3.750000 +Gamma,LogPDF,512,SCALAR,9.708000 +Gamma,LogPDF,512,VECTORIZED,2.250000 +Gamma,LogPDF,512,PARALLEL,1.875000 +Gamma,LogPDF,512,WORK_STEALING,1.708000 +Gamma,CDF,512,SCALAR,26.458000 +Gamma,CDF,512,VECTORIZED,14.875000 +Gamma,CDF,512,PARALLEL,14.667000 +Gamma,CDF,512,WORK_STEALING,14.666000 +Gamma,PDF,1000,SCALAR,37.417000 +Gamma,PDF,1000,VECTORIZED,7.042000 +Gamma,PDF,1000,PARALLEL,7.500000 +Gamma,PDF,1000,WORK_STEALING,7.333000 +Gamma,LogPDF,1000,SCALAR,18.958000 +Gamma,LogPDF,1000,VECTORIZED,4.167000 +Gamma,LogPDF,1000,PARALLEL,3.625000 +Gamma,LogPDF,1000,WORK_STEALING,3.375000 +Gamma,CDF,1000,SCALAR,53.375000 +Gamma,CDF,1000,VECTORIZED,30.917000 +Gamma,CDF,1000,PARALLEL,33.542000 +Gamma,CDF,1000,WORK_STEALING,33.667000 +Gamma,PDF,2000,SCALAR,74.917000 +Gamma,PDF,2000,VECTORIZED,13.750000 +Gamma,PDF,2000,PARALLEL,58.667000 +Gamma,PDF,2000,WORK_STEALING,69.916000 +Gamma,LogPDF,2000,SCALAR,37.875000 +Gamma,LogPDF,2000,VECTORIZED,8.667000 +Gamma,LogPDF,2000,PARALLEL,52.333000 +Gamma,LogPDF,2000,WORK_STEALING,73.208000 +Gamma,CDF,2000,SCALAR,104.500000 +Gamma,CDF,2000,VECTORIZED,66.291000 +Gamma,CDF,2000,PARALLEL,65.167000 +Gamma,CDF,2000,WORK_STEALING,102.333000 +Gamma,PDF,5000,SCALAR,187.292000 +Gamma,PDF,5000,VECTORIZED,35.834000 +Gamma,PDF,5000,PARALLEL,99.333000 +Gamma,PDF,5000,WORK_STEALING,56.791000 +Gamma,LogPDF,5000,SCALAR,94.625000 +Gamma,LogPDF,5000,VECTORIZED,22.375000 +Gamma,LogPDF,5000,PARALLEL,110.875000 +Gamma,LogPDF,5000,WORK_STEALING,56.042000 +Gamma,CDF,5000,SCALAR,264.500000 +Gamma,CDF,5000,VECTORIZED,180.208000 +Gamma,CDF,5000,PARALLEL,93.625000 +Gamma,CDF,5000,WORK_STEALING,111.167000 +Gamma,PDF,10000,SCALAR,374.542000 +Gamma,PDF,10000,VECTORIZED,77.083000 +Gamma,PDF,10000,PARALLEL,127.500000 +Gamma,PDF,10000,WORK_STEALING,117.125000 +Gamma,LogPDF,10000,SCALAR,190.000000 +Gamma,LogPDF,10000,VECTORIZED,50.833000 +Gamma,LogPDF,10000,PARALLEL,152.000000 +Gamma,LogPDF,10000,WORK_STEALING,66.333000 +Gamma,CDF,10000,SCALAR,529.167000 +Gamma,CDF,10000,VECTORIZED,365.292000 +Gamma,CDF,10000,PARALLEL,151.625000 +Gamma,CDF,10000,WORK_STEALING,249.792000 +Gamma,PDF,20000,SCALAR,749.250000 +Gamma,PDF,20000,VECTORIZED,156.708000 +Gamma,PDF,20000,PARALLEL,140.541000 +Gamma,PDF,20000,WORK_STEALING,149.208000 +Gamma,LogPDF,20000,SCALAR,378.750000 +Gamma,LogPDF,20000,VECTORIZED,102.542000 +Gamma,LogPDF,20000,PARALLEL,158.958000 +Gamma,LogPDF,20000,WORK_STEALING,95.916000 +Gamma,CDF,20000,SCALAR,1060.958000 +Gamma,CDF,20000,VECTORIZED,740.875000 +Gamma,CDF,20000,PARALLEL,236.375000 +Gamma,CDF,20000,WORK_STEALING,397.542000 +Gamma,PDF,50000,SCALAR,1873.375000 +Gamma,PDF,50000,VECTORIZED,396.750000 +Gamma,PDF,50000,PARALLEL,161.625000 +Gamma,PDF,50000,WORK_STEALING,231.166000 +Gamma,LogPDF,50000,SCALAR,946.459000 +Gamma,LogPDF,50000,VECTORIZED,262.500000 +Gamma,LogPDF,50000,PARALLEL,133.208000 +Gamma,LogPDF,50000,WORK_STEALING,171.792000 +Gamma,CDF,50000,SCALAR,2648.666000 +Gamma,CDF,50000,VECTORIZED,1850.083000 +Gamma,CDF,50000,PARALLEL,497.125000 +Gamma,CDF,50000,WORK_STEALING,630.458000 +Gamma,PDF,100000,SCALAR,3745.041000 +Gamma,PDF,100000,VECTORIZED,793.125000 +Gamma,PDF,100000,PARALLEL,236.500000 +Gamma,PDF,100000,WORK_STEALING,351.708000 +Gamma,LogPDF,100000,SCALAR,1896.708000 +Gamma,LogPDF,100000,VECTORIZED,524.459000 +Gamma,LogPDF,100000,PARALLEL,164.375000 +Gamma,LogPDF,100000,WORK_STEALING,284.834000 +Gamma,CDF,100000,SCALAR,6832.584000 +Gamma,CDF,100000,VECTORIZED,3736.708000 +Gamma,CDF,100000,PARALLEL,1035.417000 +Gamma,CDF,100000,WORK_STEALING,1178.583000 +Gamma,PDF,250000,SCALAR,9364.375000 +Gamma,PDF,250000,VECTORIZED,2008.416000 +Gamma,PDF,250000,PARALLEL,542.250000 +Gamma,PDF,250000,WORK_STEALING,632.750000 +Gamma,LogPDF,250000,SCALAR,4738.875000 +Gamma,LogPDF,250000,VECTORIZED,1339.584000 +Gamma,LogPDF,250000,PARALLEL,301.792000 +Gamma,LogPDF,250000,WORK_STEALING,548.375000 +Gamma,CDF,250000,SCALAR,13286.084000 +Gamma,CDF,250000,VECTORIZED,9333.208000 +Gamma,CDF,250000,PARALLEL,2457.334000 +Gamma,CDF,250000,WORK_STEALING,2327.291000 +Gamma,PDF,500000,SCALAR,19928.333000 +Gamma,PDF,500000,VECTORIZED,4096.084000 +Gamma,PDF,500000,PARALLEL,1034.167000 +Gamma,PDF,500000,WORK_STEALING,1238.542000 +Gamma,LogPDF,500000,SCALAR,9478.750000 +Gamma,LogPDF,500000,VECTORIZED,2760.459000 +Gamma,LogPDF,500000,PARALLEL,586.208000 +Gamma,LogPDF,500000,WORK_STEALING,671.208000 +Gamma,CDF,500000,SCALAR,26626.750000 +Gamma,CDF,500000,VECTORIZED,18527.583000 +Gamma,CDF,500000,PARALLEL,4684.458000 +Gamma,CDF,500000,WORK_STEALING,4233.333000 +StudentT,PDF,8,SCALAR,0.209000 +StudentT,PDF,8,VECTORIZED,0.167000 +StudentT,PDF,8,PARALLEL,0.167000 +StudentT,PDF,8,WORK_STEALING,0.167000 +StudentT,LogPDF,8,SCALAR,0.167000 +StudentT,LogPDF,8,VECTORIZED,0.167000 +StudentT,LogPDF,8,PARALLEL,0.125000 +StudentT,LogPDF,8,WORK_STEALING,0.125000 +StudentT,CDF,8,SCALAR,0.833000 +StudentT,CDF,8,VECTORIZED,0.667000 +StudentT,CDF,8,PARALLEL,0.666000 +StudentT,CDF,8,WORK_STEALING,0.666000 +StudentT,PDF,16,SCALAR,0.416000 +StudentT,PDF,16,VECTORIZED,0.250000 +StudentT,PDF,16,PARALLEL,0.208000 +StudentT,PDF,16,WORK_STEALING,0.208000 +StudentT,LogPDF,16,SCALAR,0.334000 +StudentT,LogPDF,16,VECTORIZED,0.166000 +StudentT,LogPDF,16,PARALLEL,0.166000 +StudentT,LogPDF,16,WORK_STEALING,0.125000 +StudentT,CDF,16,SCALAR,1.417000 +StudentT,CDF,16,VECTORIZED,1.125000 +StudentT,CDF,16,PARALLEL,1.125000 +StudentT,CDF,16,WORK_STEALING,1.125000 +StudentT,PDF,32,SCALAR,0.750000 +StudentT,PDF,32,VECTORIZED,0.333000 +StudentT,PDF,32,PARALLEL,0.333000 +StudentT,PDF,32,WORK_STEALING,0.292000 +StudentT,LogPDF,32,SCALAR,0.625000 +StudentT,LogPDF,32,VECTORIZED,0.209000 +StudentT,LogPDF,32,PARALLEL,0.167000 +StudentT,LogPDF,32,WORK_STEALING,0.208000 +StudentT,CDF,32,SCALAR,3.417000 +StudentT,CDF,32,VECTORIZED,2.667000 +StudentT,CDF,32,PARALLEL,2.666000 +StudentT,CDF,32,WORK_STEALING,2.666000 +StudentT,PDF,64,SCALAR,1.500000 +StudentT,PDF,64,VECTORIZED,0.542000 +StudentT,PDF,64,PARALLEL,0.542000 +StudentT,PDF,64,WORK_STEALING,0.542000 +StudentT,LogPDF,64,SCALAR,1.250000 +StudentT,LogPDF,64,VECTORIZED,0.333000 +StudentT,LogPDF,64,PARALLEL,0.292000 +StudentT,LogPDF,64,WORK_STEALING,0.291000 +StudentT,CDF,64,SCALAR,6.666000 +StudentT,CDF,64,VECTORIZED,5.334000 +StudentT,CDF,64,PARALLEL,5.417000 +StudentT,CDF,64,WORK_STEALING,5.416000 +StudentT,PDF,128,SCALAR,2.833000 +StudentT,PDF,128,VECTORIZED,0.958000 +StudentT,PDF,128,PARALLEL,1.041000 +StudentT,PDF,128,WORK_STEALING,1.042000 +StudentT,LogPDF,128,SCALAR,2.500000 +StudentT,LogPDF,128,VECTORIZED,0.583000 +StudentT,LogPDF,128,PARALLEL,0.500000 +StudentT,LogPDF,128,WORK_STEALING,0.541000 +StudentT,CDF,128,SCALAR,13.167000 +StudentT,CDF,128,VECTORIZED,10.625000 +StudentT,CDF,128,PARALLEL,10.625000 +StudentT,CDF,128,WORK_STEALING,10.667000 +StudentT,PDF,256,SCALAR,5.625000 +StudentT,PDF,256,VECTORIZED,1.917000 +StudentT,PDF,256,PARALLEL,2.000000 +StudentT,PDF,256,WORK_STEALING,2.000000 +StudentT,LogPDF,256,SCALAR,4.917000 +StudentT,LogPDF,256,VECTORIZED,1.166000 +StudentT,LogPDF,256,PARALLEL,0.958000 +StudentT,LogPDF,256,WORK_STEALING,0.917000 +StudentT,CDF,256,SCALAR,26.708000 +StudentT,CDF,256,VECTORIZED,22.292000 +StudentT,CDF,256,PARALLEL,22.292000 +StudentT,CDF,256,WORK_STEALING,22.333000 +StudentT,PDF,512,SCALAR,11.291000 +StudentT,PDF,512,VECTORIZED,3.792000 +StudentT,PDF,512,PARALLEL,3.958000 +StudentT,PDF,512,WORK_STEALING,3.917000 +StudentT,LogPDF,512,SCALAR,9.750000 +StudentT,LogPDF,512,VECTORIZED,2.292000 +StudentT,LogPDF,512,PARALLEL,1.916000 +StudentT,LogPDF,512,WORK_STEALING,1.834000 +StudentT,CDF,512,SCALAR,52.417000 +StudentT,CDF,512,VECTORIZED,43.750000 +StudentT,CDF,512,PARALLEL,43.834000 +StudentT,CDF,512,WORK_STEALING,43.792000 +StudentT,PDF,1000,SCALAR,21.958000 +StudentT,PDF,1000,VECTORIZED,7.167000 +StudentT,PDF,1000,PARALLEL,7.667000 +StudentT,PDF,1000,WORK_STEALING,7.667000 +StudentT,LogPDF,1000,SCALAR,19.083000 +StudentT,LogPDF,1000,VECTORIZED,4.459000 +StudentT,LogPDF,1000,PARALLEL,3.583000 +StudentT,LogPDF,1000,WORK_STEALING,3.542000 +StudentT,CDF,1000,SCALAR,104.833000 +StudentT,CDF,1000,VECTORIZED,88.125000 +StudentT,CDF,1000,PARALLEL,88.167000 +StudentT,CDF,1000,WORK_STEALING,88.167000 +StudentT,PDF,2000,SCALAR,43.875000 +StudentT,PDF,2000,VECTORIZED,14.209000 +StudentT,PDF,2000,PARALLEL,15.541000 +StudentT,PDF,2000,WORK_STEALING,15.500000 +StudentT,LogPDF,2000,SCALAR,38.083000 +StudentT,LogPDF,2000,VECTORIZED,8.792000 +StudentT,LogPDF,2000,PARALLEL,7.542000 +StudentT,LogPDF,2000,WORK_STEALING,7.583000 +StudentT,CDF,2000,SCALAR,212.250000 +StudentT,CDF,2000,VECTORIZED,177.625000 +StudentT,CDF,2000,PARALLEL,176.459000 +StudentT,CDF,2000,WORK_STEALING,179.042000 +StudentT,PDF,5000,SCALAR,109.667000 +StudentT,PDF,5000,VECTORIZED,36.375000 +StudentT,PDF,5000,PARALLEL,38.875000 +StudentT,PDF,5000,WORK_STEALING,38.750000 +StudentT,LogPDF,5000,SCALAR,94.916000 +StudentT,LogPDF,5000,VECTORIZED,23.292000 +StudentT,LogPDF,5000,PARALLEL,21.250000 +StudentT,LogPDF,5000,WORK_STEALING,21.125000 +StudentT,CDF,5000,SCALAR,525.791000 +StudentT,CDF,5000,VECTORIZED,450.833000 +StudentT,CDF,5000,PARALLEL,450.833000 +StudentT,CDF,5000,WORK_STEALING,447.542000 +StudentT,PDF,10000,SCALAR,222.750000 +StudentT,PDF,10000,VECTORIZED,76.416000 +StudentT,PDF,10000,PARALLEL,151.500000 +StudentT,PDF,10000,WORK_STEALING,193.667000 +StudentT,LogPDF,10000,SCALAR,192.166000 +StudentT,LogPDF,10000,VECTORIZED,51.625000 +StudentT,LogPDF,10000,PARALLEL,187.167000 +StudentT,LogPDF,10000,WORK_STEALING,161.500000 +StudentT,CDF,10000,SCALAR,1059.917000 +StudentT,CDF,10000,VECTORIZED,889.958000 +StudentT,CDF,10000,PARALLEL,890.041000 +StudentT,CDF,10000,WORK_STEALING,890.292000 +StudentT,PDF,20000,SCALAR,439.416000 +StudentT,PDF,20000,VECTORIZED,154.916000 +StudentT,PDF,20000,PARALLEL,124.083000 +StudentT,PDF,20000,WORK_STEALING,104.042000 +StudentT,LogPDF,20000,SCALAR,380.583000 +StudentT,LogPDF,20000,VECTORIZED,101.583000 +StudentT,LogPDF,20000,PARALLEL,226.292000 +StudentT,LogPDF,20000,WORK_STEALING,188.500000 +StudentT,CDF,20000,SCALAR,2104.041000 +StudentT,CDF,20000,VECTORIZED,1793.209000 +StudentT,CDF,20000,PARALLEL,1801.667000 +StudentT,CDF,20000,WORK_STEALING,1781.250000 +StudentT,PDF,50000,SCALAR,1096.792000 +StudentT,PDF,50000,VECTORIZED,386.084000 +StudentT,PDF,50000,PARALLEL,150.584000 +StudentT,PDF,50000,WORK_STEALING,144.875000 +StudentT,LogPDF,50000,SCALAR,950.292000 +StudentT,LogPDF,50000,VECTORIZED,251.208000 +StudentT,LogPDF,50000,PARALLEL,121.958000 +StudentT,LogPDF,50000,WORK_STEALING,124.125000 +StudentT,CDF,50000,SCALAR,5279.375000 +StudentT,CDF,50000,VECTORIZED,4434.625000 +StudentT,CDF,50000,PARALLEL,4434.417000 +StudentT,CDF,50000,WORK_STEALING,4448.250000 +StudentT,PDF,100000,SCALAR,2194.708000 +StudentT,PDF,100000,VECTORIZED,768.708000 +StudentT,PDF,100000,PARALLEL,219.000000 +StudentT,PDF,100000,WORK_STEALING,226.042000 +StudentT,LogPDF,100000,SCALAR,1899.917000 +StudentT,LogPDF,100000,VECTORIZED,500.292000 +StudentT,LogPDF,100000,PARALLEL,167.458000 +StudentT,LogPDF,100000,WORK_STEALING,162.333000 +StudentT,CDF,100000,SCALAR,10490.792000 +StudentT,CDF,100000,VECTORIZED,8873.583000 +StudentT,CDF,100000,PARALLEL,8879.166000 +StudentT,CDF,100000,WORK_STEALING,8875.250000 +StudentT,PDF,250000,SCALAR,5491.125000 +StudentT,PDF,250000,VECTORIZED,1936.250000 +StudentT,PDF,250000,PARALLEL,482.042000 +StudentT,PDF,250000,WORK_STEALING,486.458000 +StudentT,LogPDF,250000,SCALAR,4753.750000 +StudentT,LogPDF,250000,VECTORIZED,1268.917000 +StudentT,LogPDF,250000,PARALLEL,359.292000 +StudentT,LogPDF,250000,WORK_STEALING,353.084000 +StudentT,CDF,250000,SCALAR,26765.708000 +StudentT,CDF,250000,VECTORIZED,22263.459000 +StudentT,CDF,250000,PARALLEL,22272.625000 +StudentT,CDF,250000,WORK_STEALING,22232.292000 +StudentT,PDF,500000,SCALAR,10971.917000 +StudentT,PDF,500000,VECTORIZED,4001.125000 +StudentT,PDF,500000,PARALLEL,1015.500000 +StudentT,PDF,500000,WORK_STEALING,949.042000 +StudentT,LogPDF,500000,SCALAR,9505.459000 +StudentT,LogPDF,500000,VECTORIZED,2672.042000 +StudentT,LogPDF,500000,PARALLEL,680.625000 +StudentT,LogPDF,500000,WORK_STEALING,691.417000 +StudentT,CDF,500000,SCALAR,52478.042000 +StudentT,CDF,500000,VECTORIZED,44435.291000 +StudentT,CDF,500000,PARALLEL,44347.791000 +StudentT,CDF,500000,WORK_STEALING,44718.875000 +Beta,PDF,8,SCALAR,0.208000 +Beta,PDF,8,VECTORIZED,0.250000 +Beta,PDF,8,PARALLEL,0.167000 +Beta,PDF,8,WORK_STEALING,0.167000 +Beta,LogPDF,8,SCALAR,0.167000 +Beta,LogPDF,8,VECTORIZED,0.208000 +Beta,LogPDF,8,PARALLEL,0.125000 +Beta,LogPDF,8,WORK_STEALING,0.125000 +Beta,CDF,8,SCALAR,0.500000 +Beta,CDF,8,VECTORIZED,0.375000 +Beta,CDF,8,PARALLEL,0.500000 +Beta,CDF,8,WORK_STEALING,0.500000 +Beta,PDF,16,SCALAR,0.375000 +Beta,PDF,16,VECTORIZED,0.333000 +Beta,PDF,16,PARALLEL,0.250000 +Beta,PDF,16,WORK_STEALING,0.250000 +Beta,LogPDF,16,SCALAR,0.334000 +Beta,LogPDF,16,VECTORIZED,0.292000 +Beta,LogPDF,16,PARALLEL,0.208000 +Beta,LogPDF,16,WORK_STEALING,0.209000 +Beta,CDF,16,SCALAR,1.000000 +Beta,CDF,16,VECTORIZED,0.750000 +Beta,CDF,16,PARALLEL,1.042000 +Beta,CDF,16,WORK_STEALING,1.042000 +Beta,PDF,32,SCALAR,0.791000 +Beta,PDF,32,VECTORIZED,0.583000 +Beta,PDF,32,PARALLEL,0.458000 +Beta,PDF,32,WORK_STEALING,0.458000 +Beta,LogPDF,32,SCALAR,0.709000 +Beta,LogPDF,32,VECTORIZED,0.417000 +Beta,LogPDF,32,PARALLEL,0.334000 +Beta,LogPDF,32,WORK_STEALING,0.334000 +Beta,CDF,32,SCALAR,2.000000 +Beta,CDF,32,VECTORIZED,1.416000 +Beta,CDF,32,PARALLEL,2.000000 +Beta,CDF,32,WORK_STEALING,2.000000 +Beta,PDF,64,SCALAR,1.583000 +Beta,PDF,64,VECTORIZED,1.000000 +Beta,PDF,64,PARALLEL,0.875000 +Beta,PDF,64,WORK_STEALING,0.875000 +Beta,LogPDF,64,SCALAR,1.416000 +Beta,LogPDF,64,VECTORIZED,0.875000 +Beta,LogPDF,64,PARALLEL,0.666000 +Beta,LogPDF,64,WORK_STEALING,0.625000 +Beta,CDF,64,SCALAR,3.500000 +Beta,CDF,64,VECTORIZED,2.625000 +Beta,CDF,64,PARALLEL,3.500000 +Beta,CDF,64,WORK_STEALING,3.500000 +Beta,PDF,128,SCALAR,3.459000 +Beta,PDF,128,VECTORIZED,1.750000 +Beta,PDF,128,PARALLEL,1.500000 +Beta,PDF,128,WORK_STEALING,1.541000 +Beta,LogPDF,128,SCALAR,2.916000 +Beta,LogPDF,128,VECTORIZED,1.333000 +Beta,LogPDF,128,PARALLEL,1.083000 +Beta,LogPDF,128,WORK_STEALING,1.125000 +Beta,CDF,128,SCALAR,7.667000 +Beta,CDF,128,VECTORIZED,5.625000 +Beta,CDF,128,PARALLEL,7.834000 +Beta,CDF,128,WORK_STEALING,7.833000 +Beta,PDF,256,SCALAR,7.250000 +Beta,PDF,256,VECTORIZED,3.209000 +Beta,PDF,256,PARALLEL,2.875000 +Beta,PDF,256,WORK_STEALING,2.834000 +Beta,LogPDF,256,SCALAR,5.750000 +Beta,LogPDF,256,VECTORIZED,2.541000 +Beta,LogPDF,256,PARALLEL,2.042000 +Beta,LogPDF,256,WORK_STEALING,1.958000 +Beta,CDF,256,SCALAR,16.250000 +Beta,CDF,256,VECTORIZED,11.959000 +Beta,CDF,256,PARALLEL,16.292000 +Beta,CDF,256,WORK_STEALING,16.334000 +Beta,PDF,512,SCALAR,14.625000 +Beta,PDF,512,VECTORIZED,6.875000 +Beta,PDF,512,PARALLEL,6.083000 +Beta,PDF,512,WORK_STEALING,5.959000 +Beta,LogPDF,512,SCALAR,11.584000 +Beta,LogPDF,512,VECTORIZED,5.458000 +Beta,LogPDF,512,PARALLEL,4.417000 +Beta,LogPDF,512,WORK_STEALING,4.291000 +Beta,CDF,512,SCALAR,30.291000 +Beta,CDF,512,VECTORIZED,22.666000 +Beta,CDF,512,PARALLEL,30.250000 +Beta,CDF,512,WORK_STEALING,30.209000 +Beta,PDF,1000,SCALAR,29.208000 +Beta,PDF,1000,VECTORIZED,13.833000 +Beta,PDF,1000,PARALLEL,12.542000 +Beta,PDF,1000,WORK_STEALING,12.250000 +Beta,LogPDF,1000,SCALAR,22.416000 +Beta,LogPDF,1000,VECTORIZED,10.500000 +Beta,LogPDF,1000,PARALLEL,8.916000 +Beta,LogPDF,1000,WORK_STEALING,8.750000 +Beta,CDF,1000,SCALAR,60.334000 +Beta,CDF,1000,VECTORIZED,44.833000 +Beta,CDF,1000,PARALLEL,60.250000 +Beta,CDF,1000,WORK_STEALING,60.209000 +Beta,PDF,2000,SCALAR,60.833000 +Beta,PDF,2000,VECTORIZED,32.917000 +Beta,PDF,2000,PARALLEL,28.708000 +Beta,PDF,2000,WORK_STEALING,95.750000 +Beta,LogPDF,2000,SCALAR,45.208000 +Beta,LogPDF,2000,VECTORIZED,22.583000 +Beta,LogPDF,2000,PARALLEL,20.708000 +Beta,LogPDF,2000,WORK_STEALING,20.583000 +Beta,CDF,2000,SCALAR,123.000000 +Beta,CDF,2000,VECTORIZED,92.500000 +Beta,CDF,2000,PARALLEL,123.125000 +Beta,CDF,2000,WORK_STEALING,122.667000 +Beta,PDF,5000,SCALAR,151.208000 +Beta,PDF,5000,VECTORIZED,104.791000 +Beta,PDF,5000,PARALLEL,87.500000 +Beta,PDF,5000,WORK_STEALING,86.417000 +Beta,LogPDF,5000,SCALAR,112.375000 +Beta,LogPDF,5000,VECTORIZED,71.542000 +Beta,LogPDF,5000,PARALLEL,56.500000 +Beta,LogPDF,5000,WORK_STEALING,54.416000 +Beta,CDF,5000,SCALAR,307.375000 +Beta,CDF,5000,VECTORIZED,229.541000 +Beta,CDF,5000,PARALLEL,306.625000 +Beta,CDF,5000,WORK_STEALING,305.792000 +Beta,PDF,10000,SCALAR,304.042000 +Beta,PDF,10000,VECTORIZED,226.625000 +Beta,PDF,10000,PARALLEL,642.250000 +Beta,PDF,10000,WORK_STEALING,666.458000 +Beta,LogPDF,10000,SCALAR,226.042000 +Beta,LogPDF,10000,VECTORIZED,204.083000 +Beta,LogPDF,10000,PARALLEL,501.916000 +Beta,LogPDF,10000,WORK_STEALING,489.208000 +Beta,CDF,10000,SCALAR,610.750000 +Beta,CDF,10000,VECTORIZED,458.042000 +Beta,CDF,10000,PARALLEL,610.375000 +Beta,CDF,10000,WORK_STEALING,768.166000 +Beta,PDF,20000,SCALAR,611.167000 +Beta,PDF,20000,VECTORIZED,462.500000 +Beta,PDF,20000,PARALLEL,1298.125000 +Beta,PDF,20000,WORK_STEALING,1322.208000 +Beta,LogPDF,20000,SCALAR,448.375000 +Beta,LogPDF,20000,VECTORIZED,326.166000 +Beta,LogPDF,20000,PARALLEL,960.958000 +Beta,LogPDF,20000,WORK_STEALING,939.250000 +Beta,CDF,20000,SCALAR,1214.250000 +Beta,CDF,20000,VECTORIZED,912.083000 +Beta,CDF,20000,PARALLEL,1213.542000 +Beta,CDF,20000,WORK_STEALING,1211.000000 +Beta,PDF,50000,SCALAR,1520.875000 +Beta,PDF,50000,VECTORIZED,1192.542000 +Beta,PDF,50000,PARALLEL,3252.625000 +Beta,PDF,50000,WORK_STEALING,3302.750000 +Beta,LogPDF,50000,SCALAR,1125.750000 +Beta,LogPDF,50000,VECTORIZED,832.625000 +Beta,LogPDF,50000,PARALLEL,2099.792000 +Beta,LogPDF,50000,WORK_STEALING,2267.375000 +Beta,CDF,50000,SCALAR,3097.417000 +Beta,CDF,50000,VECTORIZED,2372.000000 +Beta,CDF,50000,PARALLEL,3066.334000 +Beta,CDF,50000,WORK_STEALING,3084.708000 +Beta,PDF,100000,SCALAR,3046.250000 +Beta,PDF,100000,VECTORIZED,2425.792000 +Beta,PDF,100000,PARALLEL,6294.625000 +Beta,PDF,100000,WORK_STEALING,5738.834000 +Beta,LogPDF,100000,SCALAR,2244.708000 +Beta,LogPDF,100000,VECTORIZED,1660.291000 +Beta,LogPDF,100000,PARALLEL,4758.709000 +Beta,LogPDF,100000,WORK_STEALING,4942.750000 +Beta,CDF,100000,SCALAR,7648.958000 +Beta,CDF,100000,VECTORIZED,5903.666000 +Beta,CDF,100000,PARALLEL,7236.709000 +Beta,CDF,100000,WORK_STEALING,6666.625000 +Beta,PDF,250000,SCALAR,7630.375000 +Beta,PDF,250000,VECTORIZED,5993.750000 +Beta,PDF,250000,PARALLEL,16433.250000 +Beta,PDF,250000,WORK_STEALING,17377.458000 +Beta,LogPDF,250000,SCALAR,6379.333000 +Beta,LogPDF,250000,VECTORIZED,4218.208000 +Beta,LogPDF,250000,PARALLEL,11317.292000 +Beta,LogPDF,250000,WORK_STEALING,11753.958000 +Beta,CDF,250000,SCALAR,15244.875000 +Beta,CDF,250000,VECTORIZED,11486.542000 +Beta,CDF,250000,PARALLEL,15221.917000 +Beta,CDF,250000,WORK_STEALING,15255.125000 +Beta,PDF,500000,SCALAR,15195.792000 +Beta,PDF,500000,VECTORIZED,16395.792000 +Beta,PDF,500000,PARALLEL,33924.875000 +Beta,PDF,500000,WORK_STEALING,32549.958000 +Beta,LogPDF,500000,SCALAR,11310.375000 +Beta,LogPDF,500000,VECTORIZED,8464.750000 +Beta,LogPDF,500000,PARALLEL,22130.042000 +Beta,LogPDF,500000,WORK_STEALING,22490.125000 +Beta,CDF,500000,SCALAR,30511.208000 +Beta,CDF,500000,VECTORIZED,22979.416000 +Beta,CDF,500000,PARALLEL,30506.958000 +Beta,CDF,500000,WORK_STEALING,30510.833000 +ChiSquared,PDF,8,SCALAR,0.333000 +ChiSquared,PDF,8,VECTORIZED,0.167000 +ChiSquared,PDF,8,PARALLEL,0.084000 +ChiSquared,PDF,8,WORK_STEALING,0.125000 +ChiSquared,LogPDF,8,SCALAR,0.167000 +ChiSquared,LogPDF,8,VECTORIZED,0.125000 +ChiSquared,LogPDF,8,PARALLEL,0.042000 +ChiSquared,LogPDF,8,WORK_STEALING,0.042000 +ChiSquared,CDF,8,SCALAR,0.333000 +ChiSquared,CDF,8,VECTORIZED,0.208000 +ChiSquared,CDF,8,PARALLEL,0.209000 +ChiSquared,CDF,8,WORK_STEALING,0.209000 +ChiSquared,PDF,16,SCALAR,0.625000 +ChiSquared,PDF,16,VECTORIZED,0.208000 +ChiSquared,PDF,16,PARALLEL,0.166000 +ChiSquared,PDF,16,WORK_STEALING,0.166000 +ChiSquared,LogPDF,16,SCALAR,0.334000 +ChiSquared,LogPDF,16,VECTORIZED,0.167000 +ChiSquared,LogPDF,16,PARALLEL,0.084000 +ChiSquared,LogPDF,16,WORK_STEALING,0.083000 +ChiSquared,CDF,16,SCALAR,0.709000 +ChiSquared,CDF,16,VECTORIZED,0.417000 +ChiSquared,CDF,16,PARALLEL,0.417000 +ChiSquared,CDF,16,WORK_STEALING,0.416000 +ChiSquared,PDF,32,SCALAR,1.250000 +ChiSquared,PDF,32,VECTORIZED,0.375000 +ChiSquared,PDF,32,PARALLEL,0.292000 +ChiSquared,PDF,32,WORK_STEALING,0.250000 +ChiSquared,LogPDF,32,SCALAR,0.667000 +ChiSquared,LogPDF,32,VECTORIZED,0.208000 +ChiSquared,LogPDF,32,PARALLEL,0.166000 +ChiSquared,LogPDF,32,WORK_STEALING,0.166000 +ChiSquared,CDF,32,SCALAR,1.458000 +ChiSquared,CDF,32,VECTORIZED,0.792000 +ChiSquared,CDF,32,PARALLEL,0.750000 +ChiSquared,CDF,32,WORK_STEALING,0.750000 +ChiSquared,PDF,64,SCALAR,2.417000 +ChiSquared,PDF,64,VECTORIZED,0.542000 +ChiSquared,PDF,64,PARALLEL,0.500000 +ChiSquared,PDF,64,WORK_STEALING,0.500000 +ChiSquared,LogPDF,64,SCALAR,1.250000 +ChiSquared,LogPDF,64,VECTORIZED,0.334000 +ChiSquared,LogPDF,64,PARALLEL,0.250000 +ChiSquared,LogPDF,64,WORK_STEALING,0.209000 +ChiSquared,CDF,64,SCALAR,3.250000 +ChiSquared,CDF,64,VECTORIZED,1.667000 +ChiSquared,CDF,64,PARALLEL,1.583000 +ChiSquared,CDF,64,WORK_STEALING,1.417000 +ChiSquared,PDF,128,SCALAR,4.833000 +ChiSquared,PDF,128,VECTORIZED,0.959000 +ChiSquared,PDF,128,PARALLEL,0.958000 +ChiSquared,PDF,128,WORK_STEALING,0.958000 +ChiSquared,LogPDF,128,SCALAR,2.458000 +ChiSquared,LogPDF,128,VECTORIZED,0.583000 +ChiSquared,LogPDF,128,PARALLEL,0.541000 +ChiSquared,LogPDF,128,WORK_STEALING,0.458000 +ChiSquared,CDF,128,SCALAR,6.792000 +ChiSquared,CDF,128,VECTORIZED,3.250000 +ChiSquared,CDF,128,PARALLEL,3.416000 +ChiSquared,CDF,128,WORK_STEALING,3.292000 +ChiSquared,PDF,256,SCALAR,9.625000 +ChiSquared,PDF,256,VECTORIZED,1.917000 +ChiSquared,PDF,256,PARALLEL,1.875000 +ChiSquared,PDF,256,WORK_STEALING,1.875000 +ChiSquared,LogPDF,256,SCALAR,4.875000 +ChiSquared,LogPDF,256,VECTORIZED,1.250000 +ChiSquared,LogPDF,256,PARALLEL,1.000000 +ChiSquared,LogPDF,256,WORK_STEALING,0.875000 +ChiSquared,CDF,256,SCALAR,14.250000 +ChiSquared,CDF,256,VECTORIZED,6.125000 +ChiSquared,CDF,256,PARALLEL,6.875000 +ChiSquared,CDF,256,WORK_STEALING,6.875000 +ChiSquared,PDF,512,SCALAR,19.167000 +ChiSquared,PDF,512,VECTORIZED,3.667000 +ChiSquared,PDF,512,PARALLEL,3.791000 +ChiSquared,PDF,512,WORK_STEALING,3.625000 +ChiSquared,LogPDF,512,SCALAR,9.708000 +ChiSquared,LogPDF,512,VECTORIZED,2.291000 +ChiSquared,LogPDF,512,PARALLEL,1.958000 +ChiSquared,LogPDF,512,WORK_STEALING,1.709000 +ChiSquared,CDF,512,SCALAR,28.250000 +ChiSquared,CDF,512,VECTORIZED,14.208000 +ChiSquared,CDF,512,PARALLEL,15.917000 +ChiSquared,CDF,512,WORK_STEALING,16.167000 +ChiSquared,PDF,1000,SCALAR,37.625000 +ChiSquared,PDF,1000,VECTORIZED,7.084000 +ChiSquared,PDF,1000,PARALLEL,7.292000 +ChiSquared,PDF,1000,WORK_STEALING,7.083000 +ChiSquared,LogPDF,1000,SCALAR,18.959000 +ChiSquared,LogPDF,1000,VECTORIZED,4.291000 +ChiSquared,LogPDF,1000,PARALLEL,3.709000 +ChiSquared,LogPDF,1000,WORK_STEALING,3.333000 +ChiSquared,CDF,1000,SCALAR,56.208000 +ChiSquared,CDF,1000,VECTORIZED,32.500000 +ChiSquared,CDF,1000,PARALLEL,36.708000 +ChiSquared,CDF,1000,WORK_STEALING,36.291000 +ChiSquared,PDF,2000,SCALAR,74.833000 +ChiSquared,PDF,2000,VECTORIZED,14.250000 +ChiSquared,PDF,2000,PARALLEL,69.542000 +ChiSquared,PDF,2000,WORK_STEALING,58.791000 +ChiSquared,LogPDF,2000,SCALAR,37.875000 +ChiSquared,LogPDF,2000,VECTORIZED,8.792000 +ChiSquared,LogPDF,2000,PARALLEL,47.875000 +ChiSquared,LogPDF,2000,WORK_STEALING,42.375000 +ChiSquared,CDF,2000,SCALAR,113.791000 +ChiSquared,CDF,2000,VECTORIZED,73.125000 +ChiSquared,CDF,2000,PARALLEL,63.625000 +ChiSquared,CDF,2000,WORK_STEALING,93.750000 +ChiSquared,PDF,5000,SCALAR,187.375000 +ChiSquared,PDF,5000,VECTORIZED,35.875000 +ChiSquared,PDF,5000,PARALLEL,99.417000 +ChiSquared,PDF,5000,WORK_STEALING,69.791000 +ChiSquared,LogPDF,5000,SCALAR,94.541000 +ChiSquared,LogPDF,5000,VECTORIZED,23.000000 +ChiSquared,LogPDF,5000,PARALLEL,119.000000 +ChiSquared,LogPDF,5000,WORK_STEALING,60.167000 +ChiSquared,CDF,5000,SCALAR,286.916000 +ChiSquared,CDF,5000,VECTORIZED,201.250000 +ChiSquared,CDF,5000,PARALLEL,105.084000 +ChiSquared,CDF,5000,WORK_STEALING,152.250000 +ChiSquared,PDF,10000,SCALAR,374.708000 +ChiSquared,PDF,10000,VECTORIZED,75.542000 +ChiSquared,PDF,10000,PARALLEL,216.541000 +ChiSquared,PDF,10000,WORK_STEALING,105.042000 +ChiSquared,LogPDF,10000,SCALAR,189.208000 +ChiSquared,LogPDF,10000,VECTORIZED,48.041000 +ChiSquared,LogPDF,10000,PARALLEL,185.458000 +ChiSquared,LogPDF,10000,WORK_STEALING,80.250000 +ChiSquared,CDF,10000,SCALAR,1348.375000 +ChiSquared,CDF,10000,VECTORIZED,409.375000 +ChiSquared,CDF,10000,PARALLEL,193.666000 +ChiSquared,CDF,10000,WORK_STEALING,217.958000 +ChiSquared,PDF,20000,SCALAR,747.958000 +ChiSquared,PDF,20000,VECTORIZED,149.333000 +ChiSquared,PDF,20000,PARALLEL,185.125000 +ChiSquared,PDF,20000,WORK_STEALING,128.583000 +ChiSquared,LogPDF,20000,SCALAR,378.292000 +ChiSquared,LogPDF,20000,VECTORIZED,95.542000 +ChiSquared,LogPDF,20000,PARALLEL,204.042000 +ChiSquared,LogPDF,20000,WORK_STEALING,156.500000 +ChiSquared,CDF,20000,SCALAR,1146.208000 +ChiSquared,CDF,20000,VECTORIZED,824.958000 +ChiSquared,CDF,20000,PARALLEL,287.750000 +ChiSquared,CDF,20000,WORK_STEALING,414.667000 +ChiSquared,PDF,50000,SCALAR,1874.167000 +ChiSquared,PDF,50000,VECTORIZED,374.917000 +ChiSquared,PDF,50000,PARALLEL,242.959000 +ChiSquared,PDF,50000,WORK_STEALING,245.416000 +ChiSquared,LogPDF,50000,SCALAR,945.542000 +ChiSquared,LogPDF,50000,VECTORIZED,241.166000 +ChiSquared,LogPDF,50000,PARALLEL,192.167000 +ChiSquared,LogPDF,50000,WORK_STEALING,179.958000 +ChiSquared,CDF,50000,SCALAR,2883.291000 +ChiSquared,CDF,50000,VECTORIZED,2074.292000 +ChiSquared,CDF,50000,PARALLEL,534.500000 +ChiSquared,CDF,50000,WORK_STEALING,624.958000 +ChiSquared,PDF,100000,SCALAR,3748.834000 +ChiSquared,PDF,100000,VECTORIZED,752.750000 +ChiSquared,PDF,100000,PARALLEL,266.417000 +ChiSquared,PDF,100000,WORK_STEALING,343.750000 +ChiSquared,LogPDF,100000,SCALAR,1892.875000 +ChiSquared,LogPDF,100000,VECTORIZED,484.708000 +ChiSquared,LogPDF,100000,PARALLEL,157.042000 +ChiSquared,LogPDF,100000,WORK_STEALING,282.958000 +ChiSquared,CDF,100000,SCALAR,5748.333000 +ChiSquared,CDF,100000,VECTORIZED,4128.500000 +ChiSquared,CDF,100000,PARALLEL,1124.417000 +ChiSquared,CDF,100000,WORK_STEALING,1268.459000 +ChiSquared,PDF,250000,SCALAR,9369.291000 +ChiSquared,PDF,250000,VECTORIZED,1897.792000 +ChiSquared,PDF,250000,PARALLEL,543.292000 +ChiSquared,PDF,250000,WORK_STEALING,616.875000 +ChiSquared,LogPDF,250000,SCALAR,4736.375000 +ChiSquared,LogPDF,250000,VECTORIZED,1229.334000 +ChiSquared,LogPDF,250000,PARALLEL,303.500000 +ChiSquared,LogPDF,250000,WORK_STEALING,414.792000 +ChiSquared,CDF,250000,SCALAR,14364.125000 +ChiSquared,CDF,250000,VECTORIZED,10392.917000 +ChiSquared,CDF,250000,PARALLEL,2544.417000 +ChiSquared,CDF,250000,WORK_STEALING,2675.709000 +ChiSquared,PDF,500000,SCALAR,18767.625000 +ChiSquared,PDF,500000,VECTORIZED,4068.166000 +ChiSquared,PDF,500000,PARALLEL,1522.083000 +ChiSquared,PDF,500000,WORK_STEALING,1644.792000 +ChiSquared,LogPDF,500000,SCALAR,11370.666000 +ChiSquared,LogPDF,500000,VECTORIZED,2557.167000 +ChiSquared,LogPDF,500000,PARALLEL,536.334000 +ChiSquared,LogPDF,500000,WORK_STEALING,674.500000 +ChiSquared,CDF,500000,SCALAR,34489.792000 +ChiSquared,CDF,500000,VECTORIZED,21886.833000 +ChiSquared,CDF,500000,PARALLEL,5311.209000 +ChiSquared,CDF,500000,WORK_STEALING,8817.667000 diff --git a/data/profiles/dispatcher/2026-04-12T05-36-21Z_darwin-arm64_investigate-gaussian-avx512-perf_sha-6aef918/summary.json b/data/profiles/dispatcher/2026-04-12T05-36-21Z_darwin-arm64_investigate-gaussian-avx512-perf_sha-6aef918/summary.json new file mode 100644 index 0000000..1cba3ea --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T05-36-21Z_darwin-arm64_investigate-gaussian-avx512-perf_sha-6aef918/summary.json @@ -0,0 +1,188 @@ +{ + "run_id": "2026-04-12T05-36-21Z_darwin-arm64_investigate-gaussian-avx512-perf_sha-6aef918", + "data_source": "strategy_profile_results.csv", + "metadata": { + "captured_at_utc": "2026-04-12T05-36-21Z", + "run_id": "2026-04-12T05-36-21Z_darwin-arm64_investigate-gaussian-avx512-perf_sha-6aef918", + "git_branch": "investigate-gaussian-avx512-perf", + "git_sha": "6aef918", + "project_root": "/Users/wolfman/Development/libstats", + "build_dir": "/Users/wolfman/Development/libstats/build", + "build_type": "Release", + "cxx_compiler": "", + "os": "darwin", + "arch": "arm64", + "cpu_brand": "Apple M1", + "physical_cores": "8", + "logical_cores": "8" + }, + "coverage": { + "distributions": [ + "Beta", + "ChiSquared", + "Discrete", + "Exponential", + "Gamma", + "Gaussian", + "Poisson", + "StudentT", + "Uniform" + ], + "operations": [ + "CDF", + "LogPDF", + "PDF" + ], + "batch_sizes": [ + 8, + 16, + 32, + 64, + 128, + 256, + 512, + 1000, + 2000, + 5000, + 10000, + 20000, + 50000, + 100000, + 250000, + 500000 + ], + "total_measurements": 1728 + }, + "strategy_win_counts": { + "VECTORIZED": 193, + "PARALLEL": 121, + "WORK_STEALING": 113, + "SCALAR": 5 + }, + "crossover_summary": { + "groups": 27, + "vectorized_never_wins": [], + "parallel_crossover_sizes": [ + { + "distribution": "Beta", + "operation": "LogPDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Beta", + "operation": "PDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "ChiSquared", + "operation": "CDF", + "vectorized_to_parallel": 32 + }, + { + "distribution": "ChiSquared", + "operation": "LogPDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "ChiSquared", + "operation": "PDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Discrete", + "operation": "CDF", + "vectorized_to_parallel": 512 + }, + { + "distribution": "Discrete", + "operation": "LogPDF", + "vectorized_to_parallel": 250000 + }, + { + "distribution": "Discrete", + "operation": "PDF", + "vectorized_to_parallel": 128 + }, + { + "distribution": "Exponential", + "operation": "CDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Exponential", + "operation": "LogPDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Exponential", + "operation": "PDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Gamma", + "operation": "CDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Gamma", + "operation": "LogPDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Gamma", + "operation": "PDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Gaussian", + "operation": "CDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Gaussian", + "operation": "LogPDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Gaussian", + "operation": "PDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Poisson", + "operation": "CDF", + "vectorized_to_parallel": 2000 + }, + { + "distribution": "Poisson", + "operation": "LogPDF", + "vectorized_to_parallel": 50000 + }, + { + "distribution": "Poisson", + "operation": "PDF", + "vectorized_to_parallel": 50000 + }, + { + "distribution": "StudentT", + "operation": "CDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "StudentT", + "operation": "LogPDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "StudentT", + "operation": "PDF", + "vectorized_to_parallel": 16 + }, + { + "distribution": "Uniform", + "operation": "CDF", + "vectorized_to_parallel": 8 + } + ] + } +} diff --git a/data/profiles/dispatcher/2026-04-12T05-55-52Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-e75c6e3/best_strategies.csv b/data/profiles/dispatcher/2026-04-12T05-55-52Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-e75c6e3/best_strategies.csv new file mode 100644 index 0000000..4a8eda1 --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T05-55-52Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-e75c6e3/best_strategies.csv @@ -0,0 +1,433 @@ +distribution,operation,batch_size,best_strategy,best_time_us,scalar_time_us,speedup_vs_scalar +Beta,CDF,8,VECTORIZED,1.824,2.278,1.249 +Beta,CDF,16,VECTORIZED,3.687,4.48,1.215 +Beta,CDF,32,VECTORIZED,6.567,8.273,1.26 +Beta,CDF,64,VECTORIZED,12.087,15.262,1.263 +Beta,CDF,128,VECTORIZED,26.992,34.087,1.263 +Beta,CDF,256,VECTORIZED,55.603,70.395,1.266 +Beta,CDF,512,VECTORIZED,104.853,133.026,1.269 +Beta,CDF,1000,VECTORIZED,204.678,261.676,1.278 +Beta,CDF,2000,VECTORIZED,415.186,528.214,1.272 +Beta,CDF,5000,VECTORIZED,1031.968,1312.694,1.272 +Beta,CDF,10000,VECTORIZED,2055.77,2638.883,1.284 +Beta,CDF,20000,VECTORIZED,4121.391,5303.753,1.287 +Beta,CDF,50000,VECTORIZED,10509.694,13645.976,1.298 +Beta,CDF,100000,VECTORIZED,21408.717,27187.49,1.27 +Beta,CDF,250000,VECTORIZED,55862.159,70341.281,1.259 +Beta,CDF,500000,VECTORIZED,108210.939,136394.416,1.26 +Beta,LogPDF,8,WORK_STEALING,0.789,0.841,1.066 +Beta,LogPDF,16,PARALLEL,1.134,1.614,1.423 +Beta,LogPDF,32,WORK_STEALING,1.774,3.107,1.751 +Beta,LogPDF,64,WORK_STEALING,3.265,5.909,1.81 +Beta,LogPDF,128,VECTORIZED,5.16,12.21,2.366 +Beta,LogPDF,256,VECTORIZED,9.32,24.528,2.632 +Beta,LogPDF,512,VECTORIZED,19.206,48.958,2.549 +Beta,LogPDF,1000,VECTORIZED,37.28,95.221,2.554 +Beta,LogPDF,2000,VECTORIZED,72.463,190.196,2.625 +Beta,LogPDF,5000,VECTORIZED,188.49,474.843,2.519 +Beta,LogPDF,10000,VECTORIZED,377.767,951.118,2.518 +Beta,LogPDF,20000,VECTORIZED,766.098,1919.34,2.505 +Beta,LogPDF,50000,VECTORIZED,1918.572,4855.107,2.531 +Beta,LogPDF,100000,VECTORIZED,3937.124,10055.888,2.554 +Beta,LogPDF,250000,VECTORIZED,10306.767,26880.563,2.608 +Beta,LogPDF,500000,VECTORIZED,21277.706,54250.002,2.55 +Beta,PDF,8,WORK_STEALING,0.912,1.037,1.137 +Beta,PDF,16,PARALLEL,1.362,1.967,1.444 +Beta,PDF,32,PARALLEL,2.396,3.689,1.54 +Beta,PDF,64,WORK_STEALING,4.294,7.117,1.657 +Beta,PDF,128,VECTORIZED,6.006,14.715,2.45 +Beta,PDF,256,VECTORIZED,10.98,29.791,2.713 +Beta,PDF,512,VECTORIZED,22.243,58.461,2.628 +Beta,PDF,1000,VECTORIZED,43.503,114.29,2.627 +Beta,PDF,2000,VECTORIZED,86.042,228.795,2.659 +Beta,PDF,5000,VECTORIZED,223.072,571.624,2.563 +Beta,PDF,10000,VECTORIZED,440.552,1139.424,2.586 +Beta,PDF,20000,VECTORIZED,895.672,2299.9,2.568 +Beta,PDF,50000,VECTORIZED,2238.532,5801.108,2.591 +Beta,PDF,100000,VECTORIZED,4499.069,11534.897,2.564 +Beta,PDF,250000,VECTORIZED,11863.217,29528.332,2.489 +Beta,PDF,500000,VECTORIZED,24361.364,59917.84,2.46 +ChiSquared,CDF,8,PARALLEL,0.928,1.518,1.636 +ChiSquared,CDF,16,PARALLEL,1.92,2.977,1.551 +ChiSquared,CDF,32,PARALLEL,3.513,5.915,1.684 +ChiSquared,CDF,64,PARALLEL,7.138,11.891,1.666 +ChiSquared,CDF,128,PARALLEL,14.178,23.74,1.674 +ChiSquared,CDF,256,WORK_STEALING,28.433,47.412,1.667 +ChiSquared,CDF,512,WORK_STEALING,57.368,94.417,1.646 +ChiSquared,CDF,1000,WORK_STEALING,114.515,185.249,1.618 +ChiSquared,CDF,2000,WORK_STEALING,231.114,371.916,1.609 +ChiSquared,CDF,5000,WORK_STEALING,383.755,931.059,2.426 +ChiSquared,CDF,10000,WORK_STEALING,406.415,1865.907,4.591 +ChiSquared,CDF,20000,WORK_STEALING,633.81,3735.991,5.894 +ChiSquared,CDF,50000,WORK_STEALING,1137.076,9178.469,8.072 +ChiSquared,CDF,100000,WORK_STEALING,2166.615,18474.985,8.527 +ChiSquared,CDF,250000,WORK_STEALING,4681.841,46089.342,9.844 +ChiSquared,CDF,500000,WORK_STEALING,9477.315,94949.249,10.019 +ChiSquared,LogPDF,8,PARALLEL,0.319,0.839,2.63 +ChiSquared,LogPDF,16,PARALLEL,0.478,1.498,3.134 +ChiSquared,LogPDF,32,PARALLEL,0.804,2.954,3.674 +ChiSquared,LogPDF,64,PARALLEL,1.419,5.811,4.095 +ChiSquared,LogPDF,128,VECTORIZED,2.348,11.219,4.778 +ChiSquared,LogPDF,256,VECTORIZED,4.109,22.606,5.502 +ChiSquared,LogPDF,512,VECTORIZED,7.404,44.971,6.074 +ChiSquared,LogPDF,1000,VECTORIZED,14.023,88.035,6.278 +ChiSquared,LogPDF,2000,VECTORIZED,28.153,176.214,6.259 +ChiSquared,LogPDF,5000,VECTORIZED,72.013,442.043,6.138 +ChiSquared,LogPDF,10000,VECTORIZED,144.79,883.501,6.102 +ChiSquared,LogPDF,20000,WORK_STEALING,266.186,1770.347,6.651 +ChiSquared,LogPDF,50000,WORK_STEALING,325.851,4289.635,13.164 +ChiSquared,LogPDF,100000,WORK_STEALING,466.344,8666.265,18.583 +ChiSquared,LogPDF,250000,WORK_STEALING,854.389,21812.825,25.53 +ChiSquared,LogPDF,500000,WORK_STEALING,1734.745,45162.959,26.034 +ChiSquared,PDF,8,PARALLEL,0.492,1.427,2.9 +ChiSquared,PDF,16,PARALLEL,0.772,2.848,3.689 +ChiSquared,PDF,32,PARALLEL,1.428,5.558,3.892 +ChiSquared,PDF,64,VECTORIZED,2.106,10.809,5.132 +ChiSquared,PDF,128,VECTORIZED,3.18,21.874,6.879 +ChiSquared,PDF,256,VECTORIZED,5.637,43.672,7.747 +ChiSquared,PDF,512,VECTORIZED,10.682,87.323,8.175 +ChiSquared,PDF,1000,VECTORIZED,20.383,170.551,8.367 +ChiSquared,PDF,2000,VECTORIZED,40.817,341.84,8.375 +ChiSquared,PDF,5000,VECTORIZED,103.628,855.697,8.257 +ChiSquared,PDF,10000,VECTORIZED,206.983,1707.995,8.252 +ChiSquared,PDF,20000,WORK_STEALING,377.527,3456.736,9.156 +ChiSquared,PDF,50000,WORK_STEALING,556.281,8768.906,15.763 +ChiSquared,PDF,100000,WORK_STEALING,693.799,17466.45,25.175 +ChiSquared,PDF,250000,WORK_STEALING,1425.777,44439.148,31.168 +ChiSquared,PDF,500000,WORK_STEALING,2833.422,89011.909,31.415 +Discrete,CDF,8,VECTORIZED,0.207,0.631,3.048 +Discrete,CDF,16,VECTORIZED,0.219,1.123,5.128 +Discrete,CDF,32,VECTORIZED,0.289,2.318,8.021 +Discrete,CDF,64,VECTORIZED,0.461,4.609,9.998 +Discrete,CDF,128,VECTORIZED,0.713,8.704,12.208 +Discrete,CDF,256,VECTORIZED,1.264,17.93,14.185 +Discrete,CDF,512,VECTORIZED,2.532,35.202,13.903 +Discrete,CDF,1000,VECTORIZED,4.436,68.886,15.529 +Discrete,CDF,2000,VECTORIZED,8.879,139.339,15.693 +Discrete,CDF,5000,VECTORIZED,22.081,331.741,15.024 +Discrete,CDF,10000,VECTORIZED,47.613,628.89,13.208 +Discrete,CDF,20000,VECTORIZED,86.728,1206.387,13.91 +Discrete,CDF,50000,WORK_STEALING,183.581,3199.611,17.429 +Discrete,CDF,100000,WORK_STEALING,254.203,6356.678,25.006 +Discrete,CDF,250000,WORK_STEALING,451.442,15791.279,34.98 +Discrete,CDF,500000,WORK_STEALING,829.916,37855.593,45.614 +Discrete,LogPDF,8,VECTORIZED,0.22,0.662,3.009 +Discrete,LogPDF,16,VECTORIZED,0.238,1.16,4.874 +Discrete,LogPDF,32,VECTORIZED,0.375,2.283,6.088 +Discrete,LogPDF,64,VECTORIZED,0.491,4.53,9.226 +Discrete,LogPDF,128,VECTORIZED,0.857,8.898,10.383 +Discrete,LogPDF,256,VECTORIZED,1.6,17.763,11.102 +Discrete,LogPDF,512,VECTORIZED,2.943,35.168,11.95 +Discrete,LogPDF,1000,VECTORIZED,5.664,68.992,12.181 +Discrete,LogPDF,2000,VECTORIZED,11.306,137.467,12.159 +Discrete,LogPDF,5000,VECTORIZED,27.267,333.496,12.231 +Discrete,LogPDF,10000,VECTORIZED,52.569,664.807,12.646 +Discrete,LogPDF,20000,VECTORIZED,94.611,1181.772,12.491 +Discrete,LogPDF,50000,WORK_STEALING,164.871,3225.592,19.564 +Discrete,LogPDF,100000,WORK_STEALING,211.907,6207.493,29.293 +Discrete,LogPDF,250000,WORK_STEALING,343.007,15877.295,46.289 +Discrete,LogPDF,500000,WORK_STEALING,756.407,36277.922,47.961 +Discrete,PDF,8,VECTORIZED,0.207,0.67,3.237 +Discrete,PDF,16,VECTORIZED,0.219,1.076,4.913 +Discrete,PDF,32,VECTORIZED,0.369,2.343,6.35 +Discrete,PDF,64,VECTORIZED,0.486,4.632,9.531 +Discrete,PDF,128,PARALLEL,0.747,9.092,12.171 +Discrete,PDF,256,PARALLEL,1.244,18.086,14.539 +Discrete,PDF,512,PARALLEL,2.26,36.294,16.059 +Discrete,PDF,1000,PARALLEL,4.185,70.856,16.931 +Discrete,PDF,2000,VECTORIZED,8.343,140.988,16.899 +Discrete,PDF,5000,VECTORIZED,20.445,352.164,17.225 +Discrete,PDF,10000,VECTORIZED,38.898,666.563,17.136 +Discrete,PDF,20000,VECTORIZED,71.544,1248.761,17.454 +Discrete,PDF,50000,WORK_STEALING,153.473,3130.039,20.395 +Discrete,PDF,100000,WORK_STEALING,197.324,6231.586,31.58 +Discrete,PDF,250000,WORK_STEALING,349.807,16372.408,46.804 +Discrete,PDF,500000,WORK_STEALING,555.283,34087.722,61.388 +Exponential,CDF,8,PARALLEL,0.284,0.736,2.592 +Exponential,CDF,16,PARALLEL,0.412,1.37,3.325 +Exponential,CDF,32,VECTORIZED,0.551,2.709,4.917 +Exponential,CDF,64,VECTORIZED,0.932,5.342,5.732 +Exponential,CDF,128,VECTORIZED,1.299,10.463,8.055 +Exponential,CDF,256,VECTORIZED,2.185,20.867,9.55 +Exponential,CDF,512,VECTORIZED,4.195,41.518,9.897 +Exponential,CDF,1000,VECTORIZED,7.962,81.149,10.192 +Exponential,CDF,2000,VECTORIZED,15.829,162.325,10.255 +Exponential,CDF,5000,VECTORIZED,39.0,393.918,10.1 +Exponential,CDF,10000,VECTORIZED,74.195,804.394,10.842 +Exponential,CDF,20000,WORK_STEALING,129.432,1570.004,12.13 +Exponential,CDF,50000,WORK_STEALING,215.983,3819.556,17.685 +Exponential,CDF,100000,WORK_STEALING,307.863,7690.138,24.979 +Exponential,CDF,250000,WORK_STEALING,727.349,19635.843,26.996 +Exponential,CDF,500000,WORK_STEALING,1263.703,40751.03,32.247 +Exponential,LogPDF,8,PARALLEL,0.186,0.579,3.113 +Exponential,LogPDF,16,PARALLEL,0.204,1.043,5.113 +Exponential,LogPDF,32,WORK_STEALING,0.242,2.019,8.343 +Exponential,LogPDF,64,WORK_STEALING,0.272,4.01,14.743 +Exponential,LogPDF,128,WORK_STEALING,0.313,7.993,25.537 +Exponential,LogPDF,256,WORK_STEALING,0.433,15.657,36.159 +Exponential,LogPDF,512,WORK_STEALING,0.571,31.277,54.776 +Exponential,LogPDF,1000,WORK_STEALING,0.86,61.475,71.483 +Exponential,LogPDF,2000,WORK_STEALING,1.493,122.601,82.117 +Exponential,LogPDF,5000,VECTORIZED,8.185,300.155,36.671 +Exponential,LogPDF,10000,VECTORIZED,16.24,578.087,35.596 +Exponential,LogPDF,20000,VECTORIZED,35.94,1220.231,33.952 +Exponential,LogPDF,50000,VECTORIZED,92.204,2981.963,32.341 +Exponential,LogPDF,100000,WORK_STEALING,138.083,5869.572,42.508 +Exponential,LogPDF,250000,PARALLEL,231.28,14761.885,63.827 +Exponential,LogPDF,500000,WORK_STEALING,400.822,30639.062,76.441 +Exponential,PDF,8,PARALLEL,0.288,0.706,2.451 +Exponential,PDF,16,PARALLEL,0.4,1.345,3.362 +Exponential,PDF,32,VECTORIZED,0.511,2.592,5.072 +Exponential,PDF,64,VECTORIZED,0.823,5.093,6.188 +Exponential,PDF,128,VECTORIZED,1.21,9.998,8.263 +Exponential,PDF,256,VECTORIZED,2.293,19.979,8.713 +Exponential,PDF,512,VECTORIZED,4.028,40.06,9.945 +Exponential,PDF,1000,VECTORIZED,7.781,77.954,10.019 +Exponential,PDF,2000,VECTORIZED,15.169,155.197,10.231 +Exponential,PDF,5000,VECTORIZED,37.32,380.839,10.205 +Exponential,PDF,10000,VECTORIZED,71.694,745.394,10.397 +Exponential,PDF,20000,WORK_STEALING,125.271,1469.837,11.733 +Exponential,PDF,50000,WORK_STEALING,199.268,3702.037,18.578 +Exponential,PDF,100000,WORK_STEALING,321.44,7580.54,23.583 +Exponential,PDF,250000,WORK_STEALING,664.963,21106.764,31.741 +Exponential,PDF,500000,WORK_STEALING,1356.76,43044.825,31.726 +Gamma,CDF,8,PARALLEL,1.01,1.517,1.502 +Gamma,CDF,16,WORK_STEALING,1.819,2.854,1.569 +Gamma,CDF,32,PARALLEL,3.542,5.887,1.662 +Gamma,CDF,64,PARALLEL,6.773,11.52,1.701 +Gamma,CDF,128,WORK_STEALING,13.451,21.818,1.622 +Gamma,CDF,256,WORK_STEALING,27.007,46.095,1.707 +Gamma,CDF,512,WORK_STEALING,55.901,92.689,1.658 +Gamma,CDF,1000,VECTORIZED,108.05,182.828,1.692 +Gamma,CDF,2000,WORK_STEALING,219.786,361.622,1.645 +Gamma,CDF,5000,WORK_STEALING,309.932,908.043,2.93 +Gamma,CDF,10000,WORK_STEALING,416.177,1812.897,4.356 +Gamma,CDF,20000,WORK_STEALING,560.97,3632.194,6.475 +Gamma,CDF,50000,WORK_STEALING,1179.907,9253.649,7.843 +Gamma,CDF,100000,WORK_STEALING,1987.041,19061.954,9.593 +Gamma,CDF,250000,WORK_STEALING,5523.179,47939.196,8.68 +Gamma,CDF,500000,WORK_STEALING,8921.626,92261.452,10.341 +Gamma,LogPDF,8,PARALLEL,0.334,0.765,2.29 +Gamma,LogPDF,16,PARALLEL,0.476,1.449,3.044 +Gamma,LogPDF,32,PARALLEL,0.796,2.827,3.552 +Gamma,LogPDF,64,WORK_STEALING,1.39,5.557,3.998 +Gamma,LogPDF,128,WORK_STEALING,2.584,10.846,4.197 +Gamma,LogPDF,256,VECTORIZED,4.066,21.919,5.391 +Gamma,LogPDF,512,VECTORIZED,6.998,43.742,6.251 +Gamma,LogPDF,1000,VECTORIZED,14.202,83.513,5.88 +Gamma,LogPDF,2000,VECTORIZED,27.938,170.925,6.118 +Gamma,LogPDF,5000,VECTORIZED,72.973,429.17,5.881 +Gamma,LogPDF,10000,VECTORIZED,143.351,855.422,5.967 +Gamma,LogPDF,20000,WORK_STEALING,261.039,1712.371,6.56 +Gamma,LogPDF,50000,WORK_STEALING,377.836,4307.949,11.402 +Gamma,LogPDF,100000,WORK_STEALING,500.433,9005.219,17.995 +Gamma,LogPDF,250000,WORK_STEALING,1075.012,22616.24,21.038 +Gamma,LogPDF,500000,WORK_STEALING,1606.919,43755.764,27.23 +Gamma,PDF,8,PARALLEL,0.496,1.465,2.954 +Gamma,PDF,16,PARALLEL,0.762,2.652,3.48 +Gamma,PDF,32,VECTORIZED,1.366,5.212,3.816 +Gamma,PDF,64,VECTORIZED,2.228,10.862,4.875 +Gamma,PDF,128,VECTORIZED,3.278,21.679,6.613 +Gamma,PDF,256,VECTORIZED,5.808,43.353,7.464 +Gamma,PDF,512,VECTORIZED,10.916,85.901,7.869 +Gamma,PDF,1000,VECTORIZED,20.882,169.317,8.108 +Gamma,PDF,2000,VECTORIZED,40.69,338.696,8.324 +Gamma,PDF,5000,VECTORIZED,104.54,850.715,8.138 +Gamma,PDF,10000,VECTORIZED,208.252,1695.141,8.14 +Gamma,PDF,20000,WORK_STEALING,383.257,3439.624,8.975 +Gamma,PDF,50000,WORK_STEALING,573.799,8915.021,15.537 +Gamma,PDF,100000,WORK_STEALING,845.147,18062.75,21.372 +Gamma,PDF,250000,WORK_STEALING,1640.754,47023.807,28.66 +Gamma,PDF,500000,WORK_STEALING,3124.022,93606.819,29.964 +Gaussian,CDF,8,VECTORIZED,0.381,0.876,2.299 +Gaussian,CDF,16,VECTORIZED,0.473,1.82,3.848 +Gaussian,CDF,32,VECTORIZED,0.676,3.402,5.033 +Gaussian,CDF,64,VECTORIZED,1.055,6.864,6.506 +Gaussian,CDF,128,VECTORIZED,1.861,13.324,7.16 +Gaussian,CDF,256,VECTORIZED,3.526,26.739,7.583 +Gaussian,CDF,512,VECTORIZED,6.759,53.658,7.939 +Gaussian,CDF,1000,VECTORIZED,12.812,104.268,8.138 +Gaussian,CDF,2000,VECTORIZED,25.783,230.143,8.926 +Gaussian,CDF,5000,VECTORIZED,64.012,520.793,8.136 +Gaussian,CDF,10000,VECTORIZED,127.955,1043.482,8.155 +Gaussian,CDF,20000,WORK_STEALING,206.21,2125.346,10.307 +Gaussian,CDF,50000,WORK_STEALING,452.512,5244.991,11.591 +Gaussian,CDF,100000,WORK_STEALING,829.468,10496.903,12.655 +Gaussian,CDF,250000,WORK_STEALING,1760.534,26679.933,15.154 +Gaussian,CDF,500000,WORK_STEALING,3611.076,54242.521,15.021 +Gaussian,LogPDF,8,PARALLEL,0.146,0.477,3.267 +Gaussian,LogPDF,16,PARALLEL,0.153,0.957,6.255 +Gaussian,LogPDF,32,PARALLEL,0.171,1.865,10.906 +Gaussian,LogPDF,64,PARALLEL,0.185,3.676,19.87 +Gaussian,LogPDF,128,PARALLEL,0.219,7.338,33.507 +Gaussian,LogPDF,256,PARALLEL,0.288,14.572,50.597 +Gaussian,LogPDF,512,PARALLEL,0.434,28.852,66.479 +Gaussian,LogPDF,1000,PARALLEL,0.671,55.671,82.967 +Gaussian,LogPDF,2000,PARALLEL,1.268,139.102,109.702 +Gaussian,LogPDF,5000,VECTORIZED,6.598,281.195,42.618 +Gaussian,LogPDF,10000,VECTORIZED,13.321,563.089,42.271 +Gaussian,LogPDF,20000,VECTORIZED,29.394,1126.417,38.321 +Gaussian,LogPDF,50000,WORK_STEALING,57.41,2816.569,49.061 +Gaussian,LogPDF,100000,PARALLEL,68.628,5657.184,82.433 +Gaussian,LogPDF,250000,PARALLEL,149.495,14193.064,94.94 +Gaussian,LogPDF,500000,PARALLEL,238.987,28308.526,118.452 +Gaussian,PDF,8,PARALLEL,0.249,0.663,2.663 +Gaussian,PDF,16,PARALLEL,0.34,1.269,3.732 +Gaussian,PDF,32,PARALLEL,0.539,2.433,4.514 +Gaussian,PDF,64,PARALLEL,0.934,4.946,5.296 +Gaussian,PDF,128,VECTORIZED,1.15,9.499,8.26 +Gaussian,PDF,256,VECTORIZED,1.971,19.157,9.719 +Gaussian,PDF,512,VECTORIZED,3.571,37.914,10.617 +Gaussian,PDF,1000,VECTORIZED,6.683,78.496,11.746 +Gaussian,PDF,2000,VECTORIZED,13.259,166.993,12.595 +Gaussian,PDF,5000,VECTORIZED,33.102,414.916,12.534 +Gaussian,PDF,10000,VECTORIZED,67.803,727.539,10.73 +Gaussian,PDF,20000,WORK_STEALING,87.063,1491.612,17.133 +Gaussian,PDF,50000,WORK_STEALING,149.182,3749.025,25.131 +Gaussian,PDF,100000,WORK_STEALING,238.977,7493.858,31.358 +Gaussian,PDF,250000,WORK_STEALING,526.68,18872.819,35.834 +Gaussian,PDF,500000,WORK_STEALING,1038.997,38109.428,36.679 +Poisson,CDF,8,SCALAR,1.326,1.326,1.0 +Poisson,CDF,16,SCALAR,2.722,2.722,1.0 +Poisson,CDF,32,SCALAR,5.383,5.383,1.0 +Poisson,CDF,64,SCALAR,12.018,12.018,1.0 +Poisson,CDF,128,VECTORIZED,22.1,22.358,1.012 +Poisson,CDF,256,WORK_STEALING,44.765,45.511,1.017 +Poisson,CDF,512,VECTORIZED,89.994,91.604,1.018 +Poisson,CDF,1000,VECTORIZED,172.592,175.62,1.018 +Poisson,CDF,2000,VECTORIZED,346.864,350.721,1.011 +Poisson,CDF,5000,WORK_STEALING,374.889,904.462,2.413 +Poisson,CDF,10000,WORK_STEALING,537.192,1759.176,3.275 +Poisson,CDF,20000,WORK_STEALING,779.629,3530.499,4.528 +Poisson,CDF,50000,WORK_STEALING,1554.58,8893.41,5.721 +Poisson,CDF,100000,WORK_STEALING,2771.316,17828.967,6.433 +Poisson,CDF,250000,WORK_STEALING,7150.195,41468.784,5.8 +Poisson,CDF,500000,WORK_STEALING,12707.912,80156.317,6.308 +Poisson,LogPDF,8,VECTORIZED,0.332,0.819,2.467 +Poisson,LogPDF,16,VECTORIZED,0.523,1.584,3.029 +Poisson,LogPDF,32,VECTORIZED,0.881,3.223,3.658 +Poisson,LogPDF,64,VECTORIZED,1.807,6.315,3.495 +Poisson,LogPDF,128,VECTORIZED,3.1,12.055,3.889 +Poisson,LogPDF,256,VECTORIZED,6.228,24.478,3.93 +Poisson,LogPDF,512,WORK_STEALING,12.216,48.765,3.992 +Poisson,LogPDF,1000,WORK_STEALING,23.4,94.873,4.054 +Poisson,LogPDF,2000,WORK_STEALING,47.149,188.323,3.994 +Poisson,LogPDF,5000,VECTORIZED,121.161,472.86,3.903 +Poisson,LogPDF,10000,WORK_STEALING,238.073,943.729,3.964 +Poisson,LogPDF,20000,WORK_STEALING,285.532,1887.053,6.609 +Poisson,LogPDF,50000,WORK_STEALING,376.003,4731.683,12.584 +Poisson,LogPDF,100000,WORK_STEALING,656.007,9605.554,14.642 +Poisson,LogPDF,250000,WORK_STEALING,1045.822,22302.845,21.326 +Poisson,LogPDF,500000,WORK_STEALING,2188.146,43186.21,19.736 +Poisson,PDF,8,VECTORIZED,0.754,1.29,1.711 +Poisson,PDF,16,VECTORIZED,1.308,2.331,1.782 +Poisson,PDF,32,VECTORIZED,2.311,4.723,2.044 +Poisson,PDF,64,VECTORIZED,4.61,9.36,2.03 +Poisson,PDF,128,VECTORIZED,8.799,18.397,2.091 +Poisson,PDF,256,VECTORIZED,17.822,37.144,2.084 +Poisson,PDF,512,VECTORIZED,35.611,74.352,2.088 +Poisson,PDF,1000,VECTORIZED,69.233,144.968,2.094 +Poisson,PDF,2000,WORK_STEALING,139.204,304.184,2.185 +Poisson,PDF,5000,WORK_STEALING,267.167,764.759,2.862 +Poisson,PDF,10000,WORK_STEALING,291.499,1456.656,4.997 +Poisson,PDF,20000,WORK_STEALING,398.339,2890.646,7.257 +Poisson,PDF,50000,WORK_STEALING,696.05,7334.977,10.538 +Poisson,PDF,100000,WORK_STEALING,1258.094,14674.027,11.664 +Poisson,PDF,250000,WORK_STEALING,2804.03,35545.913,12.677 +Poisson,PDF,500000,WORK_STEALING,4655.665,66119.149,14.202 +StudentT,CDF,8,WORK_STEALING,2.961,3.495,1.18 +StudentT,CDF,16,WORK_STEALING,5.376,6.626,1.233 +StudentT,CDF,32,WORK_STEALING,11.52,13.726,1.191 +StudentT,CDF,64,WORK_STEALING,22.512,27.099,1.204 +StudentT,CDF,128,VECTORIZED,43.813,53.198,1.214 +StudentT,CDF,256,VECTORIZED,88.351,106.645,1.207 +StudentT,CDF,512,PARALLEL,168.854,211.31,1.251 +StudentT,CDF,1000,PARALLEL,334.115,403.88,1.209 +StudentT,CDF,2000,PARALLEL,668.523,808.536,1.209 +StudentT,CDF,5000,PARALLEL,1675.067,2039.758,1.218 +StudentT,CDF,10000,PARALLEL,3345.139,4060.666,1.214 +StudentT,CDF,20000,VECTORIZED,6813.721,8208.717,1.205 +StudentT,CDF,50000,VECTORIZED,17565.398,21188.633,1.206 +StudentT,CDF,100000,WORK_STEALING,33989.196,42454.443,1.249 +StudentT,CDF,250000,WORK_STEALING,84879.62,102617.177,1.209 +StudentT,CDF,500000,PARALLEL,169793.972,206548.826,1.216 +StudentT,LogPDF,8,VECTORIZED,0.536,0.836,1.56 +StudentT,LogPDF,16,VECTORIZED,0.626,1.521,2.43 +StudentT,LogPDF,32,VECTORIZED,0.849,2.99,3.522 +StudentT,LogPDF,64,VECTORIZED,1.244,5.87,4.719 +StudentT,LogPDF,128,VECTORIZED,2.021,11.552,5.716 +StudentT,LogPDF,256,VECTORIZED,3.722,22.859,6.142 +StudentT,LogPDF,512,VECTORIZED,7.019,45.558,6.491 +StudentT,LogPDF,1000,VECTORIZED,12.917,86.064,6.663 +StudentT,LogPDF,2000,VECTORIZED,25.542,172.765,6.764 +StudentT,LogPDF,5000,VECTORIZED,65.763,406.758,6.185 +StudentT,LogPDF,10000,VECTORIZED,131.128,866.203,6.606 +StudentT,LogPDF,20000,VECTORIZED,263.697,1734.06,6.576 +StudentT,LogPDF,50000,VECTORIZED,710.186,4387.641,6.178 +StudentT,LogPDF,100000,WORK_STEALING,736.153,9083.086,12.339 +StudentT,LogPDF,250000,WORK_STEALING,1302.958,22120.386,16.977 +StudentT,LogPDF,500000,WORK_STEALING,2484.48,44361.856,17.856 +StudentT,PDF,8,PARALLEL,0.744,0.976,1.312 +StudentT,PDF,16,PARALLEL,1.019,1.913,1.877 +StudentT,PDF,32,VECTORIZED,1.291,3.524,2.73 +StudentT,PDF,64,VECTORIZED,1.739,7.201,4.141 +StudentT,PDF,128,VECTORIZED,2.938,14.329,4.877 +StudentT,PDF,256,VECTORIZED,5.581,28.596,5.124 +StudentT,PDF,512,VECTORIZED,10.407,56.821,5.46 +StudentT,PDF,1000,VECTORIZED,19.654,107.776,5.484 +StudentT,PDF,2000,VECTORIZED,38.523,216.422,5.618 +StudentT,PDF,5000,VECTORIZED,98.028,542.172,5.531 +StudentT,PDF,10000,VECTORIZED,195.096,1081.366,5.543 +StudentT,PDF,20000,VECTORIZED,393.142,2172.542,5.526 +StudentT,PDF,50000,VECTORIZED,1001.4,5512.219,5.505 +StudentT,PDF,100000,WORK_STEALING,1254.442,11371.579,9.065 +StudentT,PDF,250000,PARALLEL,2324.528,27521.818,11.84 +StudentT,PDF,500000,WORK_STEALING,4707.626,54990.904,11.681 +Uniform,CDF,8,PARALLEL,0.225,0.707,3.142 +Uniform,CDF,16,PARALLEL,0.254,1.411,5.555 +Uniform,CDF,32,PARALLEL,0.309,2.67,8.641 +Uniform,CDF,64,PARALLEL,0.379,5.222,13.778 +Uniform,CDF,128,PARALLEL,0.588,10.436,17.748 +Uniform,CDF,256,WORK_STEALING,0.444,20.694,46.608 +Uniform,CDF,512,WORK_STEALING,0.646,35.064,54.279 +Uniform,CDF,1000,PARALLEL,1.489,67.333,45.22 +Uniform,CDF,2000,WORK_STEALING,4.694,125.523,26.741 +Uniform,CDF,5000,VECTORIZED,13.004,291.14,22.388 +Uniform,CDF,10000,WORK_STEALING,28.363,589.7,20.791 +Uniform,CDF,20000,WORK_STEALING,39.174,1308.494,33.402 +Uniform,CDF,50000,WORK_STEALING,68.871,3104.694,45.08 +Uniform,CDF,100000,WORK_STEALING,133.885,5990.628,44.745 +Uniform,CDF,250000,WORK_STEALING,303.761,14895.358,49.036 +Uniform,CDF,500000,WORK_STEALING,601.362,29842.841,49.625 +Uniform,LogPDF,8,VECTORIZED,0.138,0.556,4.029 +Uniform,LogPDF,16,VECTORIZED,0.192,1.344,7.0 +Uniform,LogPDF,32,VECTORIZED,0.227,2.624,11.559 +Uniform,LogPDF,64,VECTORIZED,0.168,4.299,25.589 +Uniform,LogPDF,128,VECTORIZED,0.318,10.057,31.626 +Uniform,LogPDF,256,VECTORIZED,0.461,18.397,39.907 +Uniform,LogPDF,512,VECTORIZED,0.575,32.281,56.141 +Uniform,LogPDF,1000,WORK_STEALING,0.844,65.839,78.008 +Uniform,LogPDF,2000,WORK_STEALING,0.863,115.567,133.913 +Uniform,LogPDF,5000,VECTORIZED,3.81,285.688,74.984 +Uniform,LogPDF,10000,VECTORIZED,7.489,567.083,75.722 +Uniform,LogPDF,20000,VECTORIZED,11.028,1254.58,113.763 +Uniform,LogPDF,50000,VECTORIZED,41.376,3136.259,75.799 +Uniform,LogPDF,100000,VECTORIZED,82.751,6133.268,74.117 +Uniform,LogPDF,250000,VECTORIZED,200.611,14511.75,72.338 +Uniform,LogPDF,500000,VECTORIZED,484.381,29022.239,59.916 +Uniform,PDF,8,VECTORIZED,0.131,0.575,4.389 +Uniform,PDF,16,VECTORIZED,0.182,1.277,7.016 +Uniform,PDF,32,VECTORIZED,0.205,2.679,13.068 +Uniform,PDF,64,VECTORIZED,0.228,4.563,20.013 +Uniform,PDF,128,VECTORIZED,0.295,10.458,35.451 +Uniform,PDF,256,VECTORIZED,0.392,20.562,52.454 +Uniform,PDF,512,VECTORIZED,0.462,34.86,75.455 +Uniform,PDF,1000,VECTORIZED,0.771,65.445,84.883 +Uniform,PDF,2000,WORK_STEALING,0.854,121.171,141.886 +Uniform,PDF,5000,VECTORIZED,3.16,294.0,93.038 +Uniform,PDF,10000,VECTORIZED,6.284,582.847,92.751 +Uniform,PDF,20000,VECTORIZED,10.194,1180.011,115.755 +Uniform,PDF,50000,VECTORIZED,37.992,3218.146,84.706 +Uniform,PDF,100000,VECTORIZED,80.621,6159.961,76.406 +Uniform,PDF,250000,VECTORIZED,155.432,14728.921,94.761 +Uniform,PDF,500000,VECTORIZED,446.261,29749.137,66.663 diff --git a/data/profiles/dispatcher/2026-04-12T05-55-52Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-e75c6e3/crossovers.csv b/data/profiles/dispatcher/2026-04-12T05-55-52Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-e75c6e3/crossovers.csv new file mode 100644 index 0000000..aa338d2 --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T05-55-52Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-e75c6e3/crossovers.csv @@ -0,0 +1,28 @@ +distribution,operation,scalar_to_vectorized,vectorized_to_parallel,parallel_to_work_stealing,best_strategy_at_max_size,best_time_us_at_max_size,max_batch_size +Beta,CDF,8,,16,VECTORIZED,108210.939,500000 +Beta,LogPDF,32,8,8,VECTORIZED,21277.706,500000 +Beta,PDF,32,8,8,VECTORIZED,24361.364,500000 +ChiSquared,CDF,16,8,256,WORK_STEALING,9477.315,500000 +ChiSquared,LogPDF,16,8,128,WORK_STEALING,1734.745,500000 +ChiSquared,PDF,8,8,512,WORK_STEALING,2833.422,500000 +Discrete,CDF,8,100000,64,WORK_STEALING,829.916,500000 +Discrete,LogPDF,8,100000,64,WORK_STEALING,756.407,500000 +Discrete,PDF,8,128,5000,WORK_STEALING,555.283,500000 +Exponential,CDF,8,8,64,WORK_STEALING,1263.703,500000 +Exponential,LogPDF,8,8,32,WORK_STEALING,400.822,500000 +Exponential,PDF,8,8,128,WORK_STEALING,1356.76,500000 +Gamma,CDF,16,8,16,WORK_STEALING,8921.626,500000 +Gamma,LogPDF,16,8,64,WORK_STEALING,1606.919,500000 +Gamma,PDF,8,8,64,WORK_STEALING,3124.022,500000 +Gaussian,CDF,8,,2000,WORK_STEALING,3611.076,500000 +Gaussian,LogPDF,8,8,5000,PARALLEL,238.987,500000 +Gaussian,PDF,8,8,1000,WORK_STEALING,1038.997,500000 +Poisson,CDF,128,64,32,WORK_STEALING,12707.912,500000 +Poisson,LogPDF,8,50000,64,WORK_STEALING,2188.146,500000 +Poisson,PDF,8,2000,64,WORK_STEALING,4655.665,500000 +StudentT,CDF,8,8,8,PARALLEL,169793.972,500000 +StudentT,LogPDF,8,100000,8,WORK_STEALING,2484.48,500000 +StudentT,PDF,16,8,64,WORK_STEALING,4707.626,500000 +Uniform,CDF,8,8,256,WORK_STEALING,601.362,500000 +Uniform,LogPDF,8,,64,VECTORIZED,484.381,500000 +Uniform,PDF,8,,8,VECTORIZED,446.261,500000 diff --git a/data/profiles/dispatcher/2026-04-12T05-55-52Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-e75c6e3/logs/strategy_profile.txt b/data/profiles/dispatcher/2026-04-12T05-55-52Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-e75c6e3/logs/strategy_profile.txt new file mode 100644 index 0000000..f2bcc57 --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T05-55-52Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-e75c6e3/logs/strategy_profile.txt @@ -0,0 +1,658 @@ + +==================== + Strategy Profile +==================== + +Forced-strategy timing profiler for dispatcher threshold tuning + +System: 8 logical cores, AVX SIMD, 8192 KB L3 cache + +Batch sizes: 8 16 32 64 128 256 512 1000 2000 5000 10000 20000 50000 100000 250000 500000 + + +--- Uniform Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- Gaussian Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- Exponential Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- Discrete Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- Poisson Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- Gamma Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- StudentT Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- Beta Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- ChiSquared Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +========================= + Best Strategy Summary +========================= + +Distribution Operation Size Best Strategy Time (ΞΌs) +---------------------------------------------------------------- +Beta CDF 8 Vectorized 1.82 +Beta CDF 16 Vectorized 3.69 +Beta CDF 32 Vectorized 6.57 +Beta CDF 64 Vectorized 12.09 +Beta CDF 128 Vectorized 26.99 +Beta CDF 256 Vectorized 55.60 +Beta CDF 512 Vectorized 104.85 +Beta CDF 1000 Vectorized 204.68 +Beta CDF 2000 Vectorized 415.19 +Beta CDF 5000 Vectorized 1031.97 +Beta CDF 10000 Vectorized 2055.77 +Beta CDF 20000 Vectorized 4121.39 +Beta CDF 50000 Vectorized 10509.69 +Beta CDF 100000 Vectorized 21408.72 +Beta CDF 250000 Vectorized 55862.16 +Beta CDF 500000 Vectorized 108210.94 +Beta LogPDF 8 Work-Stealing 0.79 +Beta LogPDF 16 Parallel 1.13 +Beta LogPDF 32 Work-Stealing 1.77 +Beta LogPDF 64 Work-Stealing 3.27 +Beta LogPDF 128 Vectorized 5.16 +Beta LogPDF 256 Vectorized 9.32 +Beta LogPDF 512 Vectorized 19.21 +Beta LogPDF 1000 Vectorized 37.28 +Beta LogPDF 2000 Vectorized 72.46 +Beta LogPDF 5000 Vectorized 188.49 +Beta LogPDF 10000 Vectorized 377.77 +Beta LogPDF 20000 Vectorized 766.10 +Beta LogPDF 50000 Vectorized 1918.57 +Beta LogPDF 100000 Vectorized 3937.12 +Beta LogPDF 250000 Vectorized 10306.77 +Beta LogPDF 500000 Vectorized 21277.71 +Beta PDF 8 Work-Stealing 0.91 +Beta PDF 16 Parallel 1.36 +Beta PDF 32 Parallel 2.40 +Beta PDF 64 Work-Stealing 4.29 +Beta PDF 128 Vectorized 6.01 +Beta PDF 256 Vectorized 10.98 +Beta PDF 512 Vectorized 22.24 +Beta PDF 1000 Vectorized 43.50 +Beta PDF 2000 Vectorized 86.04 +Beta PDF 5000 Vectorized 223.07 +Beta PDF 10000 Vectorized 440.55 +Beta PDF 20000 Vectorized 895.67 +Beta PDF 50000 Vectorized 2238.53 +Beta PDF 100000 Vectorized 4499.07 +Beta PDF 250000 Vectorized 11863.22 +Beta PDF 500000 Vectorized 24361.36 +ChiSquared CDF 8 Parallel 0.93 +ChiSquared CDF 16 Parallel 1.92 +ChiSquared CDF 32 Parallel 3.51 +ChiSquared CDF 64 Parallel 7.14 +ChiSquared CDF 128 Parallel 14.18 +ChiSquared CDF 256 Work-Stealing 28.43 +ChiSquared CDF 512 Work-Stealing 57.37 +ChiSquared CDF 1000 Work-Stealing 114.52 +ChiSquared CDF 2000 Work-Stealing 231.11 +ChiSquared CDF 5000 Work-Stealing 383.75 +ChiSquared CDF 10000 Work-Stealing 406.42 +ChiSquared CDF 20000 Work-Stealing 633.81 +ChiSquared CDF 50000 Work-Stealing 1137.08 +ChiSquared CDF 100000 Work-Stealing 2166.61 +ChiSquared CDF 250000 Work-Stealing 4681.84 +ChiSquared CDF 500000 Work-Stealing 9477.32 +ChiSquared LogPDF 8 Parallel 0.32 +ChiSquared LogPDF 16 Parallel 0.48 +ChiSquared LogPDF 32 Parallel 0.80 +ChiSquared LogPDF 64 Parallel 1.42 +ChiSquared LogPDF 128 Vectorized 2.35 +ChiSquared LogPDF 256 Vectorized 4.11 +ChiSquared LogPDF 512 Vectorized 7.40 +ChiSquared LogPDF 1000 Vectorized 14.02 +ChiSquared LogPDF 2000 Vectorized 28.15 +ChiSquared LogPDF 5000 Vectorized 72.01 +ChiSquared LogPDF 10000 Vectorized 144.79 +ChiSquared LogPDF 20000 Work-Stealing 266.19 +ChiSquared LogPDF 50000 Work-Stealing 325.85 +ChiSquared LogPDF 100000 Work-Stealing 466.34 +ChiSquared LogPDF 250000 Work-Stealing 854.39 +ChiSquared LogPDF 500000 Work-Stealing 1734.74 +ChiSquared PDF 8 Parallel 0.49 +ChiSquared PDF 16 Parallel 0.77 +ChiSquared PDF 32 Parallel 1.43 +ChiSquared PDF 64 Vectorized 2.11 +ChiSquared PDF 128 Vectorized 3.18 +ChiSquared PDF 256 Vectorized 5.64 +ChiSquared PDF 512 Vectorized 10.68 +ChiSquared PDF 1000 Vectorized 20.38 +ChiSquared PDF 2000 Vectorized 40.82 +ChiSquared PDF 5000 Vectorized 103.63 +ChiSquared PDF 10000 Vectorized 206.98 +ChiSquared PDF 20000 Work-Stealing 377.53 +ChiSquared PDF 50000 Work-Stealing 556.28 +ChiSquared PDF 100000 Work-Stealing 693.80 +ChiSquared PDF 250000 Work-Stealing 1425.78 +ChiSquared PDF 500000 Work-Stealing 2833.42 +Discrete CDF 8 Vectorized 0.21 +Discrete CDF 16 Vectorized 0.22 +Discrete CDF 32 Vectorized 0.29 +Discrete CDF 64 Vectorized 0.46 +Discrete CDF 128 Vectorized 0.71 +Discrete CDF 256 Vectorized 1.26 +Discrete CDF 512 Vectorized 2.53 +Discrete CDF 1000 Vectorized 4.44 +Discrete CDF 2000 Vectorized 8.88 +Discrete CDF 5000 Vectorized 22.08 +Discrete CDF 10000 Vectorized 47.61 +Discrete CDF 20000 Vectorized 86.73 +Discrete CDF 50000 Work-Stealing 183.58 +Discrete CDF 100000 Work-Stealing 254.20 +Discrete CDF 250000 Work-Stealing 451.44 +Discrete CDF 500000 Work-Stealing 829.92 +Discrete LogPDF 8 Vectorized 0.22 +Discrete LogPDF 16 Vectorized 0.24 +Discrete LogPDF 32 Vectorized 0.38 +Discrete LogPDF 64 Vectorized 0.49 +Discrete LogPDF 128 Vectorized 0.86 +Discrete LogPDF 256 Vectorized 1.60 +Discrete LogPDF 512 Vectorized 2.94 +Discrete LogPDF 1000 Vectorized 5.66 +Discrete LogPDF 2000 Vectorized 11.31 +Discrete LogPDF 5000 Vectorized 27.27 +Discrete LogPDF 10000 Vectorized 52.57 +Discrete LogPDF 20000 Vectorized 94.61 +Discrete LogPDF 50000 Work-Stealing 164.87 +Discrete LogPDF 100000 Work-Stealing 211.91 +Discrete LogPDF 250000 Work-Stealing 343.01 +Discrete LogPDF 500000 Work-Stealing 756.41 +Discrete PDF 8 Vectorized 0.21 +Discrete PDF 16 Vectorized 0.22 +Discrete PDF 32 Vectorized 0.37 +Discrete PDF 64 Vectorized 0.49 +Discrete PDF 128 Parallel 0.75 +Discrete PDF 256 Parallel 1.24 +Discrete PDF 512 Parallel 2.26 +Discrete PDF 1000 Parallel 4.18 +Discrete PDF 2000 Vectorized 8.34 +Discrete PDF 5000 Vectorized 20.45 +Discrete PDF 10000 Vectorized 38.90 +Discrete PDF 20000 Vectorized 71.54 +Discrete PDF 50000 Work-Stealing 153.47 +Discrete PDF 100000 Work-Stealing 197.32 +Discrete PDF 250000 Work-Stealing 349.81 +Discrete PDF 500000 Work-Stealing 555.28 +Exponential CDF 8 Parallel 0.28 +Exponential CDF 16 Parallel 0.41 +Exponential CDF 32 Vectorized 0.55 +Exponential CDF 64 Vectorized 0.93 +Exponential CDF 128 Vectorized 1.30 +Exponential CDF 256 Vectorized 2.19 +Exponential CDF 512 Vectorized 4.20 +Exponential CDF 1000 Vectorized 7.96 +Exponential CDF 2000 Vectorized 15.83 +Exponential CDF 5000 Vectorized 39.00 +Exponential CDF 10000 Vectorized 74.19 +Exponential CDF 20000 Work-Stealing 129.43 +Exponential CDF 50000 Work-Stealing 215.98 +Exponential CDF 100000 Work-Stealing 307.86 +Exponential CDF 250000 Work-Stealing 727.35 +Exponential CDF 500000 Work-Stealing 1263.70 +Exponential LogPDF 8 Parallel 0.19 +Exponential LogPDF 16 Parallel 0.20 +Exponential LogPDF 32 Work-Stealing 0.24 +Exponential LogPDF 64 Work-Stealing 0.27 +Exponential LogPDF 128 Work-Stealing 0.31 +Exponential LogPDF 256 Work-Stealing 0.43 +Exponential LogPDF 512 Work-Stealing 0.57 +Exponential LogPDF 1000 Work-Stealing 0.86 +Exponential LogPDF 2000 Work-Stealing 1.49 +Exponential LogPDF 5000 Vectorized 8.19 +Exponential LogPDF 10000 Vectorized 16.24 +Exponential LogPDF 20000 Vectorized 35.94 +Exponential LogPDF 50000 Vectorized 92.20 +Exponential LogPDF 100000 Work-Stealing 138.08 +Exponential LogPDF 250000 Parallel 231.28 +Exponential LogPDF 500000 Work-Stealing 400.82 +Exponential PDF 8 Parallel 0.29 +Exponential PDF 16 Parallel 0.40 +Exponential PDF 32 Vectorized 0.51 +Exponential PDF 64 Vectorized 0.82 +Exponential PDF 128 Vectorized 1.21 +Exponential PDF 256 Vectorized 2.29 +Exponential PDF 512 Vectorized 4.03 +Exponential PDF 1000 Vectorized 7.78 +Exponential PDF 2000 Vectorized 15.17 +Exponential PDF 5000 Vectorized 37.32 +Exponential PDF 10000 Vectorized 71.69 +Exponential PDF 20000 Work-Stealing 125.27 +Exponential PDF 50000 Work-Stealing 199.27 +Exponential PDF 100000 Work-Stealing 321.44 +Exponential PDF 250000 Work-Stealing 664.96 +Exponential PDF 500000 Work-Stealing 1356.76 +Gamma CDF 8 Parallel 1.01 +Gamma CDF 16 Work-Stealing 1.82 +Gamma CDF 32 Parallel 3.54 +Gamma CDF 64 Parallel 6.77 +Gamma CDF 128 Work-Stealing 13.45 +Gamma CDF 256 Work-Stealing 27.01 +Gamma CDF 512 Work-Stealing 55.90 +Gamma CDF 1000 Vectorized 108.05 +Gamma CDF 2000 Work-Stealing 219.79 +Gamma CDF 5000 Work-Stealing 309.93 +Gamma CDF 10000 Work-Stealing 416.18 +Gamma CDF 20000 Work-Stealing 560.97 +Gamma CDF 50000 Work-Stealing 1179.91 +Gamma CDF 100000 Work-Stealing 1987.04 +Gamma CDF 250000 Work-Stealing 5523.18 +Gamma CDF 500000 Work-Stealing 8921.63 +Gamma LogPDF 8 Parallel 0.33 +Gamma LogPDF 16 Parallel 0.48 +Gamma LogPDF 32 Parallel 0.80 +Gamma LogPDF 64 Work-Stealing 1.39 +Gamma LogPDF 128 Work-Stealing 2.58 +Gamma LogPDF 256 Vectorized 4.07 +Gamma LogPDF 512 Vectorized 7.00 +Gamma LogPDF 1000 Vectorized 14.20 +Gamma LogPDF 2000 Vectorized 27.94 +Gamma LogPDF 5000 Vectorized 72.97 +Gamma LogPDF 10000 Vectorized 143.35 +Gamma LogPDF 20000 Work-Stealing 261.04 +Gamma LogPDF 50000 Work-Stealing 377.84 +Gamma LogPDF 100000 Work-Stealing 500.43 +Gamma LogPDF 250000 Work-Stealing 1075.01 +Gamma LogPDF 500000 Work-Stealing 1606.92 +Gamma PDF 8 Parallel 0.50 +Gamma PDF 16 Parallel 0.76 +Gamma PDF 32 Vectorized 1.37 +Gamma PDF 64 Vectorized 2.23 +Gamma PDF 128 Vectorized 3.28 +Gamma PDF 256 Vectorized 5.81 +Gamma PDF 512 Vectorized 10.92 +Gamma PDF 1000 Vectorized 20.88 +Gamma PDF 2000 Vectorized 40.69 +Gamma PDF 5000 Vectorized 104.54 +Gamma PDF 10000 Vectorized 208.25 +Gamma PDF 20000 Work-Stealing 383.26 +Gamma PDF 50000 Work-Stealing 573.80 +Gamma PDF 100000 Work-Stealing 845.15 +Gamma PDF 250000 Work-Stealing 1640.75 +Gamma PDF 500000 Work-Stealing 3124.02 +Gaussian CDF 8 Vectorized 0.38 +Gaussian CDF 16 Vectorized 0.47 +Gaussian CDF 32 Vectorized 0.68 +Gaussian CDF 64 Vectorized 1.05 +Gaussian CDF 128 Vectorized 1.86 +Gaussian CDF 256 Vectorized 3.53 +Gaussian CDF 512 Vectorized 6.76 +Gaussian CDF 1000 Vectorized 12.81 +Gaussian CDF 2000 Vectorized 25.78 +Gaussian CDF 5000 Vectorized 64.01 +Gaussian CDF 10000 Vectorized 127.95 +Gaussian CDF 20000 Work-Stealing 206.21 +Gaussian CDF 50000 Work-Stealing 452.51 +Gaussian CDF 100000 Work-Stealing 829.47 +Gaussian CDF 250000 Work-Stealing 1760.53 +Gaussian CDF 500000 Work-Stealing 3611.08 +Gaussian LogPDF 8 Parallel 0.15 +Gaussian LogPDF 16 Parallel 0.15 +Gaussian LogPDF 32 Parallel 0.17 +Gaussian LogPDF 64 Parallel 0.18 +Gaussian LogPDF 128 Parallel 0.22 +Gaussian LogPDF 256 Parallel 0.29 +Gaussian LogPDF 512 Parallel 0.43 +Gaussian LogPDF 1000 Parallel 0.67 +Gaussian LogPDF 2000 Parallel 1.27 +Gaussian LogPDF 5000 Vectorized 6.60 +Gaussian LogPDF 10000 Vectorized 13.32 +Gaussian LogPDF 20000 Vectorized 29.39 +Gaussian LogPDF 50000 Work-Stealing 57.41 +Gaussian LogPDF 100000 Parallel 68.63 +Gaussian LogPDF 250000 Parallel 149.50 +Gaussian LogPDF 500000 Parallel 238.99 +Gaussian PDF 8 Parallel 0.25 +Gaussian PDF 16 Parallel 0.34 +Gaussian PDF 32 Parallel 0.54 +Gaussian PDF 64 Parallel 0.93 +Gaussian PDF 128 Vectorized 1.15 +Gaussian PDF 256 Vectorized 1.97 +Gaussian PDF 512 Vectorized 3.57 +Gaussian PDF 1000 Vectorized 6.68 +Gaussian PDF 2000 Vectorized 13.26 +Gaussian PDF 5000 Vectorized 33.10 +Gaussian PDF 10000 Vectorized 67.80 +Gaussian PDF 20000 Work-Stealing 87.06 +Gaussian PDF 50000 Work-Stealing 149.18 +Gaussian PDF 100000 Work-Stealing 238.98 +Gaussian PDF 250000 Work-Stealing 526.68 +Gaussian PDF 500000 Work-Stealing 1039.00 +Poisson CDF 8 Scalar 1.33 +Poisson CDF 16 Scalar 2.72 +Poisson CDF 32 Scalar 5.38 +Poisson CDF 64 Scalar 12.02 +Poisson CDF 128 Vectorized 22.10 +Poisson CDF 256 Work-Stealing 44.77 +Poisson CDF 512 Vectorized 89.99 +Poisson CDF 1000 Vectorized 172.59 +Poisson CDF 2000 Vectorized 346.86 +Poisson CDF 5000 Work-Stealing 374.89 +Poisson CDF 10000 Work-Stealing 537.19 +Poisson CDF 20000 Work-Stealing 779.63 +Poisson CDF 50000 Work-Stealing 1554.58 +Poisson CDF 100000 Work-Stealing 2771.32 +Poisson CDF 250000 Work-Stealing 7150.19 +Poisson CDF 500000 Work-Stealing 12707.91 +Poisson LogPDF 8 Vectorized 0.33 +Poisson LogPDF 16 Vectorized 0.52 +Poisson LogPDF 32 Vectorized 0.88 +Poisson LogPDF 64 Vectorized 1.81 +Poisson LogPDF 128 Vectorized 3.10 +Poisson LogPDF 256 Vectorized 6.23 +Poisson LogPDF 512 Work-Stealing 12.22 +Poisson LogPDF 1000 Work-Stealing 23.40 +Poisson LogPDF 2000 Work-Stealing 47.15 +Poisson LogPDF 5000 Vectorized 121.16 +Poisson LogPDF 10000 Work-Stealing 238.07 +Poisson LogPDF 20000 Work-Stealing 285.53 +Poisson LogPDF 50000 Work-Stealing 376.00 +Poisson LogPDF 100000 Work-Stealing 656.01 +Poisson LogPDF 250000 Work-Stealing 1045.82 +Poisson LogPDF 500000 Work-Stealing 2188.15 +Poisson PDF 8 Vectorized 0.75 +Poisson PDF 16 Vectorized 1.31 +Poisson PDF 32 Vectorized 2.31 +Poisson PDF 64 Vectorized 4.61 +Poisson PDF 128 Vectorized 8.80 +Poisson PDF 256 Vectorized 17.82 +Poisson PDF 512 Vectorized 35.61 +Poisson PDF 1000 Vectorized 69.23 +Poisson PDF 2000 Work-Stealing 139.20 +Poisson PDF 5000 Work-Stealing 267.17 +Poisson PDF 10000 Work-Stealing 291.50 +Poisson PDF 20000 Work-Stealing 398.34 +Poisson PDF 50000 Work-Stealing 696.05 +Poisson PDF 100000 Work-Stealing 1258.09 +Poisson PDF 250000 Work-Stealing 2804.03 +Poisson PDF 500000 Work-Stealing 4655.66 +StudentT CDF 8 Work-Stealing 2.96 +StudentT CDF 16 Work-Stealing 5.38 +StudentT CDF 32 Work-Stealing 11.52 +StudentT CDF 64 Work-Stealing 22.51 +StudentT CDF 128 Vectorized 43.81 +StudentT CDF 256 Vectorized 88.35 +StudentT CDF 512 Parallel 168.85 +StudentT CDF 1000 Parallel 334.12 +StudentT CDF 2000 Parallel 668.52 +StudentT CDF 5000 Parallel 1675.07 +StudentT CDF 10000 Parallel 3345.14 +StudentT CDF 20000 Vectorized 6813.72 +StudentT CDF 50000 Vectorized 17565.40 +StudentT CDF 100000 Work-Stealing 33989.20 +StudentT CDF 250000 Work-Stealing 84879.62 +StudentT CDF 500000 Parallel 169793.97 +StudentT LogPDF 8 Vectorized 0.54 +StudentT LogPDF 16 Vectorized 0.63 +StudentT LogPDF 32 Vectorized 0.85 +StudentT LogPDF 64 Vectorized 1.24 +StudentT LogPDF 128 Vectorized 2.02 +StudentT LogPDF 256 Vectorized 3.72 +StudentT LogPDF 512 Vectorized 7.02 +StudentT LogPDF 1000 Vectorized 12.92 +StudentT LogPDF 2000 Vectorized 25.54 +StudentT LogPDF 5000 Vectorized 65.76 +StudentT LogPDF 10000 Vectorized 131.13 +StudentT LogPDF 20000 Vectorized 263.70 +StudentT LogPDF 50000 Vectorized 710.19 +StudentT LogPDF 100000 Work-Stealing 736.15 +StudentT LogPDF 250000 Work-Stealing 1302.96 +StudentT LogPDF 500000 Work-Stealing 2484.48 +StudentT PDF 8 Parallel 0.74 +StudentT PDF 16 Parallel 1.02 +StudentT PDF 32 Vectorized 1.29 +StudentT PDF 64 Vectorized 1.74 +StudentT PDF 128 Vectorized 2.94 +StudentT PDF 256 Vectorized 5.58 +StudentT PDF 512 Vectorized 10.41 +StudentT PDF 1000 Vectorized 19.65 +StudentT PDF 2000 Vectorized 38.52 +StudentT PDF 5000 Vectorized 98.03 +StudentT PDF 10000 Vectorized 195.10 +StudentT PDF 20000 Vectorized 393.14 +StudentT PDF 50000 Vectorized 1001.40 +StudentT PDF 100000 Work-Stealing 1254.44 +StudentT PDF 250000 Parallel 2324.53 +StudentT PDF 500000 Work-Stealing 4707.63 +Uniform CDF 8 Parallel 0.23 +Uniform CDF 16 Parallel 0.25 +Uniform CDF 32 Parallel 0.31 +Uniform CDF 64 Parallel 0.38 +Uniform CDF 128 Parallel 0.59 +Uniform CDF 256 Work-Stealing 0.44 +Uniform CDF 512 Work-Stealing 0.65 +Uniform CDF 1000 Parallel 1.49 +Uniform CDF 2000 Work-Stealing 4.69 +Uniform CDF 5000 Vectorized 13.00 +Uniform CDF 10000 Work-Stealing 28.36 +Uniform CDF 20000 Work-Stealing 39.17 +Uniform CDF 50000 Work-Stealing 68.87 +Uniform CDF 100000 Work-Stealing 133.88 +Uniform CDF 250000 Work-Stealing 303.76 +Uniform CDF 500000 Work-Stealing 601.36 +Uniform LogPDF 8 Vectorized 0.14 +Uniform LogPDF 16 Vectorized 0.19 +Uniform LogPDF 32 Vectorized 0.23 +Uniform LogPDF 64 Vectorized 0.17 +Uniform LogPDF 128 Vectorized 0.32 +Uniform LogPDF 256 Vectorized 0.46 +Uniform LogPDF 512 Vectorized 0.57 +Uniform LogPDF 1000 Work-Stealing 0.84 +Uniform LogPDF 2000 Work-Stealing 0.86 +Uniform LogPDF 5000 Vectorized 3.81 +Uniform LogPDF 10000 Vectorized 7.49 +Uniform LogPDF 20000 Vectorized 11.03 +Uniform LogPDF 50000 Vectorized 41.38 +Uniform LogPDF 100000 Vectorized 82.75 +Uniform LogPDF 250000 Vectorized 200.61 +Uniform LogPDF 500000 Vectorized 484.38 +Uniform PDF 8 Vectorized 0.13 +Uniform PDF 16 Vectorized 0.18 +Uniform PDF 32 Vectorized 0.20 +Uniform PDF 64 Vectorized 0.23 +Uniform PDF 128 Vectorized 0.29 +Uniform PDF 256 Vectorized 0.39 +Uniform PDF 512 Vectorized 0.46 +Uniform PDF 1000 Vectorized 0.77 +Uniform PDF 2000 Work-Stealing 0.85 +Uniform PDF 5000 Vectorized 3.16 +Uniform PDF 10000 Vectorized 6.28 +Uniform PDF 20000 Vectorized 10.19 +Uniform PDF 50000 Vectorized 37.99 +Uniform PDF 100000 Vectorized 80.62 +Uniform PDF 250000 Vectorized 155.43 +Uniform PDF 500000 Vectorized 446.26 + + +===================== + Crossover Summary +===================== + +Distribution Operation Sβ†’V Vβ†’P Pβ†’Work-Steal +-------------------------------------------------------------------------- +Beta CDF 8 never 16 +Beta LogPDF 32 8 8 +Beta PDF 32 8 8 +ChiSquared CDF 16 8 256 +ChiSquared LogPDF 16 8 128 +ChiSquared PDF 8 8 512 +Discrete CDF 8 100000 64 +Discrete LogPDF 8 100000 64 +Discrete PDF 8 128 5000 +Exponential CDF 8 8 64 +Exponential LogPDF 8 8 32 +Exponential PDF 8 8 128 +Gamma CDF 16 8 16 +Gamma LogPDF 16 8 64 +Gamma PDF 8 8 64 +Gaussian CDF 8 never 2000 +Gaussian LogPDF 8 8 5000 +Gaussian PDF 8 8 1000 +Poisson CDF 128 64 32 +Poisson LogPDF 8 50000 64 +Poisson PDF 8 2000 64 +StudentT CDF 8 8 8 +StudentT LogPDF 8 100000 8 +StudentT PDF 16 8 64 +Uniform CDF 8 8 256 +Uniform LogPDF 8 never 64 +Uniform PDF 8 never 8 + +Results saved to /Users/wolfman/Development/libstats/build/profiles/dispatcher/2026-04-12T05-55-52Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-e75c6e3/strategy_profile_results.csv diff --git a/data/profiles/dispatcher/2026-04-12T05-55-52Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-e75c6e3/logs/system_inspector_performance.txt b/data/profiles/dispatcher/2026-04-12T05-55-52Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-e75c6e3/logs/system_inspector_performance.txt new file mode 100644 index 0000000..5bc454d --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T05-55-52Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-e75c6e3/logs/system_inspector_performance.txt @@ -0,0 +1,102 @@ + +======================================= + System Inspector - Performance Mode +======================================= + +System capabilities analysis with performance measurements + +System: 8 logical cores, AVX SIMD, 8192 KB L3 cache + + +--- CPU Features --- +Feature Support Description +------------------------------------------------------------ +AVX-512 No Foundation instructions +AVX2 No Advanced Vector Ext 2 +AVX Yes Advanced Vector Ext +SSE2 Yes Streaming SIMD Ext 2 +NEON No ARM SIMD instructions +FMA No Fused Multiply-Add + + +--- Cache Information --- +Cache Level Size (KB) Line Size +------------------------------------------ +L1 32 64 bytes +L2 256 64 bytes +L3 8192 64 bytes + + +--- CPU Topology --- +Hardware Threads: 8 +Logical Cores: 8 +Physical Cores: 4 +Hyperthreading: Enabled + + +--- SIMD Capabilities --- +Instruction Support Vector Width Description +-------------------------------------------------------------- +SSE2 Yes 128-bit Basic SIMD operations +AVX Yes 256-bit Advanced vector ext +AVX2 No 256-bit Integer AVX operations +AVX-512 No 512-bit Foundation instructions +NEON No 128-bit ARM SIMD instructions + +Active SIMD Level: AVX + + +--- Performance Baselines --- +Operation Type Time (ΞΌs) Throughput (MOps/s) +------------------------------------------------------------ +SIMD Multiply 1251 799 +Scalar Multiply 1181 846 + +SIMD Speedup: 0.94x + + +--- Performance Dispatcher Configuration --- +Example Strategy Selections: +Batch Size Distribution Complexity Strategy +---------------------------------------------------------------------- +100 Uniform Simple Vectorized +100 Gaussian Simple Vectorized +100 Exponential Simple Vectorized +100 Poisson Simple Vectorized +100 Discrete Simple Vectorized +1000 Uniform Simple Vectorized +1000 Gaussian Simple Vectorized +1000 Exponential Simple Vectorized +1000 Poisson Simple Vectorized +1000 Discrete Simple Vectorized +10000 Uniform Simple Parallel +10000 Gaussian Simple Parallel +10000 Exponential Simple Parallel +10000 Poisson Simple Parallel +10000 Discrete Simple Parallel +100000 Uniform Simple Parallel +100000 Gaussian Simple Parallel +100000 Exponential Simple Parallel +100000 Poisson Simple Work-Stealing +100000 Discrete Simple Parallel + + +--- Platform Constants --- +Constant Value +-------------------------------------------------- +SIMD Block Size 4 doubles +Memory Alignment 32 bytes +Min SIMD Size 8 elements +Optimal Grain Size 32 elements +Fast Transcendental Support No + + +--- Adaptive Constants --- +Constant Value +-------------------------------------------------- +Min Elements for Parallel 4096 +Default Grain Size 32768 +Simple Operation Grain Size 256 +Complex Operation Grain Size 1024 + +System inspection completed successfully. diff --git a/data/profiles/dispatcher/2026-04-12T05-55-52Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-e75c6e3/manifest.txt b/data/profiles/dispatcher/2026-04-12T05-55-52Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-e75c6e3/manifest.txt new file mode 100644 index 0000000..96e0bd6 --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T05-55-52Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-e75c6e3/manifest.txt @@ -0,0 +1,14 @@ +Dispatcher profile bundle +========================= + +Run ID: 2026-04-12T05-55-52Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-e75c6e3 +Captured at (UTC): 2026-04-12T05-55-52Z + +Files: +- metadata.json +- summary.json +- crossovers.csv +- best_strategies.csv +- strategy_profile_results.csv +- logs/system_inspector_performance.txt +- logs/strategy_profile.txt diff --git a/data/profiles/dispatcher/2026-04-12T05-55-52Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-e75c6e3/metadata.json b/data/profiles/dispatcher/2026-04-12T05-55-52Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-e75c6e3/metadata.json new file mode 100644 index 0000000..2c49346 --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T05-55-52Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-e75c6e3/metadata.json @@ -0,0 +1,15 @@ +{ + "captured_at_utc": "2026-04-12T05-55-52Z", + "run_id": "2026-04-12T05-55-52Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-e75c6e3", + "git_branch": "investigate-gaussian-avx512-perf", + "git_sha": "e75c6e3", + "project_root": "/Users/wolfman/Development/libstats", + "build_dir": "/Users/wolfman/Development/libstats/build", + "build_type": "Release", + "cxx_compiler": "", + "os": "darwin", + "arch": "x86_64", + "cpu_brand": "Intel(R) Core(TM) i7-3820QM CPU @ 2.70GHz", + "physical_cores": "4", + "logical_cores": "8" +} diff --git a/data/profiles/dispatcher/2026-04-12T05-55-52Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-e75c6e3/strategy_profile_results.csv b/data/profiles/dispatcher/2026-04-12T05-55-52Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-e75c6e3/strategy_profile_results.csv new file mode 100644 index 0000000..b6f7a4d --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T05-55-52Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-e75c6e3/strategy_profile_results.csv @@ -0,0 +1,1729 @@ +Distribution,Operation,BatchSize,Strategy,MedianTime_us +Uniform,PDF,8,SCALAR,0.575000 +Uniform,PDF,8,VECTORIZED,0.131000 +Uniform,PDF,8,PARALLEL,0.143000 +Uniform,PDF,8,WORK_STEALING,0.139000 +Uniform,LogPDF,8,SCALAR,0.556000 +Uniform,LogPDF,8,VECTORIZED,0.138000 +Uniform,LogPDF,8,PARALLEL,0.148000 +Uniform,LogPDF,8,WORK_STEALING,0.278000 +Uniform,CDF,8,SCALAR,0.707000 +Uniform,CDF,8,VECTORIZED,0.291000 +Uniform,CDF,8,PARALLEL,0.225000 +Uniform,CDF,8,WORK_STEALING,0.295000 +Uniform,PDF,16,SCALAR,1.277000 +Uniform,PDF,16,VECTORIZED,0.182000 +Uniform,PDF,16,PARALLEL,0.215000 +Uniform,PDF,16,WORK_STEALING,0.269000 +Uniform,LogPDF,16,SCALAR,1.344000 +Uniform,LogPDF,16,VECTORIZED,0.192000 +Uniform,LogPDF,16,PARALLEL,0.256000 +Uniform,LogPDF,16,WORK_STEALING,0.300000 +Uniform,CDF,16,SCALAR,1.411000 +Uniform,CDF,16,VECTORIZED,0.342000 +Uniform,CDF,16,PARALLEL,0.254000 +Uniform,CDF,16,WORK_STEALING,0.266000 +Uniform,PDF,32,SCALAR,2.679000 +Uniform,PDF,32,VECTORIZED,0.205000 +Uniform,PDF,32,PARALLEL,0.266000 +Uniform,PDF,32,WORK_STEALING,0.295000 +Uniform,LogPDF,32,SCALAR,2.624000 +Uniform,LogPDF,32,VECTORIZED,0.227000 +Uniform,LogPDF,32,PARALLEL,0.273000 +Uniform,LogPDF,32,WORK_STEALING,0.306000 +Uniform,CDF,32,SCALAR,2.670000 +Uniform,CDF,32,VECTORIZED,0.460000 +Uniform,CDF,32,PARALLEL,0.309000 +Uniform,CDF,32,WORK_STEALING,0.349000 +Uniform,PDF,64,SCALAR,4.563000 +Uniform,PDF,64,VECTORIZED,0.228000 +Uniform,PDF,64,PARALLEL,0.329000 +Uniform,PDF,64,WORK_STEALING,0.315000 +Uniform,LogPDF,64,SCALAR,4.299000 +Uniform,LogPDF,64,VECTORIZED,0.168000 +Uniform,LogPDF,64,PARALLEL,0.229000 +Uniform,LogPDF,64,WORK_STEALING,0.169000 +Uniform,CDF,64,SCALAR,5.222000 +Uniform,CDF,64,VECTORIZED,0.452000 +Uniform,CDF,64,PARALLEL,0.379000 +Uniform,CDF,64,WORK_STEALING,0.417000 +Uniform,PDF,128,SCALAR,10.458000 +Uniform,PDF,128,VECTORIZED,0.295000 +Uniform,PDF,128,PARALLEL,0.465000 +Uniform,PDF,128,WORK_STEALING,0.328000 +Uniform,LogPDF,128,SCALAR,10.057000 +Uniform,LogPDF,128,VECTORIZED,0.318000 +Uniform,LogPDF,128,PARALLEL,0.548000 +Uniform,LogPDF,128,WORK_STEALING,0.381000 +Uniform,CDF,128,SCALAR,10.436000 +Uniform,CDF,128,VECTORIZED,0.595000 +Uniform,CDF,128,PARALLEL,0.588000 +Uniform,CDF,128,WORK_STEALING,0.632000 +Uniform,PDF,256,SCALAR,20.562000 +Uniform,PDF,256,VECTORIZED,0.392000 +Uniform,PDF,256,PARALLEL,0.828000 +Uniform,PDF,256,WORK_STEALING,0.475000 +Uniform,LogPDF,256,SCALAR,18.397000 +Uniform,LogPDF,256,VECTORIZED,0.461000 +Uniform,LogPDF,256,PARALLEL,0.868000 +Uniform,LogPDF,256,WORK_STEALING,0.467000 +Uniform,CDF,256,SCALAR,20.694000 +Uniform,CDF,256,VECTORIZED,0.523000 +Uniform,CDF,256,PARALLEL,0.539000 +Uniform,CDF,256,WORK_STEALING,0.444000 +Uniform,PDF,512,SCALAR,34.860000 +Uniform,PDF,512,VECTORIZED,0.462000 +Uniform,PDF,512,PARALLEL,1.069000 +Uniform,PDF,512,WORK_STEALING,0.542000 +Uniform,LogPDF,512,SCALAR,32.281000 +Uniform,LogPDF,512,VECTORIZED,0.575000 +Uniform,LogPDF,512,PARALLEL,1.280000 +Uniform,LogPDF,512,WORK_STEALING,0.595000 +Uniform,CDF,512,SCALAR,35.064000 +Uniform,CDF,512,VECTORIZED,1.093000 +Uniform,CDF,512,PARALLEL,0.799000 +Uniform,CDF,512,WORK_STEALING,0.646000 +Uniform,PDF,1000,SCALAR,65.445000 +Uniform,PDF,1000,VECTORIZED,0.771000 +Uniform,PDF,1000,PARALLEL,1.954000 +Uniform,PDF,1000,WORK_STEALING,0.848000 +Uniform,LogPDF,1000,SCALAR,65.839000 +Uniform,LogPDF,1000,VECTORIZED,0.959000 +Uniform,LogPDF,1000,PARALLEL,2.442000 +Uniform,LogPDF,1000,WORK_STEALING,0.844000 +Uniform,CDF,1000,SCALAR,67.333000 +Uniform,CDF,1000,VECTORIZED,1.852000 +Uniform,CDF,1000,PARALLEL,1.489000 +Uniform,CDF,1000,WORK_STEALING,2.236000 +Uniform,PDF,2000,SCALAR,121.171000 +Uniform,PDF,2000,VECTORIZED,1.350000 +Uniform,PDF,2000,PARALLEL,2.015000 +Uniform,PDF,2000,WORK_STEALING,0.854000 +Uniform,LogPDF,2000,SCALAR,115.567000 +Uniform,LogPDF,2000,VECTORIZED,1.677000 +Uniform,LogPDF,2000,PARALLEL,5.268000 +Uniform,LogPDF,2000,WORK_STEALING,0.863000 +Uniform,CDF,2000,SCALAR,125.523000 +Uniform,CDF,2000,VECTORIZED,4.754000 +Uniform,CDF,2000,PARALLEL,5.367000 +Uniform,CDF,2000,WORK_STEALING,4.694000 +Uniform,PDF,5000,SCALAR,294.000000 +Uniform,PDF,5000,VECTORIZED,3.160000 +Uniform,PDF,5000,PARALLEL,47.071000 +Uniform,PDF,5000,WORK_STEALING,26.734000 +Uniform,LogPDF,5000,SCALAR,285.688000 +Uniform,LogPDF,5000,VECTORIZED,3.810000 +Uniform,LogPDF,5000,PARALLEL,52.203000 +Uniform,LogPDF,5000,WORK_STEALING,20.367000 +Uniform,CDF,5000,SCALAR,291.140000 +Uniform,CDF,5000,VECTORIZED,13.004000 +Uniform,CDF,5000,PARALLEL,51.054000 +Uniform,CDF,5000,WORK_STEALING,19.673000 +Uniform,PDF,10000,SCALAR,582.847000 +Uniform,PDF,10000,VECTORIZED,6.284000 +Uniform,PDF,10000,PARALLEL,87.022000 +Uniform,PDF,10000,WORK_STEALING,27.718000 +Uniform,LogPDF,10000,SCALAR,567.083000 +Uniform,LogPDF,10000,VECTORIZED,7.489000 +Uniform,LogPDF,10000,PARALLEL,83.305000 +Uniform,LogPDF,10000,WORK_STEALING,25.330000 +Uniform,CDF,10000,SCALAR,589.700000 +Uniform,CDF,10000,VECTORIZED,38.171000 +Uniform,CDF,10000,PARALLEL,92.296000 +Uniform,CDF,10000,WORK_STEALING,28.363000 +Uniform,PDF,20000,SCALAR,1180.011000 +Uniform,PDF,20000,VECTORIZED,10.194000 +Uniform,PDF,20000,PARALLEL,139.147000 +Uniform,PDF,20000,WORK_STEALING,38.061000 +Uniform,LogPDF,20000,SCALAR,1254.580000 +Uniform,LogPDF,20000,VECTORIZED,11.028000 +Uniform,LogPDF,20000,PARALLEL,142.835000 +Uniform,LogPDF,20000,WORK_STEALING,35.878000 +Uniform,CDF,20000,SCALAR,1308.494000 +Uniform,CDF,20000,VECTORIZED,114.316000 +Uniform,CDF,20000,PARALLEL,147.620000 +Uniform,CDF,20000,WORK_STEALING,39.174000 +Uniform,PDF,50000,SCALAR,3218.146000 +Uniform,PDF,50000,VECTORIZED,37.992000 +Uniform,PDF,50000,PARALLEL,232.942000 +Uniform,PDF,50000,WORK_STEALING,68.675000 +Uniform,LogPDF,50000,SCALAR,3136.259000 +Uniform,LogPDF,50000,VECTORIZED,41.376000 +Uniform,LogPDF,50000,PARALLEL,231.295000 +Uniform,LogPDF,50000,WORK_STEALING,65.579000 +Uniform,CDF,50000,SCALAR,3104.694000 +Uniform,CDF,50000,VECTORIZED,270.618000 +Uniform,CDF,50000,PARALLEL,226.690000 +Uniform,CDF,50000,WORK_STEALING,68.871000 +Uniform,PDF,100000,SCALAR,6159.961000 +Uniform,PDF,100000,VECTORIZED,80.621000 +Uniform,PDF,100000,PARALLEL,246.306000 +Uniform,PDF,100000,WORK_STEALING,111.782000 +Uniform,LogPDF,100000,SCALAR,6133.268000 +Uniform,LogPDF,100000,VECTORIZED,82.751000 +Uniform,LogPDF,100000,PARALLEL,246.681000 +Uniform,LogPDF,100000,WORK_STEALING,140.192000 +Uniform,CDF,100000,SCALAR,5990.628000 +Uniform,CDF,100000,VECTORIZED,569.441000 +Uniform,CDF,100000,PARALLEL,259.885000 +Uniform,CDF,100000,WORK_STEALING,133.885000 +Uniform,PDF,250000,SCALAR,14728.921000 +Uniform,PDF,250000,VECTORIZED,155.432000 +Uniform,PDF,250000,PARALLEL,479.054000 +Uniform,PDF,250000,WORK_STEALING,389.541000 +Uniform,LogPDF,250000,SCALAR,14511.750000 +Uniform,LogPDF,250000,VECTORIZED,200.611000 +Uniform,LogPDF,250000,PARALLEL,447.207000 +Uniform,LogPDF,250000,WORK_STEALING,345.203000 +Uniform,CDF,250000,SCALAR,14895.358000 +Uniform,CDF,250000,VECTORIZED,1395.982000 +Uniform,CDF,250000,PARALLEL,498.651000 +Uniform,CDF,250000,WORK_STEALING,303.761000 +Uniform,PDF,500000,SCALAR,29749.137000 +Uniform,PDF,500000,VECTORIZED,446.261000 +Uniform,PDF,500000,PARALLEL,868.807000 +Uniform,PDF,500000,WORK_STEALING,487.857000 +Uniform,LogPDF,500000,SCALAR,29022.239000 +Uniform,LogPDF,500000,VECTORIZED,484.381000 +Uniform,LogPDF,500000,PARALLEL,948.402000 +Uniform,LogPDF,500000,WORK_STEALING,530.778000 +Uniform,CDF,500000,SCALAR,29842.841000 +Uniform,CDF,500000,VECTORIZED,3063.196000 +Uniform,CDF,500000,PARALLEL,935.017000 +Uniform,CDF,500000,WORK_STEALING,601.362000 +Gaussian,PDF,8,SCALAR,0.663000 +Gaussian,PDF,8,VECTORIZED,0.347000 +Gaussian,PDF,8,PARALLEL,0.249000 +Gaussian,PDF,8,WORK_STEALING,0.266000 +Gaussian,LogPDF,8,SCALAR,0.477000 +Gaussian,LogPDF,8,VECTORIZED,0.244000 +Gaussian,LogPDF,8,PARALLEL,0.146000 +Gaussian,LogPDF,8,WORK_STEALING,0.202000 +Gaussian,CDF,8,SCALAR,0.876000 +Gaussian,CDF,8,VECTORIZED,0.381000 +Gaussian,CDF,8,PARALLEL,0.482000 +Gaussian,CDF,8,WORK_STEALING,0.546000 +Gaussian,PDF,16,SCALAR,1.269000 +Gaussian,PDF,16,VECTORIZED,0.389000 +Gaussian,PDF,16,PARALLEL,0.340000 +Gaussian,PDF,16,WORK_STEALING,0.400000 +Gaussian,LogPDF,16,SCALAR,0.957000 +Gaussian,LogPDF,16,VECTORIZED,0.277000 +Gaussian,LogPDF,16,PARALLEL,0.153000 +Gaussian,LogPDF,16,WORK_STEALING,0.207000 +Gaussian,CDF,16,SCALAR,1.820000 +Gaussian,CDF,16,VECTORIZED,0.473000 +Gaussian,CDF,16,PARALLEL,0.941000 +Gaussian,CDF,16,WORK_STEALING,1.004000 +Gaussian,PDF,32,SCALAR,2.433000 +Gaussian,PDF,32,VECTORIZED,0.859000 +Gaussian,PDF,32,PARALLEL,0.539000 +Gaussian,PDF,32,WORK_STEALING,0.599000 +Gaussian,LogPDF,32,SCALAR,1.865000 +Gaussian,LogPDF,32,VECTORIZED,0.270000 +Gaussian,LogPDF,32,PARALLEL,0.171000 +Gaussian,LogPDF,32,WORK_STEALING,0.217000 +Gaussian,CDF,32,SCALAR,3.402000 +Gaussian,CDF,32,VECTORIZED,0.676000 +Gaussian,CDF,32,PARALLEL,1.616000 +Gaussian,CDF,32,WORK_STEALING,1.659000 +Gaussian,PDF,64,SCALAR,4.946000 +Gaussian,PDF,64,VECTORIZED,1.136000 +Gaussian,PDF,64,PARALLEL,0.934000 +Gaussian,PDF,64,WORK_STEALING,0.961000 +Gaussian,LogPDF,64,SCALAR,3.676000 +Gaussian,LogPDF,64,VECTORIZED,0.304000 +Gaussian,LogPDF,64,PARALLEL,0.185000 +Gaussian,LogPDF,64,WORK_STEALING,0.225000 +Gaussian,CDF,64,SCALAR,6.864000 +Gaussian,CDF,64,VECTORIZED,1.055000 +Gaussian,CDF,64,PARALLEL,3.162000 +Gaussian,CDF,64,WORK_STEALING,3.200000 +Gaussian,PDF,128,SCALAR,9.499000 +Gaussian,PDF,128,VECTORIZED,1.150000 +Gaussian,PDF,128,PARALLEL,1.701000 +Gaussian,PDF,128,WORK_STEALING,1.774000 +Gaussian,LogPDF,128,SCALAR,7.338000 +Gaussian,LogPDF,128,VECTORIZED,0.353000 +Gaussian,LogPDF,128,PARALLEL,0.219000 +Gaussian,LogPDF,128,WORK_STEALING,0.265000 +Gaussian,CDF,128,SCALAR,13.324000 +Gaussian,CDF,128,VECTORIZED,1.861000 +Gaussian,CDF,128,PARALLEL,6.130000 +Gaussian,CDF,128,WORK_STEALING,6.148000 +Gaussian,PDF,256,SCALAR,19.157000 +Gaussian,PDF,256,VECTORIZED,1.971000 +Gaussian,PDF,256,PARALLEL,3.259000 +Gaussian,PDF,256,WORK_STEALING,3.337000 +Gaussian,LogPDF,256,SCALAR,14.572000 +Gaussian,LogPDF,256,VECTORIZED,0.468000 +Gaussian,LogPDF,256,PARALLEL,0.288000 +Gaussian,LogPDF,256,WORK_STEALING,0.344000 +Gaussian,CDF,256,SCALAR,26.739000 +Gaussian,CDF,256,VECTORIZED,3.526000 +Gaussian,CDF,256,PARALLEL,12.063000 +Gaussian,CDF,256,WORK_STEALING,12.222000 +Gaussian,PDF,512,SCALAR,37.914000 +Gaussian,PDF,512,VECTORIZED,3.571000 +Gaussian,PDF,512,PARALLEL,6.404000 +Gaussian,PDF,512,WORK_STEALING,6.423000 +Gaussian,LogPDF,512,SCALAR,28.852000 +Gaussian,LogPDF,512,VECTORIZED,0.692000 +Gaussian,LogPDF,512,PARALLEL,0.434000 +Gaussian,LogPDF,512,WORK_STEALING,0.467000 +Gaussian,CDF,512,SCALAR,53.658000 +Gaussian,CDF,512,VECTORIZED,6.759000 +Gaussian,CDF,512,PARALLEL,23.982000 +Gaussian,CDF,512,WORK_STEALING,24.068000 +Gaussian,PDF,1000,SCALAR,78.496000 +Gaussian,PDF,1000,VECTORIZED,6.683000 +Gaussian,PDF,1000,PARALLEL,12.383000 +Gaussian,PDF,1000,WORK_STEALING,12.328000 +Gaussian,LogPDF,1000,SCALAR,55.671000 +Gaussian,LogPDF,1000,VECTORIZED,1.141000 +Gaussian,LogPDF,1000,PARALLEL,0.671000 +Gaussian,LogPDF,1000,WORK_STEALING,0.718000 +Gaussian,CDF,1000,SCALAR,104.268000 +Gaussian,CDF,1000,VECTORIZED,12.812000 +Gaussian,CDF,1000,PARALLEL,46.495000 +Gaussian,CDF,1000,WORK_STEALING,46.563000 +Gaussian,PDF,2000,SCALAR,166.993000 +Gaussian,PDF,2000,VECTORIZED,13.259000 +Gaussian,PDF,2000,PARALLEL,24.547000 +Gaussian,PDF,2000,WORK_STEALING,24.535000 +Gaussian,LogPDF,2000,SCALAR,139.102000 +Gaussian,LogPDF,2000,VECTORIZED,2.341000 +Gaussian,LogPDF,2000,PARALLEL,1.268000 +Gaussian,LogPDF,2000,WORK_STEALING,1.332000 +Gaussian,CDF,2000,SCALAR,230.143000 +Gaussian,CDF,2000,VECTORIZED,25.783000 +Gaussian,CDF,2000,PARALLEL,98.499000 +Gaussian,CDF,2000,WORK_STEALING,92.955000 +Gaussian,PDF,5000,SCALAR,414.916000 +Gaussian,PDF,5000,VECTORIZED,33.102000 +Gaussian,PDF,5000,PARALLEL,100.432000 +Gaussian,PDF,5000,WORK_STEALING,53.502000 +Gaussian,LogPDF,5000,SCALAR,281.195000 +Gaussian,LogPDF,5000,VECTORIZED,6.598000 +Gaussian,LogPDF,5000,PARALLEL,32.899000 +Gaussian,LogPDF,5000,WORK_STEALING,25.978000 +Gaussian,CDF,5000,SCALAR,520.793000 +Gaussian,CDF,5000,VECTORIZED,64.012000 +Gaussian,CDF,5000,PARALLEL,272.919000 +Gaussian,CDF,5000,WORK_STEALING,93.924000 +Gaussian,PDF,10000,SCALAR,727.539000 +Gaussian,PDF,10000,VECTORIZED,67.803000 +Gaussian,PDF,10000,PARALLEL,165.819000 +Gaussian,PDF,10000,WORK_STEALING,75.115000 +Gaussian,LogPDF,10000,SCALAR,563.089000 +Gaussian,LogPDF,10000,VECTORIZED,13.321000 +Gaussian,LogPDF,10000,PARALLEL,35.399000 +Gaussian,LogPDF,10000,WORK_STEALING,33.238000 +Gaussian,CDF,10000,SCALAR,1043.482000 +Gaussian,CDF,10000,VECTORIZED,127.955000 +Gaussian,CDF,10000,PARALLEL,505.706000 +Gaussian,CDF,10000,WORK_STEALING,151.470000 +Gaussian,PDF,20000,SCALAR,1491.612000 +Gaussian,PDF,20000,VECTORIZED,137.611000 +Gaussian,PDF,20000,PARALLEL,288.203000 +Gaussian,PDF,20000,WORK_STEALING,87.063000 +Gaussian,LogPDF,20000,SCALAR,1126.417000 +Gaussian,LogPDF,20000,VECTORIZED,29.394000 +Gaussian,LogPDF,20000,PARALLEL,50.356000 +Gaussian,LogPDF,20000,WORK_STEALING,34.099000 +Gaussian,CDF,20000,SCALAR,2125.346000 +Gaussian,CDF,20000,VECTORIZED,258.729000 +Gaussian,CDF,20000,PARALLEL,975.009000 +Gaussian,CDF,20000,WORK_STEALING,206.210000 +Gaussian,PDF,50000,SCALAR,3749.025000 +Gaussian,PDF,50000,VECTORIZED,353.082000 +Gaussian,PDF,50000,PARALLEL,444.183000 +Gaussian,PDF,50000,WORK_STEALING,149.182000 +Gaussian,LogPDF,50000,SCALAR,2816.569000 +Gaussian,LogPDF,50000,VECTORIZED,82.922000 +Gaussian,LogPDF,50000,PARALLEL,59.518000 +Gaussian,LogPDF,50000,WORK_STEALING,57.410000 +Gaussian,CDF,50000,SCALAR,5244.991000 +Gaussian,CDF,50000,VECTORIZED,652.749000 +Gaussian,CDF,50000,PARALLEL,1462.932000 +Gaussian,CDF,50000,WORK_STEALING,452.512000 +Gaussian,PDF,100000,SCALAR,7493.858000 +Gaussian,PDF,100000,VECTORIZED,706.287000 +Gaussian,PDF,100000,PARALLEL,445.967000 +Gaussian,PDF,100000,WORK_STEALING,238.977000 +Gaussian,LogPDF,100000,SCALAR,5657.184000 +Gaussian,LogPDF,100000,VECTORIZED,168.290000 +Gaussian,LogPDF,100000,PARALLEL,68.628000 +Gaussian,LogPDF,100000,WORK_STEALING,97.518000 +Gaussian,CDF,100000,SCALAR,10496.903000 +Gaussian,CDF,100000,VECTORIZED,1306.920000 +Gaussian,CDF,100000,PARALLEL,1882.092000 +Gaussian,CDF,100000,WORK_STEALING,829.468000 +Gaussian,PDF,250000,SCALAR,18872.819000 +Gaussian,PDF,250000,VECTORIZED,1777.422000 +Gaussian,PDF,250000,PARALLEL,909.350000 +Gaussian,PDF,250000,WORK_STEALING,526.680000 +Gaussian,LogPDF,250000,SCALAR,14193.064000 +Gaussian,LogPDF,250000,VECTORIZED,424.674000 +Gaussian,LogPDF,250000,PARALLEL,149.495000 +Gaussian,LogPDF,250000,WORK_STEALING,189.653000 +Gaussian,CDF,250000,SCALAR,26679.933000 +Gaussian,CDF,250000,VECTORIZED,3406.828000 +Gaussian,CDF,250000,PARALLEL,3849.269000 +Gaussian,CDF,250000,WORK_STEALING,1760.534000 +Gaussian,PDF,500000,SCALAR,38109.428000 +Gaussian,PDF,500000,VECTORIZED,3699.536000 +Gaussian,PDF,500000,PARALLEL,1830.914000 +Gaussian,PDF,500000,WORK_STEALING,1038.997000 +Gaussian,LogPDF,500000,SCALAR,28308.526000 +Gaussian,LogPDF,500000,VECTORIZED,1014.902000 +Gaussian,LogPDF,500000,PARALLEL,238.987000 +Gaussian,LogPDF,500000,WORK_STEALING,321.062000 +Gaussian,CDF,500000,SCALAR,54242.521000 +Gaussian,CDF,500000,VECTORIZED,7073.789000 +Gaussian,CDF,500000,PARALLEL,7320.565000 +Gaussian,CDF,500000,WORK_STEALING,3611.076000 +Exponential,PDF,8,SCALAR,0.706000 +Exponential,PDF,8,VECTORIZED,0.607000 +Exponential,PDF,8,PARALLEL,0.288000 +Exponential,PDF,8,WORK_STEALING,0.346000 +Exponential,LogPDF,8,SCALAR,0.579000 +Exponential,LogPDF,8,VECTORIZED,0.259000 +Exponential,LogPDF,8,PARALLEL,0.186000 +Exponential,LogPDF,8,WORK_STEALING,0.212000 +Exponential,CDF,8,SCALAR,0.736000 +Exponential,CDF,8,VECTORIZED,0.407000 +Exponential,CDF,8,PARALLEL,0.284000 +Exponential,CDF,8,WORK_STEALING,0.342000 +Exponential,PDF,16,SCALAR,1.345000 +Exponential,PDF,16,VECTORIZED,0.620000 +Exponential,PDF,16,PARALLEL,0.400000 +Exponential,PDF,16,WORK_STEALING,0.433000 +Exponential,LogPDF,16,SCALAR,1.043000 +Exponential,LogPDF,16,VECTORIZED,0.268000 +Exponential,LogPDF,16,PARALLEL,0.204000 +Exponential,LogPDF,16,WORK_STEALING,0.216000 +Exponential,CDF,16,SCALAR,1.370000 +Exponential,CDF,16,VECTORIZED,0.437000 +Exponential,CDF,16,PARALLEL,0.412000 +Exponential,CDF,16,WORK_STEALING,0.434000 +Exponential,PDF,32,SCALAR,2.592000 +Exponential,PDF,32,VECTORIZED,0.511000 +Exponential,PDF,32,PARALLEL,0.600000 +Exponential,PDF,32,WORK_STEALING,0.664000 +Exponential,LogPDF,32,SCALAR,2.019000 +Exponential,LogPDF,32,VECTORIZED,0.296000 +Exponential,LogPDF,32,PARALLEL,0.248000 +Exponential,LogPDF,32,WORK_STEALING,0.242000 +Exponential,CDF,32,SCALAR,2.709000 +Exponential,CDF,32,VECTORIZED,0.551000 +Exponential,CDF,32,PARALLEL,0.648000 +Exponential,CDF,32,WORK_STEALING,0.674000 +Exponential,PDF,64,SCALAR,5.093000 +Exponential,PDF,64,VECTORIZED,0.823000 +Exponential,PDF,64,PARALLEL,1.107000 +Exponential,PDF,64,WORK_STEALING,1.122000 +Exponential,LogPDF,64,SCALAR,4.010000 +Exponential,LogPDF,64,VECTORIZED,0.329000 +Exponential,LogPDF,64,PARALLEL,0.319000 +Exponential,LogPDF,64,WORK_STEALING,0.272000 +Exponential,CDF,64,SCALAR,5.342000 +Exponential,CDF,64,VECTORIZED,0.932000 +Exponential,CDF,64,PARALLEL,1.152000 +Exponential,CDF,64,WORK_STEALING,1.130000 +Exponential,PDF,128,SCALAR,9.998000 +Exponential,PDF,128,VECTORIZED,1.210000 +Exponential,PDF,128,PARALLEL,2.057000 +Exponential,PDF,128,WORK_STEALING,2.015000 +Exponential,LogPDF,128,SCALAR,7.993000 +Exponential,LogPDF,128,VECTORIZED,0.433000 +Exponential,LogPDF,128,PARALLEL,0.474000 +Exponential,LogPDF,128,WORK_STEALING,0.313000 +Exponential,CDF,128,SCALAR,10.463000 +Exponential,CDF,128,VECTORIZED,1.299000 +Exponential,CDF,128,PARALLEL,2.106000 +Exponential,CDF,128,WORK_STEALING,2.070000 +Exponential,PDF,256,SCALAR,19.979000 +Exponential,PDF,256,VECTORIZED,2.293000 +Exponential,PDF,256,PARALLEL,3.970000 +Exponential,PDF,256,WORK_STEALING,3.786000 +Exponential,LogPDF,256,SCALAR,15.657000 +Exponential,LogPDF,256,VECTORIZED,0.584000 +Exponential,LogPDF,256,PARALLEL,0.747000 +Exponential,LogPDF,256,WORK_STEALING,0.433000 +Exponential,CDF,256,SCALAR,20.867000 +Exponential,CDF,256,VECTORIZED,2.185000 +Exponential,CDF,256,PARALLEL,4.016000 +Exponential,CDF,256,WORK_STEALING,3.941000 +Exponential,PDF,512,SCALAR,40.060000 +Exponential,PDF,512,VECTORIZED,4.028000 +Exponential,PDF,512,PARALLEL,7.648000 +Exponential,PDF,512,WORK_STEALING,7.373000 +Exponential,LogPDF,512,SCALAR,31.277000 +Exponential,LogPDF,512,VECTORIZED,0.985000 +Exponential,LogPDF,512,PARALLEL,1.345000 +Exponential,LogPDF,512,WORK_STEALING,0.571000 +Exponential,CDF,512,SCALAR,41.518000 +Exponential,CDF,512,VECTORIZED,4.195000 +Exponential,CDF,512,PARALLEL,7.898000 +Exponential,CDF,512,WORK_STEALING,7.614000 +Exponential,PDF,1000,SCALAR,77.954000 +Exponential,PDF,1000,VECTORIZED,7.781000 +Exponential,PDF,1000,PARALLEL,14.738000 +Exponential,PDF,1000,WORK_STEALING,14.097000 +Exponential,LogPDF,1000,SCALAR,61.475000 +Exponential,LogPDF,1000,VECTORIZED,1.580000 +Exponential,LogPDF,1000,PARALLEL,2.439000 +Exponential,LogPDF,1000,WORK_STEALING,0.860000 +Exponential,CDF,1000,SCALAR,81.149000 +Exponential,CDF,1000,VECTORIZED,7.962000 +Exponential,CDF,1000,PARALLEL,15.301000 +Exponential,CDF,1000,WORK_STEALING,14.534000 +Exponential,PDF,2000,SCALAR,155.197000 +Exponential,PDF,2000,VECTORIZED,15.169000 +Exponential,PDF,2000,PARALLEL,29.357000 +Exponential,PDF,2000,WORK_STEALING,27.930000 +Exponential,LogPDF,2000,SCALAR,122.601000 +Exponential,LogPDF,2000,VECTORIZED,3.084000 +Exponential,LogPDF,2000,PARALLEL,4.265000 +Exponential,LogPDF,2000,WORK_STEALING,1.493000 +Exponential,CDF,2000,SCALAR,162.325000 +Exponential,CDF,2000,VECTORIZED,15.829000 +Exponential,CDF,2000,PARALLEL,30.346000 +Exponential,CDF,2000,WORK_STEALING,29.580000 +Exponential,PDF,5000,SCALAR,380.839000 +Exponential,PDF,5000,VECTORIZED,37.320000 +Exponential,PDF,5000,PARALLEL,143.326000 +Exponential,PDF,5000,WORK_STEALING,85.703000 +Exponential,LogPDF,5000,SCALAR,300.155000 +Exponential,LogPDF,5000,VECTORIZED,8.185000 +Exponential,LogPDF,5000,PARALLEL,68.096000 +Exponential,LogPDF,5000,WORK_STEALING,58.660000 +Exponential,CDF,5000,SCALAR,393.918000 +Exponential,CDF,5000,VECTORIZED,39.000000 +Exponential,CDF,5000,PARALLEL,132.793000 +Exponential,CDF,5000,WORK_STEALING,85.740000 +Exponential,PDF,10000,SCALAR,745.394000 +Exponential,PDF,10000,VECTORIZED,71.694000 +Exponential,PDF,10000,PARALLEL,202.094000 +Exponential,PDF,10000,WORK_STEALING,98.572000 +Exponential,LogPDF,10000,SCALAR,578.087000 +Exponential,LogPDF,10000,VECTORIZED,16.240000 +Exponential,LogPDF,10000,PARALLEL,78.787000 +Exponential,LogPDF,10000,WORK_STEALING,61.926000 +Exponential,CDF,10000,SCALAR,804.394000 +Exponential,CDF,10000,VECTORIZED,74.195000 +Exponential,CDF,10000,PARALLEL,213.281000 +Exponential,CDF,10000,WORK_STEALING,99.287000 +Exponential,PDF,20000,SCALAR,1469.837000 +Exponential,PDF,20000,VECTORIZED,145.108000 +Exponential,PDF,20000,PARALLEL,334.426000 +Exponential,PDF,20000,WORK_STEALING,125.271000 +Exponential,LogPDF,20000,SCALAR,1220.231000 +Exponential,LogPDF,20000,VECTORIZED,35.940000 +Exponential,LogPDF,20000,PARALLEL,106.459000 +Exponential,LogPDF,20000,WORK_STEALING,79.867000 +Exponential,CDF,20000,SCALAR,1570.004000 +Exponential,CDF,20000,VECTORIZED,154.019000 +Exponential,CDF,20000,PARALLEL,348.791000 +Exponential,CDF,20000,WORK_STEALING,129.432000 +Exponential,PDF,50000,SCALAR,3702.037000 +Exponential,PDF,50000,VECTORIZED,368.011000 +Exponential,PDF,50000,PARALLEL,511.114000 +Exponential,PDF,50000,WORK_STEALING,199.268000 +Exponential,LogPDF,50000,SCALAR,2981.963000 +Exponential,LogPDF,50000,VECTORIZED,92.204000 +Exponential,LogPDF,50000,PARALLEL,132.322000 +Exponential,LogPDF,50000,WORK_STEALING,96.263000 +Exponential,CDF,50000,SCALAR,3819.556000 +Exponential,CDF,50000,VECTORIZED,393.785000 +Exponential,CDF,50000,PARALLEL,537.281000 +Exponential,CDF,50000,WORK_STEALING,215.983000 +Exponential,PDF,100000,SCALAR,7580.540000 +Exponential,PDF,100000,VECTORIZED,739.567000 +Exponential,PDF,100000,PARALLEL,546.505000 +Exponential,PDF,100000,WORK_STEALING,321.440000 +Exponential,LogPDF,100000,SCALAR,5869.572000 +Exponential,LogPDF,100000,VECTORIZED,197.236000 +Exponential,LogPDF,100000,PARALLEL,141.244000 +Exponential,LogPDF,100000,WORK_STEALING,138.083000 +Exponential,CDF,100000,SCALAR,7690.138000 +Exponential,CDF,100000,VECTORIZED,813.188000 +Exponential,CDF,100000,PARALLEL,569.786000 +Exponential,CDF,100000,WORK_STEALING,307.863000 +Exponential,PDF,250000,SCALAR,21106.764000 +Exponential,PDF,250000,VECTORIZED,1972.284000 +Exponential,PDF,250000,PARALLEL,1053.016000 +Exponential,PDF,250000,WORK_STEALING,664.963000 +Exponential,LogPDF,250000,SCALAR,14761.885000 +Exponential,LogPDF,250000,VECTORIZED,483.931000 +Exponential,LogPDF,250000,PARALLEL,231.280000 +Exponential,LogPDF,250000,WORK_STEALING,239.007000 +Exponential,CDF,250000,SCALAR,19635.843000 +Exponential,CDF,250000,VECTORIZED,2057.278000 +Exponential,CDF,250000,PARALLEL,1098.142000 +Exponential,CDF,250000,WORK_STEALING,727.349000 +Exponential,PDF,500000,SCALAR,43044.825000 +Exponential,PDF,500000,VECTORIZED,4114.834000 +Exponential,PDF,500000,PARALLEL,2218.950000 +Exponential,PDF,500000,WORK_STEALING,1356.760000 +Exponential,LogPDF,500000,SCALAR,30639.062000 +Exponential,LogPDF,500000,VECTORIZED,1206.767000 +Exponential,LogPDF,500000,PARALLEL,452.717000 +Exponential,LogPDF,500000,WORK_STEALING,400.822000 +Exponential,CDF,500000,SCALAR,40751.030000 +Exponential,CDF,500000,VECTORIZED,4378.876000 +Exponential,CDF,500000,PARALLEL,2257.244000 +Exponential,CDF,500000,WORK_STEALING,1263.703000 +Discrete,PDF,8,SCALAR,0.670000 +Discrete,PDF,8,VECTORIZED,0.207000 +Discrete,PDF,8,PARALLEL,0.210000 +Discrete,PDF,8,WORK_STEALING,0.279000 +Discrete,LogPDF,8,SCALAR,0.662000 +Discrete,LogPDF,8,VECTORIZED,0.220000 +Discrete,LogPDF,8,PARALLEL,0.229000 +Discrete,LogPDF,8,WORK_STEALING,0.278000 +Discrete,CDF,8,SCALAR,0.631000 +Discrete,CDF,8,VECTORIZED,0.207000 +Discrete,CDF,8,PARALLEL,0.235000 +Discrete,CDF,8,WORK_STEALING,0.279000 +Discrete,PDF,16,SCALAR,1.076000 +Discrete,PDF,16,VECTORIZED,0.219000 +Discrete,PDF,16,PARALLEL,0.231000 +Discrete,PDF,16,WORK_STEALING,0.294000 +Discrete,LogPDF,16,SCALAR,1.160000 +Discrete,LogPDF,16,VECTORIZED,0.238000 +Discrete,LogPDF,16,PARALLEL,0.274000 +Discrete,LogPDF,16,WORK_STEALING,0.333000 +Discrete,CDF,16,SCALAR,1.123000 +Discrete,CDF,16,VECTORIZED,0.219000 +Discrete,CDF,16,PARALLEL,0.287000 +Discrete,CDF,16,WORK_STEALING,0.306000 +Discrete,PDF,32,SCALAR,2.343000 +Discrete,PDF,32,VECTORIZED,0.369000 +Discrete,PDF,32,PARALLEL,0.376000 +Discrete,PDF,32,WORK_STEALING,0.431000 +Discrete,LogPDF,32,SCALAR,2.283000 +Discrete,LogPDF,32,VECTORIZED,0.375000 +Discrete,LogPDF,32,PARALLEL,0.393000 +Discrete,LogPDF,32,WORK_STEALING,0.411000 +Discrete,CDF,32,SCALAR,2.318000 +Discrete,CDF,32,VECTORIZED,0.289000 +Discrete,CDF,32,PARALLEL,0.388000 +Discrete,CDF,32,WORK_STEALING,0.425000 +Discrete,PDF,64,SCALAR,4.632000 +Discrete,PDF,64,VECTORIZED,0.486000 +Discrete,PDF,64,PARALLEL,0.496000 +Discrete,PDF,64,WORK_STEALING,0.621000 +Discrete,LogPDF,64,SCALAR,4.530000 +Discrete,LogPDF,64,VECTORIZED,0.491000 +Discrete,LogPDF,64,PARALLEL,0.611000 +Discrete,LogPDF,64,WORK_STEALING,0.600000 +Discrete,CDF,64,SCALAR,4.609000 +Discrete,CDF,64,VECTORIZED,0.461000 +Discrete,CDF,64,PARALLEL,0.600000 +Discrete,CDF,64,WORK_STEALING,0.589000 +Discrete,PDF,128,SCALAR,9.092000 +Discrete,PDF,128,VECTORIZED,0.782000 +Discrete,PDF,128,PARALLEL,0.747000 +Discrete,PDF,128,WORK_STEALING,0.932000 +Discrete,LogPDF,128,SCALAR,8.898000 +Discrete,LogPDF,128,VECTORIZED,0.857000 +Discrete,LogPDF,128,PARALLEL,0.948000 +Discrete,LogPDF,128,WORK_STEALING,0.933000 +Discrete,CDF,128,SCALAR,8.704000 +Discrete,CDF,128,VECTORIZED,0.713000 +Discrete,CDF,128,PARALLEL,1.006000 +Discrete,CDF,128,WORK_STEALING,0.903000 +Discrete,PDF,256,SCALAR,18.086000 +Discrete,PDF,256,VECTORIZED,1.278000 +Discrete,PDF,256,PARALLEL,1.244000 +Discrete,PDF,256,WORK_STEALING,1.643000 +Discrete,LogPDF,256,SCALAR,17.763000 +Discrete,LogPDF,256,VECTORIZED,1.600000 +Discrete,LogPDF,256,PARALLEL,1.777000 +Discrete,LogPDF,256,WORK_STEALING,1.734000 +Discrete,CDF,256,SCALAR,17.930000 +Discrete,CDF,256,VECTORIZED,1.264000 +Discrete,CDF,256,PARALLEL,1.840000 +Discrete,CDF,256,WORK_STEALING,1.680000 +Discrete,PDF,512,SCALAR,36.294000 +Discrete,PDF,512,VECTORIZED,2.445000 +Discrete,PDF,512,PARALLEL,2.260000 +Discrete,PDF,512,WORK_STEALING,3.038000 +Discrete,LogPDF,512,SCALAR,35.168000 +Discrete,LogPDF,512,VECTORIZED,2.943000 +Discrete,LogPDF,512,PARALLEL,3.339000 +Discrete,LogPDF,512,WORK_STEALING,3.012000 +Discrete,CDF,512,SCALAR,35.202000 +Discrete,CDF,512,VECTORIZED,2.532000 +Discrete,CDF,512,PARALLEL,3.461000 +Discrete,CDF,512,WORK_STEALING,3.161000 +Discrete,PDF,1000,SCALAR,70.856000 +Discrete,PDF,1000,VECTORIZED,4.419000 +Discrete,PDF,1000,PARALLEL,4.185000 +Discrete,PDF,1000,WORK_STEALING,5.664000 +Discrete,LogPDF,1000,SCALAR,68.992000 +Discrete,LogPDF,1000,VECTORIZED,5.664000 +Discrete,LogPDF,1000,PARALLEL,6.282000 +Discrete,LogPDF,1000,WORK_STEALING,5.769000 +Discrete,CDF,1000,SCALAR,68.886000 +Discrete,CDF,1000,VECTORIZED,4.436000 +Discrete,CDF,1000,PARALLEL,6.714000 +Discrete,CDF,1000,WORK_STEALING,5.949000 +Discrete,PDF,2000,SCALAR,140.988000 +Discrete,PDF,2000,VECTORIZED,8.343000 +Discrete,PDF,2000,PARALLEL,8.345000 +Discrete,PDF,2000,WORK_STEALING,11.081000 +Discrete,LogPDF,2000,SCALAR,137.467000 +Discrete,LogPDF,2000,VECTORIZED,11.306000 +Discrete,LogPDF,2000,PARALLEL,12.631000 +Discrete,LogPDF,2000,WORK_STEALING,11.751000 +Discrete,CDF,2000,SCALAR,139.339000 +Discrete,CDF,2000,VECTORIZED,8.879000 +Discrete,CDF,2000,PARALLEL,13.642000 +Discrete,CDF,2000,WORK_STEALING,11.217000 +Discrete,PDF,5000,SCALAR,352.164000 +Discrete,PDF,5000,VECTORIZED,20.445000 +Discrete,PDF,5000,PARALLEL,112.869000 +Discrete,PDF,5000,WORK_STEALING,92.912000 +Discrete,LogPDF,5000,SCALAR,333.496000 +Discrete,LogPDF,5000,VECTORIZED,27.267000 +Discrete,LogPDF,5000,PARALLEL,125.344000 +Discrete,LogPDF,5000,WORK_STEALING,95.559000 +Discrete,CDF,5000,SCALAR,331.741000 +Discrete,CDF,5000,VECTORIZED,22.081000 +Discrete,CDF,5000,PARALLEL,127.857000 +Discrete,CDF,5000,WORK_STEALING,97.722000 +Discrete,PDF,10000,SCALAR,666.563000 +Discrete,PDF,10000,VECTORIZED,38.898000 +Discrete,PDF,10000,PARALLEL,143.364000 +Discrete,PDF,10000,WORK_STEALING,98.963000 +Discrete,LogPDF,10000,SCALAR,664.807000 +Discrete,LogPDF,10000,VECTORIZED,52.569000 +Discrete,LogPDF,10000,PARALLEL,144.431000 +Discrete,LogPDF,10000,WORK_STEALING,98.184000 +Discrete,CDF,10000,SCALAR,628.890000 +Discrete,CDF,10000,VECTORIZED,47.613000 +Discrete,CDF,10000,PARALLEL,157.830000 +Discrete,CDF,10000,WORK_STEALING,107.665000 +Discrete,PDF,20000,SCALAR,1248.761000 +Discrete,PDF,20000,VECTORIZED,71.544000 +Discrete,PDF,20000,PARALLEL,173.573000 +Discrete,PDF,20000,WORK_STEALING,107.466000 +Discrete,LogPDF,20000,SCALAR,1181.772000 +Discrete,LogPDF,20000,VECTORIZED,94.611000 +Discrete,LogPDF,20000,PARALLEL,180.232000 +Discrete,LogPDF,20000,WORK_STEALING,106.612000 +Discrete,CDF,20000,SCALAR,1206.387000 +Discrete,CDF,20000,VECTORIZED,86.728000 +Discrete,CDF,20000,PARALLEL,212.270000 +Discrete,CDF,20000,WORK_STEALING,120.549000 +Discrete,PDF,50000,SCALAR,3130.039000 +Discrete,PDF,50000,VECTORIZED,178.592000 +Discrete,PDF,50000,PARALLEL,236.943000 +Discrete,PDF,50000,WORK_STEALING,153.473000 +Discrete,LogPDF,50000,SCALAR,3225.592000 +Discrete,LogPDF,50000,VECTORIZED,258.045000 +Discrete,LogPDF,50000,PARALLEL,276.565000 +Discrete,LogPDF,50000,WORK_STEALING,164.871000 +Discrete,CDF,50000,SCALAR,3199.611000 +Discrete,CDF,50000,VECTORIZED,234.130000 +Discrete,CDF,50000,PARALLEL,333.572000 +Discrete,CDF,50000,WORK_STEALING,183.581000 +Discrete,PDF,100000,SCALAR,6231.586000 +Discrete,PDF,100000,VECTORIZED,346.642000 +Discrete,PDF,100000,PARALLEL,232.031000 +Discrete,PDF,100000,WORK_STEALING,197.324000 +Discrete,LogPDF,100000,SCALAR,6207.493000 +Discrete,LogPDF,100000,VECTORIZED,500.633000 +Discrete,LogPDF,100000,PARALLEL,277.506000 +Discrete,LogPDF,100000,WORK_STEALING,211.907000 +Discrete,CDF,100000,SCALAR,6356.678000 +Discrete,CDF,100000,VECTORIZED,471.635000 +Discrete,CDF,100000,PARALLEL,329.414000 +Discrete,CDF,100000,WORK_STEALING,254.203000 +Discrete,PDF,250000,SCALAR,16372.408000 +Discrete,PDF,250000,VECTORIZED,919.808000 +Discrete,PDF,250000,PARALLEL,413.468000 +Discrete,PDF,250000,WORK_STEALING,349.807000 +Discrete,LogPDF,250000,SCALAR,15877.295000 +Discrete,LogPDF,250000,VECTORIZED,1286.873000 +Discrete,LogPDF,250000,PARALLEL,472.393000 +Discrete,LogPDF,250000,WORK_STEALING,343.007000 +Discrete,CDF,250000,SCALAR,15791.279000 +Discrete,CDF,250000,VECTORIZED,1185.310000 +Discrete,CDF,250000,PARALLEL,582.200000 +Discrete,CDF,250000,WORK_STEALING,451.442000 +Discrete,PDF,500000,SCALAR,34087.722000 +Discrete,PDF,500000,VECTORIZED,2072.175000 +Discrete,PDF,500000,PARALLEL,819.262000 +Discrete,PDF,500000,WORK_STEALING,555.283000 +Discrete,LogPDF,500000,SCALAR,36277.922000 +Discrete,LogPDF,500000,VECTORIZED,2879.678000 +Discrete,LogPDF,500000,PARALLEL,988.258000 +Discrete,LogPDF,500000,WORK_STEALING,756.407000 +Discrete,CDF,500000,SCALAR,37855.593000 +Discrete,CDF,500000,VECTORIZED,2887.526000 +Discrete,CDF,500000,PARALLEL,1425.542000 +Discrete,CDF,500000,WORK_STEALING,829.916000 +Poisson,PDF,8,SCALAR,1.290000 +Poisson,PDF,8,VECTORIZED,0.754000 +Poisson,PDF,8,PARALLEL,0.784000 +Poisson,PDF,8,WORK_STEALING,0.906000 +Poisson,LogPDF,8,SCALAR,0.819000 +Poisson,LogPDF,8,VECTORIZED,0.332000 +Poisson,LogPDF,8,PARALLEL,0.384000 +Poisson,LogPDF,8,WORK_STEALING,0.419000 +Poisson,CDF,8,SCALAR,1.326000 +Poisson,CDF,8,VECTORIZED,1.392000 +Poisson,CDF,8,PARALLEL,1.396000 +Poisson,CDF,8,WORK_STEALING,1.437000 +Poisson,PDF,16,SCALAR,2.331000 +Poisson,PDF,16,VECTORIZED,1.308000 +Poisson,PDF,16,PARALLEL,1.358000 +Poisson,PDF,16,WORK_STEALING,1.438000 +Poisson,LogPDF,16,SCALAR,1.584000 +Poisson,LogPDF,16,VECTORIZED,0.523000 +Poisson,LogPDF,16,PARALLEL,0.572000 +Poisson,LogPDF,16,WORK_STEALING,0.603000 +Poisson,CDF,16,SCALAR,2.722000 +Poisson,CDF,16,VECTORIZED,2.813000 +Poisson,CDF,16,PARALLEL,2.866000 +Poisson,CDF,16,WORK_STEALING,2.900000 +Poisson,PDF,32,SCALAR,4.723000 +Poisson,PDF,32,VECTORIZED,2.311000 +Poisson,PDF,32,PARALLEL,2.335000 +Poisson,PDF,32,WORK_STEALING,2.385000 +Poisson,LogPDF,32,SCALAR,3.223000 +Poisson,LogPDF,32,VECTORIZED,0.881000 +Poisson,LogPDF,32,PARALLEL,0.918000 +Poisson,LogPDF,32,WORK_STEALING,0.942000 +Poisson,CDF,32,SCALAR,5.383000 +Poisson,CDF,32,VECTORIZED,5.429000 +Poisson,CDF,32,PARALLEL,5.644000 +Poisson,CDF,32,WORK_STEALING,5.612000 +Poisson,PDF,64,SCALAR,9.360000 +Poisson,PDF,64,VECTORIZED,4.610000 +Poisson,PDF,64,PARALLEL,4.746000 +Poisson,PDF,64,WORK_STEALING,4.739000 +Poisson,LogPDF,64,SCALAR,6.315000 +Poisson,LogPDF,64,VECTORIZED,1.807000 +Poisson,LogPDF,64,PARALLEL,1.932000 +Poisson,LogPDF,64,WORK_STEALING,1.846000 +Poisson,CDF,64,SCALAR,12.018000 +Poisson,CDF,64,VECTORIZED,12.101000 +Poisson,CDF,64,PARALLEL,12.100000 +Poisson,CDF,64,WORK_STEALING,12.079000 +Poisson,PDF,128,SCALAR,18.397000 +Poisson,PDF,128,VECTORIZED,8.799000 +Poisson,PDF,128,PARALLEL,9.112000 +Poisson,PDF,128,WORK_STEALING,9.045000 +Poisson,LogPDF,128,SCALAR,12.055000 +Poisson,LogPDF,128,VECTORIZED,3.100000 +Poisson,LogPDF,128,PARALLEL,3.362000 +Poisson,LogPDF,128,WORK_STEALING,3.133000 +Poisson,CDF,128,SCALAR,22.358000 +Poisson,CDF,128,VECTORIZED,22.100000 +Poisson,CDF,128,PARALLEL,22.401000 +Poisson,CDF,128,WORK_STEALING,22.330000 +Poisson,PDF,256,SCALAR,37.144000 +Poisson,PDF,256,VECTORIZED,17.822000 +Poisson,PDF,256,PARALLEL,18.427000 +Poisson,PDF,256,WORK_STEALING,18.089000 +Poisson,LogPDF,256,SCALAR,24.478000 +Poisson,LogPDF,256,VECTORIZED,6.228000 +Poisson,LogPDF,256,PARALLEL,6.759000 +Poisson,LogPDF,256,WORK_STEALING,6.290000 +Poisson,CDF,256,SCALAR,45.511000 +Poisson,CDF,256,VECTORIZED,45.034000 +Poisson,CDF,256,PARALLEL,45.468000 +Poisson,CDF,256,WORK_STEALING,44.765000 +Poisson,PDF,512,SCALAR,74.352000 +Poisson,PDF,512,VECTORIZED,35.611000 +Poisson,PDF,512,PARALLEL,36.818000 +Poisson,PDF,512,WORK_STEALING,36.212000 +Poisson,LogPDF,512,SCALAR,48.765000 +Poisson,LogPDF,512,VECTORIZED,12.479000 +Poisson,LogPDF,512,PARALLEL,13.347000 +Poisson,LogPDF,512,WORK_STEALING,12.216000 +Poisson,CDF,512,SCALAR,91.604000 +Poisson,CDF,512,VECTORIZED,89.994000 +Poisson,CDF,512,PARALLEL,91.225000 +Poisson,CDF,512,WORK_STEALING,90.285000 +Poisson,PDF,1000,SCALAR,144.968000 +Poisson,PDF,1000,VECTORIZED,69.233000 +Poisson,PDF,1000,PARALLEL,77.012000 +Poisson,PDF,1000,WORK_STEALING,70.009000 +Poisson,LogPDF,1000,SCALAR,94.873000 +Poisson,LogPDF,1000,VECTORIZED,23.753000 +Poisson,LogPDF,1000,PARALLEL,25.141000 +Poisson,LogPDF,1000,WORK_STEALING,23.400000 +Poisson,CDF,1000,SCALAR,175.620000 +Poisson,CDF,1000,VECTORIZED,172.592000 +Poisson,CDF,1000,PARALLEL,174.714000 +Poisson,CDF,1000,WORK_STEALING,174.036000 +Poisson,PDF,2000,SCALAR,304.184000 +Poisson,PDF,2000,VECTORIZED,143.161000 +Poisson,PDF,2000,PARALLEL,142.521000 +Poisson,PDF,2000,WORK_STEALING,139.204000 +Poisson,LogPDF,2000,SCALAR,188.323000 +Poisson,LogPDF,2000,VECTORIZED,47.972000 +Poisson,LogPDF,2000,PARALLEL,51.225000 +Poisson,LogPDF,2000,WORK_STEALING,47.149000 +Poisson,CDF,2000,SCALAR,350.721000 +Poisson,CDF,2000,VECTORIZED,346.864000 +Poisson,CDF,2000,PARALLEL,351.399000 +Poisson,CDF,2000,WORK_STEALING,355.468000 +Poisson,PDF,5000,SCALAR,764.759000 +Poisson,PDF,5000,VECTORIZED,366.991000 +Poisson,PDF,5000,PARALLEL,485.509000 +Poisson,PDF,5000,WORK_STEALING,267.167000 +Poisson,LogPDF,5000,SCALAR,472.860000 +Poisson,LogPDF,5000,VECTORIZED,121.161000 +Poisson,LogPDF,5000,PARALLEL,269.505000 +Poisson,LogPDF,5000,WORK_STEALING,180.438000 +Poisson,CDF,5000,SCALAR,904.462000 +Poisson,CDF,5000,VECTORIZED,909.851000 +Poisson,CDF,5000,PARALLEL,994.100000 +Poisson,CDF,5000,WORK_STEALING,374.889000 +Poisson,PDF,10000,SCALAR,1456.656000 +Poisson,PDF,10000,VECTORIZED,684.435000 +Poisson,PDF,10000,PARALLEL,843.162000 +Poisson,PDF,10000,WORK_STEALING,291.499000 +Poisson,LogPDF,10000,SCALAR,943.729000 +Poisson,LogPDF,10000,VECTORIZED,242.636000 +Poisson,LogPDF,10000,PARALLEL,403.143000 +Poisson,LogPDF,10000,WORK_STEALING,238.073000 +Poisson,CDF,10000,SCALAR,1759.176000 +Poisson,CDF,10000,VECTORIZED,1736.772000 +Poisson,CDF,10000,PARALLEL,1892.522000 +Poisson,CDF,10000,WORK_STEALING,537.192000 +Poisson,PDF,20000,SCALAR,2890.646000 +Poisson,PDF,20000,VECTORIZED,1368.740000 +Poisson,PDF,20000,PARALLEL,1551.435000 +Poisson,PDF,20000,WORK_STEALING,398.339000 +Poisson,LogPDF,20000,SCALAR,1887.053000 +Poisson,LogPDF,20000,VECTORIZED,486.652000 +Poisson,LogPDF,20000,PARALLEL,668.746000 +Poisson,LogPDF,20000,WORK_STEALING,285.532000 +Poisson,CDF,20000,SCALAR,3530.499000 +Poisson,CDF,20000,VECTORIZED,3476.569000 +Poisson,CDF,20000,PARALLEL,3683.467000 +Poisson,CDF,20000,WORK_STEALING,779.629000 +Poisson,PDF,50000,SCALAR,7334.977000 +Poisson,PDF,50000,VECTORIZED,3408.521000 +Poisson,PDF,50000,PARALLEL,2475.941000 +Poisson,PDF,50000,WORK_STEALING,696.050000 +Poisson,LogPDF,50000,SCALAR,4731.683000 +Poisson,LogPDF,50000,VECTORIZED,1218.768000 +Poisson,LogPDF,50000,PARALLEL,1027.157000 +Poisson,LogPDF,50000,WORK_STEALING,376.003000 +Poisson,CDF,50000,SCALAR,8893.410000 +Poisson,CDF,50000,VECTORIZED,8807.073000 +Poisson,CDF,50000,PARALLEL,5906.741000 +Poisson,CDF,50000,WORK_STEALING,1554.580000 +Poisson,PDF,100000,SCALAR,14674.027000 +Poisson,PDF,100000,VECTORIZED,6930.527000 +Poisson,PDF,100000,PARALLEL,2493.896000 +Poisson,PDF,100000,WORK_STEALING,1258.094000 +Poisson,LogPDF,100000,SCALAR,9605.554000 +Poisson,LogPDF,100000,VECTORIZED,2432.604000 +Poisson,LogPDF,100000,PARALLEL,1034.697000 +Poisson,LogPDF,100000,WORK_STEALING,656.007000 +Poisson,CDF,100000,SCALAR,17828.967000 +Poisson,CDF,100000,VECTORIZED,17003.202000 +Poisson,CDF,100000,PARALLEL,5739.661000 +Poisson,CDF,100000,WORK_STEALING,2771.316000 +Poisson,PDF,250000,SCALAR,35545.913000 +Poisson,PDF,250000,VECTORIZED,16140.049000 +Poisson,PDF,250000,PARALLEL,4785.092000 +Poisson,PDF,250000,WORK_STEALING,2804.030000 +Poisson,LogPDF,250000,SCALAR,22302.845000 +Poisson,LogPDF,250000,VECTORIZED,5735.302000 +Poisson,LogPDF,250000,PARALLEL,1965.956000 +Poisson,LogPDF,250000,WORK_STEALING,1045.822000 +Poisson,CDF,250000,SCALAR,41468.784000 +Poisson,CDF,250000,VECTORIZED,39616.187000 +Poisson,CDF,250000,PARALLEL,11109.288000 +Poisson,CDF,250000,WORK_STEALING,7150.195000 +Poisson,PDF,500000,SCALAR,66119.149000 +Poisson,PDF,500000,VECTORIZED,31202.601000 +Poisson,PDF,500000,PARALLEL,9109.537000 +Poisson,PDF,500000,WORK_STEALING,4655.665000 +Poisson,LogPDF,500000,SCALAR,43186.210000 +Poisson,LogPDF,500000,VECTORIZED,11174.342000 +Poisson,LogPDF,500000,PARALLEL,3671.548000 +Poisson,LogPDF,500000,WORK_STEALING,2188.146000 +Poisson,CDF,500000,SCALAR,80156.317000 +Poisson,CDF,500000,VECTORIZED,79216.049000 +Poisson,CDF,500000,PARALLEL,22779.102000 +Poisson,CDF,500000,WORK_STEALING,12707.912000 +Gamma,PDF,8,SCALAR,1.465000 +Gamma,PDF,8,VECTORIZED,1.394000 +Gamma,PDF,8,PARALLEL,0.496000 +Gamma,PDF,8,WORK_STEALING,0.549000 +Gamma,LogPDF,8,SCALAR,0.765000 +Gamma,LogPDF,8,VECTORIZED,1.040000 +Gamma,LogPDF,8,PARALLEL,0.334000 +Gamma,LogPDF,8,WORK_STEALING,0.399000 +Gamma,CDF,8,SCALAR,1.517000 +Gamma,CDF,8,VECTORIZED,1.794000 +Gamma,CDF,8,PARALLEL,1.010000 +Gamma,CDF,8,WORK_STEALING,1.078000 +Gamma,PDF,16,SCALAR,2.652000 +Gamma,PDF,16,VECTORIZED,1.308000 +Gamma,PDF,16,PARALLEL,0.762000 +Gamma,PDF,16,WORK_STEALING,0.852000 +Gamma,LogPDF,16,SCALAR,1.449000 +Gamma,LogPDF,16,VECTORIZED,1.118000 +Gamma,LogPDF,16,PARALLEL,0.476000 +Gamma,LogPDF,16,WORK_STEALING,0.521000 +Gamma,CDF,16,SCALAR,2.854000 +Gamma,CDF,16,VECTORIZED,2.506000 +Gamma,CDF,16,PARALLEL,1.896000 +Gamma,CDF,16,WORK_STEALING,1.819000 +Gamma,PDF,32,SCALAR,5.212000 +Gamma,PDF,32,VECTORIZED,1.366000 +Gamma,PDF,32,PARALLEL,1.396000 +Gamma,PDF,32,WORK_STEALING,1.453000 +Gamma,LogPDF,32,SCALAR,2.827000 +Gamma,LogPDF,32,VECTORIZED,1.223000 +Gamma,LogPDF,32,PARALLEL,0.796000 +Gamma,LogPDF,32,WORK_STEALING,0.817000 +Gamma,CDF,32,SCALAR,5.887000 +Gamma,CDF,32,VECTORIZED,4.156000 +Gamma,CDF,32,PARALLEL,3.542000 +Gamma,CDF,32,WORK_STEALING,3.608000 +Gamma,PDF,64,SCALAR,10.862000 +Gamma,PDF,64,VECTORIZED,2.228000 +Gamma,PDF,64,PARALLEL,2.657000 +Gamma,PDF,64,WORK_STEALING,2.644000 +Gamma,LogPDF,64,SCALAR,5.557000 +Gamma,LogPDF,64,VECTORIZED,1.654000 +Gamma,LogPDF,64,PARALLEL,1.404000 +Gamma,LogPDF,64,WORK_STEALING,1.390000 +Gamma,CDF,64,SCALAR,11.520000 +Gamma,CDF,64,VECTORIZED,7.517000 +Gamma,CDF,64,PARALLEL,6.773000 +Gamma,CDF,64,WORK_STEALING,6.787000 +Gamma,PDF,128,SCALAR,21.679000 +Gamma,PDF,128,VECTORIZED,3.278000 +Gamma,PDF,128,PARALLEL,5.145000 +Gamma,PDF,128,WORK_STEALING,5.292000 +Gamma,LogPDF,128,SCALAR,10.846000 +Gamma,LogPDF,128,VECTORIZED,2.772000 +Gamma,LogPDF,128,PARALLEL,2.733000 +Gamma,LogPDF,128,WORK_STEALING,2.584000 +Gamma,CDF,128,SCALAR,21.818000 +Gamma,CDF,128,VECTORIZED,13.695000 +Gamma,CDF,128,PARALLEL,13.513000 +Gamma,CDF,128,WORK_STEALING,13.451000 +Gamma,PDF,256,SCALAR,43.353000 +Gamma,PDF,256,VECTORIZED,5.808000 +Gamma,PDF,256,PARALLEL,10.158000 +Gamma,PDF,256,WORK_STEALING,10.212000 +Gamma,LogPDF,256,SCALAR,21.919000 +Gamma,LogPDF,256,VECTORIZED,4.066000 +Gamma,LogPDF,256,PARALLEL,4.982000 +Gamma,LogPDF,256,WORK_STEALING,4.640000 +Gamma,CDF,256,SCALAR,46.095000 +Gamma,CDF,256,VECTORIZED,27.611000 +Gamma,CDF,256,PARALLEL,27.451000 +Gamma,CDF,256,WORK_STEALING,27.007000 +Gamma,PDF,512,SCALAR,85.901000 +Gamma,PDF,512,VECTORIZED,10.916000 +Gamma,PDF,512,PARALLEL,20.132000 +Gamma,PDF,512,WORK_STEALING,20.027000 +Gamma,LogPDF,512,SCALAR,43.742000 +Gamma,LogPDF,512,VECTORIZED,6.998000 +Gamma,LogPDF,512,PARALLEL,9.785000 +Gamma,LogPDF,512,WORK_STEALING,9.501000 +Gamma,CDF,512,SCALAR,92.689000 +Gamma,CDF,512,VECTORIZED,56.736000 +Gamma,CDF,512,PARALLEL,56.804000 +Gamma,CDF,512,WORK_STEALING,55.901000 +Gamma,PDF,1000,SCALAR,169.317000 +Gamma,PDF,1000,VECTORIZED,20.882000 +Gamma,PDF,1000,PARALLEL,39.178000 +Gamma,PDF,1000,WORK_STEALING,38.976000 +Gamma,LogPDF,1000,SCALAR,83.513000 +Gamma,LogPDF,1000,VECTORIZED,14.202000 +Gamma,LogPDF,1000,PARALLEL,19.897000 +Gamma,LogPDF,1000,WORK_STEALING,18.505000 +Gamma,CDF,1000,SCALAR,182.828000 +Gamma,CDF,1000,VECTORIZED,108.050000 +Gamma,CDF,1000,PARALLEL,112.077000 +Gamma,CDF,1000,WORK_STEALING,115.507000 +Gamma,PDF,2000,SCALAR,338.696000 +Gamma,PDF,2000,VECTORIZED,40.690000 +Gamma,PDF,2000,PARALLEL,77.713000 +Gamma,PDF,2000,WORK_STEALING,77.457000 +Gamma,LogPDF,2000,SCALAR,170.925000 +Gamma,LogPDF,2000,VECTORIZED,27.938000 +Gamma,LogPDF,2000,PARALLEL,39.711000 +Gamma,LogPDF,2000,WORK_STEALING,36.656000 +Gamma,CDF,2000,SCALAR,361.622000 +Gamma,CDF,2000,VECTORIZED,221.231000 +Gamma,CDF,2000,PARALLEL,221.138000 +Gamma,CDF,2000,WORK_STEALING,219.786000 +Gamma,PDF,5000,SCALAR,850.715000 +Gamma,PDF,5000,VECTORIZED,104.540000 +Gamma,PDF,5000,PARALLEL,524.478000 +Gamma,PDF,5000,WORK_STEALING,262.414000 +Gamma,LogPDF,5000,SCALAR,429.170000 +Gamma,LogPDF,5000,VECTORIZED,72.973000 +Gamma,LogPDF,5000,PARALLEL,349.312000 +Gamma,LogPDF,5000,WORK_STEALING,187.072000 +Gamma,CDF,5000,SCALAR,908.043000 +Gamma,CDF,5000,VECTORIZED,559.736000 +Gamma,CDF,5000,PARALLEL,813.947000 +Gamma,CDF,5000,WORK_STEALING,309.932000 +Gamma,PDF,10000,SCALAR,1695.141000 +Gamma,PDF,10000,VECTORIZED,208.252000 +Gamma,PDF,10000,PARALLEL,644.749000 +Gamma,PDF,10000,WORK_STEALING,291.314000 +Gamma,LogPDF,10000,SCALAR,855.422000 +Gamma,LogPDF,10000,VECTORIZED,143.351000 +Gamma,LogPDF,10000,PARALLEL,443.370000 +Gamma,LogPDF,10000,WORK_STEALING,223.445000 +Gamma,CDF,10000,SCALAR,1812.897000 +Gamma,CDF,10000,VECTORIZED,1119.914000 +Gamma,CDF,10000,PARALLEL,1353.690000 +Gamma,CDF,10000,WORK_STEALING,416.177000 +Gamma,PDF,20000,SCALAR,3439.624000 +Gamma,PDF,20000,VECTORIZED,415.386000 +Gamma,PDF,20000,PARALLEL,1038.302000 +Gamma,PDF,20000,WORK_STEALING,383.257000 +Gamma,LogPDF,20000,SCALAR,1712.371000 +Gamma,LogPDF,20000,VECTORIZED,288.472000 +Gamma,LogPDF,20000,PARALLEL,643.004000 +Gamma,LogPDF,20000,WORK_STEALING,261.039000 +Gamma,CDF,20000,SCALAR,3632.194000 +Gamma,CDF,20000,VECTORIZED,2226.338000 +Gamma,CDF,20000,PARALLEL,2547.388000 +Gamma,CDF,20000,WORK_STEALING,560.970000 +Gamma,PDF,50000,SCALAR,8915.021000 +Gamma,PDF,50000,VECTORIZED,1040.646000 +Gamma,PDF,50000,PARALLEL,1550.213000 +Gamma,PDF,50000,WORK_STEALING,573.799000 +Gamma,LogPDF,50000,SCALAR,4307.949000 +Gamma,LogPDF,50000,VECTORIZED,728.715000 +Gamma,LogPDF,50000,PARALLEL,829.447000 +Gamma,LogPDF,50000,WORK_STEALING,377.836000 +Gamma,CDF,50000,SCALAR,9253.649000 +Gamma,CDF,50000,VECTORIZED,5733.050000 +Gamma,CDF,50000,PARALLEL,3852.924000 +Gamma,CDF,50000,WORK_STEALING,1179.907000 +Gamma,PDF,100000,SCALAR,18062.750000 +Gamma,PDF,100000,VECTORIZED,2091.026000 +Gamma,PDF,100000,PARALLEL,1492.717000 +Gamma,PDF,100000,WORK_STEALING,845.147000 +Gamma,LogPDF,100000,SCALAR,9005.219000 +Gamma,LogPDF,100000,VECTORIZED,1510.385000 +Gamma,LogPDF,100000,PARALLEL,862.994000 +Gamma,LogPDF,100000,WORK_STEALING,500.433000 +Gamma,CDF,100000,SCALAR,19061.954000 +Gamma,CDF,100000,VECTORIZED,11752.497000 +Gamma,CDF,100000,PARALLEL,4409.069000 +Gamma,CDF,100000,WORK_STEALING,1987.041000 +Gamma,PDF,250000,SCALAR,47023.807000 +Gamma,PDF,250000,VECTORIZED,5733.149000 +Gamma,PDF,250000,PARALLEL,2993.486000 +Gamma,PDF,250000,WORK_STEALING,1640.754000 +Gamma,LogPDF,250000,SCALAR,22616.240000 +Gamma,LogPDF,250000,VECTORIZED,3906.495000 +Gamma,LogPDF,250000,PARALLEL,1558.849000 +Gamma,LogPDF,250000,WORK_STEALING,1075.012000 +Gamma,CDF,250000,SCALAR,47939.196000 +Gamma,CDF,250000,VECTORIZED,29578.568000 +Gamma,CDF,250000,PARALLEL,8731.487000 +Gamma,CDF,250000,WORK_STEALING,5523.179000 +Gamma,PDF,500000,SCALAR,93606.819000 +Gamma,PDF,500000,VECTORIZED,11316.357000 +Gamma,PDF,500000,PARALLEL,6331.536000 +Gamma,PDF,500000,WORK_STEALING,3124.022000 +Gamma,LogPDF,500000,SCALAR,43755.764000 +Gamma,LogPDF,500000,VECTORIZED,8282.161000 +Gamma,LogPDF,500000,PARALLEL,2922.826000 +Gamma,LogPDF,500000,WORK_STEALING,1606.919000 +Gamma,CDF,500000,SCALAR,92261.452000 +Gamma,CDF,500000,VECTORIZED,56917.679000 +Gamma,CDF,500000,PARALLEL,16074.814000 +Gamma,CDF,500000,WORK_STEALING,8921.626000 +StudentT,PDF,8,SCALAR,0.976000 +StudentT,PDF,8,VECTORIZED,0.996000 +StudentT,PDF,8,PARALLEL,0.744000 +StudentT,PDF,8,WORK_STEALING,0.773000 +StudentT,LogPDF,8,SCALAR,0.836000 +StudentT,LogPDF,8,VECTORIZED,0.536000 +StudentT,LogPDF,8,PARALLEL,0.614000 +StudentT,LogPDF,8,WORK_STEALING,0.610000 +StudentT,CDF,8,SCALAR,3.495000 +StudentT,CDF,8,VECTORIZED,2.983000 +StudentT,CDF,8,PARALLEL,2.980000 +StudentT,CDF,8,WORK_STEALING,2.961000 +StudentT,PDF,16,SCALAR,1.913000 +StudentT,PDF,16,VECTORIZED,1.168000 +StudentT,PDF,16,PARALLEL,1.019000 +StudentT,PDF,16,WORK_STEALING,1.021000 +StudentT,LogPDF,16,SCALAR,1.521000 +StudentT,LogPDF,16,VECTORIZED,0.626000 +StudentT,LogPDF,16,PARALLEL,0.765000 +StudentT,LogPDF,16,WORK_STEALING,0.769000 +StudentT,CDF,16,SCALAR,6.626000 +StudentT,CDF,16,VECTORIZED,5.502000 +StudentT,CDF,16,PARALLEL,5.419000 +StudentT,CDF,16,WORK_STEALING,5.376000 +StudentT,PDF,32,SCALAR,3.524000 +StudentT,PDF,32,VECTORIZED,1.291000 +StudentT,PDF,32,PARALLEL,1.588000 +StudentT,PDF,32,WORK_STEALING,1.595000 +StudentT,LogPDF,32,SCALAR,2.990000 +StudentT,LogPDF,32,VECTORIZED,0.849000 +StudentT,LogPDF,32,PARALLEL,1.068000 +StudentT,LogPDF,32,WORK_STEALING,1.034000 +StudentT,CDF,32,SCALAR,13.726000 +StudentT,CDF,32,VECTORIZED,11.595000 +StudentT,CDF,32,PARALLEL,11.572000 +StudentT,CDF,32,WORK_STEALING,11.520000 +StudentT,PDF,64,SCALAR,7.201000 +StudentT,PDF,64,VECTORIZED,1.739000 +StudentT,PDF,64,PARALLEL,2.654000 +StudentT,PDF,64,WORK_STEALING,2.626000 +StudentT,LogPDF,64,SCALAR,5.870000 +StudentT,LogPDF,64,VECTORIZED,1.244000 +StudentT,LogPDF,64,PARALLEL,1.581000 +StudentT,LogPDF,64,WORK_STEALING,1.619000 +StudentT,CDF,64,SCALAR,27.099000 +StudentT,CDF,64,VECTORIZED,22.582000 +StudentT,CDF,64,PARALLEL,22.623000 +StudentT,CDF,64,WORK_STEALING,22.512000 +StudentT,PDF,128,SCALAR,14.329000 +StudentT,PDF,128,VECTORIZED,2.938000 +StudentT,PDF,128,PARALLEL,4.939000 +StudentT,PDF,128,WORK_STEALING,4.965000 +StudentT,LogPDF,128,SCALAR,11.552000 +StudentT,LogPDF,128,VECTORIZED,2.021000 +StudentT,LogPDF,128,PARALLEL,2.647000 +StudentT,LogPDF,128,WORK_STEALING,2.681000 +StudentT,CDF,128,SCALAR,53.198000 +StudentT,CDF,128,VECTORIZED,43.813000 +StudentT,CDF,128,PARALLEL,43.820000 +StudentT,CDF,128,WORK_STEALING,44.067000 +StudentT,PDF,256,SCALAR,28.596000 +StudentT,PDF,256,VECTORIZED,5.581000 +StudentT,PDF,256,PARALLEL,9.106000 +StudentT,PDF,256,WORK_STEALING,9.209000 +StudentT,LogPDF,256,SCALAR,22.859000 +StudentT,LogPDF,256,VECTORIZED,3.722000 +StudentT,LogPDF,256,PARALLEL,4.917000 +StudentT,LogPDF,256,WORK_STEALING,4.859000 +StudentT,CDF,256,SCALAR,106.645000 +StudentT,CDF,256,VECTORIZED,88.351000 +StudentT,CDF,256,PARALLEL,88.721000 +StudentT,CDF,256,WORK_STEALING,88.394000 +StudentT,PDF,512,SCALAR,56.821000 +StudentT,PDF,512,VECTORIZED,10.407000 +StudentT,PDF,512,PARALLEL,17.799000 +StudentT,PDF,512,WORK_STEALING,17.740000 +StudentT,LogPDF,512,SCALAR,45.558000 +StudentT,LogPDF,512,VECTORIZED,7.019000 +StudentT,LogPDF,512,PARALLEL,9.317000 +StudentT,LogPDF,512,WORK_STEALING,9.301000 +StudentT,CDF,512,SCALAR,211.310000 +StudentT,CDF,512,VECTORIZED,174.892000 +StudentT,CDF,512,PARALLEL,168.854000 +StudentT,CDF,512,WORK_STEALING,169.003000 +StudentT,PDF,1000,SCALAR,107.776000 +StudentT,PDF,1000,VECTORIZED,19.654000 +StudentT,PDF,1000,PARALLEL,33.507000 +StudentT,PDF,1000,WORK_STEALING,33.183000 +StudentT,LogPDF,1000,SCALAR,86.064000 +StudentT,LogPDF,1000,VECTORIZED,12.917000 +StudentT,LogPDF,1000,PARALLEL,16.994000 +StudentT,LogPDF,1000,WORK_STEALING,17.017000 +StudentT,CDF,1000,SCALAR,403.880000 +StudentT,CDF,1000,VECTORIZED,334.739000 +StudentT,CDF,1000,PARALLEL,334.115000 +StudentT,CDF,1000,WORK_STEALING,334.797000 +StudentT,PDF,2000,SCALAR,216.422000 +StudentT,PDF,2000,VECTORIZED,38.523000 +StudentT,PDF,2000,PARALLEL,65.587000 +StudentT,PDF,2000,WORK_STEALING,65.608000 +StudentT,LogPDF,2000,SCALAR,172.765000 +StudentT,LogPDF,2000,VECTORIZED,25.542000 +StudentT,LogPDF,2000,PARALLEL,33.720000 +StudentT,LogPDF,2000,WORK_STEALING,33.537000 +StudentT,CDF,2000,SCALAR,808.536000 +StudentT,CDF,2000,VECTORIZED,669.052000 +StudentT,CDF,2000,PARALLEL,668.523000 +StudentT,CDF,2000,WORK_STEALING,672.188000 +StudentT,PDF,5000,SCALAR,542.172000 +StudentT,PDF,5000,VECTORIZED,98.028000 +StudentT,PDF,5000,PARALLEL,164.676000 +StudentT,PDF,5000,WORK_STEALING,164.665000 +StudentT,LogPDF,5000,SCALAR,406.758000 +StudentT,LogPDF,5000,VECTORIZED,65.763000 +StudentT,LogPDF,5000,PARALLEL,84.007000 +StudentT,LogPDF,5000,WORK_STEALING,84.175000 +StudentT,CDF,5000,SCALAR,2039.758000 +StudentT,CDF,5000,VECTORIZED,1677.093000 +StudentT,CDF,5000,PARALLEL,1675.067000 +StudentT,CDF,5000,WORK_STEALING,1683.329000 +StudentT,PDF,10000,SCALAR,1081.366000 +StudentT,PDF,10000,VECTORIZED,195.096000 +StudentT,PDF,10000,PARALLEL,584.598000 +StudentT,PDF,10000,WORK_STEALING,583.751000 +StudentT,LogPDF,10000,SCALAR,866.203000 +StudentT,LogPDF,10000,VECTORIZED,131.128000 +StudentT,LogPDF,10000,PARALLEL,397.540000 +StudentT,LogPDF,10000,WORK_STEALING,403.633000 +StudentT,CDF,10000,SCALAR,4060.666000 +StudentT,CDF,10000,VECTORIZED,3389.890000 +StudentT,CDF,10000,PARALLEL,3345.139000 +StudentT,CDF,10000,WORK_STEALING,3418.150000 +StudentT,PDF,20000,SCALAR,2172.542000 +StudentT,PDF,20000,VECTORIZED,393.142000 +StudentT,PDF,20000,PARALLEL,800.877000 +StudentT,PDF,20000,WORK_STEALING,790.994000 +StudentT,LogPDF,20000,SCALAR,1734.060000 +StudentT,LogPDF,20000,VECTORIZED,263.697000 +StudentT,LogPDF,20000,PARALLEL,481.335000 +StudentT,LogPDF,20000,WORK_STEALING,494.675000 +StudentT,CDF,20000,SCALAR,8208.717000 +StudentT,CDF,20000,VECTORIZED,6813.721000 +StudentT,CDF,20000,PARALLEL,6823.052000 +StudentT,CDF,20000,WORK_STEALING,6825.536000 +StudentT,PDF,50000,SCALAR,5512.219000 +StudentT,PDF,50000,VECTORIZED,1001.400000 +StudentT,PDF,50000,PARALLEL,1210.417000 +StudentT,PDF,50000,WORK_STEALING,1208.057000 +StudentT,LogPDF,50000,SCALAR,4387.641000 +StudentT,LogPDF,50000,VECTORIZED,710.186000 +StudentT,LogPDF,50000,PARALLEL,739.512000 +StudentT,LogPDF,50000,WORK_STEALING,738.532000 +StudentT,CDF,50000,SCALAR,21188.633000 +StudentT,CDF,50000,VECTORIZED,17565.398000 +StudentT,CDF,50000,PARALLEL,17602.362000 +StudentT,CDF,50000,WORK_STEALING,17599.932000 +StudentT,PDF,100000,SCALAR,11371.579000 +StudentT,PDF,100000,VECTORIZED,2073.694000 +StudentT,PDF,100000,PARALLEL,1330.404000 +StudentT,PDF,100000,WORK_STEALING,1254.442000 +StudentT,LogPDF,100000,SCALAR,9083.086000 +StudentT,LogPDF,100000,VECTORIZED,1415.772000 +StudentT,LogPDF,100000,PARALLEL,793.391000 +StudentT,LogPDF,100000,WORK_STEALING,736.153000 +StudentT,CDF,100000,SCALAR,42454.443000 +StudentT,CDF,100000,VECTORIZED,35276.985000 +StudentT,CDF,100000,PARALLEL,35173.533000 +StudentT,CDF,100000,WORK_STEALING,33989.196000 +StudentT,PDF,250000,SCALAR,27521.818000 +StudentT,PDF,250000,VECTORIZED,5176.539000 +StudentT,PDF,250000,PARALLEL,2324.528000 +StudentT,PDF,250000,WORK_STEALING,2410.880000 +StudentT,LogPDF,250000,SCALAR,22120.386000 +StudentT,LogPDF,250000,VECTORIZED,3532.784000 +StudentT,LogPDF,250000,PARALLEL,1319.596000 +StudentT,LogPDF,250000,WORK_STEALING,1302.958000 +StudentT,CDF,250000,SCALAR,102617.177000 +StudentT,CDF,250000,VECTORIZED,85140.881000 +StudentT,CDF,250000,PARALLEL,85261.530000 +StudentT,CDF,250000,WORK_STEALING,84879.620000 +StudentT,PDF,500000,SCALAR,54990.904000 +StudentT,PDF,500000,VECTORIZED,10525.945000 +StudentT,PDF,500000,PARALLEL,4710.908000 +StudentT,PDF,500000,WORK_STEALING,4707.626000 +StudentT,LogPDF,500000,SCALAR,44361.856000 +StudentT,LogPDF,500000,VECTORIZED,7222.205000 +StudentT,LogPDF,500000,PARALLEL,2581.095000 +StudentT,LogPDF,500000,WORK_STEALING,2484.480000 +StudentT,CDF,500000,SCALAR,206548.826000 +StudentT,CDF,500000,VECTORIZED,170116.467000 +StudentT,CDF,500000,PARALLEL,169793.972000 +StudentT,CDF,500000,WORK_STEALING,169815.252000 +Beta,PDF,8,SCALAR,1.037000 +Beta,PDF,8,VECTORIZED,1.599000 +Beta,PDF,8,PARALLEL,0.958000 +Beta,PDF,8,WORK_STEALING,0.912000 +Beta,LogPDF,8,SCALAR,0.841000 +Beta,LogPDF,8,VECTORIZED,1.330000 +Beta,LogPDF,8,PARALLEL,0.805000 +Beta,LogPDF,8,WORK_STEALING,0.789000 +Beta,CDF,8,SCALAR,2.278000 +Beta,CDF,8,VECTORIZED,1.824000 +Beta,CDF,8,PARALLEL,2.273000 +Beta,CDF,8,WORK_STEALING,2.309000 +Beta,PDF,16,SCALAR,1.967000 +Beta,PDF,16,VECTORIZED,2.055000 +Beta,PDF,16,PARALLEL,1.362000 +Beta,PDF,16,WORK_STEALING,1.458000 +Beta,LogPDF,16,SCALAR,1.614000 +Beta,LogPDF,16,VECTORIZED,1.644000 +Beta,LogPDF,16,PARALLEL,1.134000 +Beta,LogPDF,16,WORK_STEALING,1.134000 +Beta,CDF,16,SCALAR,4.480000 +Beta,CDF,16,VECTORIZED,3.687000 +Beta,CDF,16,PARALLEL,4.576000 +Beta,CDF,16,WORK_STEALING,4.529000 +Beta,PDF,32,SCALAR,3.689000 +Beta,PDF,32,VECTORIZED,2.711000 +Beta,PDF,32,PARALLEL,2.396000 +Beta,PDF,32,WORK_STEALING,2.398000 +Beta,LogPDF,32,SCALAR,3.107000 +Beta,LogPDF,32,VECTORIZED,2.391000 +Beta,LogPDF,32,PARALLEL,1.785000 +Beta,LogPDF,32,WORK_STEALING,1.774000 +Beta,CDF,32,SCALAR,8.273000 +Beta,CDF,32,VECTORIZED,6.567000 +Beta,CDF,32,PARALLEL,8.289000 +Beta,CDF,32,WORK_STEALING,8.297000 +Beta,PDF,64,SCALAR,7.117000 +Beta,PDF,64,VECTORIZED,5.176000 +Beta,PDF,64,PARALLEL,4.325000 +Beta,PDF,64,WORK_STEALING,4.294000 +Beta,LogPDF,64,SCALAR,5.909000 +Beta,LogPDF,64,VECTORIZED,4.471000 +Beta,LogPDF,64,PARALLEL,3.340000 +Beta,LogPDF,64,WORK_STEALING,3.265000 +Beta,CDF,64,SCALAR,15.262000 +Beta,CDF,64,VECTORIZED,12.087000 +Beta,CDF,64,PARALLEL,15.225000 +Beta,CDF,64,WORK_STEALING,15.294000 +Beta,PDF,128,SCALAR,14.715000 +Beta,PDF,128,VECTORIZED,6.006000 +Beta,PDF,128,PARALLEL,8.035000 +Beta,PDF,128,WORK_STEALING,7.800000 +Beta,LogPDF,128,SCALAR,12.210000 +Beta,LogPDF,128,VECTORIZED,5.160000 +Beta,LogPDF,128,PARALLEL,5.605000 +Beta,LogPDF,128,WORK_STEALING,5.695000 +Beta,CDF,128,SCALAR,34.087000 +Beta,CDF,128,VECTORIZED,26.992000 +Beta,CDF,128,PARALLEL,34.312000 +Beta,CDF,128,WORK_STEALING,34.411000 +Beta,PDF,256,SCALAR,29.791000 +Beta,PDF,256,VECTORIZED,10.980000 +Beta,PDF,256,PARALLEL,15.127000 +Beta,PDF,256,WORK_STEALING,15.072000 +Beta,LogPDF,256,SCALAR,24.528000 +Beta,LogPDF,256,VECTORIZED,9.320000 +Beta,LogPDF,256,PARALLEL,10.333000 +Beta,LogPDF,256,WORK_STEALING,10.303000 +Beta,CDF,256,SCALAR,70.395000 +Beta,CDF,256,VECTORIZED,55.603000 +Beta,CDF,256,PARALLEL,70.602000 +Beta,CDF,256,WORK_STEALING,70.613000 +Beta,PDF,512,SCALAR,58.461000 +Beta,PDF,512,VECTORIZED,22.243000 +Beta,PDF,512,PARALLEL,30.360000 +Beta,PDF,512,WORK_STEALING,30.146000 +Beta,LogPDF,512,SCALAR,48.958000 +Beta,LogPDF,512,VECTORIZED,19.206000 +Beta,LogPDF,512,PARALLEL,21.396000 +Beta,LogPDF,512,WORK_STEALING,21.451000 +Beta,CDF,512,SCALAR,133.026000 +Beta,CDF,512,VECTORIZED,104.853000 +Beta,CDF,512,PARALLEL,133.426000 +Beta,CDF,512,WORK_STEALING,133.835000 +Beta,PDF,1000,SCALAR,114.290000 +Beta,PDF,1000,VECTORIZED,43.503000 +Beta,PDF,1000,PARALLEL,59.329000 +Beta,PDF,1000,WORK_STEALING,59.060000 +Beta,LogPDF,1000,SCALAR,95.221000 +Beta,LogPDF,1000,VECTORIZED,37.280000 +Beta,LogPDF,1000,PARALLEL,41.877000 +Beta,LogPDF,1000,WORK_STEALING,41.426000 +Beta,CDF,1000,SCALAR,261.676000 +Beta,CDF,1000,VECTORIZED,204.678000 +Beta,CDF,1000,PARALLEL,260.923000 +Beta,CDF,1000,WORK_STEALING,261.168000 +Beta,PDF,2000,SCALAR,228.795000 +Beta,PDF,2000,VECTORIZED,86.042000 +Beta,PDF,2000,PARALLEL,117.490000 +Beta,PDF,2000,WORK_STEALING,117.510000 +Beta,LogPDF,2000,SCALAR,190.196000 +Beta,LogPDF,2000,VECTORIZED,72.463000 +Beta,LogPDF,2000,PARALLEL,82.983000 +Beta,LogPDF,2000,WORK_STEALING,82.728000 +Beta,CDF,2000,SCALAR,528.214000 +Beta,CDF,2000,VECTORIZED,415.186000 +Beta,CDF,2000,PARALLEL,528.132000 +Beta,CDF,2000,WORK_STEALING,524.447000 +Beta,PDF,5000,SCALAR,571.624000 +Beta,PDF,5000,VECTORIZED,223.072000 +Beta,PDF,5000,PARALLEL,296.640000 +Beta,PDF,5000,WORK_STEALING,294.176000 +Beta,LogPDF,5000,SCALAR,474.843000 +Beta,LogPDF,5000,VECTORIZED,188.490000 +Beta,LogPDF,5000,PARALLEL,210.079000 +Beta,LogPDF,5000,WORK_STEALING,208.948000 +Beta,CDF,5000,SCALAR,1312.694000 +Beta,CDF,5000,VECTORIZED,1031.968000 +Beta,CDF,5000,PARALLEL,1317.140000 +Beta,CDF,5000,WORK_STEALING,1312.706000 +Beta,PDF,10000,SCALAR,1139.424000 +Beta,PDF,10000,VECTORIZED,440.552000 +Beta,PDF,10000,PARALLEL,743.900000 +Beta,PDF,10000,WORK_STEALING,754.032000 +Beta,LogPDF,10000,SCALAR,951.118000 +Beta,LogPDF,10000,VECTORIZED,377.767000 +Beta,LogPDF,10000,PARALLEL,579.703000 +Beta,LogPDF,10000,WORK_STEALING,575.570000 +Beta,CDF,10000,SCALAR,2638.883000 +Beta,CDF,10000,VECTORIZED,2055.770000 +Beta,CDF,10000,PARALLEL,2617.753000 +Beta,CDF,10000,WORK_STEALING,2610.802000 +Beta,PDF,20000,SCALAR,2299.900000 +Beta,PDF,20000,VECTORIZED,895.672000 +Beta,PDF,20000,PARALLEL,1354.222000 +Beta,PDF,20000,WORK_STEALING,1353.656000 +Beta,LogPDF,20000,SCALAR,1919.340000 +Beta,LogPDF,20000,VECTORIZED,766.098000 +Beta,LogPDF,20000,PARALLEL,1028.715000 +Beta,LogPDF,20000,WORK_STEALING,1025.967000 +Beta,CDF,20000,SCALAR,5303.753000 +Beta,CDF,20000,VECTORIZED,4121.391000 +Beta,CDF,20000,PARALLEL,5305.967000 +Beta,CDF,20000,WORK_STEALING,5210.427000 +Beta,PDF,50000,SCALAR,5801.108000 +Beta,PDF,50000,VECTORIZED,2238.532000 +Beta,PDF,50000,PARALLEL,3097.306000 +Beta,PDF,50000,WORK_STEALING,3132.227000 +Beta,LogPDF,50000,SCALAR,4855.107000 +Beta,LogPDF,50000,VECTORIZED,1918.572000 +Beta,LogPDF,50000,PARALLEL,2663.504000 +Beta,LogPDF,50000,WORK_STEALING,2660.711000 +Beta,CDF,50000,SCALAR,13645.976000 +Beta,CDF,50000,VECTORIZED,10509.694000 +Beta,CDF,50000,PARALLEL,13288.983000 +Beta,CDF,50000,WORK_STEALING,13244.898000 +Beta,PDF,100000,SCALAR,11534.897000 +Beta,PDF,100000,VECTORIZED,4499.069000 +Beta,PDF,100000,PARALLEL,7390.987000 +Beta,PDF,100000,WORK_STEALING,6733.591000 +Beta,LogPDF,100000,SCALAR,10055.888000 +Beta,LogPDF,100000,VECTORIZED,3937.124000 +Beta,LogPDF,100000,PARALLEL,5223.059000 +Beta,LogPDF,100000,WORK_STEALING,5292.639000 +Beta,CDF,100000,SCALAR,27187.490000 +Beta,CDF,100000,VECTORIZED,21408.717000 +Beta,CDF,100000,PARALLEL,26603.161000 +Beta,CDF,100000,WORK_STEALING,26540.190000 +Beta,PDF,250000,SCALAR,29528.332000 +Beta,PDF,250000,VECTORIZED,11863.217000 +Beta,PDF,250000,PARALLEL,17156.378000 +Beta,PDF,250000,WORK_STEALING,17336.405000 +Beta,LogPDF,250000,SCALAR,26880.563000 +Beta,LogPDF,250000,VECTORIZED,10306.767000 +Beta,LogPDF,250000,PARALLEL,13671.685000 +Beta,LogPDF,250000,WORK_STEALING,13680.404000 +Beta,CDF,250000,SCALAR,70341.281000 +Beta,CDF,250000,VECTORIZED,55862.159000 +Beta,CDF,250000,PARALLEL,68742.652000 +Beta,CDF,250000,WORK_STEALING,68656.675000 +Beta,PDF,500000,SCALAR,59917.840000 +Beta,PDF,500000,VECTORIZED,24361.364000 +Beta,PDF,500000,PARALLEL,34466.177000 +Beta,PDF,500000,WORK_STEALING,34291.895000 +Beta,LogPDF,500000,SCALAR,54250.002000 +Beta,LogPDF,500000,VECTORIZED,21277.706000 +Beta,LogPDF,500000,PARALLEL,26340.200000 +Beta,LogPDF,500000,WORK_STEALING,26255.335000 +Beta,CDF,500000,SCALAR,136394.416000 +Beta,CDF,500000,VECTORIZED,108210.939000 +Beta,CDF,500000,PARALLEL,133025.700000 +Beta,CDF,500000,WORK_STEALING,137383.802000 +ChiSquared,PDF,8,SCALAR,1.427000 +ChiSquared,PDF,8,VECTORIZED,1.223000 +ChiSquared,PDF,8,PARALLEL,0.492000 +ChiSquared,PDF,8,WORK_STEALING,0.542000 +ChiSquared,LogPDF,8,SCALAR,0.839000 +ChiSquared,LogPDF,8,VECTORIZED,1.075000 +ChiSquared,LogPDF,8,PARALLEL,0.319000 +ChiSquared,LogPDF,8,WORK_STEALING,0.385000 +ChiSquared,CDF,8,SCALAR,1.518000 +ChiSquared,CDF,8,VECTORIZED,1.562000 +ChiSquared,CDF,8,PARALLEL,0.928000 +ChiSquared,CDF,8,WORK_STEALING,1.030000 +ChiSquared,PDF,16,SCALAR,2.848000 +ChiSquared,PDF,16,VECTORIZED,1.349000 +ChiSquared,PDF,16,PARALLEL,0.772000 +ChiSquared,PDF,16,WORK_STEALING,0.857000 +ChiSquared,LogPDF,16,SCALAR,1.498000 +ChiSquared,LogPDF,16,VECTORIZED,1.200000 +ChiSquared,LogPDF,16,PARALLEL,0.478000 +ChiSquared,LogPDF,16,WORK_STEALING,0.519000 +ChiSquared,CDF,16,SCALAR,2.977000 +ChiSquared,CDF,16,VECTORIZED,2.748000 +ChiSquared,CDF,16,PARALLEL,1.920000 +ChiSquared,CDF,16,WORK_STEALING,1.985000 +ChiSquared,PDF,32,SCALAR,5.558000 +ChiSquared,PDF,32,VECTORIZED,1.740000 +ChiSquared,PDF,32,PARALLEL,1.428000 +ChiSquared,PDF,32,WORK_STEALING,1.448000 +ChiSquared,LogPDF,32,SCALAR,2.954000 +ChiSquared,LogPDF,32,VECTORIZED,1.491000 +ChiSquared,LogPDF,32,PARALLEL,0.804000 +ChiSquared,LogPDF,32,WORK_STEALING,0.805000 +ChiSquared,CDF,32,SCALAR,5.915000 +ChiSquared,CDF,32,VECTORIZED,4.370000 +ChiSquared,CDF,32,PARALLEL,3.513000 +ChiSquared,CDF,32,WORK_STEALING,3.518000 +ChiSquared,PDF,64,SCALAR,10.809000 +ChiSquared,PDF,64,VECTORIZED,2.106000 +ChiSquared,PDF,64,PARALLEL,2.685000 +ChiSquared,PDF,64,WORK_STEALING,2.729000 +ChiSquared,LogPDF,64,SCALAR,5.811000 +ChiSquared,LogPDF,64,VECTORIZED,1.728000 +ChiSquared,LogPDF,64,PARALLEL,1.419000 +ChiSquared,LogPDF,64,WORK_STEALING,1.431000 +ChiSquared,CDF,64,SCALAR,11.891000 +ChiSquared,CDF,64,VECTORIZED,7.823000 +ChiSquared,CDF,64,PARALLEL,7.138000 +ChiSquared,CDF,64,WORK_STEALING,7.199000 +ChiSquared,PDF,128,SCALAR,21.874000 +ChiSquared,PDF,128,VECTORIZED,3.180000 +ChiSquared,PDF,128,PARALLEL,5.155000 +ChiSquared,PDF,128,WORK_STEALING,5.241000 +ChiSquared,LogPDF,128,SCALAR,11.219000 +ChiSquared,LogPDF,128,VECTORIZED,2.348000 +ChiSquared,LogPDF,128,PARALLEL,2.684000 +ChiSquared,LogPDF,128,WORK_STEALING,2.586000 +ChiSquared,CDF,128,SCALAR,23.740000 +ChiSquared,CDF,128,VECTORIZED,14.334000 +ChiSquared,CDF,128,PARALLEL,14.178000 +ChiSquared,CDF,128,WORK_STEALING,14.215000 +ChiSquared,PDF,256,SCALAR,43.672000 +ChiSquared,PDF,256,VECTORIZED,5.637000 +ChiSquared,PDF,256,PARALLEL,10.162000 +ChiSquared,PDF,256,WORK_STEALING,10.229000 +ChiSquared,LogPDF,256,SCALAR,22.606000 +ChiSquared,LogPDF,256,VECTORIZED,4.109000 +ChiSquared,LogPDF,256,PARALLEL,5.203000 +ChiSquared,LogPDF,256,WORK_STEALING,4.915000 +ChiSquared,CDF,256,SCALAR,47.412000 +ChiSquared,CDF,256,VECTORIZED,28.998000 +ChiSquared,CDF,256,PARALLEL,28.640000 +ChiSquared,CDF,256,WORK_STEALING,28.433000 +ChiSquared,PDF,512,SCALAR,87.323000 +ChiSquared,PDF,512,VECTORIZED,10.682000 +ChiSquared,PDF,512,PARALLEL,20.041000 +ChiSquared,PDF,512,WORK_STEALING,19.970000 +ChiSquared,LogPDF,512,SCALAR,44.971000 +ChiSquared,LogPDF,512,VECTORIZED,7.404000 +ChiSquared,LogPDF,512,PARALLEL,10.257000 +ChiSquared,LogPDF,512,WORK_STEALING,9.559000 +ChiSquared,CDF,512,SCALAR,94.417000 +ChiSquared,CDF,512,VECTORIZED,57.812000 +ChiSquared,CDF,512,PARALLEL,57.725000 +ChiSquared,CDF,512,WORK_STEALING,57.368000 +ChiSquared,PDF,1000,SCALAR,170.551000 +ChiSquared,PDF,1000,VECTORIZED,20.383000 +ChiSquared,PDF,1000,PARALLEL,39.028000 +ChiSquared,PDF,1000,WORK_STEALING,38.749000 +ChiSquared,LogPDF,1000,SCALAR,88.035000 +ChiSquared,LogPDF,1000,VECTORIZED,14.023000 +ChiSquared,LogPDF,1000,PARALLEL,19.719000 +ChiSquared,LogPDF,1000,WORK_STEALING,18.480000 +ChiSquared,CDF,1000,SCALAR,185.249000 +ChiSquared,CDF,1000,VECTORIZED,115.506000 +ChiSquared,CDF,1000,PARALLEL,114.997000 +ChiSquared,CDF,1000,WORK_STEALING,114.515000 +ChiSquared,PDF,2000,SCALAR,341.840000 +ChiSquared,PDF,2000,VECTORIZED,40.817000 +ChiSquared,PDF,2000,PARALLEL,77.592000 +ChiSquared,PDF,2000,WORK_STEALING,77.248000 +ChiSquared,LogPDF,2000,SCALAR,176.214000 +ChiSquared,LogPDF,2000,VECTORIZED,28.153000 +ChiSquared,LogPDF,2000,PARALLEL,39.213000 +ChiSquared,LogPDF,2000,WORK_STEALING,36.797000 +ChiSquared,CDF,2000,SCALAR,371.916000 +ChiSquared,CDF,2000,VECTORIZED,235.901000 +ChiSquared,CDF,2000,PARALLEL,234.523000 +ChiSquared,CDF,2000,WORK_STEALING,231.114000 +ChiSquared,PDF,5000,SCALAR,855.697000 +ChiSquared,PDF,5000,VECTORIZED,103.628000 +ChiSquared,PDF,5000,PARALLEL,333.975000 +ChiSquared,PDF,5000,WORK_STEALING,214.486000 +ChiSquared,LogPDF,5000,SCALAR,442.043000 +ChiSquared,LogPDF,5000,VECTORIZED,72.013000 +ChiSquared,LogPDF,5000,PARALLEL,230.630000 +ChiSquared,LogPDF,5000,WORK_STEALING,168.750000 +ChiSquared,CDF,5000,SCALAR,931.059000 +ChiSquared,CDF,5000,VECTORIZED,592.076000 +ChiSquared,CDF,5000,PARALLEL,723.990000 +ChiSquared,CDF,5000,WORK_STEALING,383.755000 +ChiSquared,PDF,10000,SCALAR,1707.995000 +ChiSquared,PDF,10000,VECTORIZED,206.983000 +ChiSquared,PDF,10000,PARALLEL,534.753000 +ChiSquared,PDF,10000,WORK_STEALING,258.106000 +ChiSquared,LogPDF,10000,SCALAR,883.501000 +ChiSquared,LogPDF,10000,VECTORIZED,144.790000 +ChiSquared,LogPDF,10000,PARALLEL,333.037000 +ChiSquared,LogPDF,10000,WORK_STEALING,196.366000 +ChiSquared,CDF,10000,SCALAR,1865.907000 +ChiSquared,CDF,10000,VECTORIZED,1179.908000 +ChiSquared,CDF,10000,PARALLEL,1317.084000 +ChiSquared,CDF,10000,WORK_STEALING,406.415000 +ChiSquared,PDF,20000,SCALAR,3456.736000 +ChiSquared,PDF,20000,VECTORIZED,415.075000 +ChiSquared,PDF,20000,PARALLEL,936.874000 +ChiSquared,PDF,20000,WORK_STEALING,377.527000 +ChiSquared,LogPDF,20000,SCALAR,1770.347000 +ChiSquared,LogPDF,20000,VECTORIZED,289.590000 +ChiSquared,LogPDF,20000,PARALLEL,549.121000 +ChiSquared,LogPDF,20000,WORK_STEALING,266.186000 +ChiSquared,CDF,20000,SCALAR,3735.991000 +ChiSquared,CDF,20000,VECTORIZED,2414.656000 +ChiSquared,CDF,20000,PARALLEL,2525.131000 +ChiSquared,CDF,20000,WORK_STEALING,633.810000 +ChiSquared,PDF,50000,SCALAR,8768.906000 +ChiSquared,PDF,50000,VECTORIZED,1037.996000 +ChiSquared,PDF,50000,PARALLEL,1412.632000 +ChiSquared,PDF,50000,WORK_STEALING,556.281000 +ChiSquared,LogPDF,50000,SCALAR,4289.635000 +ChiSquared,LogPDF,50000,VECTORIZED,708.823000 +ChiSquared,LogPDF,50000,PARALLEL,784.367000 +ChiSquared,LogPDF,50000,WORK_STEALING,325.851000 +ChiSquared,CDF,50000,SCALAR,9178.469000 +ChiSquared,CDF,50000,VECTORIZED,5830.911000 +ChiSquared,CDF,50000,PARALLEL,3901.768000 +ChiSquared,CDF,50000,WORK_STEALING,1137.076000 +ChiSquared,PDF,100000,SCALAR,17466.450000 +ChiSquared,PDF,100000,VECTORIZED,2014.483000 +ChiSquared,PDF,100000,PARALLEL,1407.913000 +ChiSquared,PDF,100000,WORK_STEALING,693.799000 +ChiSquared,LogPDF,100000,SCALAR,8666.265000 +ChiSquared,LogPDF,100000,VECTORIZED,1399.352000 +ChiSquared,LogPDF,100000,PARALLEL,790.514000 +ChiSquared,LogPDF,100000,WORK_STEALING,466.344000 +ChiSquared,CDF,100000,SCALAR,18474.985000 +ChiSquared,CDF,100000,VECTORIZED,11649.111000 +ChiSquared,CDF,100000,PARALLEL,4300.780000 +ChiSquared,CDF,100000,WORK_STEALING,2166.615000 +ChiSquared,PDF,250000,SCALAR,44439.148000 +ChiSquared,PDF,250000,VECTORIZED,5223.320000 +ChiSquared,PDF,250000,PARALLEL,2713.922000 +ChiSquared,PDF,250000,WORK_STEALING,1425.777000 +ChiSquared,LogPDF,250000,SCALAR,21812.825000 +ChiSquared,LogPDF,250000,VECTORIZED,3717.870000 +ChiSquared,LogPDF,250000,PARALLEL,1468.757000 +ChiSquared,LogPDF,250000,WORK_STEALING,854.389000 +ChiSquared,CDF,250000,SCALAR,46089.342000 +ChiSquared,CDF,250000,VECTORIZED,29105.854000 +ChiSquared,CDF,250000,PARALLEL,8053.222000 +ChiSquared,CDF,250000,WORK_STEALING,4681.841000 +ChiSquared,PDF,500000,SCALAR,89011.909000 +ChiSquared,PDF,500000,VECTORIZED,11348.492000 +ChiSquared,PDF,500000,PARALLEL,6347.036000 +ChiSquared,PDF,500000,WORK_STEALING,2833.422000 +ChiSquared,LogPDF,500000,SCALAR,45162.959000 +ChiSquared,LogPDF,500000,VECTORIZED,8176.144000 +ChiSquared,LogPDF,500000,PARALLEL,2883.237000 +ChiSquared,LogPDF,500000,WORK_STEALING,1734.745000 +ChiSquared,CDF,500000,SCALAR,94949.249000 +ChiSquared,CDF,500000,VECTORIZED,58402.911000 +ChiSquared,CDF,500000,PARALLEL,15998.711000 +ChiSquared,CDF,500000,WORK_STEALING,9477.315000 diff --git a/data/profiles/dispatcher/2026-04-12T05-55-52Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-e75c6e3/summary.json b/data/profiles/dispatcher/2026-04-12T05-55-52Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-e75c6e3/summary.json new file mode 100644 index 0000000..e0c2be1 --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T05-55-52Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-e75c6e3/summary.json @@ -0,0 +1,183 @@ +{ + "run_id": "2026-04-12T05-55-52Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-e75c6e3", + "data_source": "strategy_profile_results.csv", + "metadata": { + "captured_at_utc": "2026-04-12T05-55-52Z", + "run_id": "2026-04-12T05-55-52Z_darwin-x86_64_investigate-gaussian-avx512-perf_sha-e75c6e3", + "git_branch": "investigate-gaussian-avx512-perf", + "git_sha": "e75c6e3", + "project_root": "/Users/wolfman/Development/libstats", + "build_dir": "/Users/wolfman/Development/libstats/build", + "build_type": "Release", + "cxx_compiler": "", + "os": "darwin", + "arch": "x86_64", + "cpu_brand": "Intel(R) Core(TM) i7-3820QM CPU @ 2.70GHz", + "physical_cores": "4", + "logical_cores": "8" + }, + "coverage": { + "distributions": [ + "Beta", + "ChiSquared", + "Discrete", + "Exponential", + "Gamma", + "Gaussian", + "Poisson", + "StudentT", + "Uniform" + ], + "operations": [ + "CDF", + "LogPDF", + "PDF" + ], + "batch_sizes": [ + 8, + 16, + 32, + 64, + 128, + 256, + 512, + 1000, + 2000, + 5000, + 10000, + 20000, + 50000, + 100000, + 250000, + 500000 + ], + "total_measurements": 1728 + }, + "strategy_win_counts": { + "VECTORIZED": 223, + "WORK_STEALING": 140, + "PARALLEL": 65, + "SCALAR": 4 + }, + "crossover_summary": { + "groups": 27, + "vectorized_never_wins": [], + "parallel_crossover_sizes": [ + { + "distribution": "Beta", + "operation": "LogPDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Beta", + "operation": "PDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "ChiSquared", + "operation": "CDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "ChiSquared", + "operation": "LogPDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "ChiSquared", + "operation": "PDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Discrete", + "operation": "CDF", + "vectorized_to_parallel": 100000 + }, + { + "distribution": "Discrete", + "operation": "LogPDF", + "vectorized_to_parallel": 100000 + }, + { + "distribution": "Discrete", + "operation": "PDF", + "vectorized_to_parallel": 128 + }, + { + "distribution": "Exponential", + "operation": "CDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Exponential", + "operation": "LogPDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Exponential", + "operation": "PDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Gamma", + "operation": "CDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Gamma", + "operation": "LogPDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Gamma", + "operation": "PDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Gaussian", + "operation": "LogPDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Gaussian", + "operation": "PDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Poisson", + "operation": "CDF", + "vectorized_to_parallel": 64 + }, + { + "distribution": "Poisson", + "operation": "LogPDF", + "vectorized_to_parallel": 50000 + }, + { + "distribution": "Poisson", + "operation": "PDF", + "vectorized_to_parallel": 2000 + }, + { + "distribution": "StudentT", + "operation": "CDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "StudentT", + "operation": "LogPDF", + "vectorized_to_parallel": 100000 + }, + { + "distribution": "StudentT", + "operation": "PDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Uniform", + "operation": "CDF", + "vectorized_to_parallel": 8 + } + ] + } +} diff --git a/data/profiles/dispatcher/2026-04-12T06-02-56Z_windows-x86_64_investigate-gaussian-avx512-perf_sha-32c0819/best_strategies.csv b/data/profiles/dispatcher/2026-04-12T06-02-56Z_windows-x86_64_investigate-gaussian-avx512-perf_sha-32c0819/best_strategies.csv new file mode 100644 index 0000000..dcaa9c9 --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T06-02-56Z_windows-x86_64_investigate-gaussian-avx512-perf_sha-32c0819/best_strategies.csv @@ -0,0 +1,433 @@ +distribution,operation,batch_size,best_strategy,best_time_us,scalar_time_us,speedup_vs_scalar +Beta,CDF,8,VECTORIZED,1.0,1.1,1.1 +Beta,CDF,16,VECTORIZED,2.3,2.4,1.043 +Beta,CDF,32,VECTORIZED,4.4,4.7,1.068 +Beta,CDF,64,VECTORIZED,8.0,8.9,1.113 +Beta,CDF,128,WORK_STEALING,12.6,18.9,1.5 +Beta,CDF,256,VECTORIZED,23.1,25.6,1.108 +Beta,CDF,512,VECTORIZED,46.2,51.0,1.104 +Beta,CDF,1000,VECTORIZED,90.5,98.9,1.093 +Beta,CDF,2000,VECTORIZED,185.6,203.5,1.096 +Beta,CDF,5000,VECTORIZED,461.1,501.7,1.088 +Beta,CDF,10000,VECTORIZED,944.3,1013.0,1.073 +Beta,CDF,20000,VECTORIZED,1918.2,2066.0,1.077 +Beta,CDF,50000,VECTORIZED,4867.0,5197.1,1.068 +Beta,CDF,100000,VECTORIZED,9742.3,10620.1,1.09 +Beta,CDF,250000,VECTORIZED,24705.2,26920.1,1.09 +Beta,CDF,500000,SCALAR,53781.2,53781.2,1.0 +Beta,LogPDF,8,SCALAR,0.1,0.1,1.0 +Beta,LogPDF,16,WORK_STEALING,0.3,0.5,1.667 +Beta,LogPDF,32,VECTORIZED,0.5,0.6,1.2 +Beta,LogPDF,64,VECTORIZED,1.2,1.8,1.5 +Beta,LogPDF,128,VECTORIZED,1.3,2.1,1.615 +Beta,LogPDF,256,VECTORIZED,1.5,2.8,1.867 +Beta,LogPDF,512,VECTORIZED,2.8,5.5,1.964 +Beta,LogPDF,1000,VECTORIZED,5.2,10.8,2.077 +Beta,LogPDF,2000,VECTORIZED,10.2,21.6,2.118 +Beta,LogPDF,5000,VECTORIZED,25.8,53.7,2.081 +Beta,LogPDF,10000,VECTORIZED,101.4,186.8,1.842 +Beta,LogPDF,20000,VECTORIZED,105.5,238.5,2.261 +Beta,LogPDF,50000,VECTORIZED,284.5,751.5,2.641 +Beta,LogPDF,100000,VECTORIZED,980.6,1266.9,1.292 +Beta,LogPDF,250000,VECTORIZED,2005.5,3220.7,1.606 +Beta,LogPDF,500000,VECTORIZED,4433.4,6434.6,1.451 +Beta,PDF,8,SCALAR,0.2,0.2,1.0 +Beta,PDF,16,VECTORIZED,0.4,0.5,1.25 +Beta,PDF,32,VECTORIZED,0.6,0.8,1.333 +Beta,PDF,64,VECTORIZED,1.5,2.4,1.6 +Beta,PDF,128,VECTORIZED,1.6,3.0,1.875 +Beta,PDF,256,VECTORIZED,1.8,4.1,2.278 +Beta,PDF,512,VECTORIZED,3.5,8.0,2.286 +Beta,PDF,1000,VECTORIZED,6.6,15.6,2.364 +Beta,PDF,2000,VECTORIZED,12.8,31.6,2.469 +Beta,PDF,5000,VECTORIZED,32.5,78.4,2.412 +Beta,PDF,10000,VECTORIZED,65.0,157.3,2.42 +Beta,PDF,20000,VECTORIZED,181.1,341.5,1.886 +Beta,PDF,50000,VECTORIZED,369.3,848.0,2.296 +Beta,PDF,100000,VECTORIZED,835.5,1741.7,2.085 +Beta,PDF,250000,VECTORIZED,2744.4,4498.8,1.639 +Beta,PDF,500000,VECTORIZED,5384.4,9004.9,1.672 +ChiSquared,CDF,8,WORK_STEALING,0.3,0.4,1.333 +ChiSquared,CDF,16,PARALLEL,0.7,0.8,1.143 +ChiSquared,CDF,32,VECTORIZED,1.4,1.6,1.143 +ChiSquared,CDF,64,VECTORIZED,2.7,3.0,1.111 +ChiSquared,CDF,128,PARALLEL,5.3,6.1,1.151 +ChiSquared,CDF,256,PARALLEL,10.4,12.1,1.163 +ChiSquared,CDF,512,PARALLEL,20.8,24.2,1.163 +ChiSquared,CDF,1000,WORK_STEALING,40.6,47.1,1.16 +ChiSquared,CDF,2000,VECTORIZED,82.0,94.3,1.15 +ChiSquared,CDF,5000,WORK_STEALING,227.3,254.7,1.121 +ChiSquared,CDF,10000,PARALLEL,163.4,535.2,3.275 +ChiSquared,CDF,20000,PARALLEL,280.7,1044.8,3.722 +ChiSquared,CDF,50000,PARALLEL,592.0,2686.0,4.537 +ChiSquared,CDF,100000,PARALLEL,1022.5,5470.9,5.351 +ChiSquared,CDF,250000,PARALLEL,3035.4,13978.1,4.605 +ChiSquared,CDF,500000,PARALLEL,6003.6,27969.0,4.659 +ChiSquared,LogPDF,8,SCALAR,0.1,0.1,1.0 +ChiSquared,LogPDF,16,PARALLEL,0.1,0.2,2.0 +ChiSquared,LogPDF,32,WORK_STEALING,0.1,0.3,3.0 +ChiSquared,LogPDF,64,WORK_STEALING,0.2,0.5,2.5 +ChiSquared,LogPDF,128,VECTORIZED,0.5,1.1,2.2 +ChiSquared,LogPDF,256,VECTORIZED,0.9,2.1,2.333 +ChiSquared,LogPDF,512,VECTORIZED,1.3,4.3,3.308 +ChiSquared,LogPDF,1000,VECTORIZED,2.5,8.4,3.36 +ChiSquared,LogPDF,2000,VECTORIZED,4.8,16.6,3.458 +ChiSquared,LogPDF,5000,VECTORIZED,12.1,41.7,3.446 +ChiSquared,LogPDF,10000,VECTORIZED,36.6,125.4,3.426 +ChiSquared,LogPDF,20000,VECTORIZED,48.0,167.4,3.488 +ChiSquared,LogPDF,50000,WORK_STEALING,112.4,753.6,6.705 +ChiSquared,LogPDF,100000,PARALLEL,142.9,851.1,5.956 +ChiSquared,LogPDF,250000,PARALLEL,344.4,2148.7,6.239 +ChiSquared,LogPDF,500000,PARALLEL,551.4,4438.6,8.05 +ChiSquared,PDF,8,VECTORIZED,0.1,0.2,2.0 +ChiSquared,PDF,16,VECTORIZED,0.2,0.4,2.0 +ChiSquared,PDF,32,VECTORIZED,0.3,0.7,2.333 +ChiSquared,PDF,64,VECTORIZED,0.4,1.4,3.5 +ChiSquared,PDF,128,VECTORIZED,0.7,2.7,3.857 +ChiSquared,PDF,256,VECTORIZED,1.1,5.4,4.909 +ChiSquared,PDF,512,VECTORIZED,2.0,10.8,5.4 +ChiSquared,PDF,1000,VECTORIZED,3.7,21.0,5.676 +ChiSquared,PDF,2000,VECTORIZED,7.4,42.1,5.689 +ChiSquared,PDF,5000,VECTORIZED,23.4,168.6,7.205 +ChiSquared,PDF,10000,VECTORIZED,37.2,213.6,5.742 +ChiSquared,PDF,20000,VECTORIZED,74.8,423.1,5.656 +ChiSquared,PDF,50000,PARALLEL,127.9,1068.9,8.357 +ChiSquared,PDF,100000,PARALLEL,208.7,2160.8,10.354 +ChiSquared,PDF,250000,PARALLEL,522.6,5528.4,10.579 +ChiSquared,PDF,500000,PARALLEL,998.0,11167.5,11.19 +Discrete,CDF,8,WORK_STEALING,0.0,0.1, +Discrete,CDF,16,VECTORIZED,0.0,0.1, +Discrete,CDF,32,VECTORIZED,0.1,0.2,2.0 +Discrete,CDF,64,VECTORIZED,0.1,0.4,4.0 +Discrete,CDF,128,VECTORIZED,0.2,0.8,4.0 +Discrete,CDF,256,VECTORIZED,0.3,1.5,5.0 +Discrete,CDF,512,VECTORIZED,0.5,2.8,5.6 +Discrete,CDF,1000,VECTORIZED,1.0,5.6,5.6 +Discrete,CDF,2000,VECTORIZED,2.0,11.1,5.55 +Discrete,CDF,5000,VECTORIZED,5.0,27.9,5.58 +Discrete,CDF,10000,VECTORIZED,16.6,86.8,5.229 +Discrete,CDF,20000,VECTORIZED,26.6,118.7,4.462 +Discrete,CDF,50000,WORK_STEALING,55.3,458.6,8.293 +Discrete,CDF,100000,PARALLEL,119.0,623.2,5.237 +Discrete,CDF,250000,PARALLEL,172.1,1597.0,9.279 +Discrete,CDF,500000,PARALLEL,318.9,3439.1,10.784 +Discrete,LogPDF,8,VECTORIZED,0.0,0.1, +Discrete,LogPDF,16,VECTORIZED,0.0,0.1, +Discrete,LogPDF,32,VECTORIZED,0.0,0.2, +Discrete,LogPDF,64,VECTORIZED,0.0,0.4, +Discrete,LogPDF,128,VECTORIZED,0.1,0.8,8.0 +Discrete,LogPDF,256,VECTORIZED,0.2,1.6,8.0 +Discrete,LogPDF,512,VECTORIZED,0.5,3.1,6.2 +Discrete,LogPDF,1000,VECTORIZED,0.9,6.3,7.0 +Discrete,LogPDF,2000,VECTORIZED,1.7,12.0,7.059 +Discrete,LogPDF,5000,VECTORIZED,4.2,30.4,7.238 +Discrete,LogPDF,10000,VECTORIZED,8.5,90.9,10.694 +Discrete,LogPDF,20000,VECTORIZED,16.9,121.8,7.207 +Discrete,LogPDF,50000,VECTORIZED,42.5,334.4,7.868 +Discrete,LogPDF,100000,VECTORIZED,84.6,613.0,7.246 +Discrete,LogPDF,250000,PARALLEL,157.2,1857.6,11.817 +Discrete,LogPDF,500000,PARALLEL,260.7,3046.6,11.686 +Discrete,PDF,8,VECTORIZED,0.0,0.1, +Discrete,PDF,16,VECTORIZED,0.0,0.1, +Discrete,PDF,32,VECTORIZED,0.1,0.2,2.0 +Discrete,PDF,64,VECTORIZED,0.0,0.4, +Discrete,PDF,128,PARALLEL,0.1,0.9,9.0 +Discrete,PDF,256,VECTORIZED,0.2,1.6,8.0 +Discrete,PDF,512,VECTORIZED,0.5,3.2,6.4 +Discrete,PDF,1000,VECTORIZED,0.9,6.5,7.222 +Discrete,PDF,2000,VECTORIZED,1.7,13.2,7.765 +Discrete,PDF,5000,VECTORIZED,4.3,31.4,7.302 +Discrete,PDF,10000,VECTORIZED,8.5,63.7,7.494 +Discrete,PDF,20000,VECTORIZED,27.7,125.8,4.542 +Discrete,PDF,50000,PARALLEL,56.4,412.8,7.319 +Discrete,PDF,100000,PARALLEL,65.4,631.1,9.65 +Discrete,PDF,250000,PARALLEL,129.9,1559.9,12.008 +Discrete,PDF,500000,PARALLEL,183.9,4138.2,22.502 +Exponential,CDF,8,VECTORIZED,0.0,0.1, +Exponential,CDF,16,VECTORIZED,0.1,0.2,2.0 +Exponential,CDF,32,VECTORIZED,0.2,0.4,2.0 +Exponential,CDF,64,VECTORIZED,0.3,0.7,2.333 +Exponential,CDF,128,VECTORIZED,0.5,1.5,3.0 +Exponential,CDF,256,VECTORIZED,0.8,2.9,3.625 +Exponential,CDF,512,VECTORIZED,1.0,3.9,3.9 +Exponential,CDF,1000,VECTORIZED,1.8,7.6,4.222 +Exponential,CDF,2000,VECTORIZED,3.7,15.2,4.108 +Exponential,CDF,5000,VECTORIZED,9.3,38.1,4.097 +Exponential,CDF,10000,VECTORIZED,27.8,196.4,7.065 +Exponential,CDF,20000,WORK_STEALING,47.9,228.9,4.779 +Exponential,CDF,50000,VECTORIZED,93.4,535.0,5.728 +Exponential,CDF,100000,PARALLEL,147.3,769.4,5.223 +Exponential,CDF,250000,WORK_STEALING,212.6,1932.2,9.088 +Exponential,CDF,500000,PARALLEL,411.1,4310.9,10.486 +Exponential,LogPDF,8,PARALLEL,0.0,0.1, +Exponential,LogPDF,16,PARALLEL,0.0,0.2, +Exponential,LogPDF,32,PARALLEL,0.0,0.3, +Exponential,LogPDF,64,VECTORIZED,0.1,0.5,5.0 +Exponential,LogPDF,128,VECTORIZED,0.1,1.1,11.0 +Exponential,LogPDF,256,VECTORIZED,0.2,2.0,10.0 +Exponential,LogPDF,512,VECTORIZED,0.2,2.7,13.5 +Exponential,LogPDF,1000,VECTORIZED,0.5,5.3,10.6 +Exponential,LogPDF,2000,VECTORIZED,1.0,10.6,10.6 +Exponential,LogPDF,5000,VECTORIZED,2.5,26.9,10.76 +Exponential,LogPDF,10000,VECTORIZED,5.0,53.6,10.72 +Exponential,LogPDF,20000,VECTORIZED,10.1,105.7,10.465 +Exponential,LogPDF,50000,VECTORIZED,25.0,268.4,10.736 +Exponential,LogPDF,100000,VECTORIZED,54.3,534.1,9.836 +Exponential,LogPDF,250000,WORK_STEALING,130.5,1329.4,10.187 +Exponential,LogPDF,500000,PARALLEL,138.7,2688.3,19.382 +Exponential,PDF,8,VECTORIZED,0.1,0.3,3.0 +Exponential,PDF,16,VECTORIZED,0.1,0.2,2.0 +Exponential,PDF,32,VECTORIZED,0.1,0.4,4.0 +Exponential,PDF,64,VECTORIZED,0.2,0.8,4.0 +Exponential,PDF,128,VECTORIZED,0.4,1.5,3.75 +Exponential,PDF,256,VECTORIZED,0.8,3.0,3.75 +Exponential,PDF,512,VECTORIZED,0.9,3.9,4.333 +Exponential,PDF,1000,VECTORIZED,1.8,7.6,4.222 +Exponential,PDF,2000,VECTORIZED,3.5,15.2,4.343 +Exponential,PDF,5000,VECTORIZED,9.0,38.1,4.233 +Exponential,PDF,10000,VECTORIZED,17.9,76.5,4.274 +Exponential,PDF,20000,VECTORIZED,44.2,177.8,4.023 +Exponential,PDF,50000,PARALLEL,77.3,381.0,4.929 +Exponential,PDF,100000,PARALLEL,120.0,763.6,6.363 +Exponential,PDF,250000,PARALLEL,258.9,1928.6,7.449 +Exponential,PDF,500000,PARALLEL,426.8,4075.6,9.549 +Gamma,CDF,8,PARALLEL,0.4,0.6,1.5 +Gamma,CDF,16,PARALLEL,1.0,1.2,1.2 +Gamma,CDF,32,PARALLEL,2.0,2.3,1.15 +Gamma,CDF,64,PARALLEL,3.9,4.5,1.154 +Gamma,CDF,128,PARALLEL,7.7,9.0,1.169 +Gamma,CDF,256,PARALLEL,10.2,17.9,1.755 +Gamma,CDF,512,PARALLEL,20.5,23.9,1.166 +Gamma,CDF,1000,WORK_STEALING,39.7,47.0,1.184 +Gamma,CDF,2000,WORK_STEALING,79.7,93.9,1.178 +Gamma,CDF,5000,WORK_STEALING,220.6,252.5,1.145 +Gamma,CDF,10000,PARALLEL,145.1,536.3,3.696 +Gamma,CDF,20000,PARALLEL,256.8,1088.4,4.238 +Gamma,CDF,50000,PARALLEL,618.4,2852.4,4.613 +Gamma,CDF,100000,PARALLEL,1032.3,5343.4,5.176 +Gamma,CDF,250000,PARALLEL,2586.1,13709.6,5.301 +Gamma,CDF,500000,PARALLEL,5087.1,28113.7,5.526 +Gamma,LogPDF,8,VECTORIZED,0.1,0.2,2.0 +Gamma,LogPDF,16,PARALLEL,0.1,0.3,3.0 +Gamma,LogPDF,32,PARALLEL,0.2,0.4,2.0 +Gamma,LogPDF,64,PARALLEL,0.4,0.9,2.25 +Gamma,LogPDF,128,VECTORIZED,0.7,1.6,2.286 +Gamma,LogPDF,256,VECTORIZED,1.1,3.2,2.909 +Gamma,LogPDF,512,VECTORIZED,1.4,4.3,3.071 +Gamma,LogPDF,1000,VECTORIZED,3.7,12.7,3.432 +Gamma,LogPDF,2000,VECTORIZED,4.8,16.9,3.521 +Gamma,LogPDF,5000,VECTORIZED,12.1,42.3,3.496 +Gamma,LogPDF,10000,VECTORIZED,24.1,212.9,8.834 +Gamma,LogPDF,20000,VECTORIZED,48.0,173.8,3.621 +Gamma,LogPDF,50000,PARALLEL,83.8,424.0,5.06 +Gamma,LogPDF,100000,PARALLEL,148.7,913.7,6.145 +Gamma,LogPDF,250000,PARALLEL,346.1,2133.0,6.163 +Gamma,LogPDF,500000,PARALLEL,664.7,4298.3,6.467 +Gamma,PDF,8,VECTORIZED,0.1,0.3,3.0 +Gamma,PDF,16,VECTORIZED,0.3,0.5,1.667 +Gamma,PDF,32,VECTORIZED,0.4,1.1,2.75 +Gamma,PDF,64,VECTORIZED,0.7,2.1,3.0 +Gamma,PDF,128,VECTORIZED,1.0,4.1,4.1 +Gamma,PDF,256,VECTORIZED,1.7,8.1,4.765 +Gamma,PDF,512,VECTORIZED,2.1,10.8,5.143 +Gamma,PDF,1000,VECTORIZED,5.7,31.8,5.579 +Gamma,PDF,2000,VECTORIZED,7.5,42.1,5.613 +Gamma,PDF,5000,VECTORIZED,18.6,106.1,5.704 +Gamma,PDF,10000,VECTORIZED,37.0,211.1,5.705 +Gamma,PDF,20000,WORK_STEALING,67.4,425.7,6.316 +Gamma,PDF,50000,PARALLEL,128.1,1082.2,8.448 +Gamma,PDF,100000,PARALLEL,277.4,2124.3,7.658 +Gamma,PDF,250000,PARALLEL,624.4,5380.3,8.617 +Gamma,PDF,500000,PARALLEL,1218.5,11730.4,9.627 +Gaussian,CDF,8,SCALAR,0.2,0.2,1.0 +Gaussian,CDF,16,VECTORIZED,0.3,0.5,1.667 +Gaussian,CDF,32,VECTORIZED,0.4,2.1,5.25 +Gaussian,CDF,64,VECTORIZED,0.7,1.8,2.571 +Gaussian,CDF,128,VECTORIZED,1.2,3.6,3.0 +Gaussian,CDF,256,VECTORIZED,2.3,7.3,3.174 +Gaussian,CDF,512,VECTORIZED,4.2,14.3,3.405 +Gaussian,CDF,1000,VECTORIZED,7.9,26.4,3.342 +Gaussian,CDF,2000,VECTORIZED,10.5,34.3,3.267 +Gaussian,CDF,5000,VECTORIZED,40.2,142.9,3.555 +Gaussian,CDF,10000,VECTORIZED,53.9,156.9,2.911 +Gaussian,CDF,20000,VECTORIZED,109.3,347.7,3.181 +Gaussian,CDF,50000,PARALLEL,115.6,757.8,6.555 +Gaussian,CDF,100000,PARALLEL,239.8,1071.6,4.469 +Gaussian,CDF,250000,PARALLEL,371.7,2723.3,7.327 +Gaussian,CDF,500000,PARALLEL,825.6,5476.1,6.633 +Gaussian,LogPDF,8,VECTORIZED,0.0,0.1, +Gaussian,LogPDF,16,VECTORIZED,0.1,0.3,3.0 +Gaussian,LogPDF,32,VECTORIZED,0.1,0.4,4.0 +Gaussian,LogPDF,64,VECTORIZED,0.2,0.8,4.0 +Gaussian,LogPDF,128,VECTORIZED,0.2,1.8,9.0 +Gaussian,LogPDF,256,VECTORIZED,0.3,3.7,12.333 +Gaussian,LogPDF,512,VECTORIZED,0.4,7.3,18.25 +Gaussian,LogPDF,1000,VECTORIZED,0.5,11.5,23.0 +Gaussian,LogPDF,2000,VECTORIZED,0.5,17.9,35.8 +Gaussian,LogPDF,5000,VECTORIZED,1.5,38.5,25.667 +Gaussian,LogPDF,10000,VECTORIZED,3.2,88.7,27.719 +Gaussian,LogPDF,20000,VECTORIZED,8.7,167.8,19.287 +Gaussian,LogPDF,50000,VECTORIZED,11.3,253.8,22.46 +Gaussian,LogPDF,100000,VECTORIZED,26.8,508.9,18.989 +Gaussian,LogPDF,250000,VECTORIZED,122.8,1276.0,10.391 +Gaussian,LogPDF,500000,VECTORIZED,137.8,2573.2,18.673 +Gaussian,PDF,8,VECTORIZED,0.0,0.1, +Gaussian,PDF,16,VECTORIZED,0.2,0.4,2.0 +Gaussian,PDF,32,VECTORIZED,0.3,0.7,2.333 +Gaussian,PDF,64,VECTORIZED,0.4,1.4,3.5 +Gaussian,PDF,128,VECTORIZED,0.5,2.6,5.2 +Gaussian,PDF,256,VECTORIZED,0.9,5.9,6.556 +Gaussian,PDF,512,VECTORIZED,1.7,11.3,6.647 +Gaussian,PDF,1000,VECTORIZED,2.9,20.3,7.0 +Gaussian,PDF,2000,VECTORIZED,5.6,34.6,6.179 +Gaussian,PDF,5000,VECTORIZED,9.2,68.5,7.446 +Gaussian,PDF,10000,VECTORIZED,28.6,209.0,7.308 +Gaussian,PDF,20000,VECTORIZED,37.2,236.0,6.344 +Gaussian,PDF,50000,VECTORIZED,81.2,461.4,5.682 +Gaussian,PDF,100000,PARALLEL,108.9,798.0,7.328 +Gaussian,PDF,250000,PARALLEL,258.8,2099.1,8.111 +Gaussian,PDF,500000,PARALLEL,532.0,4065.2,7.641 +Poisson,CDF,8,SCALAR,0.4,0.4,1.0 +Poisson,CDF,16,VECTORIZED,0.8,0.9,1.125 +Poisson,CDF,32,SCALAR,1.7,1.7,1.0 +Poisson,CDF,64,VECTORIZED,3.2,3.3,1.031 +Poisson,CDF,128,VECTORIZED,6.2,6.5,1.048 +Poisson,CDF,256,VECTORIZED,12.7,13.2,1.039 +Poisson,CDF,512,VECTORIZED,24.8,25.9,1.044 +Poisson,CDF,1000,VECTORIZED,48.9,51.0,1.043 +Poisson,CDF,2000,VECTORIZED,99.0,102.9,1.039 +Poisson,CDF,5000,VECTORIZED,266.3,277.3,1.041 +Poisson,CDF,10000,PARALLEL,143.9,873.1,6.067 +Poisson,CDF,20000,PARALLEL,224.3,1158.8,5.166 +Poisson,CDF,50000,PARALLEL,565.8,2970.0,5.249 +Poisson,CDF,100000,PARALLEL,1344.7,5867.2,4.363 +Poisson,CDF,250000,PARALLEL,3236.8,15729.1,4.859 +Poisson,CDF,500000,PARALLEL,5648.4,31162.6,5.517 +Poisson,LogPDF,8,SCALAR,0.1,0.1,1.0 +Poisson,LogPDF,16,VECTORIZED,0.1,0.2,2.0 +Poisson,LogPDF,32,VECTORIZED,0.2,0.4,2.0 +Poisson,LogPDF,64,VECTORIZED,0.4,0.7,1.75 +Poisson,LogPDF,128,VECTORIZED,0.6,1.3,2.167 +Poisson,LogPDF,256,VECTORIZED,1.3,2.7,2.077 +Poisson,LogPDF,512,VECTORIZED,2.5,5.2,2.08 +Poisson,LogPDF,1000,VECTORIZED,4.9,10.2,2.082 +Poisson,LogPDF,2000,VECTORIZED,9.6,47.6,4.958 +Poisson,LogPDF,5000,VECTORIZED,24.1,51.1,2.12 +Poisson,LogPDF,10000,VECTORIZED,47.9,105.0,2.192 +Poisson,LogPDF,20000,PARALLEL,59.3,225.1,3.796 +Poisson,LogPDF,50000,PARALLEL,116.0,579.2,4.993 +Poisson,LogPDF,100000,PARALLEL,176.3,1167.8,6.624 +Poisson,LogPDF,250000,WORK_STEALING,386.1,2954.2,7.651 +Poisson,LogPDF,500000,PARALLEL,769.7,6152.7,7.994 +Poisson,PDF,8,VECTORIZED,0.1,0.2,2.0 +Poisson,PDF,16,VECTORIZED,0.2,0.4,2.0 +Poisson,PDF,32,VECTORIZED,0.4,0.6,1.5 +Poisson,PDF,64,VECTORIZED,0.8,1.2,1.5 +Poisson,PDF,128,VECTORIZED,1.5,2.5,1.667 +Poisson,PDF,256,VECTORIZED,2.9,4.9,1.69 +Poisson,PDF,512,VECTORIZED,5.7,9.6,1.684 +Poisson,PDF,1000,VECTORIZED,11.2,18.9,1.688 +Poisson,PDF,2000,VECTORIZED,22.2,37.6,1.694 +Poisson,PDF,5000,VECTORIZED,55.4,94.4,1.704 +Poisson,PDF,10000,PARALLEL,85.4,188.1,2.203 +Poisson,PDF,20000,PARALLEL,86.6,384.6,4.441 +Poisson,PDF,50000,PARALLEL,164.1,985.9,6.008 +Poisson,PDF,100000,PARALLEL,337.8,2014.6,5.964 +Poisson,PDF,250000,PARALLEL,736.6,5409.8,7.344 +Poisson,PDF,500000,WORK_STEALING,1567.0,10170.7,6.491 +StudentT,CDF,8,VECTORIZED,1.2,1.3,1.083 +StudentT,CDF,16,VECTORIZED,2.6,3.0,1.154 +StudentT,CDF,32,WORK_STEALING,5.3,5.5,1.038 +StudentT,CDF,64,PARALLEL,10.6,11.5,1.085 +StudentT,CDF,128,WORK_STEALING,21.1,22.6,1.071 +StudentT,CDF,256,WORK_STEALING,30.8,32.4,1.052 +StudentT,CDF,512,VECTORIZED,61.8,65.6,1.061 +StudentT,CDF,1000,PARALLEL,124.2,131.3,1.057 +StudentT,CDF,2000,WORK_STEALING,250.3,265.9,1.062 +StudentT,CDF,5000,PARALLEL,640.4,683.1,1.067 +StudentT,CDF,10000,PARALLEL,1289.8,1367.2,1.06 +StudentT,CDF,20000,PARALLEL,2590.8,2727.2,1.053 +StudentT,CDF,50000,WORK_STEALING,6588.8,7067.0,1.073 +StudentT,CDF,100000,WORK_STEALING,13545.2,14498.4,1.07 +StudentT,CDF,250000,PARALLEL,33262.5,35694.7,1.073 +StudentT,CDF,500000,VECTORIZED,67776.8,71873.5,1.06 +StudentT,LogPDF,8,SCALAR,0.1,0.1,1.0 +StudentT,LogPDF,16,SCALAR,0.2,0.2,1.0 +StudentT,LogPDF,32,VECTORIZED,0.2,0.4,2.0 +StudentT,LogPDF,64,VECTORIZED,0.3,0.8,2.667 +StudentT,LogPDF,128,VECTORIZED,0.5,1.8,3.6 +StudentT,LogPDF,256,VECTORIZED,0.8,2.2,2.75 +StudentT,LogPDF,512,VECTORIZED,1.0,4.2,4.2 +StudentT,LogPDF,1000,VECTORIZED,2.0,8.3,4.15 +StudentT,LogPDF,2000,VECTORIZED,4.0,16.7,4.175 +StudentT,LogPDF,5000,VECTORIZED,10.0,43.0,4.3 +StudentT,LogPDF,10000,VECTORIZED,20.1,105.6,5.254 +StudentT,LogPDF,20000,VECTORIZED,39.9,237.8,5.96 +StudentT,LogPDF,50000,WORK_STEALING,98.5,850.7,8.637 +StudentT,LogPDF,100000,VECTORIZED,209.4,1242.6,5.934 +StudentT,LogPDF,250000,WORK_STEALING,396.5,3116.1,7.859 +StudentT,LogPDF,500000,PARALLEL,975.4,6325.2,6.485 +StudentT,PDF,8,VECTORIZED,0.1,0.2,2.0 +StudentT,PDF,16,VECTORIZED,0.2,0.4,2.0 +StudentT,PDF,32,VECTORIZED,0.3,0.7,2.333 +StudentT,PDF,64,VECTORIZED,0.4,1.4,3.5 +StudentT,PDF,128,VECTORIZED,0.7,2.7,3.857 +StudentT,PDF,256,VECTORIZED,1.0,5.3,5.3 +StudentT,PDF,512,VECTORIZED,1.8,6.9,3.833 +StudentT,PDF,1000,VECTORIZED,3.3,13.6,4.121 +StudentT,PDF,2000,VECTORIZED,6.5,28.2,4.338 +StudentT,PDF,5000,VECTORIZED,16.5,73.6,4.461 +StudentT,PDF,10000,VECTORIZED,33.5,152.7,4.558 +StudentT,PDF,20000,PARALLEL,72.1,555.5,7.705 +StudentT,PDF,50000,WORK_STEALING,134.2,809.5,6.032 +StudentT,PDF,100000,PARALLEL,287.8,1968.3,6.839 +StudentT,PDF,250000,WORK_STEALING,544.2,4182.0,7.685 +StudentT,PDF,500000,PARALLEL,1167.8,9291.3,7.956 +Uniform,CDF,8,VECTORIZED,0.0,0.1, +Uniform,CDF,16,VECTORIZED,0.0,0.1, +Uniform,CDF,32,WORK_STEALING,0.0,0.2, +Uniform,CDF,64,PARALLEL,0.1,0.5,5.0 +Uniform,CDF,128,PARALLEL,0.2,1.2,6.0 +Uniform,CDF,256,PARALLEL,0.2,1.5,7.5 +Uniform,CDF,512,PARALLEL,0.4,2.8,7.0 +Uniform,CDF,1000,PARALLEL,0.8,6.0,7.5 +Uniform,CDF,2000,WORK_STEALING,1.6,15.7,9.812 +Uniform,CDF,5000,WORK_STEALING,6.3,41.9,6.651 +Uniform,CDF,10000,VECTORIZED,16.5,85.5,5.182 +Uniform,CDF,20000,VECTORIZED,32.8,203.2,6.195 +Uniform,CDF,50000,PARALLEL,89.7,470.4,5.244 +Uniform,CDF,100000,PARALLEL,121.9,933.8,7.66 +Uniform,CDF,250000,PARALLEL,244.2,2482.8,10.167 +Uniform,CDF,500000,PARALLEL,551.7,7710.0,13.975 +Uniform,LogPDF,8,SCALAR,0.1,0.1,1.0 +Uniform,LogPDF,16,VECTORIZED,0.0,0.1, +Uniform,LogPDF,32,VECTORIZED,0.0,0.2, +Uniform,LogPDF,64,VECTORIZED,0.1,0.5,5.0 +Uniform,LogPDF,128,VECTORIZED,0.1,1.0,10.0 +Uniform,LogPDF,256,VECTORIZED,0.2,2.2,11.0 +Uniform,LogPDF,512,WORK_STEALING,0.3,4.3,14.333 +Uniform,LogPDF,1000,VECTORIZED,1.0,6.6,6.6 +Uniform,LogPDF,2000,VECTORIZED,1.4,10.7,7.643 +Uniform,LogPDF,5000,VECTORIZED,5.5,44.9,8.164 +Uniform,LogPDF,10000,VECTORIZED,8.9,93.3,10.483 +Uniform,LogPDF,20000,VECTORIZED,15.2,173.5,11.414 +Uniform,LogPDF,50000,PARALLEL,94.8,447.2,4.717 +Uniform,LogPDF,100000,PARALLEL,126.6,888.1,7.015 +Uniform,LogPDF,250000,PARALLEL,258.3,2398.1,9.284 +Uniform,LogPDF,500000,PARALLEL,527.2,6841.0,12.976 +Uniform,PDF,8,VECTORIZED,0.0,0.1, +Uniform,PDF,16,VECTORIZED,0.0,0.1, +Uniform,PDF,32,VECTORIZED,0.0,0.2, +Uniform,PDF,64,VECTORIZED,0.1,0.5,5.0 +Uniform,PDF,128,VECTORIZED,0.1,0.7,7.0 +Uniform,PDF,256,PARALLEL,0.2,2.4,12.0 +Uniform,PDF,512,VECTORIZED,0.5,2.9,5.8 +Uniform,PDF,1000,PARALLEL,0.6,5.6,9.333 +Uniform,PDF,2000,PARALLEL,1.7,15.7,9.235 +Uniform,PDF,5000,WORK_STEALING,5.5,28.5,5.182 +Uniform,PDF,10000,VECTORIZED,11.3,86.3,7.637 +Uniform,PDF,20000,VECTORIZED,20.3,245.5,12.094 +Uniform,PDF,50000,VECTORIZED,127.0,462.6,3.643 +Uniform,PDF,100000,PARALLEL,130.7,926.2,7.086 +Uniform,PDF,250000,PARALLEL,250.4,2371.6,9.471 +Uniform,PDF,500000,PARALLEL,490.3,5427.6,11.07 diff --git a/data/profiles/dispatcher/2026-04-12T06-02-56Z_windows-x86_64_investigate-gaussian-avx512-perf_sha-32c0819/crossovers.csv b/data/profiles/dispatcher/2026-04-12T06-02-56Z_windows-x86_64_investigate-gaussian-avx512-perf_sha-32c0819/crossovers.csv new file mode 100644 index 0000000..75de349 --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T06-02-56Z_windows-x86_64_investigate-gaussian-avx512-perf_sha-32c0819/crossovers.csv @@ -0,0 +1,28 @@ +distribution,operation,scalar_to_vectorized,vectorized_to_parallel,parallel_to_work_stealing,best_strategy_at_max_size,best_time_us_at_max_size,max_batch_size +Beta,CDF,8,500000,64,SCALAR,53781.2,500000 +Beta,LogPDF,16,,8,VECTORIZED,4433.4,500000 +Beta,PDF,16,,64,VECTORIZED,5384.4,500000 +ChiSquared,CDF,32,8,8,PARALLEL,6003.6,500000 +ChiSquared,LogPDF,32,16,32,PARALLEL,551.4,500000 +ChiSquared,PDF,8,50000,2000,PARALLEL,998.0,500000 +Discrete,CDF,16,50000,8,PARALLEL,318.9,500000 +Discrete,LogPDF,8,250000,10000,PARALLEL,260.7,500000 +Discrete,PDF,8,128,8,PARALLEL,183.9,500000 +Exponential,CDF,8,100000,1000,PARALLEL,411.1,500000 +Exponential,LogPDF,16,8,10000,PARALLEL,138.7,500000 +Exponential,PDF,8,50000,512,PARALLEL,426.8,500000 +Gamma,CDF,8,8,1000,PARALLEL,5087.1,500000 +Gamma,LogPDF,8,16,128,PARALLEL,664.7,500000 +Gamma,PDF,8,20000,256,PARALLEL,1218.5,500000 +Gaussian,CDF,16,50000,64,PARALLEL,825.6,500000 +Gaussian,LogPDF,8,,10000,VECTORIZED,137.8,500000 +Gaussian,PDF,8,100000,256,PARALLEL,532.0,500000 +Poisson,CDF,16,10000,16,PARALLEL,5648.4,500000 +Poisson,LogPDF,16,20000,128,PARALLEL,769.7,500000 +Poisson,PDF,8,10000,8,WORK_STEALING,1567.0,500000 +StudentT,CDF,8,64,8,VECTORIZED,67776.8,500000 +StudentT,LogPDF,32,250000,16,PARALLEL,975.4,500000 +StudentT,PDF,8,20000,8,PARALLEL,1167.8,500000 +Uniform,CDF,8,64,8,PARALLEL,551.7,500000 +Uniform,LogPDF,16,50000,512,PARALLEL,527.2,500000 +Uniform,PDF,8,256,5000,PARALLEL,490.3,500000 diff --git a/data/profiles/dispatcher/2026-04-12T06-02-56Z_windows-x86_64_investigate-gaussian-avx512-perf_sha-32c0819/logs/strategy_profile.txt b/data/profiles/dispatcher/2026-04-12T06-02-56Z_windows-x86_64_investigate-gaussian-avx512-perf_sha-32c0819/logs/strategy_profile.txt new file mode 100644 index 0000000..ff81a73 --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T06-02-56Z_windows-x86_64_investigate-gaussian-avx512-perf_sha-32c0819/logs/strategy_profile.txt @@ -0,0 +1,658 @@ + +==================== + Strategy Profile +==================== + +Forced-strategy timing profiler for dispatcher threshold tuning + +System: 12 logical cores, AVX-512 SIMD, 16384 KB L3 cache + +Batch sizes: 8 16 32 64 128 256 512 1000 2000 5000 10000 20000 50000 100000 250000 500000 + + +--- Uniform Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- Gaussian Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- Exponential Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- Discrete Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- Poisson Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- Gamma Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- StudentT Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- Beta Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +--- ChiSquared Strategy Profile --- + Profiling batch size 8... βœ“ + Profiling batch size 16... βœ“ + Profiling batch size 32... βœ“ + Profiling batch size 64... βœ“ + Profiling batch size 128... βœ“ + Profiling batch size 256... βœ“ + Profiling batch size 512... βœ“ + Profiling batch size 1000... βœ“ + Profiling batch size 2000... βœ“ + Profiling batch size 5000... βœ“ + Profiling batch size 10000... βœ“ + Profiling batch size 20000... βœ“ + Profiling batch size 50000... βœ“ + Profiling batch size 100000... βœ“ + Profiling batch size 250000... βœ“ + Profiling batch size 500000... βœ“ + + +========================= + Best Strategy Summary +========================= + +Distribution Operation Size Best Strategy Time (ΞΌs) +---------------------------------------------------------------- +Beta CDF 8 Vectorized 1.00 +Beta CDF 16 Vectorized 2.30 +Beta CDF 32 Vectorized 4.40 +Beta CDF 64 Vectorized 8.00 +Beta CDF 128 Work-Stealing 12.60 +Beta CDF 256 Vectorized 23.10 +Beta CDF 512 Vectorized 46.20 +Beta CDF 1000 Vectorized 90.50 +Beta CDF 2000 Vectorized 185.60 +Beta CDF 5000 Vectorized 461.10 +Beta CDF 10000 Vectorized 944.30 +Beta CDF 20000 Vectorized 1918.20 +Beta CDF 50000 Vectorized 4867.00 +Beta CDF 100000 Vectorized 9742.30 +Beta CDF 250000 Vectorized 24705.20 +Beta CDF 500000 Scalar 53781.20 +Beta LogPDF 8 Scalar 0.10 +Beta LogPDF 16 Work-Stealing 0.30 +Beta LogPDF 32 Vectorized 0.50 +Beta LogPDF 64 Vectorized 1.20 +Beta LogPDF 128 Vectorized 1.30 +Beta LogPDF 256 Vectorized 1.50 +Beta LogPDF 512 Vectorized 2.80 +Beta LogPDF 1000 Vectorized 5.20 +Beta LogPDF 2000 Vectorized 10.20 +Beta LogPDF 5000 Vectorized 25.80 +Beta LogPDF 10000 Vectorized 101.40 +Beta LogPDF 20000 Vectorized 105.50 +Beta LogPDF 50000 Vectorized 284.50 +Beta LogPDF 100000 Vectorized 980.60 +Beta LogPDF 250000 Vectorized 2005.50 +Beta LogPDF 500000 Vectorized 4433.40 +Beta PDF 8 Scalar 0.20 +Beta PDF 16 Vectorized 0.40 +Beta PDF 32 Vectorized 0.60 +Beta PDF 64 Vectorized 1.50 +Beta PDF 128 Vectorized 1.60 +Beta PDF 256 Vectorized 1.80 +Beta PDF 512 Vectorized 3.50 +Beta PDF 1000 Vectorized 6.60 +Beta PDF 2000 Vectorized 12.80 +Beta PDF 5000 Vectorized 32.50 +Beta PDF 10000 Vectorized 65.00 +Beta PDF 20000 Vectorized 181.10 +Beta PDF 50000 Vectorized 369.30 +Beta PDF 100000 Vectorized 835.50 +Beta PDF 250000 Vectorized 2744.40 +Beta PDF 500000 Vectorized 5384.40 +ChiSquared CDF 8 Work-Stealing 0.30 +ChiSquared CDF 16 Parallel 0.70 +ChiSquared CDF 32 Vectorized 1.40 +ChiSquared CDF 64 Vectorized 2.70 +ChiSquared CDF 128 Parallel 5.30 +ChiSquared CDF 256 Parallel 10.40 +ChiSquared CDF 512 Parallel 20.80 +ChiSquared CDF 1000 Work-Stealing 40.60 +ChiSquared CDF 2000 Vectorized 82.00 +ChiSquared CDF 5000 Work-Stealing 227.30 +ChiSquared CDF 10000 Parallel 163.40 +ChiSquared CDF 20000 Parallel 280.70 +ChiSquared CDF 50000 Parallel 592.00 +ChiSquared CDF 100000 Parallel 1022.50 +ChiSquared CDF 250000 Parallel 3035.40 +ChiSquared CDF 500000 Parallel 6003.60 +ChiSquared LogPDF 8 Scalar 0.10 +ChiSquared LogPDF 16 Parallel 0.10 +ChiSquared LogPDF 32 Work-Stealing 0.10 +ChiSquared LogPDF 64 Work-Stealing 0.20 +ChiSquared LogPDF 128 Vectorized 0.50 +ChiSquared LogPDF 256 Vectorized 0.90 +ChiSquared LogPDF 512 Vectorized 1.30 +ChiSquared LogPDF 1000 Vectorized 2.50 +ChiSquared LogPDF 2000 Vectorized 4.80 +ChiSquared LogPDF 5000 Vectorized 12.10 +ChiSquared LogPDF 10000 Vectorized 36.60 +ChiSquared LogPDF 20000 Vectorized 48.00 +ChiSquared LogPDF 50000 Work-Stealing 112.40 +ChiSquared LogPDF 100000 Parallel 142.90 +ChiSquared LogPDF 250000 Parallel 344.40 +ChiSquared LogPDF 500000 Parallel 551.40 +ChiSquared PDF 8 Vectorized 0.10 +ChiSquared PDF 16 Vectorized 0.20 +ChiSquared PDF 32 Vectorized 0.30 +ChiSquared PDF 64 Vectorized 0.40 +ChiSquared PDF 128 Vectorized 0.70 +ChiSquared PDF 256 Vectorized 1.10 +ChiSquared PDF 512 Vectorized 2.00 +ChiSquared PDF 1000 Vectorized 3.70 +ChiSquared PDF 2000 Vectorized 7.40 +ChiSquared PDF 5000 Vectorized 23.40 +ChiSquared PDF 10000 Vectorized 37.20 +ChiSquared PDF 20000 Vectorized 74.80 +ChiSquared PDF 50000 Parallel 127.90 +ChiSquared PDF 100000 Parallel 208.70 +ChiSquared PDF 250000 Parallel 522.60 +ChiSquared PDF 500000 Parallel 998.00 +Discrete CDF 8 Work-Stealing 0.00 +Discrete CDF 16 Vectorized 0.00 +Discrete CDF 32 Vectorized 0.10 +Discrete CDF 64 Vectorized 0.10 +Discrete CDF 128 Vectorized 0.20 +Discrete CDF 256 Vectorized 0.30 +Discrete CDF 512 Vectorized 0.50 +Discrete CDF 1000 Vectorized 1.00 +Discrete CDF 2000 Vectorized 2.00 +Discrete CDF 5000 Vectorized 5.00 +Discrete CDF 10000 Vectorized 16.60 +Discrete CDF 20000 Vectorized 26.60 +Discrete CDF 50000 Work-Stealing 55.30 +Discrete CDF 100000 Parallel 119.00 +Discrete CDF 250000 Parallel 172.10 +Discrete CDF 500000 Parallel 318.90 +Discrete LogPDF 8 Vectorized 0.00 +Discrete LogPDF 16 Vectorized 0.00 +Discrete LogPDF 32 Vectorized 0.00 +Discrete LogPDF 64 Vectorized 0.00 +Discrete LogPDF 128 Vectorized 0.10 +Discrete LogPDF 256 Vectorized 0.20 +Discrete LogPDF 512 Vectorized 0.50 +Discrete LogPDF 1000 Vectorized 0.90 +Discrete LogPDF 2000 Vectorized 1.70 +Discrete LogPDF 5000 Vectorized 4.20 +Discrete LogPDF 10000 Vectorized 8.50 +Discrete LogPDF 20000 Vectorized 16.90 +Discrete LogPDF 50000 Vectorized 42.50 +Discrete LogPDF 100000 Vectorized 84.60 +Discrete LogPDF 250000 Parallel 157.20 +Discrete LogPDF 500000 Parallel 260.70 +Discrete PDF 8 Vectorized 0.00 +Discrete PDF 16 Vectorized 0.00 +Discrete PDF 32 Vectorized 0.10 +Discrete PDF 64 Vectorized 0.00 +Discrete PDF 128 Parallel 0.10 +Discrete PDF 256 Vectorized 0.20 +Discrete PDF 512 Vectorized 0.50 +Discrete PDF 1000 Vectorized 0.90 +Discrete PDF 2000 Vectorized 1.70 +Discrete PDF 5000 Vectorized 4.30 +Discrete PDF 10000 Vectorized 8.50 +Discrete PDF 20000 Vectorized 27.70 +Discrete PDF 50000 Parallel 56.40 +Discrete PDF 100000 Parallel 65.40 +Discrete PDF 250000 Parallel 129.90 +Discrete PDF 500000 Parallel 183.90 +Exponential CDF 8 Vectorized 0.00 +Exponential CDF 16 Vectorized 0.10 +Exponential CDF 32 Vectorized 0.20 +Exponential CDF 64 Vectorized 0.30 +Exponential CDF 128 Vectorized 0.50 +Exponential CDF 256 Vectorized 0.80 +Exponential CDF 512 Vectorized 1.00 +Exponential CDF 1000 Vectorized 1.80 +Exponential CDF 2000 Vectorized 3.70 +Exponential CDF 5000 Vectorized 9.30 +Exponential CDF 10000 Vectorized 27.80 +Exponential CDF 20000 Work-Stealing 47.90 +Exponential CDF 50000 Vectorized 93.40 +Exponential CDF 100000 Parallel 147.30 +Exponential CDF 250000 Work-Stealing 212.60 +Exponential CDF 500000 Parallel 411.10 +Exponential LogPDF 8 Parallel 0.00 +Exponential LogPDF 16 Parallel 0.00 +Exponential LogPDF 32 Parallel 0.00 +Exponential LogPDF 64 Vectorized 0.10 +Exponential LogPDF 128 Vectorized 0.10 +Exponential LogPDF 256 Vectorized 0.20 +Exponential LogPDF 512 Vectorized 0.20 +Exponential LogPDF 1000 Vectorized 0.50 +Exponential LogPDF 2000 Vectorized 1.00 +Exponential LogPDF 5000 Vectorized 2.50 +Exponential LogPDF 10000 Vectorized 5.00 +Exponential LogPDF 20000 Vectorized 10.10 +Exponential LogPDF 50000 Vectorized 25.00 +Exponential LogPDF 100000 Vectorized 54.30 +Exponential LogPDF 250000 Work-Stealing 130.50 +Exponential LogPDF 500000 Parallel 138.70 +Exponential PDF 8 Vectorized 0.10 +Exponential PDF 16 Vectorized 0.10 +Exponential PDF 32 Vectorized 0.10 +Exponential PDF 64 Vectorized 0.20 +Exponential PDF 128 Vectorized 0.40 +Exponential PDF 256 Vectorized 0.80 +Exponential PDF 512 Vectorized 0.90 +Exponential PDF 1000 Vectorized 1.80 +Exponential PDF 2000 Vectorized 3.50 +Exponential PDF 5000 Vectorized 9.00 +Exponential PDF 10000 Vectorized 17.90 +Exponential PDF 20000 Vectorized 44.20 +Exponential PDF 50000 Parallel 77.30 +Exponential PDF 100000 Parallel 120.00 +Exponential PDF 250000 Parallel 258.90 +Exponential PDF 500000 Parallel 426.80 +Gamma CDF 8 Parallel 0.40 +Gamma CDF 16 Parallel 1.00 +Gamma CDF 32 Parallel 2.00 +Gamma CDF 64 Parallel 3.90 +Gamma CDF 128 Parallel 7.70 +Gamma CDF 256 Parallel 10.20 +Gamma CDF 512 Parallel 20.50 +Gamma CDF 1000 Work-Stealing 39.70 +Gamma CDF 2000 Work-Stealing 79.70 +Gamma CDF 5000 Work-Stealing 220.60 +Gamma CDF 10000 Parallel 145.10 +Gamma CDF 20000 Parallel 256.80 +Gamma CDF 50000 Parallel 618.40 +Gamma CDF 100000 Parallel 1032.30 +Gamma CDF 250000 Parallel 2586.10 +Gamma CDF 500000 Parallel 5087.10 +Gamma LogPDF 8 Vectorized 0.10 +Gamma LogPDF 16 Parallel 0.10 +Gamma LogPDF 32 Parallel 0.20 +Gamma LogPDF 64 Parallel 0.40 +Gamma LogPDF 128 Vectorized 0.70 +Gamma LogPDF 256 Vectorized 1.10 +Gamma LogPDF 512 Vectorized 1.40 +Gamma LogPDF 1000 Vectorized 3.70 +Gamma LogPDF 2000 Vectorized 4.80 +Gamma LogPDF 5000 Vectorized 12.10 +Gamma LogPDF 10000 Vectorized 24.10 +Gamma LogPDF 20000 Vectorized 48.00 +Gamma LogPDF 50000 Parallel 83.80 +Gamma LogPDF 100000 Parallel 148.70 +Gamma LogPDF 250000 Parallel 346.10 +Gamma LogPDF 500000 Parallel 664.70 +Gamma PDF 8 Vectorized 0.10 +Gamma PDF 16 Vectorized 0.30 +Gamma PDF 32 Vectorized 0.40 +Gamma PDF 64 Vectorized 0.70 +Gamma PDF 128 Vectorized 1.00 +Gamma PDF 256 Vectorized 1.70 +Gamma PDF 512 Vectorized 2.10 +Gamma PDF 1000 Vectorized 5.70 +Gamma PDF 2000 Vectorized 7.50 +Gamma PDF 5000 Vectorized 18.60 +Gamma PDF 10000 Vectorized 37.00 +Gamma PDF 20000 Work-Stealing 67.40 +Gamma PDF 50000 Parallel 128.10 +Gamma PDF 100000 Parallel 277.40 +Gamma PDF 250000 Parallel 624.40 +Gamma PDF 500000 Parallel 1218.50 +Gaussian CDF 8 Scalar 0.20 +Gaussian CDF 16 Vectorized 0.30 +Gaussian CDF 32 Vectorized 0.40 +Gaussian CDF 64 Vectorized 0.70 +Gaussian CDF 128 Vectorized 1.20 +Gaussian CDF 256 Vectorized 2.30 +Gaussian CDF 512 Vectorized 4.20 +Gaussian CDF 1000 Vectorized 7.90 +Gaussian CDF 2000 Vectorized 10.50 +Gaussian CDF 5000 Vectorized 40.20 +Gaussian CDF 10000 Vectorized 53.90 +Gaussian CDF 20000 Vectorized 109.30 +Gaussian CDF 50000 Parallel 115.60 +Gaussian CDF 100000 Parallel 239.80 +Gaussian CDF 250000 Parallel 371.70 +Gaussian CDF 500000 Parallel 825.60 +Gaussian LogPDF 8 Vectorized 0.00 +Gaussian LogPDF 16 Vectorized 0.10 +Gaussian LogPDF 32 Vectorized 0.10 +Gaussian LogPDF 64 Vectorized 0.20 +Gaussian LogPDF 128 Vectorized 0.20 +Gaussian LogPDF 256 Vectorized 0.30 +Gaussian LogPDF 512 Vectorized 0.40 +Gaussian LogPDF 1000 Vectorized 0.50 +Gaussian LogPDF 2000 Vectorized 0.50 +Gaussian LogPDF 5000 Vectorized 1.50 +Gaussian LogPDF 10000 Vectorized 3.20 +Gaussian LogPDF 20000 Vectorized 8.70 +Gaussian LogPDF 50000 Vectorized 11.30 +Gaussian LogPDF 100000 Vectorized 26.80 +Gaussian LogPDF 250000 Vectorized 122.80 +Gaussian LogPDF 500000 Vectorized 137.80 +Gaussian PDF 8 Vectorized 0.00 +Gaussian PDF 16 Vectorized 0.20 +Gaussian PDF 32 Vectorized 0.30 +Gaussian PDF 64 Vectorized 0.40 +Gaussian PDF 128 Vectorized 0.50 +Gaussian PDF 256 Vectorized 0.90 +Gaussian PDF 512 Vectorized 1.70 +Gaussian PDF 1000 Vectorized 2.90 +Gaussian PDF 2000 Vectorized 5.60 +Gaussian PDF 5000 Vectorized 9.20 +Gaussian PDF 10000 Vectorized 28.60 +Gaussian PDF 20000 Vectorized 37.20 +Gaussian PDF 50000 Vectorized 81.20 +Gaussian PDF 100000 Parallel 108.90 +Gaussian PDF 250000 Parallel 258.80 +Gaussian PDF 500000 Parallel 532.00 +Poisson CDF 8 Scalar 0.40 +Poisson CDF 16 Vectorized 0.80 +Poisson CDF 32 Scalar 1.70 +Poisson CDF 64 Vectorized 3.20 +Poisson CDF 128 Vectorized 6.20 +Poisson CDF 256 Vectorized 12.70 +Poisson CDF 512 Vectorized 24.80 +Poisson CDF 1000 Vectorized 48.90 +Poisson CDF 2000 Vectorized 99.00 +Poisson CDF 5000 Vectorized 266.30 +Poisson CDF 10000 Parallel 143.90 +Poisson CDF 20000 Parallel 224.30 +Poisson CDF 50000 Parallel 565.80 +Poisson CDF 100000 Parallel 1344.70 +Poisson CDF 250000 Parallel 3236.80 +Poisson CDF 500000 Parallel 5648.40 +Poisson LogPDF 8 Scalar 0.10 +Poisson LogPDF 16 Vectorized 0.10 +Poisson LogPDF 32 Vectorized 0.20 +Poisson LogPDF 64 Vectorized 0.40 +Poisson LogPDF 128 Vectorized 0.60 +Poisson LogPDF 256 Vectorized 1.30 +Poisson LogPDF 512 Vectorized 2.50 +Poisson LogPDF 1000 Vectorized 4.90 +Poisson LogPDF 2000 Vectorized 9.60 +Poisson LogPDF 5000 Vectorized 24.10 +Poisson LogPDF 10000 Vectorized 47.90 +Poisson LogPDF 20000 Parallel 59.30 +Poisson LogPDF 50000 Parallel 116.00 +Poisson LogPDF 100000 Parallel 176.30 +Poisson LogPDF 250000 Work-Stealing 386.10 +Poisson LogPDF 500000 Parallel 769.70 +Poisson PDF 8 Vectorized 0.10 +Poisson PDF 16 Vectorized 0.20 +Poisson PDF 32 Vectorized 0.40 +Poisson PDF 64 Vectorized 0.80 +Poisson PDF 128 Vectorized 1.50 +Poisson PDF 256 Vectorized 2.90 +Poisson PDF 512 Vectorized 5.70 +Poisson PDF 1000 Vectorized 11.20 +Poisson PDF 2000 Vectorized 22.20 +Poisson PDF 5000 Vectorized 55.40 +Poisson PDF 10000 Parallel 85.40 +Poisson PDF 20000 Parallel 86.60 +Poisson PDF 50000 Parallel 164.10 +Poisson PDF 100000 Parallel 337.80 +Poisson PDF 250000 Parallel 736.60 +Poisson PDF 500000 Work-Stealing 1567.00 +StudentT CDF 8 Vectorized 1.20 +StudentT CDF 16 Vectorized 2.60 +StudentT CDF 32 Work-Stealing 5.30 +StudentT CDF 64 Parallel 10.60 +StudentT CDF 128 Work-Stealing 21.10 +StudentT CDF 256 Work-Stealing 30.80 +StudentT CDF 512 Vectorized 61.80 +StudentT CDF 1000 Parallel 124.20 +StudentT CDF 2000 Work-Stealing 250.30 +StudentT CDF 5000 Parallel 640.40 +StudentT CDF 10000 Parallel 1289.80 +StudentT CDF 20000 Parallel 2590.80 +StudentT CDF 50000 Work-Stealing 6588.80 +StudentT CDF 100000 Work-Stealing 13545.20 +StudentT CDF 250000 Parallel 33262.50 +StudentT CDF 500000 Vectorized 67776.80 +StudentT LogPDF 8 Scalar 0.10 +StudentT LogPDF 16 Scalar 0.20 +StudentT LogPDF 32 Vectorized 0.20 +StudentT LogPDF 64 Vectorized 0.30 +StudentT LogPDF 128 Vectorized 0.50 +StudentT LogPDF 256 Vectorized 0.80 +StudentT LogPDF 512 Vectorized 1.00 +StudentT LogPDF 1000 Vectorized 2.00 +StudentT LogPDF 2000 Vectorized 4.00 +StudentT LogPDF 5000 Vectorized 10.00 +StudentT LogPDF 10000 Vectorized 20.10 +StudentT LogPDF 20000 Vectorized 39.90 +StudentT LogPDF 50000 Work-Stealing 98.50 +StudentT LogPDF 100000 Vectorized 209.40 +StudentT LogPDF 250000 Work-Stealing 396.50 +StudentT LogPDF 500000 Parallel 975.40 +StudentT PDF 8 Vectorized 0.10 +StudentT PDF 16 Vectorized 0.20 +StudentT PDF 32 Vectorized 0.30 +StudentT PDF 64 Vectorized 0.40 +StudentT PDF 128 Vectorized 0.70 +StudentT PDF 256 Vectorized 1.00 +StudentT PDF 512 Vectorized 1.80 +StudentT PDF 1000 Vectorized 3.30 +StudentT PDF 2000 Vectorized 6.50 +StudentT PDF 5000 Vectorized 16.50 +StudentT PDF 10000 Vectorized 33.50 +StudentT PDF 20000 Parallel 72.10 +StudentT PDF 50000 Work-Stealing 134.20 +StudentT PDF 100000 Parallel 287.80 +StudentT PDF 250000 Work-Stealing 544.20 +StudentT PDF 500000 Parallel 1167.80 +Uniform CDF 8 Vectorized 0.00 +Uniform CDF 16 Vectorized 0.00 +Uniform CDF 32 Work-Stealing 0.00 +Uniform CDF 64 Parallel 0.10 +Uniform CDF 128 Parallel 0.20 +Uniform CDF 256 Parallel 0.20 +Uniform CDF 512 Parallel 0.40 +Uniform CDF 1000 Parallel 0.80 +Uniform CDF 2000 Work-Stealing 1.60 +Uniform CDF 5000 Work-Stealing 6.30 +Uniform CDF 10000 Vectorized 16.50 +Uniform CDF 20000 Vectorized 32.80 +Uniform CDF 50000 Parallel 89.70 +Uniform CDF 100000 Parallel 121.90 +Uniform CDF 250000 Parallel 244.20 +Uniform CDF 500000 Parallel 551.70 +Uniform LogPDF 8 Scalar 0.10 +Uniform LogPDF 16 Vectorized 0.00 +Uniform LogPDF 32 Vectorized 0.00 +Uniform LogPDF 64 Vectorized 0.10 +Uniform LogPDF 128 Vectorized 0.10 +Uniform LogPDF 256 Vectorized 0.20 +Uniform LogPDF 512 Work-Stealing 0.30 +Uniform LogPDF 1000 Vectorized 1.00 +Uniform LogPDF 2000 Vectorized 1.40 +Uniform LogPDF 5000 Vectorized 5.50 +Uniform LogPDF 10000 Vectorized 8.90 +Uniform LogPDF 20000 Vectorized 15.20 +Uniform LogPDF 50000 Parallel 94.80 +Uniform LogPDF 100000 Parallel 126.60 +Uniform LogPDF 250000 Parallel 258.30 +Uniform LogPDF 500000 Parallel 527.20 +Uniform PDF 8 Vectorized 0.00 +Uniform PDF 16 Vectorized 0.00 +Uniform PDF 32 Vectorized 0.00 +Uniform PDF 64 Vectorized 0.10 +Uniform PDF 128 Vectorized 0.10 +Uniform PDF 256 Parallel 0.20 +Uniform PDF 512 Vectorized 0.50 +Uniform PDF 1000 Parallel 0.60 +Uniform PDF 2000 Parallel 1.70 +Uniform PDF 5000 Work-Stealing 5.50 +Uniform PDF 10000 Vectorized 11.30 +Uniform PDF 20000 Vectorized 20.30 +Uniform PDF 50000 Vectorized 127.00 +Uniform PDF 100000 Parallel 130.70 +Uniform PDF 250000 Parallel 250.40 +Uniform PDF 500000 Parallel 490.30 + + +===================== + Crossover Summary +===================== + +Distribution Operation Sβ†’V Vβ†’P Pβ†’Work-Steal +-------------------------------------------------------------------------- +Beta CDF 8 500000 64 +Beta LogPDF 16 never 8 +Beta PDF 16 never 64 +ChiSquared CDF 32 8 8 +ChiSquared LogPDF 32 16 32 +ChiSquared PDF 8 50000 2000 +Discrete CDF 16 50000 8 +Discrete LogPDF 8 250000 10000 +Discrete PDF 8 128 8 +Exponential CDF 8 100000 1000 +Exponential LogPDF 16 8 10000 +Exponential PDF 8 50000 512 +Gamma CDF 8 8 1000 +Gamma LogPDF 8 16 128 +Gamma PDF 8 20000 256 +Gaussian CDF 16 50000 64 +Gaussian LogPDF 8 never 10000 +Gaussian PDF 8 100000 256 +Poisson CDF 16 10000 16 +Poisson LogPDF 16 20000 128 +Poisson PDF 8 10000 8 +StudentT CDF 8 64 8 +StudentT LogPDF 32 250000 16 +StudentT PDF 8 20000 8 +Uniform CDF 8 64 8 +Uniform LogPDF 16 50000 512 +Uniform PDF 8 256 5000 + +Results saved to C:\Users\gdwol\Development\libstats\build\profiles\dispatcher\2026-04-12T06-02-56Z_windows-x86_64_investigate-gaussian-avx512-perf_sha-32c0819\strategy_profile_results.csv diff --git a/data/profiles/dispatcher/2026-04-12T06-02-56Z_windows-x86_64_investigate-gaussian-avx512-perf_sha-32c0819/logs/system_inspector_performance.txt b/data/profiles/dispatcher/2026-04-12T06-02-56Z_windows-x86_64_investigate-gaussian-avx512-perf_sha-32c0819/logs/system_inspector_performance.txt new file mode 100644 index 0000000..f33a207 --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T06-02-56Z_windows-x86_64_investigate-gaussian-avx512-perf_sha-32c0819/logs/system_inspector_performance.txt @@ -0,0 +1,102 @@ + +======================================= + System Inspector - Performance Mode +======================================= + +System capabilities analysis with performance measurements + +System: 12 logical cores, AVX-512 SIMD, 16384 KB L3 cache + + +--- CPU Features --- +Feature Support Description +------------------------------------------------------------ +AVX-512 Yes Foundation instructions +AVX2 Yes Advanced Vector Ext 2 +AVX Yes Advanced Vector Ext +SSE2 Yes Streaming SIMD Ext 2 +NEON No ARM SIMD instructions +FMA Yes Fused Multiply-Add + + +--- Cache Information --- +Cache Level Size (KB) Line Size +------------------------------------------ +L1 32 64 bytes +L2 1024 64 bytes +L3 16384 64 bytes + + +--- CPU Topology --- +Hardware Threads: 12 +Logical Cores: 12 +Physical Cores: 6 +Hyperthreading: Enabled + + +--- SIMD Capabilities --- +Instruction Support Vector Width Description +-------------------------------------------------------------- +SSE2 Yes 128-bit Basic SIMD operations +AVX Yes 256-bit Advanced vector ext +AVX2 Yes 256-bit Integer AVX operations +AVX-512 Yes 512-bit Foundation instructions +NEON No 128-bit ARM SIMD instructions + +Active SIMD Level: AVX-512 + + +--- Performance Baselines --- +Operation Type Time (ΞΌs) Throughput (MOps/s) +------------------------------------------------------------ +SIMD Multiply 405 2466 +Scalar Multiply 220 4533 + +SIMD Speedup: 0.54x + + +--- Performance Dispatcher Configuration --- +Example Strategy Selections: +Batch Size Distribution Complexity Strategy +---------------------------------------------------------------------- +100 Uniform Simple Vectorized +100 Gaussian Simple Vectorized +100 Exponential Simple Vectorized +100 Poisson Simple Vectorized +100 Discrete Simple Vectorized +1000 Uniform Simple Vectorized +1000 Gaussian Simple Vectorized +1000 Exponential Simple Vectorized +1000 Poisson Simple Vectorized +1000 Discrete Simple Vectorized +10000 Uniform Simple Parallel +10000 Gaussian Simple Parallel +10000 Exponential Simple Parallel +10000 Poisson Simple Parallel +10000 Discrete Simple Parallel +100000 Uniform Simple Parallel +100000 Gaussian Simple Parallel +100000 Exponential Simple Parallel +100000 Poisson Simple Work-Stealing +100000 Discrete Simple Parallel + + +--- Platform Constants --- +Constant Value +-------------------------------------------------- +SIMD Block Size 4 doubles +Memory Alignment 64 bytes +Min SIMD Size 16 elements +Optimal Grain Size 64 elements +Fast Transcendental Support Yes + + +--- Adaptive Constants --- +Constant Value +-------------------------------------------------- +Min Elements for Parallel 8192 +Default Grain Size 256 +Simple Operation Grain Size 128 +Complex Operation Grain Size 512 + +System inspection completed successfully. diff --git a/data/profiles/dispatcher/2026-04-12T06-02-56Z_windows-x86_64_investigate-gaussian-avx512-perf_sha-32c0819/manifest.txt b/data/profiles/dispatcher/2026-04-12T06-02-56Z_windows-x86_64_investigate-gaussian-avx512-perf_sha-32c0819/manifest.txt new file mode 100644 index 0000000..31317fb --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T06-02-56Z_windows-x86_64_investigate-gaussian-avx512-perf_sha-32c0819/manifest.txt @@ -0,0 +1,14 @@ +Dispatcher profile bundle +========================= + +Run ID: 2026-04-12T06-02-56Z_windows-x86_64_investigate-gaussian-avx512-perf_sha-32c0819 +Captured at (UTC): 2026-04-12T06-02-56Z + +Files: +- metadata.json +- summary.json +- crossovers.csv +- best_strategies.csv +- strategy_profile_results.csv +- logs/system_inspector_performance.txt +- logs/strategy_profile.txt diff --git a/data/profiles/dispatcher/2026-04-12T06-02-56Z_windows-x86_64_investigate-gaussian-avx512-perf_sha-32c0819/metadata.json b/data/profiles/dispatcher/2026-04-12T06-02-56Z_windows-x86_64_investigate-gaussian-avx512-perf_sha-32c0819/metadata.json new file mode 100644 index 0000000..0638c33 --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T06-02-56Z_windows-x86_64_investigate-gaussian-avx512-perf_sha-32c0819/metadata.json @@ -0,0 +1,15 @@ +{ + "captured_at_utc": "2026-04-12T06-02-56Z", + "arch": "x86_64", + "git_branch": "investigate-gaussian-avx512-perf", + "os": "windows", + "cpu_brand": "AMD Ryzen 7 7445HS w/ Radeon 740M Graphics", + "build_type": "Release", + "cxx_compiler": "MSVC 17 2022", + "physical_cores": 6, + "build_dir": "C:\\Users\\gdwol\\Development\\libstats\\build", + "git_sha": "32c0819", + "logical_cores": 12, + "run_id": "2026-04-12T06-02-56Z_windows-x86_64_investigate-gaussian-avx512-perf_sha-32c0819", + "project_root": "C:\\Users\\gdwol\\Development\\libstats" +} diff --git a/data/profiles/dispatcher/2026-04-12T06-02-56Z_windows-x86_64_investigate-gaussian-avx512-perf_sha-32c0819/strategy_profile_results.csv b/data/profiles/dispatcher/2026-04-12T06-02-56Z_windows-x86_64_investigate-gaussian-avx512-perf_sha-32c0819/strategy_profile_results.csv new file mode 100644 index 0000000..8126ef5 --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T06-02-56Z_windows-x86_64_investigate-gaussian-avx512-perf_sha-32c0819/strategy_profile_results.csv @@ -0,0 +1,1729 @@ +Distribution,Operation,BatchSize,Strategy,MedianTime_us +Uniform,PDF,8,SCALAR,0.100000 +Uniform,PDF,8,VECTORIZED,0.000000 +Uniform,PDF,8,PARALLEL,0.000000 +Uniform,PDF,8,WORK_STEALING,0.100000 +Uniform,LogPDF,8,SCALAR,0.100000 +Uniform,LogPDF,8,VECTORIZED,0.100000 +Uniform,LogPDF,8,PARALLEL,0.100000 +Uniform,LogPDF,8,WORK_STEALING,0.100000 +Uniform,CDF,8,SCALAR,0.100000 +Uniform,CDF,8,VECTORIZED,0.000000 +Uniform,CDF,8,PARALLEL,0.100000 +Uniform,CDF,8,WORK_STEALING,0.000000 +Uniform,PDF,16,SCALAR,0.100000 +Uniform,PDF,16,VECTORIZED,0.000000 +Uniform,PDF,16,PARALLEL,0.000000 +Uniform,PDF,16,WORK_STEALING,0.000000 +Uniform,LogPDF,16,SCALAR,0.100000 +Uniform,LogPDF,16,VECTORIZED,0.000000 +Uniform,LogPDF,16,PARALLEL,0.000000 +Uniform,LogPDF,16,WORK_STEALING,0.000000 +Uniform,CDF,16,SCALAR,0.100000 +Uniform,CDF,16,VECTORIZED,0.000000 +Uniform,CDF,16,PARALLEL,0.000000 +Uniform,CDF,16,WORK_STEALING,0.000000 +Uniform,PDF,32,SCALAR,0.200000 +Uniform,PDF,32,VECTORIZED,0.000000 +Uniform,PDF,32,PARALLEL,0.100000 +Uniform,PDF,32,WORK_STEALING,0.100000 +Uniform,LogPDF,32,SCALAR,0.200000 +Uniform,LogPDF,32,VECTORIZED,0.000000 +Uniform,LogPDF,32,PARALLEL,0.000000 +Uniform,LogPDF,32,WORK_STEALING,0.100000 +Uniform,CDF,32,SCALAR,0.200000 +Uniform,CDF,32,VECTORIZED,0.100000 +Uniform,CDF,32,PARALLEL,0.100000 +Uniform,CDF,32,WORK_STEALING,0.000000 +Uniform,PDF,64,SCALAR,0.500000 +Uniform,PDF,64,VECTORIZED,0.100000 +Uniform,PDF,64,PARALLEL,0.100000 +Uniform,PDF,64,WORK_STEALING,0.100000 +Uniform,LogPDF,64,SCALAR,0.500000 +Uniform,LogPDF,64,VECTORIZED,0.100000 +Uniform,LogPDF,64,PARALLEL,0.100000 +Uniform,LogPDF,64,WORK_STEALING,0.100000 +Uniform,CDF,64,SCALAR,0.500000 +Uniform,CDF,64,VECTORIZED,0.200000 +Uniform,CDF,64,PARALLEL,0.100000 +Uniform,CDF,64,WORK_STEALING,0.100000 +Uniform,PDF,128,SCALAR,0.700000 +Uniform,PDF,128,VECTORIZED,0.100000 +Uniform,PDF,128,PARALLEL,0.100000 +Uniform,PDF,128,WORK_STEALING,0.200000 +Uniform,LogPDF,128,SCALAR,1.000000 +Uniform,LogPDF,128,VECTORIZED,0.100000 +Uniform,LogPDF,128,PARALLEL,0.200000 +Uniform,LogPDF,128,WORK_STEALING,0.200000 +Uniform,CDF,128,SCALAR,1.200000 +Uniform,CDF,128,VECTORIZED,0.300000 +Uniform,CDF,128,PARALLEL,0.200000 +Uniform,CDF,128,WORK_STEALING,0.200000 +Uniform,PDF,256,SCALAR,2.400000 +Uniform,PDF,256,VECTORIZED,0.300000 +Uniform,PDF,256,PARALLEL,0.200000 +Uniform,PDF,256,WORK_STEALING,0.300000 +Uniform,LogPDF,256,SCALAR,2.200000 +Uniform,LogPDF,256,VECTORIZED,0.200000 +Uniform,LogPDF,256,PARALLEL,0.300000 +Uniform,LogPDF,256,WORK_STEALING,0.400000 +Uniform,CDF,256,SCALAR,1.500000 +Uniform,CDF,256,VECTORIZED,0.300000 +Uniform,CDF,256,PARALLEL,0.200000 +Uniform,CDF,256,WORK_STEALING,0.200000 +Uniform,PDF,512,SCALAR,2.900000 +Uniform,PDF,512,VECTORIZED,0.500000 +Uniform,PDF,512,PARALLEL,0.500000 +Uniform,PDF,512,WORK_STEALING,0.500000 +Uniform,LogPDF,512,SCALAR,4.300000 +Uniform,LogPDF,512,VECTORIZED,0.400000 +Uniform,LogPDF,512,PARALLEL,0.600000 +Uniform,LogPDF,512,WORK_STEALING,0.300000 +Uniform,CDF,512,SCALAR,2.800000 +Uniform,CDF,512,VECTORIZED,0.500000 +Uniform,CDF,512,PARALLEL,0.400000 +Uniform,CDF,512,WORK_STEALING,0.400000 +Uniform,PDF,1000,SCALAR,5.600000 +Uniform,PDF,1000,VECTORIZED,0.700000 +Uniform,PDF,1000,PARALLEL,0.600000 +Uniform,PDF,1000,WORK_STEALING,0.700000 +Uniform,LogPDF,1000,SCALAR,6.600000 +Uniform,LogPDF,1000,VECTORIZED,1.000000 +Uniform,LogPDF,1000,PARALLEL,1.100000 +Uniform,LogPDF,1000,WORK_STEALING,1.100000 +Uniform,CDF,1000,SCALAR,6.000000 +Uniform,CDF,1000,VECTORIZED,1.700000 +Uniform,CDF,1000,PARALLEL,0.800000 +Uniform,CDF,1000,WORK_STEALING,0.800000 +Uniform,PDF,2000,SCALAR,15.700000 +Uniform,PDF,2000,VECTORIZED,1.900000 +Uniform,PDF,2000,PARALLEL,1.700000 +Uniform,PDF,2000,WORK_STEALING,1.700000 +Uniform,LogPDF,2000,SCALAR,10.700000 +Uniform,LogPDF,2000,VECTORIZED,1.400000 +Uniform,LogPDF,2000,PARALLEL,1.400000 +Uniform,LogPDF,2000,WORK_STEALING,1.500000 +Uniform,CDF,2000,SCALAR,15.700000 +Uniform,CDF,2000,VECTORIZED,2.100000 +Uniform,CDF,2000,PARALLEL,1.700000 +Uniform,CDF,2000,WORK_STEALING,1.600000 +Uniform,PDF,5000,SCALAR,28.500000 +Uniform,PDF,5000,VECTORIZED,8.400000 +Uniform,PDF,5000,PARALLEL,5.700000 +Uniform,PDF,5000,WORK_STEALING,5.500000 +Uniform,LogPDF,5000,SCALAR,44.900000 +Uniform,LogPDF,5000,VECTORIZED,5.500000 +Uniform,LogPDF,5000,PARALLEL,5.600000 +Uniform,LogPDF,5000,WORK_STEALING,5.700000 +Uniform,CDF,5000,SCALAR,41.900000 +Uniform,CDF,5000,VECTORIZED,8.300000 +Uniform,CDF,5000,PARALLEL,6.400000 +Uniform,CDF,5000,WORK_STEALING,6.300000 +Uniform,PDF,10000,SCALAR,86.300000 +Uniform,PDF,10000,VECTORIZED,11.300000 +Uniform,PDF,10000,PARALLEL,129.300000 +Uniform,PDF,10000,WORK_STEALING,41.200000 +Uniform,LogPDF,10000,SCALAR,93.300000 +Uniform,LogPDF,10000,VECTORIZED,8.900000 +Uniform,LogPDF,10000,PARALLEL,57.000000 +Uniform,LogPDF,10000,WORK_STEALING,38.500000 +Uniform,CDF,10000,SCALAR,85.500000 +Uniform,CDF,10000,VECTORIZED,16.500000 +Uniform,CDF,10000,PARALLEL,108.000000 +Uniform,CDF,10000,WORK_STEALING,35.000000 +Uniform,PDF,20000,SCALAR,245.500000 +Uniform,PDF,20000,VECTORIZED,20.300000 +Uniform,PDF,20000,PARALLEL,121.800000 +Uniform,PDF,20000,WORK_STEALING,64.200000 +Uniform,LogPDF,20000,SCALAR,173.500000 +Uniform,LogPDF,20000,VECTORIZED,15.200000 +Uniform,LogPDF,20000,PARALLEL,69.100000 +Uniform,LogPDF,20000,WORK_STEALING,89.800000 +Uniform,CDF,20000,SCALAR,203.200000 +Uniform,CDF,20000,VECTORIZED,32.800000 +Uniform,CDF,20000,PARALLEL,124.200000 +Uniform,CDF,20000,WORK_STEALING,77.900000 +Uniform,PDF,50000,SCALAR,462.600000 +Uniform,PDF,50000,VECTORIZED,127.000000 +Uniform,PDF,50000,PARALLEL,135.600000 +Uniform,PDF,50000,WORK_STEALING,211.600000 +Uniform,LogPDF,50000,SCALAR,447.200000 +Uniform,LogPDF,50000,VECTORIZED,135.800000 +Uniform,LogPDF,50000,PARALLEL,94.800000 +Uniform,LogPDF,50000,WORK_STEALING,249.400000 +Uniform,CDF,50000,SCALAR,470.400000 +Uniform,CDF,50000,VECTORIZED,159.200000 +Uniform,CDF,50000,PARALLEL,89.700000 +Uniform,CDF,50000,WORK_STEALING,151.100000 +Uniform,PDF,100000,SCALAR,926.200000 +Uniform,PDF,100000,VECTORIZED,401.100000 +Uniform,PDF,100000,PARALLEL,130.700000 +Uniform,PDF,100000,WORK_STEALING,241.800000 +Uniform,LogPDF,100000,SCALAR,888.100000 +Uniform,LogPDF,100000,VECTORIZED,416.300000 +Uniform,LogPDF,100000,PARALLEL,126.600000 +Uniform,LogPDF,100000,WORK_STEALING,626.700000 +Uniform,CDF,100000,SCALAR,933.800000 +Uniform,CDF,100000,VECTORIZED,394.800000 +Uniform,CDF,100000,PARALLEL,121.900000 +Uniform,CDF,100000,WORK_STEALING,473.700000 +Uniform,PDF,250000,SCALAR,2371.600000 +Uniform,PDF,250000,VECTORIZED,1075.900000 +Uniform,PDF,250000,PARALLEL,250.400000 +Uniform,PDF,250000,WORK_STEALING,1180.800000 +Uniform,LogPDF,250000,SCALAR,2398.100000 +Uniform,LogPDF,250000,VECTORIZED,1107.300000 +Uniform,LogPDF,250000,PARALLEL,258.300000 +Uniform,LogPDF,250000,WORK_STEALING,1256.200000 +Uniform,CDF,250000,SCALAR,2482.800000 +Uniform,CDF,250000,VECTORIZED,1111.500000 +Uniform,CDF,250000,PARALLEL,244.200000 +Uniform,CDF,250000,WORK_STEALING,1636.300000 +Uniform,PDF,500000,SCALAR,5427.600000 +Uniform,PDF,500000,VECTORIZED,3081.800000 +Uniform,PDF,500000,PARALLEL,490.300000 +Uniform,PDF,500000,WORK_STEALING,1546.400000 +Uniform,LogPDF,500000,SCALAR,6841.000000 +Uniform,LogPDF,500000,VECTORIZED,2634.400000 +Uniform,LogPDF,500000,PARALLEL,527.200000 +Uniform,LogPDF,500000,WORK_STEALING,2428.000000 +Uniform,CDF,500000,SCALAR,7710.000000 +Uniform,CDF,500000,VECTORIZED,2953.800000 +Uniform,CDF,500000,PARALLEL,551.700000 +Uniform,CDF,500000,WORK_STEALING,2462.000000 +Gaussian,PDF,8,SCALAR,0.100000 +Gaussian,PDF,8,VECTORIZED,0.000000 +Gaussian,PDF,8,PARALLEL,0.100000 +Gaussian,PDF,8,WORK_STEALING,0.100000 +Gaussian,LogPDF,8,SCALAR,0.100000 +Gaussian,LogPDF,8,VECTORIZED,0.000000 +Gaussian,LogPDF,8,PARALLEL,0.000000 +Gaussian,LogPDF,8,WORK_STEALING,0.100000 +Gaussian,CDF,8,SCALAR,0.200000 +Gaussian,CDF,8,VECTORIZED,0.200000 +Gaussian,CDF,8,PARALLEL,0.200000 +Gaussian,CDF,8,WORK_STEALING,0.200000 +Gaussian,PDF,16,SCALAR,0.400000 +Gaussian,PDF,16,VECTORIZED,0.200000 +Gaussian,PDF,16,PARALLEL,0.200000 +Gaussian,PDF,16,WORK_STEALING,0.200000 +Gaussian,LogPDF,16,SCALAR,0.300000 +Gaussian,LogPDF,16,VECTORIZED,0.100000 +Gaussian,LogPDF,16,PARALLEL,0.100000 +Gaussian,LogPDF,16,WORK_STEALING,0.100000 +Gaussian,CDF,16,SCALAR,0.500000 +Gaussian,CDF,16,VECTORIZED,0.300000 +Gaussian,CDF,16,PARALLEL,0.300000 +Gaussian,CDF,16,WORK_STEALING,0.300000 +Gaussian,PDF,32,SCALAR,0.700000 +Gaussian,PDF,32,VECTORIZED,0.300000 +Gaussian,PDF,32,PARALLEL,0.300000 +Gaussian,PDF,32,WORK_STEALING,0.300000 +Gaussian,LogPDF,32,SCALAR,0.400000 +Gaussian,LogPDF,32,VECTORIZED,0.100000 +Gaussian,LogPDF,32,PARALLEL,0.100000 +Gaussian,LogPDF,32,WORK_STEALING,0.100000 +Gaussian,CDF,32,SCALAR,2.100000 +Gaussian,CDF,32,VECTORIZED,0.400000 +Gaussian,CDF,32,PARALLEL,0.600000 +Gaussian,CDF,32,WORK_STEALING,0.600000 +Gaussian,PDF,64,SCALAR,1.400000 +Gaussian,PDF,64,VECTORIZED,0.400000 +Gaussian,PDF,64,PARALLEL,0.600000 +Gaussian,PDF,64,WORK_STEALING,0.600000 +Gaussian,LogPDF,64,SCALAR,0.800000 +Gaussian,LogPDF,64,VECTORIZED,0.200000 +Gaussian,LogPDF,64,PARALLEL,0.200000 +Gaussian,LogPDF,64,WORK_STEALING,0.200000 +Gaussian,CDF,64,SCALAR,1.800000 +Gaussian,CDF,64,VECTORIZED,0.700000 +Gaussian,CDF,64,PARALLEL,1.100000 +Gaussian,CDF,64,WORK_STEALING,1.000000 +Gaussian,PDF,128,SCALAR,2.600000 +Gaussian,PDF,128,VECTORIZED,0.500000 +Gaussian,PDF,128,PARALLEL,1.100000 +Gaussian,PDF,128,WORK_STEALING,1.100000 +Gaussian,LogPDF,128,SCALAR,1.800000 +Gaussian,LogPDF,128,VECTORIZED,0.200000 +Gaussian,LogPDF,128,PARALLEL,0.200000 +Gaussian,LogPDF,128,WORK_STEALING,0.300000 +Gaussian,CDF,128,SCALAR,3.600000 +Gaussian,CDF,128,VECTORIZED,1.200000 +Gaussian,CDF,128,PARALLEL,2.100000 +Gaussian,CDF,128,WORK_STEALING,2.000000 +Gaussian,PDF,256,SCALAR,5.900000 +Gaussian,PDF,256,VECTORIZED,0.900000 +Gaussian,PDF,256,PARALLEL,2.100000 +Gaussian,PDF,256,WORK_STEALING,2.000000 +Gaussian,LogPDF,256,SCALAR,3.700000 +Gaussian,LogPDF,256,VECTORIZED,0.300000 +Gaussian,LogPDF,256,PARALLEL,0.400000 +Gaussian,LogPDF,256,WORK_STEALING,0.400000 +Gaussian,CDF,256,SCALAR,7.300000 +Gaussian,CDF,256,VECTORIZED,2.300000 +Gaussian,CDF,256,PARALLEL,3.900000 +Gaussian,CDF,256,WORK_STEALING,4.000000 +Gaussian,PDF,512,SCALAR,11.300000 +Gaussian,PDF,512,VECTORIZED,1.700000 +Gaussian,PDF,512,PARALLEL,4.300000 +Gaussian,PDF,512,WORK_STEALING,3.900000 +Gaussian,LogPDF,512,SCALAR,7.300000 +Gaussian,LogPDF,512,VECTORIZED,0.400000 +Gaussian,LogPDF,512,PARALLEL,0.800000 +Gaussian,LogPDF,512,WORK_STEALING,0.800000 +Gaussian,CDF,512,SCALAR,14.300000 +Gaussian,CDF,512,VECTORIZED,4.200000 +Gaussian,CDF,512,PARALLEL,7.600000 +Gaussian,CDF,512,WORK_STEALING,7.000000 +Gaussian,PDF,1000,SCALAR,20.300000 +Gaussian,PDF,1000,VECTORIZED,2.900000 +Gaussian,PDF,1000,PARALLEL,8.500000 +Gaussian,PDF,1000,WORK_STEALING,7.700000 +Gaussian,LogPDF,1000,SCALAR,11.500000 +Gaussian,LogPDF,1000,VECTORIZED,0.500000 +Gaussian,LogPDF,1000,PARALLEL,1.400000 +Gaussian,LogPDF,1000,WORK_STEALING,1.400000 +Gaussian,CDF,1000,SCALAR,26.400000 +Gaussian,CDF,1000,VECTORIZED,7.900000 +Gaussian,CDF,1000,PARALLEL,13.900000 +Gaussian,CDF,1000,WORK_STEALING,13.200000 +Gaussian,PDF,2000,SCALAR,34.600000 +Gaussian,PDF,2000,VECTORIZED,5.600000 +Gaussian,PDF,2000,PARALLEL,16.400000 +Gaussian,PDF,2000,WORK_STEALING,13.800000 +Gaussian,LogPDF,2000,SCALAR,17.900000 +Gaussian,LogPDF,2000,VECTORIZED,0.500000 +Gaussian,LogPDF,2000,PARALLEL,1.700000 +Gaussian,LogPDF,2000,WORK_STEALING,1.800000 +Gaussian,CDF,2000,SCALAR,34.300000 +Gaussian,CDF,2000,VECTORIZED,10.500000 +Gaussian,CDF,2000,PARALLEL,19.100000 +Gaussian,CDF,2000,WORK_STEALING,18.800000 +Gaussian,PDF,5000,SCALAR,68.500000 +Gaussian,PDF,5000,VECTORIZED,9.200000 +Gaussian,PDF,5000,PARALLEL,30.500000 +Gaussian,PDF,5000,WORK_STEALING,23.500000 +Gaussian,LogPDF,5000,SCALAR,38.500000 +Gaussian,LogPDF,5000,VECTORIZED,1.500000 +Gaussian,LogPDF,5000,PARALLEL,4.000000 +Gaussian,LogPDF,5000,WORK_STEALING,4.500000 +Gaussian,CDF,5000,SCALAR,142.900000 +Gaussian,CDF,5000,VECTORIZED,40.200000 +Gaussian,CDF,5000,PARALLEL,77.000000 +Gaussian,CDF,5000,WORK_STEALING,72.800000 +Gaussian,PDF,10000,SCALAR,209.000000 +Gaussian,PDF,10000,VECTORIZED,28.600000 +Gaussian,PDF,10000,PARALLEL,54.100000 +Gaussian,PDF,10000,WORK_STEALING,78.300000 +Gaussian,LogPDF,10000,SCALAR,88.700000 +Gaussian,LogPDF,10000,VECTORIZED,3.200000 +Gaussian,LogPDF,10000,PARALLEL,65.200000 +Gaussian,LogPDF,10000,WORK_STEALING,33.400000 +Gaussian,CDF,10000,SCALAR,156.900000 +Gaussian,CDF,10000,VECTORIZED,53.900000 +Gaussian,CDF,10000,PARALLEL,63.100000 +Gaussian,CDF,10000,WORK_STEALING,171.700000 +Gaussian,PDF,20000,SCALAR,236.000000 +Gaussian,PDF,20000,VECTORIZED,37.200000 +Gaussian,PDF,20000,PARALLEL,143.200000 +Gaussian,PDF,20000,WORK_STEALING,86.200000 +Gaussian,LogPDF,20000,SCALAR,167.800000 +Gaussian,LogPDF,20000,VECTORIZED,8.700000 +Gaussian,LogPDF,20000,PARALLEL,136.200000 +Gaussian,LogPDF,20000,WORK_STEALING,44.700000 +Gaussian,CDF,20000,SCALAR,347.700000 +Gaussian,CDF,20000,VECTORIZED,109.300000 +Gaussian,CDF,20000,PARALLEL,153.900000 +Gaussian,CDF,20000,WORK_STEALING,122.100000 +Gaussian,PDF,50000,SCALAR,461.400000 +Gaussian,PDF,50000,VECTORIZED,81.200000 +Gaussian,PDF,50000,PARALLEL,87.200000 +Gaussian,PDF,50000,WORK_STEALING,177.400000 +Gaussian,LogPDF,50000,SCALAR,253.800000 +Gaussian,LogPDF,50000,VECTORIZED,11.300000 +Gaussian,LogPDF,50000,PARALLEL,136.100000 +Gaussian,LogPDF,50000,WORK_STEALING,30.100000 +Gaussian,CDF,50000,SCALAR,757.800000 +Gaussian,CDF,50000,VECTORIZED,184.400000 +Gaussian,CDF,50000,PARALLEL,115.600000 +Gaussian,CDF,50000,WORK_STEALING,225.100000 +Gaussian,PDF,100000,SCALAR,798.000000 +Gaussian,PDF,100000,VECTORIZED,196.100000 +Gaussian,PDF,100000,PARALLEL,108.900000 +Gaussian,PDF,100000,WORK_STEALING,298.000000 +Gaussian,LogPDF,100000,SCALAR,508.900000 +Gaussian,LogPDF,100000,VECTORIZED,26.800000 +Gaussian,LogPDF,100000,PARALLEL,99.200000 +Gaussian,LogPDF,100000,WORK_STEALING,57.100000 +Gaussian,CDF,100000,SCALAR,1071.600000 +Gaussian,CDF,100000,VECTORIZED,383.700000 +Gaussian,CDF,100000,PARALLEL,239.800000 +Gaussian,CDF,100000,WORK_STEALING,825.700000 +Gaussian,PDF,250000,SCALAR,2099.100000 +Gaussian,PDF,250000,VECTORIZED,435.300000 +Gaussian,PDF,250000,PARALLEL,258.800000 +Gaussian,PDF,250000,WORK_STEALING,903.900000 +Gaussian,LogPDF,250000,SCALAR,1276.000000 +Gaussian,LogPDF,250000,VECTORIZED,122.800000 +Gaussian,LogPDF,250000,PARALLEL,143.400000 +Gaussian,LogPDF,250000,WORK_STEALING,219.900000 +Gaussian,CDF,250000,SCALAR,2723.300000 +Gaussian,CDF,250000,VECTORIZED,901.400000 +Gaussian,CDF,250000,PARALLEL,371.700000 +Gaussian,CDF,250000,WORK_STEALING,1508.700000 +Gaussian,PDF,500000,SCALAR,4065.200000 +Gaussian,PDF,500000,VECTORIZED,1185.500000 +Gaussian,PDF,500000,PARALLEL,532.000000 +Gaussian,PDF,500000,WORK_STEALING,830.200000 +Gaussian,LogPDF,500000,SCALAR,2573.200000 +Gaussian,LogPDF,500000,VECTORIZED,137.800000 +Gaussian,LogPDF,500000,PARALLEL,151.500000 +Gaussian,LogPDF,500000,WORK_STEALING,154.600000 +Gaussian,CDF,500000,SCALAR,5476.100000 +Gaussian,CDF,500000,VECTORIZED,1936.500000 +Gaussian,CDF,500000,PARALLEL,825.600000 +Gaussian,CDF,500000,WORK_STEALING,3162.300000 +Exponential,PDF,8,SCALAR,0.300000 +Exponential,PDF,8,VECTORIZED,0.100000 +Exponential,PDF,8,PARALLEL,0.100000 +Exponential,PDF,8,WORK_STEALING,0.100000 +Exponential,LogPDF,8,SCALAR,0.100000 +Exponential,LogPDF,8,VECTORIZED,0.100000 +Exponential,LogPDF,8,PARALLEL,0.000000 +Exponential,LogPDF,8,WORK_STEALING,0.000000 +Exponential,CDF,8,SCALAR,0.100000 +Exponential,CDF,8,VECTORIZED,0.000000 +Exponential,CDF,8,PARALLEL,0.100000 +Exponential,CDF,8,WORK_STEALING,0.100000 +Exponential,PDF,16,SCALAR,0.200000 +Exponential,PDF,16,VECTORIZED,0.100000 +Exponential,PDF,16,PARALLEL,0.100000 +Exponential,PDF,16,WORK_STEALING,0.100000 +Exponential,LogPDF,16,SCALAR,0.200000 +Exponential,LogPDF,16,VECTORIZED,0.100000 +Exponential,LogPDF,16,PARALLEL,0.000000 +Exponential,LogPDF,16,WORK_STEALING,0.100000 +Exponential,CDF,16,SCALAR,0.200000 +Exponential,CDF,16,VECTORIZED,0.100000 +Exponential,CDF,16,PARALLEL,0.100000 +Exponential,CDF,16,WORK_STEALING,0.100000 +Exponential,PDF,32,SCALAR,0.400000 +Exponential,PDF,32,VECTORIZED,0.100000 +Exponential,PDF,32,PARALLEL,0.200000 +Exponential,PDF,32,WORK_STEALING,0.200000 +Exponential,LogPDF,32,SCALAR,0.300000 +Exponential,LogPDF,32,VECTORIZED,0.100000 +Exponential,LogPDF,32,PARALLEL,0.000000 +Exponential,LogPDF,32,WORK_STEALING,0.100000 +Exponential,CDF,32,SCALAR,0.400000 +Exponential,CDF,32,VECTORIZED,0.200000 +Exponential,CDF,32,PARALLEL,0.200000 +Exponential,CDF,32,WORK_STEALING,0.200000 +Exponential,PDF,64,SCALAR,0.800000 +Exponential,PDF,64,VECTORIZED,0.200000 +Exponential,PDF,64,PARALLEL,0.300000 +Exponential,PDF,64,WORK_STEALING,0.300000 +Exponential,LogPDF,64,SCALAR,0.500000 +Exponential,LogPDF,64,VECTORIZED,0.100000 +Exponential,LogPDF,64,PARALLEL,0.100000 +Exponential,LogPDF,64,WORK_STEALING,0.100000 +Exponential,CDF,64,SCALAR,0.700000 +Exponential,CDF,64,VECTORIZED,0.300000 +Exponential,CDF,64,PARALLEL,0.300000 +Exponential,CDF,64,WORK_STEALING,0.300000 +Exponential,PDF,128,SCALAR,1.500000 +Exponential,PDF,128,VECTORIZED,0.400000 +Exponential,PDF,128,PARALLEL,0.600000 +Exponential,PDF,128,WORK_STEALING,0.600000 +Exponential,LogPDF,128,SCALAR,1.100000 +Exponential,LogPDF,128,VECTORIZED,0.100000 +Exponential,LogPDF,128,PARALLEL,0.100000 +Exponential,LogPDF,128,WORK_STEALING,0.200000 +Exponential,CDF,128,SCALAR,1.500000 +Exponential,CDF,128,VECTORIZED,0.500000 +Exponential,CDF,128,PARALLEL,0.600000 +Exponential,CDF,128,WORK_STEALING,0.600000 +Exponential,PDF,256,SCALAR,3.000000 +Exponential,PDF,256,VECTORIZED,0.800000 +Exponential,PDF,256,PARALLEL,1.100000 +Exponential,PDF,256,WORK_STEALING,1.100000 +Exponential,LogPDF,256,SCALAR,2.000000 +Exponential,LogPDF,256,VECTORIZED,0.200000 +Exponential,LogPDF,256,PARALLEL,0.200000 +Exponential,LogPDF,256,WORK_STEALING,0.300000 +Exponential,CDF,256,SCALAR,2.900000 +Exponential,CDF,256,VECTORIZED,0.800000 +Exponential,CDF,256,PARALLEL,1.100000 +Exponential,CDF,256,WORK_STEALING,1.100000 +Exponential,PDF,512,SCALAR,3.900000 +Exponential,PDF,512,VECTORIZED,0.900000 +Exponential,PDF,512,PARALLEL,1.500000 +Exponential,PDF,512,WORK_STEALING,1.400000 +Exponential,LogPDF,512,SCALAR,2.700000 +Exponential,LogPDF,512,VECTORIZED,0.200000 +Exponential,LogPDF,512,PARALLEL,0.300000 +Exponential,LogPDF,512,WORK_STEALING,0.400000 +Exponential,CDF,512,SCALAR,3.900000 +Exponential,CDF,512,VECTORIZED,1.000000 +Exponential,CDF,512,PARALLEL,1.500000 +Exponential,CDF,512,WORK_STEALING,1.500000 +Exponential,PDF,1000,SCALAR,7.600000 +Exponential,PDF,1000,VECTORIZED,1.800000 +Exponential,PDF,1000,PARALLEL,2.800000 +Exponential,PDF,1000,WORK_STEALING,2.800000 +Exponential,LogPDF,1000,SCALAR,5.300000 +Exponential,LogPDF,1000,VECTORIZED,0.500000 +Exponential,LogPDF,1000,PARALLEL,0.600000 +Exponential,LogPDF,1000,WORK_STEALING,0.700000 +Exponential,CDF,1000,SCALAR,7.600000 +Exponential,CDF,1000,VECTORIZED,1.800000 +Exponential,CDF,1000,PARALLEL,2.900000 +Exponential,CDF,1000,WORK_STEALING,2.800000 +Exponential,PDF,2000,SCALAR,15.200000 +Exponential,PDF,2000,VECTORIZED,3.500000 +Exponential,PDF,2000,PARALLEL,5.600000 +Exponential,PDF,2000,WORK_STEALING,5.600000 +Exponential,LogPDF,2000,SCALAR,10.600000 +Exponential,LogPDF,2000,VECTORIZED,1.000000 +Exponential,LogPDF,2000,PARALLEL,1.100000 +Exponential,LogPDF,2000,WORK_STEALING,1.400000 +Exponential,CDF,2000,SCALAR,15.200000 +Exponential,CDF,2000,VECTORIZED,3.700000 +Exponential,CDF,2000,PARALLEL,5.700000 +Exponential,CDF,2000,WORK_STEALING,5.700000 +Exponential,PDF,5000,SCALAR,38.100000 +Exponential,PDF,5000,VECTORIZED,9.000000 +Exponential,PDF,5000,PARALLEL,14.000000 +Exponential,PDF,5000,WORK_STEALING,14.000000 +Exponential,LogPDF,5000,SCALAR,26.900000 +Exponential,LogPDF,5000,VECTORIZED,2.500000 +Exponential,LogPDF,5000,PARALLEL,2.700000 +Exponential,LogPDF,5000,WORK_STEALING,3.500000 +Exponential,CDF,5000,SCALAR,38.100000 +Exponential,CDF,5000,VECTORIZED,9.300000 +Exponential,CDF,5000,PARALLEL,14.300000 +Exponential,CDF,5000,WORK_STEALING,14.200000 +Exponential,PDF,10000,SCALAR,76.500000 +Exponential,PDF,10000,VECTORIZED,17.900000 +Exponential,PDF,10000,PARALLEL,84.200000 +Exponential,PDF,10000,WORK_STEALING,33.900000 +Exponential,LogPDF,10000,SCALAR,53.600000 +Exponential,LogPDF,10000,VECTORIZED,5.000000 +Exponential,LogPDF,10000,PARALLEL,75.600000 +Exponential,LogPDF,10000,WORK_STEALING,23.100000 +Exponential,CDF,10000,SCALAR,196.400000 +Exponential,CDF,10000,VECTORIZED,27.800000 +Exponential,CDF,10000,PARALLEL,83.000000 +Exponential,CDF,10000,WORK_STEALING,65.900000 +Exponential,PDF,20000,SCALAR,177.800000 +Exponential,PDF,20000,VECTORIZED,44.200000 +Exponential,PDF,20000,PARALLEL,49.100000 +Exponential,PDF,20000,WORK_STEALING,63.700000 +Exponential,LogPDF,20000,SCALAR,105.700000 +Exponential,LogPDF,20000,VECTORIZED,10.100000 +Exponential,LogPDF,20000,PARALLEL,102.900000 +Exponential,LogPDF,20000,WORK_STEALING,39.600000 +Exponential,CDF,20000,SCALAR,228.900000 +Exponential,CDF,20000,VECTORIZED,55.700000 +Exponential,CDF,20000,PARALLEL,147.000000 +Exponential,CDF,20000,WORK_STEALING,47.900000 +Exponential,PDF,50000,SCALAR,381.000000 +Exponential,PDF,50000,VECTORIZED,89.700000 +Exponential,PDF,50000,PARALLEL,77.300000 +Exponential,PDF,50000,WORK_STEALING,184.000000 +Exponential,LogPDF,50000,SCALAR,268.400000 +Exponential,LogPDF,50000,VECTORIZED,25.000000 +Exponential,LogPDF,50000,PARALLEL,100.000000 +Exponential,LogPDF,50000,WORK_STEALING,32.200000 +Exponential,CDF,50000,SCALAR,535.000000 +Exponential,CDF,50000,VECTORIZED,93.400000 +Exponential,CDF,50000,PARALLEL,93.800000 +Exponential,CDF,50000,WORK_STEALING,157.100000 +Exponential,PDF,100000,SCALAR,763.600000 +Exponential,PDF,100000,VECTORIZED,182.400000 +Exponential,PDF,100000,PARALLEL,120.000000 +Exponential,PDF,100000,WORK_STEALING,190.600000 +Exponential,LogPDF,100000,SCALAR,534.100000 +Exponential,LogPDF,100000,VECTORIZED,54.300000 +Exponential,LogPDF,100000,PARALLEL,129.300000 +Exponential,LogPDF,100000,WORK_STEALING,73.900000 +Exponential,CDF,100000,SCALAR,769.400000 +Exponential,CDF,100000,VECTORIZED,194.300000 +Exponential,CDF,100000,PARALLEL,147.300000 +Exponential,CDF,100000,WORK_STEALING,181.400000 +Exponential,PDF,250000,SCALAR,1928.600000 +Exponential,PDF,250000,VECTORIZED,477.100000 +Exponential,PDF,250000,PARALLEL,258.900000 +Exponential,PDF,250000,WORK_STEALING,1158.900000 +Exponential,LogPDF,250000,SCALAR,1329.400000 +Exponential,LogPDF,250000,VECTORIZED,153.100000 +Exponential,LogPDF,250000,PARALLEL,140.600000 +Exponential,LogPDF,250000,WORK_STEALING,130.500000 +Exponential,CDF,250000,SCALAR,1932.200000 +Exponential,CDF,250000,VECTORIZED,485.700000 +Exponential,CDF,250000,PARALLEL,240.200000 +Exponential,CDF,250000,WORK_STEALING,212.600000 +Exponential,PDF,500000,SCALAR,4075.600000 +Exponential,PDF,500000,VECTORIZED,994.900000 +Exponential,PDF,500000,PARALLEL,426.800000 +Exponential,PDF,500000,WORK_STEALING,1799.500000 +Exponential,LogPDF,500000,SCALAR,2688.300000 +Exponential,LogPDF,500000,VECTORIZED,332.300000 +Exponential,LogPDF,500000,PARALLEL,138.700000 +Exponential,LogPDF,500000,WORK_STEALING,484.600000 +Exponential,CDF,500000,SCALAR,4310.900000 +Exponential,CDF,500000,VECTORIZED,961.400000 +Exponential,CDF,500000,PARALLEL,411.100000 +Exponential,CDF,500000,WORK_STEALING,1607.400000 +Discrete,PDF,8,SCALAR,0.100000 +Discrete,PDF,8,VECTORIZED,0.000000 +Discrete,PDF,8,PARALLEL,0.100000 +Discrete,PDF,8,WORK_STEALING,0.000000 +Discrete,LogPDF,8,SCALAR,0.100000 +Discrete,LogPDF,8,VECTORIZED,0.000000 +Discrete,LogPDF,8,PARALLEL,0.100000 +Discrete,LogPDF,8,WORK_STEALING,0.100000 +Discrete,CDF,8,SCALAR,0.100000 +Discrete,CDF,8,VECTORIZED,0.100000 +Discrete,CDF,8,PARALLEL,0.100000 +Discrete,CDF,8,WORK_STEALING,0.000000 +Discrete,PDF,16,SCALAR,0.100000 +Discrete,PDF,16,VECTORIZED,0.000000 +Discrete,PDF,16,PARALLEL,0.000000 +Discrete,PDF,16,WORK_STEALING,0.000000 +Discrete,LogPDF,16,SCALAR,0.100000 +Discrete,LogPDF,16,VECTORIZED,0.000000 +Discrete,LogPDF,16,PARALLEL,0.100000 +Discrete,LogPDF,16,WORK_STEALING,0.100000 +Discrete,CDF,16,SCALAR,0.100000 +Discrete,CDF,16,VECTORIZED,0.000000 +Discrete,CDF,16,PARALLEL,0.100000 +Discrete,CDF,16,WORK_STEALING,0.000000 +Discrete,PDF,32,SCALAR,0.200000 +Discrete,PDF,32,VECTORIZED,0.100000 +Discrete,PDF,32,PARALLEL,0.100000 +Discrete,PDF,32,WORK_STEALING,0.100000 +Discrete,LogPDF,32,SCALAR,0.200000 +Discrete,LogPDF,32,VECTORIZED,0.000000 +Discrete,LogPDF,32,PARALLEL,0.100000 +Discrete,LogPDF,32,WORK_STEALING,0.100000 +Discrete,CDF,32,SCALAR,0.200000 +Discrete,CDF,32,VECTORIZED,0.100000 +Discrete,CDF,32,PARALLEL,0.100000 +Discrete,CDF,32,WORK_STEALING,0.100000 +Discrete,PDF,64,SCALAR,0.400000 +Discrete,PDF,64,VECTORIZED,0.000000 +Discrete,PDF,64,PARALLEL,0.100000 +Discrete,PDF,64,WORK_STEALING,0.100000 +Discrete,LogPDF,64,SCALAR,0.400000 +Discrete,LogPDF,64,VECTORIZED,0.000000 +Discrete,LogPDF,64,PARALLEL,0.100000 +Discrete,LogPDF,64,WORK_STEALING,0.100000 +Discrete,CDF,64,SCALAR,0.400000 +Discrete,CDF,64,VECTORIZED,0.100000 +Discrete,CDF,64,PARALLEL,0.100000 +Discrete,CDF,64,WORK_STEALING,0.100000 +Discrete,PDF,128,SCALAR,0.900000 +Discrete,PDF,128,VECTORIZED,0.200000 +Discrete,PDF,128,PARALLEL,0.100000 +Discrete,PDF,128,WORK_STEALING,0.200000 +Discrete,LogPDF,128,SCALAR,0.800000 +Discrete,LogPDF,128,VECTORIZED,0.100000 +Discrete,LogPDF,128,PARALLEL,0.200000 +Discrete,LogPDF,128,WORK_STEALING,0.200000 +Discrete,CDF,128,SCALAR,0.800000 +Discrete,CDF,128,VECTORIZED,0.200000 +Discrete,CDF,128,PARALLEL,0.200000 +Discrete,CDF,128,WORK_STEALING,0.200000 +Discrete,PDF,256,SCALAR,1.600000 +Discrete,PDF,256,VECTORIZED,0.200000 +Discrete,PDF,256,PARALLEL,0.300000 +Discrete,PDF,256,WORK_STEALING,0.300000 +Discrete,LogPDF,256,SCALAR,1.600000 +Discrete,LogPDF,256,VECTORIZED,0.200000 +Discrete,LogPDF,256,PARALLEL,0.300000 +Discrete,LogPDF,256,WORK_STEALING,0.300000 +Discrete,CDF,256,SCALAR,1.500000 +Discrete,CDF,256,VECTORIZED,0.300000 +Discrete,CDF,256,PARALLEL,0.400000 +Discrete,CDF,256,WORK_STEALING,0.400000 +Discrete,PDF,512,SCALAR,3.200000 +Discrete,PDF,512,VECTORIZED,0.500000 +Discrete,PDF,512,PARALLEL,0.500000 +Discrete,PDF,512,WORK_STEALING,0.600000 +Discrete,LogPDF,512,SCALAR,3.100000 +Discrete,LogPDF,512,VECTORIZED,0.500000 +Discrete,LogPDF,512,PARALLEL,0.600000 +Discrete,LogPDF,512,WORK_STEALING,0.600000 +Discrete,CDF,512,SCALAR,2.800000 +Discrete,CDF,512,VECTORIZED,0.500000 +Discrete,CDF,512,PARALLEL,0.700000 +Discrete,CDF,512,WORK_STEALING,0.700000 +Discrete,PDF,1000,SCALAR,6.500000 +Discrete,PDF,1000,VECTORIZED,0.900000 +Discrete,PDF,1000,PARALLEL,1.000000 +Discrete,PDF,1000,WORK_STEALING,1.000000 +Discrete,LogPDF,1000,SCALAR,6.300000 +Discrete,LogPDF,1000,VECTORIZED,0.900000 +Discrete,LogPDF,1000,PARALLEL,1.200000 +Discrete,LogPDF,1000,WORK_STEALING,1.200000 +Discrete,CDF,1000,SCALAR,5.600000 +Discrete,CDF,1000,VECTORIZED,1.000000 +Discrete,CDF,1000,PARALLEL,1.300000 +Discrete,CDF,1000,WORK_STEALING,1.300000 +Discrete,PDF,2000,SCALAR,13.200000 +Discrete,PDF,2000,VECTORIZED,1.700000 +Discrete,PDF,2000,PARALLEL,2.100000 +Discrete,PDF,2000,WORK_STEALING,2.100000 +Discrete,LogPDF,2000,SCALAR,12.000000 +Discrete,LogPDF,2000,VECTORIZED,1.700000 +Discrete,LogPDF,2000,PARALLEL,2.300000 +Discrete,LogPDF,2000,WORK_STEALING,2.300000 +Discrete,CDF,2000,SCALAR,11.100000 +Discrete,CDF,2000,VECTORIZED,2.000000 +Discrete,CDF,2000,PARALLEL,2.600000 +Discrete,CDF,2000,WORK_STEALING,2.600000 +Discrete,PDF,5000,SCALAR,31.400000 +Discrete,PDF,5000,VECTORIZED,4.300000 +Discrete,PDF,5000,PARALLEL,5.100000 +Discrete,PDF,5000,WORK_STEALING,5.000000 +Discrete,LogPDF,5000,SCALAR,30.400000 +Discrete,LogPDF,5000,VECTORIZED,4.200000 +Discrete,LogPDF,5000,PARALLEL,5.700000 +Discrete,LogPDF,5000,WORK_STEALING,5.700000 +Discrete,CDF,5000,SCALAR,27.900000 +Discrete,CDF,5000,VECTORIZED,5.000000 +Discrete,CDF,5000,PARALLEL,6.500000 +Discrete,CDF,5000,WORK_STEALING,6.500000 +Discrete,PDF,10000,SCALAR,63.700000 +Discrete,PDF,10000,VECTORIZED,8.500000 +Discrete,PDF,10000,PARALLEL,84.800000 +Discrete,PDF,10000,WORK_STEALING,31.900000 +Discrete,LogPDF,10000,SCALAR,90.900000 +Discrete,LogPDF,10000,VECTORIZED,8.500000 +Discrete,LogPDF,10000,PARALLEL,106.300000 +Discrete,LogPDF,10000,WORK_STEALING,32.400000 +Discrete,CDF,10000,SCALAR,86.800000 +Discrete,CDF,10000,VECTORIZED,16.600000 +Discrete,CDF,10000,PARALLEL,111.500000 +Discrete,CDF,10000,WORK_STEALING,29.900000 +Discrete,PDF,20000,SCALAR,125.800000 +Discrete,PDF,20000,VECTORIZED,27.700000 +Discrete,PDF,20000,PARALLEL,101.000000 +Discrete,PDF,20000,WORK_STEALING,36.100000 +Discrete,LogPDF,20000,SCALAR,121.800000 +Discrete,LogPDF,20000,VECTORIZED,16.900000 +Discrete,LogPDF,20000,PARALLEL,127.500000 +Discrete,LogPDF,20000,WORK_STEALING,31.400000 +Discrete,CDF,20000,SCALAR,118.700000 +Discrete,CDF,20000,VECTORIZED,26.600000 +Discrete,CDF,20000,PARALLEL,95.700000 +Discrete,CDF,20000,WORK_STEALING,39.300000 +Discrete,PDF,50000,SCALAR,412.800000 +Discrete,PDF,50000,VECTORIZED,63.400000 +Discrete,PDF,50000,PARALLEL,56.400000 +Discrete,PDF,50000,WORK_STEALING,62.500000 +Discrete,LogPDF,50000,SCALAR,334.400000 +Discrete,LogPDF,50000,VECTORIZED,42.500000 +Discrete,LogPDF,50000,PARALLEL,128.100000 +Discrete,LogPDF,50000,WORK_STEALING,94.800000 +Discrete,CDF,50000,SCALAR,458.600000 +Discrete,CDF,50000,VECTORIZED,134.600000 +Discrete,CDF,50000,PARALLEL,124.500000 +Discrete,CDF,50000,WORK_STEALING,55.300000 +Discrete,PDF,100000,SCALAR,631.100000 +Discrete,PDF,100000,VECTORIZED,84.600000 +Discrete,PDF,100000,PARALLEL,65.400000 +Discrete,PDF,100000,WORK_STEALING,87.600000 +Discrete,LogPDF,100000,SCALAR,613.000000 +Discrete,LogPDF,100000,VECTORIZED,84.600000 +Discrete,LogPDF,100000,PARALLEL,126.000000 +Discrete,LogPDF,100000,WORK_STEALING,231.800000 +Discrete,CDF,100000,SCALAR,623.200000 +Discrete,CDF,100000,VECTORIZED,184.900000 +Discrete,CDF,100000,PARALLEL,119.000000 +Discrete,CDF,100000,WORK_STEALING,385.500000 +Discrete,PDF,250000,SCALAR,1559.900000 +Discrete,PDF,250000,VECTORIZED,211.400000 +Discrete,PDF,250000,PARALLEL,129.900000 +Discrete,PDF,250000,WORK_STEALING,237.600000 +Discrete,LogPDF,250000,SCALAR,1857.600000 +Discrete,LogPDF,250000,VECTORIZED,220.300000 +Discrete,LogPDF,250000,PARALLEL,157.200000 +Discrete,LogPDF,250000,WORK_STEALING,352.500000 +Discrete,CDF,250000,SCALAR,1597.000000 +Discrete,CDF,250000,VECTORIZED,519.100000 +Discrete,CDF,250000,PARALLEL,172.100000 +Discrete,CDF,250000,WORK_STEALING,202.900000 +Discrete,PDF,500000,SCALAR,4138.200000 +Discrete,PDF,500000,VECTORIZED,428.300000 +Discrete,PDF,500000,PARALLEL,183.900000 +Discrete,PDF,500000,WORK_STEALING,977.700000 +Discrete,LogPDF,500000,SCALAR,3046.600000 +Discrete,LogPDF,500000,VECTORIZED,465.500000 +Discrete,LogPDF,500000,PARALLEL,260.700000 +Discrete,LogPDF,500000,WORK_STEALING,422.400000 +Discrete,CDF,500000,SCALAR,3439.100000 +Discrete,CDF,500000,VECTORIZED,974.100000 +Discrete,CDF,500000,PARALLEL,318.900000 +Discrete,CDF,500000,WORK_STEALING,550.800000 +Poisson,PDF,8,SCALAR,0.200000 +Poisson,PDF,8,VECTORIZED,0.100000 +Poisson,PDF,8,PARALLEL,0.200000 +Poisson,PDF,8,WORK_STEALING,0.100000 +Poisson,LogPDF,8,SCALAR,0.100000 +Poisson,LogPDF,8,VECTORIZED,0.100000 +Poisson,LogPDF,8,PARALLEL,0.100000 +Poisson,LogPDF,8,WORK_STEALING,0.100000 +Poisson,CDF,8,SCALAR,0.400000 +Poisson,CDF,8,VECTORIZED,0.400000 +Poisson,CDF,8,PARALLEL,0.400000 +Poisson,CDF,8,WORK_STEALING,0.400000 +Poisson,PDF,16,SCALAR,0.400000 +Poisson,PDF,16,VECTORIZED,0.200000 +Poisson,PDF,16,PARALLEL,0.200000 +Poisson,PDF,16,WORK_STEALING,0.200000 +Poisson,LogPDF,16,SCALAR,0.200000 +Poisson,LogPDF,16,VECTORIZED,0.100000 +Poisson,LogPDF,16,PARALLEL,0.100000 +Poisson,LogPDF,16,WORK_STEALING,0.100000 +Poisson,CDF,16,SCALAR,0.900000 +Poisson,CDF,16,VECTORIZED,0.800000 +Poisson,CDF,16,PARALLEL,0.900000 +Poisson,CDF,16,WORK_STEALING,0.800000 +Poisson,PDF,32,SCALAR,0.600000 +Poisson,PDF,32,VECTORIZED,0.400000 +Poisson,PDF,32,PARALLEL,0.500000 +Poisson,PDF,32,WORK_STEALING,0.500000 +Poisson,LogPDF,32,SCALAR,0.400000 +Poisson,LogPDF,32,VECTORIZED,0.200000 +Poisson,LogPDF,32,PARALLEL,0.200000 +Poisson,LogPDF,32,WORK_STEALING,0.200000 +Poisson,CDF,32,SCALAR,1.700000 +Poisson,CDF,32,VECTORIZED,1.700000 +Poisson,CDF,32,PARALLEL,1.700000 +Poisson,CDF,32,WORK_STEALING,1.700000 +Poisson,PDF,64,SCALAR,1.200000 +Poisson,PDF,64,VECTORIZED,0.800000 +Poisson,PDF,64,PARALLEL,0.800000 +Poisson,PDF,64,WORK_STEALING,0.800000 +Poisson,LogPDF,64,SCALAR,0.700000 +Poisson,LogPDF,64,VECTORIZED,0.400000 +Poisson,LogPDF,64,PARALLEL,0.400000 +Poisson,LogPDF,64,WORK_STEALING,0.400000 +Poisson,CDF,64,SCALAR,3.300000 +Poisson,CDF,64,VECTORIZED,3.200000 +Poisson,CDF,64,PARALLEL,3.200000 +Poisson,CDF,64,WORK_STEALING,3.200000 +Poisson,PDF,128,SCALAR,2.500000 +Poisson,PDF,128,VECTORIZED,1.500000 +Poisson,PDF,128,PARALLEL,1.500000 +Poisson,PDF,128,WORK_STEALING,1.500000 +Poisson,LogPDF,128,SCALAR,1.300000 +Poisson,LogPDF,128,VECTORIZED,0.600000 +Poisson,LogPDF,128,PARALLEL,0.800000 +Poisson,LogPDF,128,WORK_STEALING,0.600000 +Poisson,CDF,128,SCALAR,6.500000 +Poisson,CDF,128,VECTORIZED,6.200000 +Poisson,CDF,128,PARALLEL,6.300000 +Poisson,CDF,128,WORK_STEALING,6.300000 +Poisson,PDF,256,SCALAR,4.900000 +Poisson,PDF,256,VECTORIZED,2.900000 +Poisson,PDF,256,PARALLEL,3.100000 +Poisson,PDF,256,WORK_STEALING,3.100000 +Poisson,LogPDF,256,SCALAR,2.700000 +Poisson,LogPDF,256,VECTORIZED,1.300000 +Poisson,LogPDF,256,PARALLEL,1.400000 +Poisson,LogPDF,256,WORK_STEALING,1.400000 +Poisson,CDF,256,SCALAR,13.200000 +Poisson,CDF,256,VECTORIZED,12.700000 +Poisson,CDF,256,PARALLEL,12.800000 +Poisson,CDF,256,WORK_STEALING,12.700000 +Poisson,PDF,512,SCALAR,9.600000 +Poisson,PDF,512,VECTORIZED,5.700000 +Poisson,PDF,512,PARALLEL,6.000000 +Poisson,PDF,512,WORK_STEALING,6.000000 +Poisson,LogPDF,512,SCALAR,5.200000 +Poisson,LogPDF,512,VECTORIZED,2.500000 +Poisson,LogPDF,512,PARALLEL,2.800000 +Poisson,LogPDF,512,WORK_STEALING,2.700000 +Poisson,CDF,512,SCALAR,25.900000 +Poisson,CDF,512,VECTORIZED,24.800000 +Poisson,CDF,512,PARALLEL,25.000000 +Poisson,CDF,512,WORK_STEALING,25.000000 +Poisson,PDF,1000,SCALAR,18.900000 +Poisson,PDF,1000,VECTORIZED,11.200000 +Poisson,PDF,1000,PARALLEL,11.900000 +Poisson,PDF,1000,WORK_STEALING,11.700000 +Poisson,LogPDF,1000,SCALAR,10.200000 +Poisson,LogPDF,1000,VECTORIZED,4.900000 +Poisson,LogPDF,1000,PARALLEL,5.400000 +Poisson,LogPDF,1000,WORK_STEALING,5.100000 +Poisson,CDF,1000,SCALAR,51.000000 +Poisson,CDF,1000,VECTORIZED,48.900000 +Poisson,CDF,1000,PARALLEL,49.400000 +Poisson,CDF,1000,WORK_STEALING,49.200000 +Poisson,PDF,2000,SCALAR,37.600000 +Poisson,PDF,2000,VECTORIZED,22.200000 +Poisson,PDF,2000,PARALLEL,23.300000 +Poisson,PDF,2000,WORK_STEALING,23.100000 +Poisson,LogPDF,2000,SCALAR,47.600000 +Poisson,LogPDF,2000,VECTORIZED,9.600000 +Poisson,LogPDF,2000,PARALLEL,10.700000 +Poisson,LogPDF,2000,WORK_STEALING,10.100000 +Poisson,CDF,2000,SCALAR,102.900000 +Poisson,CDF,2000,VECTORIZED,99.000000 +Poisson,CDF,2000,PARALLEL,100.200000 +Poisson,CDF,2000,WORK_STEALING,99.700000 +Poisson,PDF,5000,SCALAR,94.400000 +Poisson,PDF,5000,VECTORIZED,55.400000 +Poisson,PDF,5000,PARALLEL,58.400000 +Poisson,PDF,5000,WORK_STEALING,58.000000 +Poisson,LogPDF,5000,SCALAR,51.100000 +Poisson,LogPDF,5000,VECTORIZED,24.100000 +Poisson,LogPDF,5000,PARALLEL,27.100000 +Poisson,LogPDF,5000,WORK_STEALING,25.400000 +Poisson,CDF,5000,SCALAR,277.300000 +Poisson,CDF,5000,VECTORIZED,266.300000 +Poisson,CDF,5000,PARALLEL,280.500000 +Poisson,CDF,5000,WORK_STEALING,268.700000 +Poisson,PDF,10000,SCALAR,188.100000 +Poisson,PDF,10000,VECTORIZED,111.400000 +Poisson,PDF,10000,PARALLEL,85.400000 +Poisson,PDF,10000,WORK_STEALING,207.800000 +Poisson,LogPDF,10000,SCALAR,105.000000 +Poisson,LogPDF,10000,VECTORIZED,47.900000 +Poisson,LogPDF,10000,PARALLEL,58.600000 +Poisson,LogPDF,10000,WORK_STEALING,53.900000 +Poisson,CDF,10000,SCALAR,873.100000 +Poisson,CDF,10000,VECTORIZED,561.600000 +Poisson,CDF,10000,PARALLEL,143.900000 +Poisson,CDF,10000,WORK_STEALING,287.100000 +Poisson,PDF,20000,SCALAR,384.600000 +Poisson,PDF,20000,VECTORIZED,225.600000 +Poisson,PDF,20000,PARALLEL,86.600000 +Poisson,PDF,20000,WORK_STEALING,277.100000 +Poisson,LogPDF,20000,SCALAR,225.100000 +Poisson,LogPDF,20000,VECTORIZED,95.500000 +Poisson,LogPDF,20000,PARALLEL,59.300000 +Poisson,LogPDF,20000,WORK_STEALING,101.200000 +Poisson,CDF,20000,SCALAR,1158.800000 +Poisson,CDF,20000,VECTORIZED,1132.200000 +Poisson,CDF,20000,PARALLEL,224.300000 +Poisson,CDF,20000,WORK_STEALING,1604.900000 +Poisson,PDF,50000,SCALAR,985.900000 +Poisson,PDF,50000,VECTORIZED,683.600000 +Poisson,PDF,50000,PARALLEL,164.100000 +Poisson,PDF,50000,WORK_STEALING,990.000000 +Poisson,LogPDF,50000,SCALAR,579.200000 +Poisson,LogPDF,50000,VECTORIZED,297.500000 +Poisson,LogPDF,50000,PARALLEL,116.000000 +Poisson,LogPDF,50000,WORK_STEALING,142.000000 +Poisson,CDF,50000,SCALAR,2970.000000 +Poisson,CDF,50000,VECTORIZED,2824.000000 +Poisson,CDF,50000,PARALLEL,565.800000 +Poisson,CDF,50000,WORK_STEALING,1333.100000 +Poisson,PDF,100000,SCALAR,2014.600000 +Poisson,PDF,100000,VECTORIZED,1374.000000 +Poisson,PDF,100000,PARALLEL,337.800000 +Poisson,PDF,100000,WORK_STEALING,894.200000 +Poisson,LogPDF,100000,SCALAR,1167.800000 +Poisson,LogPDF,100000,VECTORIZED,663.600000 +Poisson,LogPDF,100000,PARALLEL,176.300000 +Poisson,LogPDF,100000,WORK_STEALING,517.500000 +Poisson,CDF,100000,SCALAR,5867.200000 +Poisson,CDF,100000,VECTORIZED,5708.500000 +Poisson,CDF,100000,PARALLEL,1344.700000 +Poisson,CDF,100000,WORK_STEALING,4059.300000 +Poisson,PDF,250000,SCALAR,5409.800000 +Poisson,PDF,250000,VECTORIZED,3234.800000 +Poisson,PDF,250000,PARALLEL,736.600000 +Poisson,PDF,250000,WORK_STEALING,1073.600000 +Poisson,LogPDF,250000,SCALAR,2954.200000 +Poisson,LogPDF,250000,VECTORIZED,1721.100000 +Poisson,LogPDF,250000,PARALLEL,437.300000 +Poisson,LogPDF,250000,WORK_STEALING,386.100000 +Poisson,CDF,250000,SCALAR,15729.100000 +Poisson,CDF,250000,VECTORIZED,15045.300000 +Poisson,CDF,250000,PARALLEL,3236.800000 +Poisson,CDF,250000,WORK_STEALING,3826.900000 +Poisson,PDF,500000,SCALAR,10170.700000 +Poisson,PDF,500000,VECTORIZED,6262.700000 +Poisson,PDF,500000,PARALLEL,1603.700000 +Poisson,PDF,500000,WORK_STEALING,1567.000000 +Poisson,LogPDF,500000,SCALAR,6152.700000 +Poisson,LogPDF,500000,VECTORIZED,3851.100000 +Poisson,LogPDF,500000,PARALLEL,769.700000 +Poisson,LogPDF,500000,WORK_STEALING,2586.200000 +Poisson,CDF,500000,SCALAR,31162.600000 +Poisson,CDF,500000,VECTORIZED,29513.500000 +Poisson,CDF,500000,PARALLEL,5648.400000 +Poisson,CDF,500000,WORK_STEALING,8684.500000 +Gamma,PDF,8,SCALAR,0.300000 +Gamma,PDF,8,VECTORIZED,0.100000 +Gamma,PDF,8,PARALLEL,0.200000 +Gamma,PDF,8,WORK_STEALING,0.200000 +Gamma,LogPDF,8,SCALAR,0.200000 +Gamma,LogPDF,8,VECTORIZED,0.100000 +Gamma,LogPDF,8,PARALLEL,0.100000 +Gamma,LogPDF,8,WORK_STEALING,0.100000 +Gamma,CDF,8,SCALAR,0.600000 +Gamma,CDF,8,VECTORIZED,0.500000 +Gamma,CDF,8,PARALLEL,0.400000 +Gamma,CDF,8,WORK_STEALING,0.500000 +Gamma,PDF,16,SCALAR,0.500000 +Gamma,PDF,16,VECTORIZED,0.300000 +Gamma,PDF,16,PARALLEL,0.300000 +Gamma,PDF,16,WORK_STEALING,0.300000 +Gamma,LogPDF,16,SCALAR,0.300000 +Gamma,LogPDF,16,VECTORIZED,0.300000 +Gamma,LogPDF,16,PARALLEL,0.100000 +Gamma,LogPDF,16,WORK_STEALING,0.200000 +Gamma,CDF,16,SCALAR,1.200000 +Gamma,CDF,16,VECTORIZED,1.200000 +Gamma,CDF,16,PARALLEL,1.000000 +Gamma,CDF,16,WORK_STEALING,1.000000 +Gamma,PDF,32,SCALAR,1.100000 +Gamma,PDF,32,VECTORIZED,0.400000 +Gamma,PDF,32,PARALLEL,0.400000 +Gamma,PDF,32,WORK_STEALING,0.500000 +Gamma,LogPDF,32,SCALAR,0.400000 +Gamma,LogPDF,32,VECTORIZED,0.300000 +Gamma,LogPDF,32,PARALLEL,0.200000 +Gamma,LogPDF,32,WORK_STEALING,0.200000 +Gamma,CDF,32,SCALAR,2.300000 +Gamma,CDF,32,VECTORIZED,2.100000 +Gamma,CDF,32,PARALLEL,2.000000 +Gamma,CDF,32,WORK_STEALING,2.000000 +Gamma,PDF,64,SCALAR,2.100000 +Gamma,PDF,64,VECTORIZED,0.700000 +Gamma,PDF,64,PARALLEL,0.900000 +Gamma,PDF,64,WORK_STEALING,0.900000 +Gamma,LogPDF,64,SCALAR,0.900000 +Gamma,LogPDF,64,VECTORIZED,0.500000 +Gamma,LogPDF,64,PARALLEL,0.400000 +Gamma,LogPDF,64,WORK_STEALING,0.400000 +Gamma,CDF,64,SCALAR,4.500000 +Gamma,CDF,64,VECTORIZED,4.000000 +Gamma,CDF,64,PARALLEL,3.900000 +Gamma,CDF,64,WORK_STEALING,3.900000 +Gamma,PDF,128,SCALAR,4.100000 +Gamma,PDF,128,VECTORIZED,1.000000 +Gamma,PDF,128,PARALLEL,1.600000 +Gamma,PDF,128,WORK_STEALING,1.700000 +Gamma,LogPDF,128,SCALAR,1.600000 +Gamma,LogPDF,128,VECTORIZED,0.700000 +Gamma,LogPDF,128,PARALLEL,0.800000 +Gamma,LogPDF,128,WORK_STEALING,0.700000 +Gamma,CDF,128,SCALAR,9.000000 +Gamma,CDF,128,VECTORIZED,7.800000 +Gamma,CDF,128,PARALLEL,7.700000 +Gamma,CDF,128,WORK_STEALING,7.700000 +Gamma,PDF,256,SCALAR,8.100000 +Gamma,PDF,256,VECTORIZED,1.700000 +Gamma,PDF,256,PARALLEL,3.300000 +Gamma,PDF,256,WORK_STEALING,3.200000 +Gamma,LogPDF,256,SCALAR,3.200000 +Gamma,LogPDF,256,VECTORIZED,1.100000 +Gamma,LogPDF,256,PARALLEL,1.600000 +Gamma,LogPDF,256,WORK_STEALING,1.500000 +Gamma,CDF,256,SCALAR,17.900000 +Gamma,CDF,256,VECTORIZED,15.400000 +Gamma,CDF,256,PARALLEL,10.200000 +Gamma,CDF,256,WORK_STEALING,10.200000 +Gamma,PDF,512,SCALAR,10.800000 +Gamma,PDF,512,VECTORIZED,2.100000 +Gamma,PDF,512,PARALLEL,4.300000 +Gamma,PDF,512,WORK_STEALING,4.300000 +Gamma,LogPDF,512,SCALAR,4.300000 +Gamma,LogPDF,512,VECTORIZED,1.400000 +Gamma,LogPDF,512,PARALLEL,2.000000 +Gamma,LogPDF,512,WORK_STEALING,1.900000 +Gamma,CDF,512,SCALAR,23.900000 +Gamma,CDF,512,VECTORIZED,20.600000 +Gamma,CDF,512,PARALLEL,20.500000 +Gamma,CDF,512,WORK_STEALING,30.500000 +Gamma,PDF,1000,SCALAR,31.800000 +Gamma,PDF,1000,VECTORIZED,5.700000 +Gamma,PDF,1000,PARALLEL,12.400000 +Gamma,PDF,1000,WORK_STEALING,12.300000 +Gamma,LogPDF,1000,SCALAR,12.700000 +Gamma,LogPDF,1000,VECTORIZED,3.700000 +Gamma,LogPDF,1000,PARALLEL,5.800000 +Gamma,LogPDF,1000,WORK_STEALING,5.600000 +Gamma,CDF,1000,SCALAR,47.000000 +Gamma,CDF,1000,VECTORIZED,40.100000 +Gamma,CDF,1000,PARALLEL,40.000000 +Gamma,CDF,1000,WORK_STEALING,39.700000 +Gamma,PDF,2000,SCALAR,42.100000 +Gamma,PDF,2000,VECTORIZED,7.500000 +Gamma,PDF,2000,PARALLEL,16.900000 +Gamma,PDF,2000,WORK_STEALING,17.000000 +Gamma,LogPDF,2000,SCALAR,16.900000 +Gamma,LogPDF,2000,VECTORIZED,4.800000 +Gamma,LogPDF,2000,PARALLEL,7.800000 +Gamma,LogPDF,2000,WORK_STEALING,7.400000 +Gamma,CDF,2000,SCALAR,93.900000 +Gamma,CDF,2000,VECTORIZED,79.900000 +Gamma,CDF,2000,PARALLEL,80.500000 +Gamma,CDF,2000,WORK_STEALING,79.700000 +Gamma,PDF,5000,SCALAR,106.100000 +Gamma,PDF,5000,VECTORIZED,18.600000 +Gamma,PDF,5000,PARALLEL,43.100000 +Gamma,PDF,5000,WORK_STEALING,42.500000 +Gamma,LogPDF,5000,SCALAR,42.300000 +Gamma,LogPDF,5000,VECTORIZED,12.100000 +Gamma,LogPDF,5000,PARALLEL,19.400000 +Gamma,LogPDF,5000,WORK_STEALING,18.900000 +Gamma,CDF,5000,SCALAR,252.500000 +Gamma,CDF,5000,VECTORIZED,223.700000 +Gamma,CDF,5000,PARALLEL,229.700000 +Gamma,CDF,5000,WORK_STEALING,220.600000 +Gamma,PDF,10000,SCALAR,211.100000 +Gamma,PDF,10000,VECTORIZED,37.000000 +Gamma,PDF,10000,PARALLEL,88.500000 +Gamma,PDF,10000,WORK_STEALING,114.600000 +Gamma,LogPDF,10000,SCALAR,212.900000 +Gamma,LogPDF,10000,VECTORIZED,24.100000 +Gamma,LogPDF,10000,PARALLEL,43.300000 +Gamma,LogPDF,10000,WORK_STEALING,162.100000 +Gamma,CDF,10000,SCALAR,536.300000 +Gamma,CDF,10000,VECTORIZED,481.600000 +Gamma,CDF,10000,PARALLEL,145.100000 +Gamma,CDF,10000,WORK_STEALING,324.000000 +Gamma,PDF,20000,SCALAR,425.700000 +Gamma,PDF,20000,VECTORIZED,73.800000 +Gamma,PDF,20000,PARALLEL,71.700000 +Gamma,PDF,20000,WORK_STEALING,67.400000 +Gamma,LogPDF,20000,SCALAR,173.800000 +Gamma,LogPDF,20000,VECTORIZED,48.000000 +Gamma,LogPDF,20000,PARALLEL,117.700000 +Gamma,LogPDF,20000,WORK_STEALING,117.500000 +Gamma,CDF,20000,SCALAR,1088.400000 +Gamma,CDF,20000,VECTORIZED,1000.900000 +Gamma,CDF,20000,PARALLEL,256.800000 +Gamma,CDF,20000,WORK_STEALING,1012.200000 +Gamma,PDF,50000,SCALAR,1082.200000 +Gamma,PDF,50000,VECTORIZED,187.200000 +Gamma,PDF,50000,PARALLEL,128.100000 +Gamma,PDF,50000,WORK_STEALING,426.600000 +Gamma,LogPDF,50000,SCALAR,424.000000 +Gamma,LogPDF,50000,VECTORIZED,122.000000 +Gamma,LogPDF,50000,PARALLEL,83.800000 +Gamma,LogPDF,50000,WORK_STEALING,110.200000 +Gamma,CDF,50000,SCALAR,2852.400000 +Gamma,CDF,50000,VECTORIZED,2366.800000 +Gamma,CDF,50000,PARALLEL,618.400000 +Gamma,CDF,50000,WORK_STEALING,1928.600000 +Gamma,PDF,100000,SCALAR,2124.300000 +Gamma,PDF,100000,VECTORIZED,375.600000 +Gamma,PDF,100000,PARALLEL,277.400000 +Gamma,PDF,100000,WORK_STEALING,946.200000 +Gamma,LogPDF,100000,SCALAR,913.700000 +Gamma,LogPDF,100000,VECTORIZED,247.700000 +Gamma,LogPDF,100000,PARALLEL,148.700000 +Gamma,LogPDF,100000,WORK_STEALING,373.900000 +Gamma,CDF,100000,SCALAR,5343.400000 +Gamma,CDF,100000,VECTORIZED,4907.000000 +Gamma,CDF,100000,PARALLEL,1032.300000 +Gamma,CDF,100000,WORK_STEALING,1954.500000 +Gamma,PDF,250000,SCALAR,5380.300000 +Gamma,PDF,250000,VECTORIZED,1296.700000 +Gamma,PDF,250000,PARALLEL,624.400000 +Gamma,PDF,250000,WORK_STEALING,3253.900000 +Gamma,LogPDF,250000,SCALAR,2133.000000 +Gamma,LogPDF,250000,VECTORIZED,1102.300000 +Gamma,LogPDF,250000,PARALLEL,346.100000 +Gamma,LogPDF,250000,WORK_STEALING,655.200000 +Gamma,CDF,250000,SCALAR,13709.600000 +Gamma,CDF,250000,VECTORIZED,12701.000000 +Gamma,CDF,250000,PARALLEL,2586.100000 +Gamma,CDF,250000,WORK_STEALING,4236.500000 +Gamma,PDF,500000,SCALAR,11730.400000 +Gamma,PDF,500000,VECTORIZED,3209.500000 +Gamma,PDF,500000,PARALLEL,1218.500000 +Gamma,PDF,500000,WORK_STEALING,4671.000000 +Gamma,LogPDF,500000,SCALAR,4298.300000 +Gamma,LogPDF,500000,VECTORIZED,2645.000000 +Gamma,LogPDF,500000,PARALLEL,664.700000 +Gamma,LogPDF,500000,WORK_STEALING,1128.900000 +Gamma,CDF,500000,SCALAR,28113.700000 +Gamma,CDF,500000,VECTORIZED,26300.800000 +Gamma,CDF,500000,PARALLEL,5087.100000 +Gamma,CDF,500000,WORK_STEALING,6076.900000 +StudentT,PDF,8,SCALAR,0.200000 +StudentT,PDF,8,VECTORIZED,0.100000 +StudentT,PDF,8,PARALLEL,0.300000 +StudentT,PDF,8,WORK_STEALING,0.200000 +StudentT,LogPDF,8,SCALAR,0.100000 +StudentT,LogPDF,8,VECTORIZED,0.100000 +StudentT,LogPDF,8,PARALLEL,0.200000 +StudentT,LogPDF,8,WORK_STEALING,0.200000 +StudentT,CDF,8,SCALAR,1.300000 +StudentT,CDF,8,VECTORIZED,1.200000 +StudentT,CDF,8,PARALLEL,1.300000 +StudentT,CDF,8,WORK_STEALING,1.200000 +StudentT,PDF,16,SCALAR,0.400000 +StudentT,PDF,16,VECTORIZED,0.200000 +StudentT,PDF,16,PARALLEL,0.400000 +StudentT,PDF,16,WORK_STEALING,0.400000 +StudentT,LogPDF,16,SCALAR,0.200000 +StudentT,LogPDF,16,VECTORIZED,0.200000 +StudentT,LogPDF,16,PARALLEL,0.300000 +StudentT,LogPDF,16,WORK_STEALING,0.200000 +StudentT,CDF,16,SCALAR,3.000000 +StudentT,CDF,16,VECTORIZED,2.600000 +StudentT,CDF,16,PARALLEL,2.700000 +StudentT,CDF,16,WORK_STEALING,2.800000 +StudentT,PDF,32,SCALAR,0.700000 +StudentT,PDF,32,VECTORIZED,0.300000 +StudentT,PDF,32,PARALLEL,0.600000 +StudentT,PDF,32,WORK_STEALING,0.600000 +StudentT,LogPDF,32,SCALAR,0.400000 +StudentT,LogPDF,32,VECTORIZED,0.200000 +StudentT,LogPDF,32,PARALLEL,0.400000 +StudentT,LogPDF,32,WORK_STEALING,0.400000 +StudentT,CDF,32,SCALAR,5.500000 +StudentT,CDF,32,VECTORIZED,5.400000 +StudentT,CDF,32,PARALLEL,5.400000 +StudentT,CDF,32,WORK_STEALING,5.300000 +StudentT,PDF,64,SCALAR,1.400000 +StudentT,PDF,64,VECTORIZED,0.400000 +StudentT,PDF,64,PARALLEL,0.900000 +StudentT,PDF,64,WORK_STEALING,1.000000 +StudentT,LogPDF,64,SCALAR,0.800000 +StudentT,LogPDF,64,VECTORIZED,0.300000 +StudentT,LogPDF,64,PARALLEL,0.600000 +StudentT,LogPDF,64,WORK_STEALING,0.500000 +StudentT,CDF,64,SCALAR,11.500000 +StudentT,CDF,64,VECTORIZED,10.800000 +StudentT,CDF,64,PARALLEL,10.600000 +StudentT,CDF,64,WORK_STEALING,11.100000 +StudentT,PDF,128,SCALAR,2.700000 +StudentT,PDF,128,VECTORIZED,0.700000 +StudentT,PDF,128,PARALLEL,1.800000 +StudentT,PDF,128,WORK_STEALING,1.800000 +StudentT,LogPDF,128,SCALAR,1.800000 +StudentT,LogPDF,128,VECTORIZED,0.500000 +StudentT,LogPDF,128,PARALLEL,0.900000 +StudentT,LogPDF,128,WORK_STEALING,0.900000 +StudentT,CDF,128,SCALAR,22.600000 +StudentT,CDF,128,VECTORIZED,21.700000 +StudentT,CDF,128,PARALLEL,22.100000 +StudentT,CDF,128,WORK_STEALING,21.100000 +StudentT,PDF,256,SCALAR,5.300000 +StudentT,PDF,256,VECTORIZED,1.000000 +StudentT,PDF,256,PARALLEL,2.300000 +StudentT,PDF,256,WORK_STEALING,2.200000 +StudentT,LogPDF,256,SCALAR,2.200000 +StudentT,LogPDF,256,VECTORIZED,0.800000 +StudentT,LogPDF,256,PARALLEL,1.600000 +StudentT,LogPDF,256,WORK_STEALING,1.600000 +StudentT,CDF,256,SCALAR,32.400000 +StudentT,CDF,256,VECTORIZED,31.500000 +StudentT,CDF,256,PARALLEL,31.400000 +StudentT,CDF,256,WORK_STEALING,30.800000 +StudentT,PDF,512,SCALAR,6.900000 +StudentT,PDF,512,VECTORIZED,1.800000 +StudentT,PDF,512,PARALLEL,4.300000 +StudentT,PDF,512,WORK_STEALING,4.300000 +StudentT,LogPDF,512,SCALAR,4.200000 +StudentT,LogPDF,512,VECTORIZED,1.000000 +StudentT,LogPDF,512,PARALLEL,2.100000 +StudentT,LogPDF,512,WORK_STEALING,2.200000 +StudentT,CDF,512,SCALAR,65.600000 +StudentT,CDF,512,VECTORIZED,61.800000 +StudentT,CDF,512,PARALLEL,61.800000 +StudentT,CDF,512,WORK_STEALING,61.800000 +StudentT,PDF,1000,SCALAR,13.600000 +StudentT,PDF,1000,VECTORIZED,3.300000 +StudentT,PDF,1000,PARALLEL,8.300000 +StudentT,PDF,1000,WORK_STEALING,8.400000 +StudentT,LogPDF,1000,SCALAR,8.300000 +StudentT,LogPDF,1000,VECTORIZED,2.000000 +StudentT,LogPDF,1000,PARALLEL,4.100000 +StudentT,LogPDF,1000,WORK_STEALING,4.200000 +StudentT,CDF,1000,SCALAR,131.300000 +StudentT,CDF,1000,VECTORIZED,125.100000 +StudentT,CDF,1000,PARALLEL,124.200000 +StudentT,CDF,1000,WORK_STEALING,124.200000 +StudentT,PDF,2000,SCALAR,28.200000 +StudentT,PDF,2000,VECTORIZED,6.500000 +StudentT,PDF,2000,PARALLEL,16.500000 +StudentT,PDF,2000,WORK_STEALING,16.500000 +StudentT,LogPDF,2000,SCALAR,16.700000 +StudentT,LogPDF,2000,VECTORIZED,4.000000 +StudentT,LogPDF,2000,PARALLEL,8.000000 +StudentT,LogPDF,2000,WORK_STEALING,8.000000 +StudentT,CDF,2000,SCALAR,265.900000 +StudentT,CDF,2000,VECTORIZED,250.400000 +StudentT,CDF,2000,PARALLEL,250.400000 +StudentT,CDF,2000,WORK_STEALING,250.300000 +StudentT,PDF,5000,SCALAR,73.600000 +StudentT,PDF,5000,VECTORIZED,16.500000 +StudentT,PDF,5000,PARALLEL,41.200000 +StudentT,PDF,5000,WORK_STEALING,40.900000 +StudentT,LogPDF,5000,SCALAR,43.000000 +StudentT,LogPDF,5000,VECTORIZED,10.000000 +StudentT,LogPDF,5000,PARALLEL,19.800000 +StudentT,LogPDF,5000,WORK_STEALING,19.900000 +StudentT,CDF,5000,SCALAR,683.100000 +StudentT,CDF,5000,VECTORIZED,641.000000 +StudentT,CDF,5000,PARALLEL,640.400000 +StudentT,CDF,5000,WORK_STEALING,647.200000 +StudentT,PDF,10000,SCALAR,152.700000 +StudentT,PDF,10000,VECTORIZED,33.500000 +StudentT,PDF,10000,PARALLEL,157.600000 +StudentT,PDF,10000,WORK_STEALING,60.500000 +StudentT,LogPDF,10000,SCALAR,105.600000 +StudentT,LogPDF,10000,VECTORIZED,20.100000 +StudentT,LogPDF,10000,PARALLEL,90.500000 +StudentT,LogPDF,10000,WORK_STEALING,95.800000 +StudentT,CDF,10000,SCALAR,1367.200000 +StudentT,CDF,10000,VECTORIZED,1294.200000 +StudentT,CDF,10000,PARALLEL,1289.800000 +StudentT,CDF,10000,WORK_STEALING,1298.300000 +StudentT,PDF,20000,SCALAR,555.500000 +StudentT,PDF,20000,VECTORIZED,72.700000 +StudentT,PDF,20000,PARALLEL,72.100000 +StudentT,PDF,20000,WORK_STEALING,134.100000 +StudentT,LogPDF,20000,SCALAR,237.800000 +StudentT,LogPDF,20000,VECTORIZED,39.900000 +StudentT,LogPDF,20000,PARALLEL,63.600000 +StudentT,LogPDF,20000,WORK_STEALING,103.800000 +StudentT,CDF,20000,SCALAR,2727.200000 +StudentT,CDF,20000,VECTORIZED,2621.900000 +StudentT,CDF,20000,PARALLEL,2590.800000 +StudentT,CDF,20000,WORK_STEALING,2637.300000 +StudentT,PDF,50000,SCALAR,809.500000 +StudentT,PDF,50000,VECTORIZED,166.200000 +StudentT,PDF,50000,PARALLEL,156.900000 +StudentT,PDF,50000,WORK_STEALING,134.200000 +StudentT,LogPDF,50000,SCALAR,850.700000 +StudentT,LogPDF,50000,VECTORIZED,101.100000 +StudentT,LogPDF,50000,PARALLEL,111.300000 +StudentT,LogPDF,50000,WORK_STEALING,98.500000 +StudentT,CDF,50000,SCALAR,7067.000000 +StudentT,CDF,50000,VECTORIZED,6694.900000 +StudentT,CDF,50000,PARALLEL,6742.900000 +StudentT,CDF,50000,WORK_STEALING,6588.800000 +StudentT,PDF,100000,SCALAR,1968.300000 +StudentT,PDF,100000,VECTORIZED,349.900000 +StudentT,PDF,100000,PARALLEL,287.800000 +StudentT,PDF,100000,WORK_STEALING,306.000000 +StudentT,LogPDF,100000,SCALAR,1242.600000 +StudentT,LogPDF,100000,VECTORIZED,209.400000 +StudentT,LogPDF,100000,PARALLEL,233.800000 +StudentT,LogPDF,100000,WORK_STEALING,238.100000 +StudentT,CDF,100000,SCALAR,14498.400000 +StudentT,CDF,100000,VECTORIZED,13556.000000 +StudentT,CDF,100000,PARALLEL,13908.500000 +StudentT,CDF,100000,WORK_STEALING,13545.200000 +StudentT,PDF,250000,SCALAR,4182.000000 +StudentT,PDF,250000,VECTORIZED,880.300000 +StudentT,PDF,250000,PARALLEL,546.100000 +StudentT,PDF,250000,WORK_STEALING,544.200000 +StudentT,LogPDF,250000,SCALAR,3116.100000 +StudentT,LogPDF,250000,VECTORIZED,546.900000 +StudentT,LogPDF,250000,PARALLEL,400.200000 +StudentT,LogPDF,250000,WORK_STEALING,396.500000 +StudentT,CDF,250000,SCALAR,35694.700000 +StudentT,CDF,250000,VECTORIZED,34152.100000 +StudentT,CDF,250000,PARALLEL,33262.500000 +StudentT,CDF,250000,WORK_STEALING,34161.400000 +StudentT,PDF,500000,SCALAR,9291.300000 +StudentT,PDF,500000,VECTORIZED,1822.900000 +StudentT,PDF,500000,PARALLEL,1167.800000 +StudentT,PDF,500000,WORK_STEALING,1186.100000 +StudentT,LogPDF,500000,SCALAR,6325.200000 +StudentT,LogPDF,500000,VECTORIZED,1295.100000 +StudentT,LogPDF,500000,PARALLEL,975.400000 +StudentT,LogPDF,500000,WORK_STEALING,993.100000 +StudentT,CDF,500000,SCALAR,71873.500000 +StudentT,CDF,500000,VECTORIZED,67776.800000 +StudentT,CDF,500000,PARALLEL,68133.300000 +StudentT,CDF,500000,WORK_STEALING,68868.200000 +Beta,PDF,8,SCALAR,0.200000 +Beta,PDF,8,VECTORIZED,0.200000 +Beta,PDF,8,PARALLEL,0.300000 +Beta,PDF,8,WORK_STEALING,0.300000 +Beta,LogPDF,8,SCALAR,0.100000 +Beta,LogPDF,8,VECTORIZED,0.100000 +Beta,LogPDF,8,PARALLEL,0.300000 +Beta,LogPDF,8,WORK_STEALING,0.200000 +Beta,CDF,8,SCALAR,1.100000 +Beta,CDF,8,VECTORIZED,1.000000 +Beta,CDF,8,PARALLEL,1.100000 +Beta,CDF,8,WORK_STEALING,1.100000 +Beta,PDF,16,SCALAR,0.500000 +Beta,PDF,16,VECTORIZED,0.400000 +Beta,PDF,16,PARALLEL,0.400000 +Beta,PDF,16,WORK_STEALING,0.700000 +Beta,LogPDF,16,SCALAR,0.500000 +Beta,LogPDF,16,VECTORIZED,0.400000 +Beta,LogPDF,16,PARALLEL,0.400000 +Beta,LogPDF,16,WORK_STEALING,0.300000 +Beta,CDF,16,SCALAR,2.400000 +Beta,CDF,16,VECTORIZED,2.300000 +Beta,CDF,16,PARALLEL,2.300000 +Beta,CDF,16,WORK_STEALING,2.500000 +Beta,PDF,32,SCALAR,0.800000 +Beta,PDF,32,VECTORIZED,0.600000 +Beta,PDF,32,PARALLEL,0.700000 +Beta,PDF,32,WORK_STEALING,0.700000 +Beta,LogPDF,32,SCALAR,0.600000 +Beta,LogPDF,32,VECTORIZED,0.500000 +Beta,LogPDF,32,PARALLEL,0.600000 +Beta,LogPDF,32,WORK_STEALING,0.500000 +Beta,CDF,32,SCALAR,4.700000 +Beta,CDF,32,VECTORIZED,4.400000 +Beta,CDF,32,PARALLEL,4.600000 +Beta,CDF,32,WORK_STEALING,4.700000 +Beta,PDF,64,SCALAR,2.400000 +Beta,PDF,64,VECTORIZED,1.500000 +Beta,PDF,64,PARALLEL,1.900000 +Beta,PDF,64,WORK_STEALING,1.700000 +Beta,LogPDF,64,SCALAR,1.800000 +Beta,LogPDF,64,VECTORIZED,1.200000 +Beta,LogPDF,64,PARALLEL,1.300000 +Beta,LogPDF,64,WORK_STEALING,1.400000 +Beta,CDF,64,SCALAR,8.900000 +Beta,CDF,64,VECTORIZED,8.000000 +Beta,CDF,64,PARALLEL,12.800000 +Beta,CDF,64,WORK_STEALING,8.900000 +Beta,PDF,128,SCALAR,3.000000 +Beta,PDF,128,VECTORIZED,1.600000 +Beta,PDF,128,PARALLEL,2.300000 +Beta,PDF,128,WORK_STEALING,2.300000 +Beta,LogPDF,128,SCALAR,2.100000 +Beta,LogPDF,128,VECTORIZED,1.300000 +Beta,LogPDF,128,PARALLEL,1.500000 +Beta,LogPDF,128,WORK_STEALING,2.300000 +Beta,CDF,128,SCALAR,18.900000 +Beta,CDF,128,VECTORIZED,17.000000 +Beta,CDF,128,PARALLEL,18.900000 +Beta,CDF,128,WORK_STEALING,12.600000 +Beta,PDF,256,SCALAR,4.100000 +Beta,PDF,256,VECTORIZED,1.800000 +Beta,PDF,256,PARALLEL,2.900000 +Beta,PDF,256,WORK_STEALING,2.900000 +Beta,LogPDF,256,SCALAR,2.800000 +Beta,LogPDF,256,VECTORIZED,1.500000 +Beta,LogPDF,256,PARALLEL,1.900000 +Beta,LogPDF,256,WORK_STEALING,1.900000 +Beta,CDF,256,SCALAR,25.600000 +Beta,CDF,256,VECTORIZED,23.100000 +Beta,CDF,256,PARALLEL,25.500000 +Beta,CDF,256,WORK_STEALING,25.500000 +Beta,PDF,512,SCALAR,8.000000 +Beta,PDF,512,VECTORIZED,3.500000 +Beta,PDF,512,PARALLEL,5.700000 +Beta,PDF,512,WORK_STEALING,5.700000 +Beta,LogPDF,512,SCALAR,5.500000 +Beta,LogPDF,512,VECTORIZED,2.800000 +Beta,LogPDF,512,PARALLEL,3.500000 +Beta,LogPDF,512,WORK_STEALING,3.600000 +Beta,CDF,512,SCALAR,51.000000 +Beta,CDF,512,VECTORIZED,46.200000 +Beta,CDF,512,PARALLEL,50.900000 +Beta,CDF,512,WORK_STEALING,50.900000 +Beta,PDF,1000,SCALAR,15.600000 +Beta,PDF,1000,VECTORIZED,6.600000 +Beta,PDF,1000,PARALLEL,11.000000 +Beta,PDF,1000,WORK_STEALING,10.900000 +Beta,LogPDF,1000,SCALAR,10.800000 +Beta,LogPDF,1000,VECTORIZED,5.200000 +Beta,LogPDF,1000,PARALLEL,6.800000 +Beta,LogPDF,1000,WORK_STEALING,6.800000 +Beta,CDF,1000,SCALAR,98.900000 +Beta,CDF,1000,VECTORIZED,90.500000 +Beta,CDF,1000,PARALLEL,99.400000 +Beta,CDF,1000,WORK_STEALING,99.100000 +Beta,PDF,2000,SCALAR,31.600000 +Beta,PDF,2000,VECTORIZED,12.800000 +Beta,PDF,2000,PARALLEL,21.900000 +Beta,PDF,2000,WORK_STEALING,22.000000 +Beta,LogPDF,2000,SCALAR,21.600000 +Beta,LogPDF,2000,VECTORIZED,10.200000 +Beta,LogPDF,2000,PARALLEL,13.500000 +Beta,LogPDF,2000,WORK_STEALING,13.500000 +Beta,CDF,2000,SCALAR,203.500000 +Beta,CDF,2000,VECTORIZED,185.600000 +Beta,CDF,2000,PARALLEL,205.100000 +Beta,CDF,2000,WORK_STEALING,203.300000 +Beta,PDF,5000,SCALAR,78.400000 +Beta,PDF,5000,VECTORIZED,32.500000 +Beta,PDF,5000,PARALLEL,54.300000 +Beta,PDF,5000,WORK_STEALING,54.300000 +Beta,LogPDF,5000,SCALAR,53.700000 +Beta,LogPDF,5000,VECTORIZED,25.800000 +Beta,LogPDF,5000,PARALLEL,33.300000 +Beta,LogPDF,5000,WORK_STEALING,33.400000 +Beta,CDF,5000,SCALAR,501.700000 +Beta,CDF,5000,VECTORIZED,461.100000 +Beta,CDF,5000,PARALLEL,505.200000 +Beta,CDF,5000,WORK_STEALING,505.600000 +Beta,PDF,10000,SCALAR,157.300000 +Beta,PDF,10000,VECTORIZED,65.000000 +Beta,PDF,10000,PARALLEL,177.700000 +Beta,PDF,10000,WORK_STEALING,154.000000 +Beta,LogPDF,10000,SCALAR,186.800000 +Beta,LogPDF,10000,VECTORIZED,101.400000 +Beta,LogPDF,10000,PARALLEL,144.000000 +Beta,LogPDF,10000,WORK_STEALING,153.600000 +Beta,CDF,10000,SCALAR,1013.000000 +Beta,CDF,10000,VECTORIZED,944.300000 +Beta,CDF,10000,PARALLEL,1020.600000 +Beta,CDF,10000,WORK_STEALING,1025.900000 +Beta,PDF,20000,SCALAR,341.500000 +Beta,PDF,20000,VECTORIZED,181.100000 +Beta,PDF,20000,PARALLEL,299.700000 +Beta,PDF,20000,WORK_STEALING,269.900000 +Beta,LogPDF,20000,SCALAR,238.500000 +Beta,LogPDF,20000,VECTORIZED,105.500000 +Beta,LogPDF,20000,PARALLEL,258.200000 +Beta,LogPDF,20000,WORK_STEALING,245.300000 +Beta,CDF,20000,SCALAR,2066.000000 +Beta,CDF,20000,VECTORIZED,1918.200000 +Beta,CDF,20000,PARALLEL,2253.700000 +Beta,CDF,20000,WORK_STEALING,2101.800000 +Beta,PDF,50000,SCALAR,848.000000 +Beta,PDF,50000,VECTORIZED,369.300000 +Beta,PDF,50000,PARALLEL,650.000000 +Beta,PDF,50000,WORK_STEALING,654.800000 +Beta,LogPDF,50000,SCALAR,751.500000 +Beta,LogPDF,50000,VECTORIZED,284.500000 +Beta,LogPDF,50000,PARALLEL,609.800000 +Beta,LogPDF,50000,WORK_STEALING,593.600000 +Beta,CDF,50000,SCALAR,5197.100000 +Beta,CDF,50000,VECTORIZED,4867.000000 +Beta,CDF,50000,PARALLEL,5186.100000 +Beta,CDF,50000,WORK_STEALING,5133.100000 +Beta,PDF,100000,SCALAR,1741.700000 +Beta,PDF,100000,VECTORIZED,835.500000 +Beta,PDF,100000,PARALLEL,1256.200000 +Beta,PDF,100000,WORK_STEALING,1255.800000 +Beta,LogPDF,100000,SCALAR,1266.900000 +Beta,LogPDF,100000,VECTORIZED,980.600000 +Beta,LogPDF,100000,PARALLEL,1167.000000 +Beta,LogPDF,100000,WORK_STEALING,1159.200000 +Beta,CDF,100000,SCALAR,10620.100000 +Beta,CDF,100000,VECTORIZED,9742.300000 +Beta,CDF,100000,PARALLEL,10662.500000 +Beta,CDF,100000,WORK_STEALING,10600.000000 +Beta,PDF,250000,SCALAR,4498.800000 +Beta,PDF,250000,VECTORIZED,2744.400000 +Beta,PDF,250000,PARALLEL,3201.500000 +Beta,PDF,250000,WORK_STEALING,3295.300000 +Beta,LogPDF,250000,SCALAR,3220.700000 +Beta,LogPDF,250000,VECTORIZED,2005.500000 +Beta,LogPDF,250000,PARALLEL,2848.100000 +Beta,LogPDF,250000,WORK_STEALING,2893.400000 +Beta,CDF,250000,SCALAR,26920.100000 +Beta,CDF,250000,VECTORIZED,24705.200000 +Beta,CDF,250000,PARALLEL,27470.900000 +Beta,CDF,250000,WORK_STEALING,27261.100000 +Beta,PDF,500000,SCALAR,9004.900000 +Beta,PDF,500000,VECTORIZED,5384.400000 +Beta,PDF,500000,PARALLEL,6415.700000 +Beta,PDF,500000,WORK_STEALING,6678.600000 +Beta,LogPDF,500000,SCALAR,6434.600000 +Beta,LogPDF,500000,VECTORIZED,4433.400000 +Beta,LogPDF,500000,PARALLEL,5889.000000 +Beta,LogPDF,500000,WORK_STEALING,5721.700000 +Beta,CDF,500000,SCALAR,53781.200000 +Beta,CDF,500000,VECTORIZED,55078.900000 +Beta,CDF,500000,PARALLEL,54193.100000 +Beta,CDF,500000,WORK_STEALING,55158.200000 +ChiSquared,PDF,8,SCALAR,0.200000 +ChiSquared,PDF,8,VECTORIZED,0.100000 +ChiSquared,PDF,8,PARALLEL,0.100000 +ChiSquared,PDF,8,WORK_STEALING,0.100000 +ChiSquared,LogPDF,8,SCALAR,0.100000 +ChiSquared,LogPDF,8,VECTORIZED,0.100000 +ChiSquared,LogPDF,8,PARALLEL,0.100000 +ChiSquared,LogPDF,8,WORK_STEALING,0.100000 +ChiSquared,CDF,8,SCALAR,0.400000 +ChiSquared,CDF,8,VECTORIZED,0.500000 +ChiSquared,CDF,8,PARALLEL,0.400000 +ChiSquared,CDF,8,WORK_STEALING,0.300000 +ChiSquared,PDF,16,SCALAR,0.400000 +ChiSquared,PDF,16,VECTORIZED,0.200000 +ChiSquared,PDF,16,PARALLEL,0.200000 +ChiSquared,PDF,16,WORK_STEALING,0.200000 +ChiSquared,LogPDF,16,SCALAR,0.200000 +ChiSquared,LogPDF,16,VECTORIZED,0.200000 +ChiSquared,LogPDF,16,PARALLEL,0.100000 +ChiSquared,LogPDF,16,WORK_STEALING,0.100000 +ChiSquared,CDF,16,SCALAR,0.800000 +ChiSquared,CDF,16,VECTORIZED,0.800000 +ChiSquared,CDF,16,PARALLEL,0.700000 +ChiSquared,CDF,16,WORK_STEALING,0.700000 +ChiSquared,PDF,32,SCALAR,0.700000 +ChiSquared,PDF,32,VECTORIZED,0.300000 +ChiSquared,PDF,32,PARALLEL,0.300000 +ChiSquared,PDF,32,WORK_STEALING,0.300000 +ChiSquared,LogPDF,32,SCALAR,0.300000 +ChiSquared,LogPDF,32,VECTORIZED,0.200000 +ChiSquared,LogPDF,32,PARALLEL,0.200000 +ChiSquared,LogPDF,32,WORK_STEALING,0.100000 +ChiSquared,CDF,32,SCALAR,1.600000 +ChiSquared,CDF,32,VECTORIZED,1.400000 +ChiSquared,CDF,32,PARALLEL,1.400000 +ChiSquared,CDF,32,WORK_STEALING,1.400000 +ChiSquared,PDF,64,SCALAR,1.400000 +ChiSquared,PDF,64,VECTORIZED,0.400000 +ChiSquared,PDF,64,PARALLEL,0.600000 +ChiSquared,PDF,64,WORK_STEALING,0.600000 +ChiSquared,LogPDF,64,SCALAR,0.500000 +ChiSquared,LogPDF,64,VECTORIZED,0.300000 +ChiSquared,LogPDF,64,PARALLEL,0.300000 +ChiSquared,LogPDF,64,WORK_STEALING,0.200000 +ChiSquared,CDF,64,SCALAR,3.000000 +ChiSquared,CDF,64,VECTORIZED,2.700000 +ChiSquared,CDF,64,PARALLEL,2.700000 +ChiSquared,CDF,64,WORK_STEALING,2.700000 +ChiSquared,PDF,128,SCALAR,2.700000 +ChiSquared,PDF,128,VECTORIZED,0.700000 +ChiSquared,PDF,128,PARALLEL,1.100000 +ChiSquared,PDF,128,WORK_STEALING,1.100000 +ChiSquared,LogPDF,128,SCALAR,1.100000 +ChiSquared,LogPDF,128,VECTORIZED,0.500000 +ChiSquared,LogPDF,128,PARALLEL,0.500000 +ChiSquared,LogPDF,128,WORK_STEALING,0.500000 +ChiSquared,CDF,128,SCALAR,6.100000 +ChiSquared,CDF,128,VECTORIZED,5.400000 +ChiSquared,CDF,128,PARALLEL,5.300000 +ChiSquared,CDF,128,WORK_STEALING,5.300000 +ChiSquared,PDF,256,SCALAR,5.400000 +ChiSquared,PDF,256,VECTORIZED,1.100000 +ChiSquared,PDF,256,PARALLEL,2.100000 +ChiSquared,PDF,256,WORK_STEALING,2.100000 +ChiSquared,LogPDF,256,SCALAR,2.100000 +ChiSquared,LogPDF,256,VECTORIZED,0.900000 +ChiSquared,LogPDF,256,PARALLEL,1.000000 +ChiSquared,LogPDF,256,WORK_STEALING,1.000000 +ChiSquared,CDF,256,SCALAR,12.100000 +ChiSquared,CDF,256,VECTORIZED,10.500000 +ChiSquared,CDF,256,PARALLEL,10.400000 +ChiSquared,CDF,256,WORK_STEALING,10.400000 +ChiSquared,PDF,512,SCALAR,10.800000 +ChiSquared,PDF,512,VECTORIZED,2.000000 +ChiSquared,PDF,512,PARALLEL,4.400000 +ChiSquared,PDF,512,WORK_STEALING,4.900000 +ChiSquared,LogPDF,512,SCALAR,4.300000 +ChiSquared,LogPDF,512,VECTORIZED,1.300000 +ChiSquared,LogPDF,512,PARALLEL,2.000000 +ChiSquared,LogPDF,512,WORK_STEALING,1.900000 +ChiSquared,CDF,512,SCALAR,24.200000 +ChiSquared,CDF,512,VECTORIZED,21.100000 +ChiSquared,CDF,512,PARALLEL,20.800000 +ChiSquared,CDF,512,WORK_STEALING,20.900000 +ChiSquared,PDF,1000,SCALAR,21.000000 +ChiSquared,PDF,1000,VECTORIZED,3.700000 +ChiSquared,PDF,1000,PARALLEL,8.200000 +ChiSquared,PDF,1000,WORK_STEALING,8.300000 +ChiSquared,LogPDF,1000,SCALAR,8.400000 +ChiSquared,LogPDF,1000,VECTORIZED,2.500000 +ChiSquared,LogPDF,1000,PARALLEL,3.900000 +ChiSquared,LogPDF,1000,WORK_STEALING,3.700000 +ChiSquared,CDF,1000,SCALAR,47.100000 +ChiSquared,CDF,1000,VECTORIZED,41.000000 +ChiSquared,CDF,1000,PARALLEL,40.900000 +ChiSquared,CDF,1000,WORK_STEALING,40.600000 +ChiSquared,PDF,2000,SCALAR,42.100000 +ChiSquared,PDF,2000,VECTORIZED,7.400000 +ChiSquared,PDF,2000,PARALLEL,16.800000 +ChiSquared,PDF,2000,WORK_STEALING,16.700000 +ChiSquared,LogPDF,2000,SCALAR,16.600000 +ChiSquared,LogPDF,2000,VECTORIZED,4.800000 +ChiSquared,LogPDF,2000,PARALLEL,7.800000 +ChiSquared,LogPDF,2000,WORK_STEALING,7.400000 +ChiSquared,CDF,2000,SCALAR,94.300000 +ChiSquared,CDF,2000,VECTORIZED,82.000000 +ChiSquared,CDF,2000,PARALLEL,82.700000 +ChiSquared,CDF,2000,WORK_STEALING,126.000000 +ChiSquared,PDF,5000,SCALAR,168.600000 +ChiSquared,PDF,5000,VECTORIZED,23.400000 +ChiSquared,PDF,5000,PARALLEL,66.100000 +ChiSquared,PDF,5000,WORK_STEALING,42.600000 +ChiSquared,LogPDF,5000,SCALAR,41.700000 +ChiSquared,LogPDF,5000,VECTORIZED,12.100000 +ChiSquared,LogPDF,5000,PARALLEL,19.400000 +ChiSquared,LogPDF,5000,WORK_STEALING,18.800000 +ChiSquared,CDF,5000,SCALAR,254.700000 +ChiSquared,CDF,5000,VECTORIZED,235.400000 +ChiSquared,CDF,5000,PARALLEL,240.500000 +ChiSquared,CDF,5000,WORK_STEALING,227.300000 +ChiSquared,PDF,10000,SCALAR,213.600000 +ChiSquared,PDF,10000,VECTORIZED,37.200000 +ChiSquared,PDF,10000,PARALLEL,48.400000 +ChiSquared,PDF,10000,WORK_STEALING,76.700000 +ChiSquared,LogPDF,10000,SCALAR,125.400000 +ChiSquared,LogPDF,10000,VECTORIZED,36.600000 +ChiSquared,LogPDF,10000,PARALLEL,117.800000 +ChiSquared,LogPDF,10000,WORK_STEALING,74.500000 +ChiSquared,CDF,10000,SCALAR,535.200000 +ChiSquared,CDF,10000,VECTORIZED,509.000000 +ChiSquared,CDF,10000,PARALLEL,163.400000 +ChiSquared,CDF,10000,WORK_STEALING,379.700000 +ChiSquared,PDF,20000,SCALAR,423.100000 +ChiSquared,PDF,20000,VECTORIZED,74.800000 +ChiSquared,PDF,20000,PARALLEL,129.800000 +ChiSquared,PDF,20000,WORK_STEALING,171.400000 +ChiSquared,LogPDF,20000,SCALAR,167.400000 +ChiSquared,LogPDF,20000,VECTORIZED,48.000000 +ChiSquared,LogPDF,20000,PARALLEL,123.700000 +ChiSquared,LogPDF,20000,WORK_STEALING,72.000000 +ChiSquared,CDF,20000,SCALAR,1044.800000 +ChiSquared,CDF,20000,VECTORIZED,992.800000 +ChiSquared,CDF,20000,PARALLEL,280.700000 +ChiSquared,CDF,20000,WORK_STEALING,865.700000 +ChiSquared,PDF,50000,SCALAR,1068.900000 +ChiSquared,PDF,50000,VECTORIZED,194.000000 +ChiSquared,PDF,50000,PARALLEL,127.900000 +ChiSquared,PDF,50000,WORK_STEALING,483.500000 +ChiSquared,LogPDF,50000,SCALAR,753.600000 +ChiSquared,LogPDF,50000,VECTORIZED,122.000000 +ChiSquared,LogPDF,50000,PARALLEL,142.900000 +ChiSquared,LogPDF,50000,WORK_STEALING,112.400000 +ChiSquared,CDF,50000,SCALAR,2686.000000 +ChiSquared,CDF,50000,VECTORIZED,2521.200000 +ChiSquared,CDF,50000,PARALLEL,592.000000 +ChiSquared,CDF,50000,WORK_STEALING,616.600000 +ChiSquared,PDF,100000,SCALAR,2160.800000 +ChiSquared,PDF,100000,VECTORIZED,385.600000 +ChiSquared,PDF,100000,PARALLEL,208.700000 +ChiSquared,PDF,100000,WORK_STEALING,221.800000 +ChiSquared,LogPDF,100000,SCALAR,851.100000 +ChiSquared,LogPDF,100000,VECTORIZED,244.400000 +ChiSquared,LogPDF,100000,PARALLEL,142.900000 +ChiSquared,LogPDF,100000,WORK_STEALING,283.200000 +ChiSquared,CDF,100000,SCALAR,5470.900000 +ChiSquared,CDF,100000,VECTORIZED,4976.200000 +ChiSquared,CDF,100000,PARALLEL,1022.500000 +ChiSquared,CDF,100000,WORK_STEALING,3199.600000 +ChiSquared,PDF,250000,SCALAR,5528.400000 +ChiSquared,PDF,250000,VECTORIZED,1453.500000 +ChiSquared,PDF,250000,PARALLEL,522.600000 +ChiSquared,PDF,250000,WORK_STEALING,1386.200000 +ChiSquared,LogPDF,250000,SCALAR,2148.700000 +ChiSquared,LogPDF,250000,VECTORIZED,992.000000 +ChiSquared,LogPDF,250000,PARALLEL,344.400000 +ChiSquared,LogPDF,250000,WORK_STEALING,1604.300000 +ChiSquared,CDF,250000,SCALAR,13978.100000 +ChiSquared,CDF,250000,VECTORIZED,13340.500000 +ChiSquared,CDF,250000,PARALLEL,3035.400000 +ChiSquared,CDF,250000,WORK_STEALING,3827.800000 +ChiSquared,PDF,500000,SCALAR,11167.500000 +ChiSquared,PDF,500000,VECTORIZED,2873.600000 +ChiSquared,PDF,500000,PARALLEL,998.000000 +ChiSquared,PDF,500000,WORK_STEALING,2057.500000 +ChiSquared,LogPDF,500000,SCALAR,4438.600000 +ChiSquared,LogPDF,500000,VECTORIZED,3162.900000 +ChiSquared,LogPDF,500000,PARALLEL,551.400000 +ChiSquared,LogPDF,500000,WORK_STEALING,2174.200000 +ChiSquared,CDF,500000,SCALAR,27969.000000 +ChiSquared,CDF,500000,VECTORIZED,26777.200000 +ChiSquared,CDF,500000,PARALLEL,6003.600000 +ChiSquared,CDF,500000,WORK_STEALING,8077.300000 diff --git a/data/profiles/dispatcher/2026-04-12T06-02-56Z_windows-x86_64_investigate-gaussian-avx512-perf_sha-32c0819/summary.json b/data/profiles/dispatcher/2026-04-12T06-02-56Z_windows-x86_64_investigate-gaussian-avx512-perf_sha-32c0819/summary.json new file mode 100644 index 0000000..e30eeaf --- /dev/null +++ b/data/profiles/dispatcher/2026-04-12T06-02-56Z_windows-x86_64_investigate-gaussian-avx512-perf_sha-32c0819/summary.json @@ -0,0 +1,188 @@ +{ + "run_id": "2026-04-12T06-02-56Z_windows-x86_64_investigate-gaussian-avx512-perf_sha-32c0819", + "data_source": "strategy_profile_results.csv", + "metadata": { + "captured_at_utc": "2026-04-12T06-02-56Z", + "arch": "x86_64", + "git_branch": "investigate-gaussian-avx512-perf", + "os": "windows", + "cpu_brand": "AMD Ryzen 7 7445HS w/ Radeon 740M Graphics", + "build_type": "Release", + "cxx_compiler": "MSVC 17 2022", + "physical_cores": 6, + "build_dir": "C:\\Users\\gdwol\\Development\\libstats\\build", + "git_sha": "32c0819", + "logical_cores": 12, + "run_id": "2026-04-12T06-02-56Z_windows-x86_64_investigate-gaussian-avx512-perf_sha-32c0819", + "project_root": "C:\\Users\\gdwol\\Development\\libstats" + }, + "coverage": { + "distributions": [ + "Beta", + "ChiSquared", + "Discrete", + "Exponential", + "Gamma", + "Gaussian", + "Poisson", + "StudentT", + "Uniform" + ], + "operations": [ + "CDF", + "LogPDF", + "PDF" + ], + "batch_sizes": [ + 8, + 16, + 32, + 64, + 128, + 256, + 512, + 1000, + 2000, + 5000, + 10000, + 20000, + 50000, + 100000, + 250000, + 500000 + ], + "total_measurements": 1728 + }, + "strategy_win_counts": { + "VECTORIZED": 274, + "PARALLEL": 113, + "WORK_STEALING": 34, + "SCALAR": 11 + }, + "crossover_summary": { + "groups": 27, + "vectorized_never_wins": [], + "parallel_crossover_sizes": [ + { + "distribution": "Beta", + "operation": "CDF", + "vectorized_to_parallel": 500000 + }, + { + "distribution": "ChiSquared", + "operation": "CDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "ChiSquared", + "operation": "LogPDF", + "vectorized_to_parallel": 16 + }, + { + "distribution": "ChiSquared", + "operation": "PDF", + "vectorized_to_parallel": 50000 + }, + { + "distribution": "Discrete", + "operation": "CDF", + "vectorized_to_parallel": 50000 + }, + { + "distribution": "Discrete", + "operation": "LogPDF", + "vectorized_to_parallel": 250000 + }, + { + "distribution": "Discrete", + "operation": "PDF", + "vectorized_to_parallel": 128 + }, + { + "distribution": "Exponential", + "operation": "CDF", + "vectorized_to_parallel": 100000 + }, + { + "distribution": "Exponential", + "operation": "LogPDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Exponential", + "operation": "PDF", + "vectorized_to_parallel": 50000 + }, + { + "distribution": "Gamma", + "operation": "CDF", + "vectorized_to_parallel": 8 + }, + { + "distribution": "Gamma", + "operation": "LogPDF", + "vectorized_to_parallel": 16 + }, + { + "distribution": "Gamma", + "operation": "PDF", + "vectorized_to_parallel": 20000 + }, + { + "distribution": "Gaussian", + "operation": "CDF", + "vectorized_to_parallel": 50000 + }, + { + "distribution": "Gaussian", + "operation": "PDF", + "vectorized_to_parallel": 100000 + }, + { + "distribution": "Poisson", + "operation": "CDF", + "vectorized_to_parallel": 10000 + }, + { + "distribution": "Poisson", + "operation": "LogPDF", + "vectorized_to_parallel": 20000 + }, + { + "distribution": "Poisson", + "operation": "PDF", + "vectorized_to_parallel": 10000 + }, + { + "distribution": "StudentT", + "operation": "CDF", + "vectorized_to_parallel": 64 + }, + { + "distribution": "StudentT", + "operation": "LogPDF", + "vectorized_to_parallel": 250000 + }, + { + "distribution": "StudentT", + "operation": "PDF", + "vectorized_to_parallel": 20000 + }, + { + "distribution": "Uniform", + "operation": "CDF", + "vectorized_to_parallel": 64 + }, + { + "distribution": "Uniform", + "operation": "LogPDF", + "vectorized_to_parallel": 50000 + }, + { + "distribution": "Uniform", + "operation": "PDF", + "vectorized_to_parallel": 256 + } + ] + } +} diff --git a/data/profiles/dispatcher/README.md b/data/profiles/dispatcher/README.md new file mode 100644 index 0000000..e33763a --- /dev/null +++ b/data/profiles/dispatcher/README.md @@ -0,0 +1,39 @@ +# Dispatcher Profiling Data + +This directory contains profiling bundles captured by `scripts/capture_dispatcher_profile.sh`. +Each subdirectory is a timestamped bundle from a single architecture run. + +## Purpose + +The profiling data from all target architectures must be consolidated in one place +to generate the `constexpr` dispatch threshold lookup table (see the plan in issue #14). +Bundles are committed so they can accumulate across machines via normal git workflow. + +## Bundle contents + +Each bundle contains: + +- `metadata.json` β€” machine, OS, SIMD level, compiler, git state +- `strategy_profile_results.csv` β€” canonical raw timing data (distribution Γ— operation Γ— batch size Γ— strategy) +- `crossovers.csv` β€” derived SCALARβ†’VECTORIZED, VECTORIZEDβ†’PARALLEL, PARALLELβ†’WORK_STEALING crossover points +- `best_strategies.csv` β€” per-(distribution, operation, batch size) best strategy and speedup vs scalar +- `summary.json` β€” coverage, strategy win counts, crossover summary +- `logs/` β€” console output from `system_inspector` and `strategy_profile` + +## Target architectures + +| Machine | SIMD | Status | +|---|---|---| +| Mac Mini M1 | NEON | βœ… Captured | +| MacBook Pro 9,1 (2012) | AVX | Pending | +| MacBook Pro 14,1 (2017) | AVX2 | Pending | +| Asus TUF A16 (Windows) | AVX-512 | Pending | + +## Capturing a new profile + +```bash +# Build first, then run the capture script +scripts/capture_dispatcher_profile.sh +# The bundle is saved under build/ and also copied here automatically. +# Commit and push the new bundle. +``` diff --git a/docs/BUILD_SYSTEM_GUIDE.md b/docs/BUILD_SYSTEM_GUIDE.md index 0c8d743..52f041a 100644 --- a/docs/BUILD_SYSTEM_GUIDE.md +++ b/docs/BUILD_SYSTEM_GUIDE.md @@ -161,14 +161,14 @@ The build system implements a dual-layer SIMD detection system: - **SSE2**: Baseline (always available on 64-bit) - **AVX**: 256-bit vector operations - **AVX2**: Enhanced 256-bit integer operations -- **AVX-512**: 512-bit vector operations (server CPUs) +- **AVX-512**: 512-bit vector operations (Intel Skylake-X+, AMD Zen4+) #### ARM64 Platforms - **NEON**: ARM's SIMD instruction set ### SIMD Detection Examples -#### Modern CPU (Full SIMD Support) +#### Modern CPU (AVX2) ``` -- Runtime sse2 test: PASSED -- SIMD: SSE2 enabled (compiler + runtime) @@ -180,6 +180,14 @@ The build system implements a dual-layer SIMD detection system: -- SSE2: TRUE, AVX: TRUE, AVX2: TRUE, AVX-512: FALSE ``` +#### AVX-512 CPU (Intel Skylake-X+, AMD Zen4+) +``` +-- SIMD: AVX-512 enabled (compiler + runtime) +-- Applied MSVC x64 SIMD flags: /arch:AVX512 +-- SIMD detection complete: +-- SSE2: TRUE, AVX: TRUE, AVX2: TRUE, AVX-512: TRUE +``` + #### Apple Silicon (ARM64) ``` -- SIMD: SSE2 disabled (compiler not supported) @@ -289,7 +297,7 @@ make -j8 #### Windows - **Compilers**: MSVC, ClangCL support -- **SIMD Support**: x86_64 SIMD instruction sets +- **SIMD Support**: Global compile flag follows detection β€” `/arch:AVX512` when AVX-512 is detected, `/arch:AVX2` otherwise. Per-source-file flags also applied via `SIMDDetection.cmake`. - **Threading**: Windows Thread Pool API detection - **Visual Studio Integration**: Full integration with VS build system @@ -415,7 +423,8 @@ g++ -std=c++20 -pthread -fPIC \ **Windows with MSVC:** ```bash cl.exe /std:c++20 /EHsc /W3 /O2 \ - /DNOMINMAX /D_USE_MATH_DEFINES + /DNOMINMAX /D_USE_MATH_DEFINES \ + /arch:AVX512 # or /arch:AVX2 β€” set automatically by CMake based on detection ``` ### SIMD Compilation @@ -431,7 +440,7 @@ cl.exe /std:c++20 /EHsc /W3 /O2 \ # AVX2 support -mavx2 -# AVX-512 support (server CPUs) +# AVX-512 support (Intel Skylake-X+, AMD Zen4+) -mavx512f # ARM NEON (Apple Silicon/ARM64) @@ -490,9 +499,11 @@ src/distributions/*.cpp # SIMD implementations (conditional) src/simd_fallback.cpp # Always +src/simd_dispatch.cpp # Always (runtime dispatch) src/simd_sse2.cpp # If SSE2 available src/simd_avx.cpp # If AVX available src/simd_avx2.cpp # If AVX2 available +src/simd_avx512.cpp # If AVX-512 available src/simd_neon.cpp # If NEON available (ARM64) ``` diff --git a/docs/HEADER_ARCHITECTURE_GUIDE.md b/docs/HEADER_ARCHITECTURE_GUIDE.md index 85fc210..ae30a7d 100644 --- a/docs/HEADER_ARCHITECTURE_GUIDE.md +++ b/docs/HEADER_ARCHITECTURE_GUIDE.md @@ -105,9 +105,11 @@ no longer exist. Any code still referencing them predates Phase 2. #include "platform/simd.h" // SIMD operations and memory management // Threading and parallelism -#include "platform/parallel_thresholds.h" // Architecture-specific thresholds #include "platform/thread_pool.h" // Basic thread pool #include "platform/work_stealing_pool.h" // Advanced work-stealing pool + +// Dispatch thresholds (profiling-derived) +#include "core/dispatch_thresholds.h" // Per-(arch, dist, op) parallel thresholds ``` ### Level 3: Advanced Infrastructure diff --git a/examples/parallel_execution_demo.cpp b/examples/parallel_execution_demo.cpp index fbbe885..e172a00 100644 --- a/examples/parallel_execution_demo.cpp +++ b/examples/parallel_execution_demo.cpp @@ -94,7 +94,7 @@ void demonstrate_adaptive_grain_sizing() { std::cout << " Base grain size: " << stats::arch::get_optimal_grain_size() << " elements [Default work unit size]" << std::endl; std::cout << " Parallel threshold: " - << stats::arch::get_optimal_parallel_threshold("gaussian", "pdf") + << stats::arch::get_min_elements_for_distribution_parallel() << " elements [Minimum size for parallel execution]" << std::endl; std::cout << "\n ℹ️ Memory-bound: Larger grains reduce cache misses" << std::endl; std::cout << " ℹ️ Computation-bound: Smaller grains improve load balancing" << std::endl; diff --git a/examples/performance_learning_demo.cpp b/examples/performance_learning_demo.cpp index 3f523db..5848b79 100644 --- a/examples/performance_learning_demo.cpp +++ b/examples/performance_learning_demo.cpp @@ -15,6 +15,7 @@ */ #define LIBSTATS_FULL_INTERFACE +#include "libstats/core/dispatch_thresholds.h" #include "libstats/libstats.h" // Standard library includes @@ -179,9 +180,8 @@ void demonstrate_performance_dispatcher() { std::vector problem_sizes = {50, 500, 5000, 50000, 500000}; for (auto size : problem_sizes) { - auto strategy = dispatcher.selectOptimalStrategy( - size, stats::detail::DistributionType::GAUSSIAN, - stats::detail::ComputationComplexity::MODERATE, capabilities); + auto strategy = dispatcher.selectStrategy(size, stats::detail::DistributionType::GAUSSIAN, + stats::detail::OperationType::PDF, capabilities); std::cout << std::setw(15) << size << std::setw(20) << strategyToString(strategy) << std::endl; diff --git a/include/common/parallel_execution_fwd.h b/include/common/parallel_execution_fwd.h index 4717a94..448b2be 100644 --- a/include/common/parallel_execution_fwd.h +++ b/include/common/parallel_execution_fwd.h @@ -25,100 +25,126 @@ bool has_execution_policies() noexcept; const char* execution_support_string() noexcept; /// Platform-optimized parallel thresholds and grain sizes -std::size_t get_optimal_parallel_threshold(const std::string& distribution = "generic", const std::string& operation = "operation") noexcept; -std::size_t get_optimal_grain_size() noexcept; -std::size_t get_adaptive_grain_size(int operation_type = 0, std::size_t data_size = 0) noexcept; - -/// Thread count optimization -std::size_t get_optimal_thread_count(std::size_t workload_size = 0) noexcept; - -/// Parallel execution decision functions -bool should_use_parallel(const std::string& distribution, const std::string& operation, - std::size_t problem_size) noexcept; -bool should_use_parallel(std::size_t problem_size) noexcept; -bool should_use_distribution_parallel(std::size_t problem_size) noexcept; - -/// Parallel algorithm execution interfaces (implementation hidden) -namespace algorithms { - -/// Parallel for_each with automatic policy selection -template -void for_each(Iterator first, Iterator last, UnaryFunction f); - -/// Parallel transform with automatic policy selection -template -OutputIt transform(InputIt first, InputIt last, OutputIt d_first, UnaryOperation op); - -/// Parallel reduce with automatic policy selection -template -T reduce(InputIt first, InputIt last, T init, BinaryOperation op); - -/// Parallel fill with automatic policy selection -template -void fill(Iterator first, Iterator last, const T& value); - -/// Parallel count with automatic policy selection -template -typename std::iterator_traits::difference_type count(Iterator first, Iterator last, - const T& value); - -/// Parallel count_if with automatic policy selection -template -typename std::iterator_traits::difference_type count_if(Iterator first, Iterator last, - UnaryPredicate pred); - -/// Parallel sort with automatic policy selection -template -void sort(Iterator first, Iterator last, Compare comp); - -/// Parallel sort with default comparison -template -void sort(Iterator first, Iterator last); - -/// Parallel accumulate (alias for reduce) -template -T accumulate(InputIt first, InputIt last, T init, BinaryOperation op); -} // namespace algorithms - -/// Execution policy abstraction (hides platform-specific details) -namespace execution_policy { - -/// Check if specific execution policy is available -enum class PolicyType { Sequential, Parallel, ParallelUnsequenced, VectorizedParallel }; - -bool is_available(PolicyType policy) noexcept; -PolicyType get_best_available() noexcept; -const char* policy_name(PolicyType policy) noexcept; -} // namespace execution_policy - -/// Platform-specific optimization hints (implementation hidden) -namespace platform { - -/// Get platform-specific parallel configuration -struct ParallelConfig { - std::size_t optimal_threads; - std::size_t grain_size; - std::size_t parallel_threshold; - bool supports_vectorized_parallel; - bool supports_nested_parallelism; - const char* platform_name; -}; - -ParallelConfig get_platform_config(std::size_t workload_size = 0) noexcept; - -/// Check if current platform benefits from specific optimizations -bool benefits_from_large_grain_size() noexcept; -bool benefits_from_small_thread_count() noexcept; -bool has_fast_thread_creation() noexcept; - -/// Memory access pattern hints -bool should_use_cache_friendly_chunking(std::size_t data_size) noexcept; -std::size_t get_optimal_cache_chunk_size(std::size_t element_size = sizeof(double)) noexcept; -} // namespace platform - -} // namespace arch -} // namespace stats + std::size_t get_optimal_grain_size() noexcept; + std::size_t get_adaptive_grain_size( + int operation_type = 0, + std::size_t data_size = 0) noexcept; + + /// Thread count optimization + std::size_t get_optimal_thread_count( + std::size_t workload_size = 0) noexcept; + + /// Parallel execution decision functions + bool should_use_parallel( + const std::string& distribution, + const std::string& operation, + std::size_t problem_size) noexcept; + bool should_use_parallel( + std::size_t problem_size) noexcept; + bool should_use_distribution_parallel( + std::size_t problem_size) noexcept; + + /// Parallel algorithm execution interfaces + /// (implementation hidden) + namespace algorithms { + + /// Parallel for_each with automatic policy selection + template + void for_each(Iterator first, Iterator last, + UnaryFunction f); + + /// Parallel transform with automatic policy selection + template + OutputIt transform(InputIt first, InputIt last, + OutputIt d_first, UnaryOperation op); + + /// Parallel reduce with automatic policy selection + template + T reduce(InputIt first, InputIt last, T init, + BinaryOperation op); + + /// Parallel fill with automatic policy selection + template + void fill(Iterator first, Iterator last, const T& value); + + /// Parallel count with automatic policy selection + template + typename std::iterator_traits::difference_type + count(Iterator first, Iterator last, const T& value); + + /// Parallel count_if with automatic policy selection + template + typename std::iterator_traits::difference_type + count_if(Iterator first, Iterator last, + UnaryPredicate pred); + + /// Parallel sort with automatic policy selection + template + void sort(Iterator first, Iterator last, Compare comp); + + /// Parallel sort with default comparison + template + void sort(Iterator first, Iterator last); + + /// Parallel accumulate (alias for reduce) + template + T accumulate(InputIt first, InputIt last, T init, + BinaryOperation op); + } // namespace algorithms + + /// Execution policy abstraction (hides platform-specific + /// details) + namespace execution_policy { + + /// Check if specific execution policy is available + enum class PolicyType { + Sequential, + Parallel, + ParallelUnsequenced, + VectorizedParallel + }; + + bool is_available(PolicyType policy) noexcept; + PolicyType get_best_available() noexcept; + const char* policy_name(PolicyType policy) noexcept; + } // namespace execution_policy + + /// Platform-specific optimization hints (implementation + /// hidden) + namespace platform { + + /// Get platform-specific parallel configuration + struct ParallelConfig { + std::size_t optimal_threads; + std::size_t grain_size; + std::size_t parallel_threshold; + bool supports_vectorized_parallel; + bool supports_nested_parallelism; + const char* platform_name; + }; + + ParallelConfig get_platform_config( + std::size_t workload_size = 0) noexcept; + + /// Check if current platform benefits from specific + /// optimizations + bool benefits_from_large_grain_size() noexcept; + bool benefits_from_small_thread_count() noexcept; + bool has_fast_thread_creation() noexcept; + + /// Memory access pattern hints + bool should_use_cache_friendly_chunking( + std::size_t data_size) noexcept; + std::size_t get_optimal_cache_chunk_size( + std::size_t element_size = sizeof(double)) noexcept; + } // namespace platform + + } // namespace arch + } // namespace stats // Safe execution policy macros (simplified, platform-independent) #define LIBSTATS_PARALLEL_IF_AVAILABLE(size) (stats::arch::should_use_parallel(size)) diff --git a/include/core/dispatch_thresholds.h b/include/core/dispatch_thresholds.h new file mode 100644 index 0000000..52d81d4 --- /dev/null +++ b/include/core/dispatch_thresholds.h @@ -0,0 +1,517 @@ +#pragma once + +/** + * @file dispatch_thresholds.h + * @brief Profiling-derived constexpr lookup table for dispatch strategy thresholds + * + * Each threshold is the batch size at which parallel execution sustainably + * beats VECTORIZED for a given (SIMD level, distribution, operation) triple. + * Values are derived from Release-build profiling bundles captured on four + * target architectures (see data/profiles/dispatcher/). + * + * SIZE_MAX means "never parallel" β€” VECTORIZED is always preferred. + * + * The SCALARβ†’VECTORIZED boundary is handled separately by SIMDPolicy::getMinThreshold() + * and is architecture-independent within a SIMD level (typically 4–8 elements). + */ + +#include "libstats/platform/simd_policy.h" +#include "performance_dispatcher.h" + +#include +#include + +namespace stats { +namespace detail { + +/** + * @brief Operation types for per-operation threshold resolution + */ +enum class OperationType { + PDF, ///< Probability density/mass function + LOG_PDF, ///< Log-probability density/mass function + CDF, ///< Cumulative distribution function + BATCH_FIT ///< Parallel batch parameter estimation +}; + +namespace dispatch_table { + +/// Sentinel: VECTORIZED is always preferred over parallel strategies. +constexpr std::size_t NEVER = std::numeric_limits::max(); + +/// Minimum datasets for parallel batch fitting (architecture-independent). +constexpr std::size_t BATCH_FIT_MIN = 8; + +// ============================================================================ +// Per-architecture parallel thresholds: (DistributionType, OperationType) β†’ size +// Derived from strategy_profile Release builds, 2026-04-12. +// +// Reading guide: the value is the smallest batch size at which a parallel +// strategy (PARALLEL or WORK_STEALING) sustainably beats VECTORIZED through +// the largest measured size (500K). NEVER means it never does. +// ============================================================================ + +// --- NEON (Apple M1, 128-bit, 8C/8T, macOS/GCD) --- +// data/profiles/dispatcher/2026-04-12T05-36-21Z_darwin-arm64_…_sha-6aef918 + +constexpr std::size_t neon_parallel_threshold(DistributionType dist, OperationType op) { + if (op == OperationType::BATCH_FIT) + return BATCH_FIT_MIN; + if (dist == DistributionType::BETA) + return NEVER; + + switch (dist) { + case DistributionType::UNIFORM: + switch (op) { + case OperationType::PDF: + return NEVER; + case OperationType::LOG_PDF: + return NEVER; + case OperationType::CDF: + return 20000; + default: + return NEVER; + } + case DistributionType::GAUSSIAN: + switch (op) { + case OperationType::PDF: + return 50000; + case OperationType::LOG_PDF: + return 100000; + case OperationType::CDF: + return 10000; + default: + return NEVER; + } + case DistributionType::EXPONENTIAL: + switch (op) { + case OperationType::PDF: + return 50000; + case OperationType::LOG_PDF: + return 100000; + case OperationType::CDF: + return 20000; + default: + return NEVER; + } + case DistributionType::DISCRETE: + switch (op) { + case OperationType::PDF: + return 250000; + case OperationType::LOG_PDF: + return 250000; + case OperationType::CDF: + return 100000; + default: + return NEVER; + } + case DistributionType::POISSON: + switch (op) { + case OperationType::PDF: + return 20000; + case OperationType::LOG_PDF: + return 50000; + case OperationType::CDF: + return 2000; + default: + return NEVER; + } + case DistributionType::GAMMA: + switch (op) { + case OperationType::PDF: + return 20000; + case OperationType::LOG_PDF: + return 20000; + case OperationType::CDF: + return 2000; + default: + return NEVER; + } + case DistributionType::STUDENT_T: + switch (op) { + case OperationType::PDF: + return 20000; + case OperationType::LOG_PDF: + return 50000; + case OperationType::CDF: + return 250000; + default: + return NEVER; + } + case DistributionType::CHI_SQUARED: + switch (op) { + case OperationType::PDF: + return 20000; + case OperationType::LOG_PDF: + return 50000; + case OperationType::CDF: + return 2000; + default: + return NEVER; + } + default: + return NEVER; + } +} + +// --- AVX (Intel Ivy Bridge i7-3820QM, 128/256-bit, 4P/8T, macOS/GCD) --- +// data/profiles/dispatcher/2026-04-12T05-55-52Z_darwin-x86_64_…_sha-e75c6e3 + +constexpr std::size_t avx_parallel_threshold(DistributionType dist, OperationType op) { + if (op == OperationType::BATCH_FIT) + return BATCH_FIT_MIN; + if (dist == DistributionType::BETA) + return NEVER; + + switch (dist) { + case DistributionType::UNIFORM: + switch (op) { + case OperationType::PDF: + return NEVER; + case OperationType::LOG_PDF: + return NEVER; + case OperationType::CDF: + return 10000; + default: + return NEVER; + } + case DistributionType::GAUSSIAN: + switch (op) { + case OperationType::PDF: + return 20000; + case OperationType::LOG_PDF: + return 50000; + case OperationType::CDF: + return 20000; + default: + return NEVER; + } + case DistributionType::EXPONENTIAL: + switch (op) { + case OperationType::PDF: + return 20000; + case OperationType::LOG_PDF: + return 100000; + case OperationType::CDF: + return 20000; + default: + return NEVER; + } + case DistributionType::DISCRETE: + switch (op) { + case OperationType::PDF: + return 50000; + case OperationType::LOG_PDF: + return 50000; + case OperationType::CDF: + return 50000; + default: + return NEVER; + } + case DistributionType::POISSON: + switch (op) { + case OperationType::PDF: + return 2000; + case OperationType::LOG_PDF: + return 10000; + case OperationType::CDF: + return 5000; + default: + return NEVER; + } + case DistributionType::GAMMA: + switch (op) { + case OperationType::PDF: + return 20000; + case OperationType::LOG_PDF: + return 20000; + case OperationType::CDF: + return 2000; + default: + return NEVER; + } + case DistributionType::STUDENT_T: + switch (op) { + case OperationType::PDF: + return 100000; + case OperationType::LOG_PDF: + return 100000; + case OperationType::CDF: + return 100000; + default: + return NEVER; + } + case DistributionType::CHI_SQUARED: + switch (op) { + case OperationType::PDF: + return 20000; + case OperationType::LOG_PDF: + return 20000; + case OperationType::CDF: + return 2000; + default: + return NEVER; + } + default: + return NEVER; + } +} + +// --- AVX2 (Intel Kaby Lake i7-7820HQ, 256-bit, 4P/8T, macOS/GCD) --- +// data/profiles/dispatcher/2026-04-12T05-27-04Z_darwin-x86_64_…_sha-0e4e9f1 + +constexpr std::size_t avx2_parallel_threshold(DistributionType dist, OperationType op) { + if (op == OperationType::BATCH_FIT) + return BATCH_FIT_MIN; + if (dist == DistributionType::BETA) + return NEVER; + + switch (dist) { + case DistributionType::UNIFORM: + switch (op) { + case OperationType::PDF: + return NEVER; + case OperationType::LOG_PDF: + return NEVER; + case OperationType::CDF: + return 20000; + default: + return NEVER; + } + case DistributionType::GAUSSIAN: + switch (op) { + case OperationType::PDF: + return 50000; + case OperationType::LOG_PDF: + return 250000; + case OperationType::CDF: + return 50000; + default: + return NEVER; + } + case DistributionType::EXPONENTIAL: + switch (op) { + case OperationType::PDF: + return 50000; + case OperationType::LOG_PDF: + return 250000; + case OperationType::CDF: + return 50000; + default: + return NEVER; + } + case DistributionType::DISCRETE: + switch (op) { + case OperationType::PDF: + return 100000; + case OperationType::LOG_PDF: + return 50000; + case OperationType::CDF: + return 50000; + default: + return NEVER; + } + case DistributionType::POISSON: + switch (op) { + case OperationType::PDF: + return 10000; + case OperationType::LOG_PDF: + return 20000; + case OperationType::CDF: + return 2000; + default: + return NEVER; + } + case DistributionType::GAMMA: + switch (op) { + case OperationType::PDF: + return 50000; + case OperationType::LOG_PDF: + return 50000; + case OperationType::CDF: + return 5000; + default: + return NEVER; + } + case DistributionType::STUDENT_T: + switch (op) { + case OperationType::PDF: + return 100000; + case OperationType::LOG_PDF: + return 100000; + case OperationType::CDF: + return NEVER; + default: + return NEVER; + } + case DistributionType::CHI_SQUARED: + switch (op) { + case OperationType::PDF: + return 50000; + case OperationType::LOG_PDF: + return 100000; + case OperationType::CDF: + return 2000; + default: + return NEVER; + } + default: + return NEVER; + } +} + +// --- AVX-512 (AMD Ryzen 7 7445HS Zen 4, 512-bit, 6P/12T, Windows/MSVC) --- +// data/profiles/dispatcher/2026-04-12T06-02-56Z_windows-x86_64_…_sha-32c0819 + +constexpr std::size_t avx512_parallel_threshold(DistributionType dist, OperationType op) { + if (op == OperationType::BATCH_FIT) + return BATCH_FIT_MIN; + if (dist == DistributionType::BETA) + return NEVER; + + switch (dist) { + case DistributionType::UNIFORM: + switch (op) { + case OperationType::PDF: + return 100000; + case OperationType::LOG_PDF: + return 50000; + case OperationType::CDF: + return 50000; + default: + return NEVER; + } + case DistributionType::GAUSSIAN: + switch (op) { + case OperationType::PDF: + return 100000; + case OperationType::LOG_PDF: + return NEVER; + case OperationType::CDF: + return 50000; + default: + return NEVER; + } + case DistributionType::EXPONENTIAL: + switch (op) { + case OperationType::PDF: + return 50000; + case OperationType::LOG_PDF: + return 250000; + case OperationType::CDF: + return 100000; + default: + return NEVER; + } + case DistributionType::DISCRETE: + switch (op) { + case OperationType::PDF: + return 50000; + case OperationType::LOG_PDF: + return 250000; + case OperationType::CDF: + return 50000; + default: + return NEVER; + } + case DistributionType::POISSON: + switch (op) { + case OperationType::PDF: + return 10000; + case OperationType::LOG_PDF: + return 20000; + case OperationType::CDF: + return 10000; + default: + return NEVER; + } + case DistributionType::GAMMA: + switch (op) { + case OperationType::PDF: + return 20000; + case OperationType::LOG_PDF: + return 50000; + case OperationType::CDF: + return 2000; + default: + return NEVER; + } + case DistributionType::STUDENT_T: + switch (op) { + case OperationType::PDF: + return 20000; + case OperationType::LOG_PDF: + return 250000; + case OperationType::CDF: + return NEVER; + default: + return NEVER; + } + case DistributionType::CHI_SQUARED: + switch (op) { + case OperationType::PDF: + return 50000; + case OperationType::LOG_PDF: + return 50000; + case OperationType::CDF: + return 5000; + default: + return NEVER; + } + default: + return NEVER; + } +} + +// --- SSE2 fallback: shares AVX thresholds (similar 128-bit SIMD width) --- + +constexpr std::size_t sse2_parallel_threshold(DistributionType dist, OperationType op) { + return avx_parallel_threshold(dist, op); +} + +// --- No SIMD: conservative high thresholds --- + +constexpr std::size_t none_parallel_threshold(DistributionType dist, OperationType op) { + if (op == OperationType::BATCH_FIT) + return BATCH_FIT_MIN; + if (dist == DistributionType::BETA) + return NEVER; + // Without SIMD, VECTORIZED is just a scalar loop via the batch path. + // Parallel helps earlier because there is no SIMD advantage to protect. + return 5000; +} + +} // namespace dispatch_table + +/** + * @brief Look up the parallel threshold for a given SIMD level, distribution, and operation. + * + * Returns the batch size at which parallel execution sustainably beats VECTORIZED. + * Returns SIZE_MAX if VECTORIZED is always preferred. + * + * @param level Runtime SIMD level from SIMDPolicy + * @param dist Distribution type + * @param op Operation type (PDF, LOG_PDF, CDF, BATCH_FIT) + * @return Minimum batch size for parallel execution + */ +constexpr std::size_t getParallelThreshold(arch::simd::SIMDPolicy::Level level, + DistributionType dist, OperationType op) { + switch (level) { + case arch::simd::SIMDPolicy::Level::NEON: + return dispatch_table::neon_parallel_threshold(dist, op); + case arch::simd::SIMDPolicy::Level::AVX512: + return dispatch_table::avx512_parallel_threshold(dist, op); + case arch::simd::SIMDPolicy::Level::AVX2: + return dispatch_table::avx2_parallel_threshold(dist, op); + case arch::simd::SIMDPolicy::Level::AVX: + return dispatch_table::avx_parallel_threshold(dist, op); + case arch::simd::SIMDPolicy::Level::SSE2: + return dispatch_table::sse2_parallel_threshold(dist, op); + case arch::simd::SIMDPolicy::Level::None: + default: + return dispatch_table::none_parallel_threshold(dist, op); + } +} + +} // namespace detail +} // namespace stats diff --git a/include/core/dispatch_utils.h b/include/core/dispatch_utils.h index 1e18d60..9386e3a 100644 --- a/include/core/dispatch_utils.h +++ b/include/core/dispatch_utils.h @@ -1,5 +1,6 @@ #pragma once +#include "dispatch_thresholds.h" #include "libstats/platform/thread_pool.h" // For ParallelUtils #include "libstats/platform/work_stealing_pool.h" #include "performance_dispatcher.h" @@ -23,7 +24,7 @@ namespace detail { // Performance utilities * * Layer 2 β€” Select strategy: * DispatchUtils::autoDispatch - * \u2193 if hint is AUTO: PerformanceDispatcher::selectOptimalStrategy (threshold lookup + + * \u2193 if hint is AUTO: PerformanceDispatcher::selectStrategy (threshold lookup + * optional performance history override) * if hint is explicit: DispatchUtils::mapHintToStrategy * \u2193 DispatchUtils::executeStrategy (switches on Strategy enum) @@ -66,7 +67,7 @@ class DispatchUtils { typename WorkStealingFunc, typename GpuAcceleratedFunc> static void autoDispatch(const Distribution& dist, std::span values, std::span results, const PerformanceHint& hint, - DistributionType dist_type, ComputationComplexity complexity, + DistributionType dist_type, OperationType op_type, ScalarFunc&& scalar_func, BatchFunc&& batch_func, ParallelFunc&& parallel_func, WorkStealingFunc&& work_stealing_func, GpuAcceleratedFunc&& gpu_accelerated_func) { @@ -93,7 +94,7 @@ class DispatchUtils { auto strategy = Strategy::SCALAR; if (hint.strategy == PerformanceHint::PreferredStrategy::AUTO) { - strategy = dispatcher.selectOptimalStrategy(count, dist_type, complexity, system); + strategy = dispatcher.selectStrategy(count, dist_type, op_type, system); } else { strategy = mapHintToStrategy(hint.strategy, count); } diff --git a/include/core/distribution_characteristics.h b/include/core/distribution_characteristics.h deleted file mode 100644 index cf8408d..0000000 --- a/include/core/distribution_characteristics.h +++ /dev/null @@ -1,323 +0,0 @@ -/** - * @file distribution_characteristics.h - * @brief Empirically-derived distribution characteristics for performance optimization - * - * This header provides empirical constants for different distribution families based on - * actual computational complexity analysis rather than assumptions. These constants - * serve as initial performance baselines that can be refined through adaptive learning. - */ - -#pragma once - -#include "performance_dispatcher.h" - -#include -#include - -namespace stats { -namespace detail { // Performance utilities - -/** - * @brief Computational complexity characteristics for distribution families - * - * These values are derived from actual algorithmic analysis of each distribution's - * implementation rather than assumptions. They represent relative computational - * cost multipliers compared to the simplest operations. - */ -struct DistributionComplexity { - double base_complexity; ///< Base computational cost multiplier - double vectorization_efficiency; ///< SIMD efficiency (0.0-1.0) - double parallelization_efficiency; ///< Parallel efficiency (0.0-1.0) - size_t min_simd_threshold; ///< Minimum elements where SIMD becomes beneficial - size_t min_parallel_threshold; ///< Minimum elements where parallelization helps - - // Cache characteristics - double memory_access_pattern; ///< Memory access efficiency (0.0-1.0, 1.0 = perfect locality) - double branch_prediction_cost; ///< Branch misprediction penalty factor -}; - -/** - * @brief Empirically-derived characteristics for each distribution family - * - * These constants are based on algorithmic analysis of actual implementations: - * - Uniform: Simple linear transform, excellent vectorization - * - Discrete: Integer operations, good vectorization, minimal branching - * - Exponential: One transcendental function (exp/log), moderate vectorization - * - Gaussian: Box-Muller transform (2 transcendentals + sqrt), complex control flow - * - Poisson: Iterative algorithms with early termination, poor vectorization - * - Gamma: Multiple special functions + iterative rejection sampling, complex - * - StudentT: Log-space PDF (one log per element); CDF via incomplete beta - * - Beta: Log-space PDF (two logs per element); bounded support fixup at boundaries - */ -constexpr std::array DISTRIBUTION_CHARACTERISTICS = { - {// UNIFORM: y = a + (b-a) * uniform_random() - // - Single multiply-add operation - // - Perfect memory locality - // - No branching - // - Excellent SIMD efficiency (near-perfect vectorization) - { - .base_complexity = 1.0, // Baseline reference - .vectorization_efficiency = 0.95, // Excellent SIMD efficiency - .parallelization_efficiency = 0.90, // Excellent parallel efficiency - .min_simd_threshold = 16, // Very low threshold due to simplicity - .min_parallel_threshold = 1000, // Moderate threshold due to low per-element cost - .memory_access_pattern = 1.0, // Perfect sequential access - .branch_prediction_cost = 1.0 // No conditional branches - }, - - // GAUSSIAN: Box-Muller transform - // - Two uniform samples -> two Gaussian samples - // - log(), sqrt(), cos(), sin() transcendental functions - // - Moderate branching for cached value reuse - // - Good but not perfect vectorization due to transcendental overhead - { - .base_complexity = 3.2, // ~3.2x more complex than uniform - .vectorization_efficiency = 0.75, // Good SIMD but transcendentals limit efficiency - .parallelization_efficiency = 0.80, // Good parallel efficiency - .min_simd_threshold = 32, // Higher due to transcendental overhead - .min_parallel_threshold = 1500, // Higher due to moderate per-element cost - .memory_access_pattern = 0.95, // Mostly sequential, some caching patterns - .branch_prediction_cost = 1.15 // Minimal branching for cached values - }, - - // EXPONENTIAL: Inverse transform method - // - -log(uniform_random()) / lambda - // - One transcendental function (log) - // - No branching in fast path - // - Good vectorization potential - { - .base_complexity = 2.1, // ~2.1x more complex than uniform - .vectorization_efficiency = 0.82, // Good SIMD efficiency - .parallelization_efficiency = 0.85, // Good parallel efficiency - .min_simd_threshold = 24, // Moderate threshold - .min_parallel_threshold = 1200, // Moderate threshold - .memory_access_pattern = 1.0, // Perfect sequential access - .branch_prediction_cost = 1.0 // No conditional branches in fast path - }, - - // DISCRETE: Integer operations with bounds checking - // - Uniform integer generation with modulo - // - Range checking and validation - // - Excellent memory locality - // - Some branching for bounds checking - // - Good but not perfect vectorization due to integer-specific optimizations - { - .base_complexity = 1.4, // ~1.4x more complex than uniform - .vectorization_efficiency = 0.85, // Good SIMD efficiency for integer ops - .parallelization_efficiency = 0.88, // Good parallel efficiency - .min_simd_threshold = 20, // Low threshold due to simplicity - .min_parallel_threshold = 800, // Lower threshold due to low complexity - .memory_access_pattern = 1.0, // Perfect sequential access - .branch_prediction_cost = 1.1 // Minimal branching for validation - }, - - // POISSON: Iterative algorithms (Knuth's algorithm for small lambda, acceptance-rejection for - // large) - // - While loop with early termination - // - Multiple exponential/log evaluations - // - Highly variable execution time per sample - // - Poor vectorization due to data dependencies - // - Branch-heavy with unpredictable termination - { - .base_complexity = 4.8, // ~4.8x more complex than uniform - .vectorization_efficiency = 0.35, // Poor SIMD efficiency due to loops - .parallelization_efficiency = 0.70, // Moderate parallel efficiency - .min_simd_threshold = 64, // High threshold due to complexity - .min_parallel_threshold = 2000, // Higher threshold due to high per-element cost - .memory_access_pattern = 0.85, // Some irregular access patterns - .branch_prediction_cost = 1.35 // Significant branching overhead - }, - - // GAMMA: Acceptance-rejection sampling (Marsaglia & Tsang for shape >= 1, other methods for - // shape < 1) - // - Multiple transcendental functions per sample - // - Rejection sampling with variable iteration count - // - log(), exp(), sqrt(), pow() operations - // - Highly variable execution time - // - Complex branching patterns - // - Poor vectorization due to conditional loops - { - .base_complexity = 6.5, // ~6.5x more complex than uniform - .vectorization_efficiency = 0.25, // Poor SIMD efficiency - .parallelization_efficiency = 0.65, // Moderate parallel efficiency - .min_simd_threshold = 80, // High threshold - .min_parallel_threshold = 3000, // High threshold due to complexity - .memory_access_pattern = 0.80, // Irregular access patterns - .branch_prediction_cost = 1.50 // Heavy branching overhead - }, - - // STUDENT_T: Log-space PDF: log(C) + (-(Ξ½+1)/2) Β· log(1 + xΒ²/Ξ½) - // - One vector_log per element in SIMD PDF path - // - CDF via regularized incomplete beta (not vectorized) - // - Full real-line domain: no boundary fixup needed - // - Similar to Gaussian in per-element PDF cost - { - .base_complexity = 3.5, // ~3.5x more complex than uniform - .vectorization_efficiency = 0.75, // One vector_log; matches Gaussian efficiency - .parallelization_efficiency = 0.80, // Good parallel efficiency - .min_simd_threshold = 32, // Same as Gaussian - .min_parallel_threshold = 1500, // Same as Gaussian; moderate per-element cost - .memory_access_pattern = 0.95, // Sequential access; no boundary fixups - .branch_prediction_cost = 1.10 // Minimal branching in PDF path - }, - - // BETA: Log-space PDF: (Ξ±-1)Β·log(x) + (Ξ²-1)Β·log(1-x) + log_norm_const - // - Two vector_log calls per element in SIMD PDF path - // - CDF via regularized incomplete beta (not vectorized) - // - Bounded support [0,1]: fixup required at x=0 and x=1 boundaries - // - Slightly more expensive than Student's t due to two log calls + fixup - { - .base_complexity = 3.8, // ~3.8x more complex than uniform - .vectorization_efficiency = 0.78, // Two vector_log calls; fixup adds overhead - .parallelization_efficiency = 0.82, // Good parallel efficiency - .min_simd_threshold = 32, // Similar to Gaussian - .min_parallel_threshold = 1200, // Two log calls β€” benefits from parallel sooner - .memory_access_pattern = 0.95, // Sequential access; bounded support - .branch_prediction_cost = 1.20 // Boundary fixup at x=0 and x=1 - }, - - // CHI_SQUARED: Delegation wrapper over Gamma(Ξ½/2, 1/2) - // - All batch and probability operations delegate to an internal GammaDistribution - // - Positive real-line support (x > 0), same domain as Gamma - // - Computational characteristics identical to Gamma due to full delegation - { - .base_complexity = 6.5, // ~6.5x more complex than uniform (matches Gamma) - .vectorization_efficiency = 0.25, // Poor SIMD efficiency (inherited from Gamma) - .parallelization_efficiency = 0.65, // Moderate parallel efficiency (inherited from Gamma) - .min_simd_threshold = 80, // High threshold (matches Gamma) - .min_parallel_threshold = 3000, // High threshold due to complexity (matches Gamma) - .memory_access_pattern = 0.80, // Irregular access patterns (inherited from Gamma) - .branch_prediction_cost = 1.50 // Heavy branching overhead (inherited from Gamma) - }}}; - -/** - * @brief Get characteristics for a specific distribution type - * - * @param dist_type Distribution type to query - * @return Reference to empirical characteristics - */ -constexpr const DistributionComplexity& getCharacteristics(DistributionType dist_type) noexcept { - switch (dist_type) { - case DistributionType::UNIFORM: - return DISTRIBUTION_CHARACTERISTICS[0]; - case DistributionType::GAUSSIAN: - return DISTRIBUTION_CHARACTERISTICS[1]; - case DistributionType::EXPONENTIAL: - return DISTRIBUTION_CHARACTERISTICS[2]; - case DistributionType::DISCRETE: - return DISTRIBUTION_CHARACTERISTICS[3]; - case DistributionType::POISSON: - return DISTRIBUTION_CHARACTERISTICS[4]; - case DistributionType::GAMMA: - return DISTRIBUTION_CHARACTERISTICS[5]; - case DistributionType::STUDENT_T: - return DISTRIBUTION_CHARACTERISTICS[6]; - case DistributionType::BETA: - return DISTRIBUTION_CHARACTERISTICS[7]; - case DistributionType::CHI_SQUARED: - return DISTRIBUTION_CHARACTERISTICS[8]; - } - // Fallback to uniform characteristics - return DISTRIBUTION_CHARACTERISTICS[0]; -} - -/** - * @brief Performance scaling factors based on empirical analysis - * - * These represent expected performance improvements from different strategies - * based on algorithmic analysis and can be refined through adaptive learning. - */ -// scaling utilities -/** - * @brief Expected SIMD speedup factors by distribution complexity - * - * Simple operations (uniform, discrete) benefit more from SIMD than - * complex operations with transcendentals or unpredictable branching. - */ -constexpr double calculateSIMDSpeedup(const DistributionComplexity& chars) noexcept { - // SIMD speedup varies based on vectorization efficiency and complexity - // Simple operations: up to 4x speedup on 4-wide SIMD - // Complex operations: limited by transcendental function overhead - return 1.0 + (3.0 * chars.vectorization_efficiency); -} - -/** - * @brief Expected parallel speedup factors accounting for overhead - * - * Takes into account thread overhead, cache effects, and algorithmic complexity. - * More complex operations benefit more from parallelization due to higher - * computation-to-synchronization ratios. - */ -constexpr double calculateParallelSpeedup(const DistributionComplexity& chars, - size_t num_threads) noexcept { - // Parallel efficiency decreases with thread overhead and cache conflicts - // But increases with algorithmic complexity - double thread_efficiency = static_cast(num_threads) * chars.parallelization_efficiency; - - // Diminishing returns: Amdahl's law approximation - double overhead_factor = 1.0 / (1.0 + (0.1 / chars.base_complexity)); - - return std::min(thread_efficiency * overhead_factor, static_cast(num_threads) * 0.85); -} -} // namespace detail - -/** - * @brief Adaptive learning integration points - * - * These provide hooks for the performance learning system to refine - * the empirical constants based on actual measured performance. - */ -// adaptive utilities -/** - * @brief Refinement factors that can be learned and updated - * - * These multipliers adjust the base characteristics based on - * system-specific performance observations. - */ -struct LearnedRefinements { - double simd_efficiency_multiplier = 1.0; ///< Learned SIMD efficiency adjustment - double parallel_efficiency_multiplier = 1.0; ///< Learned parallel efficiency adjustment - double complexity_adjustment = 1.0; ///< Learned complexity adjustment - size_t simd_threshold_offset = 0; ///< Learned threshold adjustment - size_t parallel_threshold_offset = 0; ///< Learned threshold adjustment - - // Confidence in learned values (0.0 = use empirical, 1.0 = use learned) - double learning_confidence = 0.0; -}; - -/** - * @brief Apply learned refinements to empirical characteristics - * - * @param base_chars Empirical base characteristics - * @param refinements Learned refinements from performance history - * @return Refined characteristics combining empirical + learned data - */ -constexpr detail::DistributionComplexity applyRefinements( - const detail::DistributionComplexity& base_chars, - const LearnedRefinements& refinements) noexcept { - // Blend empirical and learned values based on confidence - double blend_factor = refinements.learning_confidence; - - return detail::DistributionComplexity{ - .base_complexity = base_chars.base_complexity * - (1.0 - blend_factor + blend_factor * refinements.complexity_adjustment), - .vectorization_efficiency = - base_chars.vectorization_efficiency * - (1.0 - blend_factor + blend_factor * refinements.simd_efficiency_multiplier), - .parallelization_efficiency = - base_chars.parallelization_efficiency * - (1.0 - blend_factor + blend_factor * refinements.parallel_efficiency_multiplier), - .min_simd_threshold = static_cast( - static_cast(base_chars.min_simd_threshold) * (1.0 - blend_factor) + - static_cast(base_chars.min_simd_threshold + refinements.simd_threshold_offset) * - blend_factor), - .min_parallel_threshold = static_cast( - static_cast(base_chars.min_parallel_threshold) * (1.0 - blend_factor) + - static_cast(base_chars.min_parallel_threshold + - refinements.parallel_threshold_offset) * - blend_factor), - .memory_access_pattern = base_chars.memory_access_pattern, - .branch_prediction_cost = base_chars.branch_prediction_cost}; -} - -} // namespace stats diff --git a/include/core/math_utils.h b/include/core/math_utils.h index 3b1a50b..c23a00d 100644 --- a/include/core/math_utils.h +++ b/include/core/math_utils.h @@ -84,6 +84,20 @@ namespace detail { */ [[nodiscard]] double beta_i(double x, double a, double b) noexcept; +/** + * @brief Regularized incomplete beta function with precomputed log-beta prefix + * + * Identical to beta_i(x, a, b) but skips the per-call lgamma(a+b)-lgamma(a)-lgamma(b) + * computation. Use in batch loops where a and b are constant across elements. + * + * @param x Input value in [0,1] + * @param a First shape parameter (a > 0) + * @param b Second shape parameter (b > 0) + * @param log_beta_prefix Precomputed lgamma(a+b) - lgamma(a) - lgamma(b) + * @return I_x(a,b) + */ +[[nodiscard]] double beta_i(double x, double a, double b, double log_beta_prefix) noexcept; + /** * @brief Natural logarithm of the beta function ln(B(a,b)) * @param a First parameter (a > 0) diff --git a/include/core/performance_dispatcher.h b/include/core/performance_dispatcher.h index a81cd1c..f27de97 100644 --- a/include/core/performance_dispatcher.h +++ b/include/core/performance_dispatcher.h @@ -18,6 +18,13 @@ class SIMDPolicy; #include "libstats/platform/simd_policy.h" +// Forward declare OperationType so it can be used in selectStrategy +namespace stats { +namespace detail { +enum class OperationType; +} // namespace detail +} // namespace stats + /** * @file performance_dispatcher.h * @brief Intelligent auto-dispatch system for optimal performance strategy selection @@ -222,17 +229,16 @@ class PerformanceDispatcher { }; /** - * @brief Select optimal execution strategy + * @brief Select optimal execution strategy using profiling-derived lookup table * * @param batch_size Number of elements to process * @param dist_type Type of distribution - * @param complexity Computational complexity level + * @param op_type Operation type (PDF, LOG_PDF, CDF, BATCH_FIT) * @param system System capabilities * @return Optimal strategy for the given parameters */ - Strategy selectOptimalStrategy(size_t batch_size, DistributionType dist_type, - ComputationComplexity complexity, - const SystemCapabilities& system) const; + Strategy selectStrategy(size_t batch_size, DistributionType dist_type, OperationType op_type, + const SystemCapabilities& system) const; /** * @brief Get current decision thresholds @@ -266,28 +272,23 @@ class PerformanceDispatcher { size_t getDistributionSpecificParallelThreshold(DistributionType dist_type) const; bool shouldUseWorkStealing(size_t batch_size, DistributionType dist_type) const; - // shouldUseGpuAccelerated removed β€” GPU_ACCELERATED strategy removed from enum. /** * @brief Detect the highest available SIMD architecture - * @param system System capabilities for detection - * @return Detected SIMD architecture */ static SIMDArchitecture detectSIMDArchitecture(const SystemCapabilities& system) noexcept; /** - * @brief Select strategy based on system capabilities and performance metrics + * @brief Select multi-threaded strategy (PARALLEL vs WORK_STEALING) * - * Uses measured SIMD efficiency, threading overhead, and memory bandwidth - * to make adaptive decisions based on actual hardware performance. - * - * @param batch_size Number of elements to process - * @param dist_type Type of distribution for complexity estimation - * @param system Measured system capabilities - * @return Optimal strategy for this hardware and workload + * The choice depends on the threading backend (GCD vs Windows TP) and + * whether hyperthreading is present, per four-architecture profiling data. */ - Strategy selectStrategyBasedOnCapabilities(size_t batch_size, DistributionType dist_type, - const SystemCapabilities& system) const; + static Strategy selectMultiThreadedStrategy(DistributionType dist_type, + const SystemCapabilities& system) noexcept; + + /// Cached SIMD level for table lookups + arch::simd::SIMDPolicy::Level simd_level_; }; /** diff --git a/include/platform/parallel_execution.h b/include/platform/parallel_execution.h index dec3e6e..88dc60d 100644 --- a/include/platform/parallel_execution.h +++ b/include/platform/parallel_execution.h @@ -18,8 +18,7 @@ #include #include -// Platform-specific headers for parallel execution -#include "parallel_thresholds.h" +// Dispatch thresholds are now in include/core/dispatch_thresholds.h // PARALLEL EXECUTION POLICY DETECTION // Priority order: @@ -113,14 +112,8 @@ inline const char* execution_support_string() noexcept { #endif } -/** - * @brief Get CPU-aware optimal parallel threshold - * @return Optimal minimum elements for parallel processing based on CPU features - */ -inline std::size_t get_optimal_parallel_threshold(const std::string& distribution, - const std::string& operation) noexcept { - return stats::arch::getGlobalThresholdCalculator().getThreshold(distribution, operation); -} +// get_optimal_parallel_threshold() removed β€” use detail::getParallelThreshold() from +// dispatch_thresholds.h instead. // Note: get_optimal_grain_size() is declared in platform_constants.h // and defined in platform_constants_impl.cpp to avoid multiple definitions @@ -215,24 +208,15 @@ inline std::size_t get_optimal_thread_count( /** * @brief Check if a problem size is large enough to benefit from parallel execution - * @param distribution Distribution name - * @param operation Operation name - * @param problem_size Total number of elements or operations - * @return true if parallel execution is likely beneficial - */ -inline bool should_use_parallel(const std::string& distribution, const std::string& operation, - std::size_t problem_size) noexcept { - const std::size_t actual_threshold = get_optimal_parallel_threshold(distribution, operation); - return has_execution_policies() && (problem_size >= actual_threshold); -} - -/** - * @brief Backward-compatible overload using default thresholds * @param problem_size Total number of elements or operations * @return true if parallel execution is likely beneficial + * + * Uses a conservative default threshold. For per-(distribution, operation) thresholds, + * use detail::getParallelThreshold() from dispatch_thresholds.h instead. */ inline bool should_use_parallel(std::size_t problem_size) noexcept { - return should_use_parallel("generic", "operation", problem_size); + return has_execution_policies() && + (problem_size >= stats::arch::get_min_elements_for_distribution_parallel()); } /** @@ -681,7 +665,7 @@ void openmp_for_each(Iterator first, Iterator last, UnaryFunction f) { const size_t total_elements = static_cast(std::distance(first, last)); const size_t chunk_size = get_openmp_chunk_size(total_elements); - if (total_elements < get_optimal_parallel_threshold("generic", "operation")) { + if (total_elements < get_min_elements_for_distribution_parallel()) { std::for_each(first, last, f); return; } @@ -698,7 +682,7 @@ void openmp_transform(Iterator1 first1, Iterator1 last1, Iterator2 first2, Unary const size_t total_elements = static_cast(std::distance(first1, last1)); const size_t chunk_size = get_openmp_chunk_size(total_elements); - if (total_elements < get_optimal_parallel_threshold("generic", "operation")) { + if (total_elements < get_min_elements_for_distribution_parallel()) { std::transform(first1, last1, first2, op); return; } @@ -716,7 +700,7 @@ void openmp_fill(Iterator first, Iterator last, const T& value) { const size_t total_elements = static_cast(std::distance(first, last)); const size_t chunk_size = get_openmp_chunk_size(total_elements); - if (total_elements < get_optimal_parallel_threshold("generic", "operation")) { + if (total_elements < get_min_elements_for_distribution_parallel()) { std::fill(first, last, value); return; } @@ -733,7 +717,7 @@ T openmp_reduce(Iterator first, Iterator last, T init, BinaryOp op) { const size_t total_elements = static_cast(std::distance(first, last)); const size_t chunk_size = get_openmp_chunk_size(total_elements); - if (total_elements < get_optimal_parallel_threshold("generic", "operation")) { + if (total_elements < get_min_elements_for_distribution_parallel()) { return std::accumulate(first, last, init, op); } @@ -758,7 +742,7 @@ typename std::iterator_traits::difference_type openmp_count(Iterator f const size_t total_elements = static_cast(std::distance(first, last)); const size_t chunk_size = get_openmp_chunk_size(total_elements); - if (total_elements < get_optimal_parallel_threshold("generic", "operation")) { + if (total_elements < get_min_elements_for_distribution_parallel()) { return std::count(first, last, value); } @@ -783,7 +767,7 @@ typename std::iterator_traits::difference_type openmp_count_if(Iterato const size_t total_elements = static_cast(std::distance(first, last)); const size_t chunk_size = get_openmp_chunk_size(total_elements); - if (total_elements < get_optimal_parallel_threshold("generic", "operation")) { + if (total_elements < get_min_elements_for_distribution_parallel()) { return std::count_if(first, last, pred); } @@ -852,8 +836,7 @@ void pthread_for_each(Iterator first, Iterator last, UnaryFunction f) { const size_t num_chunks = calculate_num_chunks(total_elements, chunk_size); const size_t max_threads = std::min(num_chunks, static_cast(get_logical_core_count())); - if (total_elements < get_optimal_parallel_threshold("generic", "operation") || - max_threads <= 1) { + if (total_elements < get_min_elements_for_distribution_parallel() || max_threads <= 1) { std::for_each(first, last, f); return; } @@ -902,8 +885,7 @@ void pthread_transform(Iterator1 first1, Iterator1 last1, Iterator2 first2, Unar const size_t max_threads = std::min(total_elements / get_optimal_grain_size(), static_cast(get_logical_core_count())); - if (total_elements < get_optimal_parallel_threshold("generic", "operation") || - max_threads <= 1) { + if (total_elements < get_min_elements_for_distribution_parallel() || max_threads <= 1) { std::transform(first1, last1, first2, op); return; } @@ -955,8 +937,7 @@ T pthread_reduce(Iterator first, Iterator last, T init, BinaryOp op) { const size_t max_threads = std::min(total_elements / get_optimal_grain_size(), static_cast(get_logical_core_count())); - if (total_elements < get_optimal_parallel_threshold("generic", "operation") || - max_threads <= 1) { + if (total_elements < get_min_elements_for_distribution_parallel() || max_threads <= 1) { return std::accumulate(first, last, init, op); } @@ -1021,7 +1002,7 @@ void safe_fill(Iterator first, Iterator last, const T& value) { const auto count = std::distance(first, last); ::stats::detail::check_finite(static_cast(count), "element count"); - if (should_use_parallel("generic", "fill", static_cast(count))) { + if (should_use_parallel(static_cast(count))) { #if defined(LIBSTATS_HAS_STD_EXECUTION) std::fill(std::execution::par_unseq, first, last, value); #elif defined(LIBSTATS_HAS_GCD) @@ -1048,7 +1029,7 @@ void safe_transform(Iterator1 first1, Iterator1 last1, Iterator2 first2, UnaryOp const auto count = std::distance(first1, last1); ::stats::detail::check_finite(static_cast(count), "element count"); - if (should_use_parallel("generic", "transform", static_cast(count))) { + if (should_use_parallel(static_cast(count))) { #if defined(LIBSTATS_HAS_STD_EXECUTION) std::transform(std::execution::par_unseq, first1, last1, first2, op); #elif defined(LIBSTATS_HAS_GCD) @@ -1067,7 +1048,7 @@ T safe_reduce(Iterator first, Iterator last, T init) { const auto count = std::distance(first, last); ::stats::detail::check_finite(static_cast(count), "element count"); - if (should_use_parallel("generic", "reduce", static_cast(count))) { + if (should_use_parallel(static_cast(count))) { #if defined(LIBSTATS_HAS_STD_EXECUTION) return std::reduce(std::execution::par_unseq, first, last, init); #elif defined(LIBSTATS_HAS_GCD) @@ -1086,7 +1067,7 @@ void safe_for_each(Iterator first, Iterator last, UnaryFunction f) { const auto count = std::distance(first, last); ::stats::detail::check_finite(static_cast(count), "element count"); - if (should_use_parallel("generic", "for_each", static_cast(count))) { + if (should_use_parallel(static_cast(count))) { #if defined(LIBSTATS_HAS_STD_EXECUTION) std::for_each(std::execution::par_unseq, first, last, f); #elif defined(LIBSTATS_HAS_GCD) @@ -1105,7 +1086,7 @@ void safe_sort(Iterator first, Iterator last) { const auto count = std::distance(first, last); ::stats::detail::check_finite(static_cast(count), "element count"); - if (should_use_parallel("generic", "sort", static_cast(count))) { + if (should_use_parallel(static_cast(count))) { #if defined(LIBSTATS_HAS_STD_EXECUTION) std::sort(std::execution::par_unseq, first, last); #else @@ -1123,7 +1104,7 @@ void safe_sort(Iterator first, Iterator last, Compare comp) { const auto count = std::distance(first, last); ::stats::detail::check_finite(static_cast(count), "element count"); - if (should_use_parallel("generic", "sort", static_cast(count))) { + if (should_use_parallel(static_cast(count))) { #if defined(LIBSTATS_HAS_STD_EXECUTION) std::sort(std::execution::par_unseq, first, last, comp); #else @@ -1141,7 +1122,7 @@ void safe_partial_sort(Iterator first, Iterator middle, Iterator last) { const auto count = std::distance(first, last); ::stats::detail::check_finite(static_cast(count), "element count"); - if (should_use_parallel("generic", "partial_sort", static_cast(count))) { + if (should_use_parallel(static_cast(count))) { #if defined(LIBSTATS_HAS_STD_EXECUTION) std::partial_sort(std::execution::par_unseq, first, middle, last); #else @@ -1159,7 +1140,7 @@ void safe_inclusive_scan(Iterator1 first, Iterator1 last, Iterator2 result) { const auto count = std::distance(first, last); ::stats::detail::check_finite(static_cast(count), "element count"); - if (should_use_parallel("generic", "scan", static_cast(count))) { + if (should_use_parallel(static_cast(count))) { #if defined(LIBSTATS_HAS_STD_EXECUTION) std::inclusive_scan(std::execution::par_unseq, first, last, result); #else @@ -1177,7 +1158,7 @@ void safe_exclusive_scan(Iterator1 first, Iterator1 last, Iterator2 result, T in const auto count = std::distance(first, last); ::stats::detail::check_finite(static_cast(count), "element count"); - if (should_use_parallel("generic", "scan", static_cast(count))) { + if (should_use_parallel(static_cast(count))) { #if defined(LIBSTATS_HAS_STD_EXECUTION) std::exclusive_scan(std::execution::par_unseq, first, last, result, init); #else @@ -1195,7 +1176,7 @@ Iterator safe_find(Iterator first, Iterator last, const T& value) { const auto count = std::distance(first, last); ::stats::detail::check_finite(static_cast(count), "element count"); - if (should_use_parallel("generic", "search", static_cast(count))) { + if (should_use_parallel(static_cast(count))) { #if defined(LIBSTATS_HAS_STD_EXECUTION) return std::find(std::execution::par_unseq, first, last, value); #else @@ -1213,7 +1194,7 @@ Iterator safe_find_if(Iterator first, Iterator last, UnaryPredicate pred) { const auto count = std::distance(first, last); ::stats::detail::check_finite(static_cast(count), "element count"); - if (should_use_parallel("generic", "search", static_cast(count))) { + if (should_use_parallel(static_cast(count))) { #if defined(LIBSTATS_HAS_STD_EXECUTION) return std::find_if(std::execution::par_unseq, first, last, pred); #else @@ -1232,7 +1213,7 @@ typename std::iterator_traits::difference_type safe_count(Iterator fir const auto count = std::distance(first, last); ::stats::detail::check_finite(static_cast(count), "element count"); - if (should_use_parallel("generic", "count", static_cast(count))) { + if (should_use_parallel(static_cast(count))) { #if defined(LIBSTATS_HAS_STD_EXECUTION) return std::count(std::execution::par_unseq, first, last, value); #elif defined(LIBSTATS_HAS_GCD) @@ -1253,7 +1234,7 @@ typename std::iterator_traits::difference_type safe_count_if(Iterator const auto count = std::distance(first, last); ::stats::detail::check_finite(static_cast(count), "element count"); - if (should_use_parallel("generic", "count", static_cast(count))) { + if (should_use_parallel(static_cast(count))) { #if defined(LIBSTATS_HAS_STD_EXECUTION) return std::count_if(std::execution::par_unseq, first, last, pred); #elif defined(LIBSTATS_HAS_GCD) diff --git a/include/platform/parallel_thresholds.h b/include/platform/parallel_thresholds.h deleted file mode 100644 index d8dfc7d..0000000 --- a/include/platform/parallel_thresholds.h +++ /dev/null @@ -1,162 +0,0 @@ -#pragma once - -/** - * @file parallel_thresholds.h - * @brief Architecture-aware parallel execution thresholds - * - * This header provides a scalable solution for determining when parallel execution - * is beneficial for different distributions and operations, without requiring - * an explosion of architecture-specific constants. - */ - -#include -#include -#include - -namespace stats { -namespace arch { - -/** - * @brief Operation complexity categories for threshold determination - */ -enum class OperationComplexity { - TRIVIAL, // Simple bounds checking, constant operations (uniform PDF/LogPDF) - SIMPLE, // Basic arithmetic, single function calls (discrete PMF, exponential PDF) - MODERATE, // Multiple function calls, some computation (poisson PMF, gaussian PDF) - COMPLEX, // Heavy computation, special functions (gamma CDF, complex CDFs) - EXPENSIVE // Very expensive operations (iterative algorithms, integration) -}; - -/** - * @brief Distribution complexity categories - */ -enum class DistributionComplexity { - UNIFORM, // Trivial operations: bounds checking, linear interpolation - DISCRETE, // Simple arithmetic: integer operations, lookups - EXPONENTIAL, // Moderate computation: exp() calls, logarithms - POISSON, // Moderate-Complex: factorial, gamma functions - GAUSSIAN // Complex: erf(), exp(), more expensive functions -}; - -/** - * @brief Architecture performance characteristics - */ -struct ArchitectureProfile { - std::size_t thread_creation_cost_us; // Microseconds to create/sync threads - std::size_t simd_width_elements; // SIMD vector width in doubles - std::size_t l3_cache_size_elements; // L3 cache size in doubles - double thread_efficiency_factor; // Threading efficiency (0.0-1.0) - std::size_t base_parallel_threshold; // Base threshold for parallel ops -}; - -/** - * @brief Adaptive threshold calculator - * - * This class calculates optimal thresholds based on: - * 1. Hardware architecture characteristics - * 2. Distribution complexity - * 3. Operation complexity - * 4. Runtime performance measurements (future enhancement) - */ -class AdaptiveThresholdCalculator { - private: - ArchitectureProfile arch_profile_; - mutable std::unordered_map cached_thresholds_; - - /** - * @brief Detect current architecture profile - */ - ArchitectureProfile detectArchitectureProfile() const; - - /** - * @brief Calculate threshold for specific operation - */ - std::size_t calculateThreshold([[maybe_unused]] DistributionComplexity dist_complexity, - OperationComplexity op_complexity) const { - std::size_t base_threshold = arch_profile_.base_parallel_threshold; - - // Adjust based on complexity - switch (op_complexity) { - case OperationComplexity::TRIVIAL: - return base_threshold * 10; - case OperationComplexity::SIMPLE: - return base_threshold * 5; - case OperationComplexity::MODERATE: - return base_threshold * 2; - case OperationComplexity::COMPLEX: - return base_threshold; - case OperationComplexity::EXPENSIVE: - return base_threshold / 2; - default: - return base_threshold; - } - } - - /** - * @brief Get operation complexity from operation name - */ - OperationComplexity getOperationComplexity(const std::string& operation) const; - - /** - * @brief Get distribution complexity from distribution name - */ - DistributionComplexity getDistributionComplexity(const std::string& distribution) const; - - public: - AdaptiveThresholdCalculator() { arch_profile_ = detectArchitectureProfile(); } - - /** - * @brief Get optimal threshold for specific distribution and operation - * @param distribution Distribution name (e.g., "uniform", "poisson") - * @param operation Operation name (e.g., "pdf", "logpdf", "cdf") - * @return Optimal threshold in number of elements - */ - std::size_t getThreshold(const std::string& distribution, const std::string& operation) const; - - /** - * @brief Check if parallel execution should be used - * @param distribution Distribution name - * @param operation Operation name - * @param data_size Number of elements to process - * @return true if parallel execution is recommended - */ - bool shouldUseParallel(const std::string& distribution, const std::string& operation, - std::size_t data_size) const; - - /** - * @brief Update threshold based on runtime measurements (future enhancement) - * @param distribution Distribution name - * @param operation Operation name - * @param data_size Size that was tested - * @param parallel_beneficial Whether parallel was beneficial - */ - void updateFromMeasurement(const std::string& distribution, const std::string& operation, - std::size_t data_size, bool parallel_beneficial); -}; - -/** - * @brief Global adaptive threshold calculator instance - * - * This singleton provides easy access to threshold calculations throughout - * the library without requiring each distribution to manage its own calculator. - */ -AdaptiveThresholdCalculator& getGlobalThresholdCalculator(); - -/** - * @brief Convenience function for checking if parallel execution should be used - * - * This function provides a clean interface for distribution implementations - * to check whether they should use parallel execution. - * - * @param distribution Distribution name (case-insensitive) - * @param operation Operation name (case-insensitive) - * @param data_size Number of elements to process - * @return true if parallel execution is recommended - */ -inline bool shouldUseDistributionParallel(const std::string& distribution, - const std::string& operation, std::size_t data_size) { - return getGlobalThresholdCalculator().shouldUseParallel(distribution, operation, data_size); -} - -} // namespace arch -} // namespace stats diff --git a/scripts/capture_dispatcher_profile.sh b/scripts/capture_dispatcher_profile.sh new file mode 100755 index 0000000..11d2ba4 --- /dev/null +++ b/scripts/capture_dispatcher_profile.sh @@ -0,0 +1,104 @@ +#!/bin/bash + +# Capture a dispatcher profiling bundle for the current machine. +# Saves metadata, logs, and benchmark CSV output in a timestamped directory under build/. +# Copies the bundle into data/profiles/dispatcher/ (tracked in version control) so +# profiles from all architectures can be consolidated on any machine. + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +BUILD_DIR="${BUILD_DIR:-$PROJECT_ROOT/build}" +TOOLS_DIR="$BUILD_DIR/tools" +PROFILE_ROOT="${PROFILE_ROOT:-$BUILD_DIR/profiles/dispatcher}" +SUMMARIZER="$SCRIPT_DIR/summarize_dispatcher_profile.py" + +SYSTEM_INSPECTOR="$TOOLS_DIR/system_inspector" +STRATEGY_PROFILE="$TOOLS_DIR/strategy_profile" + +for tool in "$SYSTEM_INSPECTOR" "$STRATEGY_PROFILE"; do + if [ ! -x "$tool" ]; then + echo "Required tool not found or not executable: $tool" >&2 + exit 1 + fi +done + +if [ ! -f "$SUMMARIZER" ]; then + echo "Required summarizer not found: $SUMMARIZER" >&2 + exit 1 +fi + +mkdir -p "$PROFILE_ROOT" + +TIMESTAMP="$(date -u +"%Y-%m-%dT%H-%M-%SZ")" +ARCH="$(uname -m)" +OS_NAME="$(uname -s | tr '[:upper:]' '[:lower:]')" +BRANCH="$(git -C "$PROJECT_ROOT" rev-parse --abbrev-ref HEAD)" +GIT_SHA="$(git -C "$PROJECT_ROOT" rev-parse --short HEAD)" +RUN_ID="${TIMESTAMP}_${OS_NAME}-${ARCH}_${BRANCH}_sha-${GIT_SHA}" +RUN_DIR="$PROFILE_ROOT/$RUN_ID" +LOG_DIR="$RUN_DIR/logs" + +mkdir -p "$LOG_DIR" + +BUILD_TYPE="$(awk -F= '/^CMAKE_BUILD_TYPE:STRING=/{print $2}' "$BUILD_DIR/CMakeCache.txt" 2>/dev/null || true)" +CXX_COMPILER="$(awk -F= '/^CMAKE_CXX_COMPILER:FILEPATH=/{print $2}' "$BUILD_DIR/CMakeCache.txt" 2>/dev/null || true)" +CPU_BRAND="$(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "unknown")" +PHYSICAL_CORES="$(sysctl -n hw.physicalcpu 2>/dev/null || echo "unknown")" +LOGICAL_CORES="$(sysctl -n hw.logicalcpu 2>/dev/null || echo "unknown")" + +cat > "$RUN_DIR/metadata.json" < "$RUN_DIR/manifest.txt" < "$LOG_DIR/system_inspector_performance.txt" 2>&1 + +STRATEGY_CSV="$RUN_DIR/strategy_profile_results.csv" +"$STRATEGY_PROFILE" --output-csv "$STRATEGY_CSV" > "$LOG_DIR/strategy_profile.txt" 2>&1 + +if [ ! -f "$STRATEGY_CSV" ]; then + echo "Expected strategy profile CSV was not created." >&2 + exit 1 +fi + +python3 "$SUMMARIZER" "$RUN_DIR" + +# Copy bundle into the tracked data directory so profiles accumulate across machines. +TRACKED_DIR="$PROJECT_ROOT/data/profiles/dispatcher/$RUN_ID" +cp -R "$RUN_DIR" "$TRACKED_DIR" +echo "Dispatcher profile saved to: $RUN_DIR" +echo "Tracked copy at: $TRACKED_DIR" diff --git a/scripts/summarize_dispatcher_profile.py b/scripts/summarize_dispatcher_profile.py new file mode 100755 index 0000000..525ec76 --- /dev/null +++ b/scripts/summarize_dispatcher_profile.py @@ -0,0 +1,262 @@ +#!/usr/bin/env python3 + +"""Summarize a dispatcher profile bundle into derived CSV/JSON artifacts. + +Reads strategy_profile_results.csv (canonical raw data from strategy_profile) +and produces crossovers.csv, best_strategies.csv, and summary.json. +""" + +from __future__ import annotations + +import argparse +import csv +import json +import math +from collections import defaultdict +from pathlib import Path +from typing import Any + + +def load_metadata(path: Path) -> dict[str, Any]: + with path.open("r", encoding="utf-8") as handle: + return json.load(handle) + + +def load_strategy_rows(path: Path) -> list[dict[str, Any]]: + rows: list[dict[str, Any]] = [] + with path.open("r", encoding="utf-8", newline="") as handle: + reader = csv.DictReader(handle) + for row in reader: + rows.append( + { + "distribution": row["Distribution"], + "operation": row["Operation"], + "batch_size": int(row["BatchSize"]), + "strategy": row["Strategy"], + "median_time_us": float(row["MedianTime_us"]), + } + ) + return rows + + +GroupKey = tuple[str, str] # (distribution, operation) + + +def group_rows( + rows: list[dict[str, Any]], +) -> dict[GroupKey, dict[int, dict[str, float]]]: + """Group rows into {(dist, op): {batch_size: {strategy: time}}}.""" + grouped: dict[GroupKey, dict[int, dict[str, float]]] = defaultdict( + lambda: defaultdict(dict) + ) + for row in rows: + key = (row["distribution"], row["operation"]) + grouped[key][row["batch_size"]][row["strategy"]] = row["median_time_us"] + return grouped + + +def best_strategy_at_size(timings: dict[str, float]) -> tuple[str, float]: + best = min(timings.items(), key=lambda item: item[1]) + return best[0], best[1] + + +def find_first_crossover( + size_map: dict[int, dict[str, float]], + slower: str, + faster: str, +) -> int | None: + for batch_size in sorted(size_map.keys()): + timings = size_map[batch_size] + slower_time = timings.get(slower) + faster_time = timings.get(faster) + if slower_time is not None and faster_time is not None: + if faster_time < slower_time: + return batch_size + return None + + +def build_crossover_rows( + grouped: dict[GroupKey, dict[int, dict[str, float]]], +) -> list[dict[str, Any]]: + results: list[dict[str, Any]] = [] + for (dist, op) in sorted(grouped.keys()): + size_map = grouped[(dist, op)] + s_to_v = find_first_crossover(size_map, "SCALAR", "VECTORIZED") + v_to_p = find_first_crossover(size_map, "VECTORIZED", "PARALLEL") + p_to_ws = find_first_crossover(size_map, "PARALLEL", "WORK_STEALING") + + largest_size = max(size_map.keys()) + best_strat, best_time = best_strategy_at_size(size_map[largest_size]) + + results.append( + { + "distribution": dist, + "operation": op, + "scalar_to_vectorized": s_to_v, + "vectorized_to_parallel": v_to_p, + "parallel_to_work_stealing": p_to_ws, + "best_strategy_at_max_size": best_strat, + "best_time_us_at_max_size": round(best_time, 3), + "max_batch_size": largest_size, + } + ) + return results + + +def build_best_strategy_rows( + grouped: dict[GroupKey, dict[int, dict[str, float]]], +) -> list[dict[str, Any]]: + results: list[dict[str, Any]] = [] + for (dist, op) in sorted(grouped.keys()): + for batch_size in sorted(grouped[(dist, op)].keys()): + timings = grouped[(dist, op)][batch_size] + best_strat, best_time = best_strategy_at_size(timings) + + scalar_time = timings.get("SCALAR") + speedup_vs_scalar = ( + round(scalar_time / best_time, 3) + if scalar_time and best_time and best_time > 0 + else None + ) + + results.append( + { + "distribution": dist, + "operation": op, + "batch_size": batch_size, + "best_strategy": best_strat, + "best_time_us": round(best_time, 3), + "scalar_time_us": round(scalar_time, 3) if scalar_time else None, + "speedup_vs_scalar": speedup_vs_scalar, + } + ) + return results + + +def safe_number(value: Any) -> Any: + if isinstance(value, float) and math.isfinite(value): + return round(value, 6) + if isinstance(value, float) and not math.isfinite(value): + return None + return value + + +def build_summary( + metadata: dict[str, Any], + rows: list[dict[str, Any]], + crossover_rows: list[dict[str, Any]], + best_strategy_rows: list[dict[str, Any]], +) -> dict[str, Any]: + distributions = sorted({r["distribution"] for r in rows}) + operations = sorted({r["operation"] for r in rows}) + batch_sizes = sorted({r["batch_size"] for r in rows}) + + strategy_wins: dict[str, int] = defaultdict(int) + for row in best_strategy_rows: + strategy_wins[row["best_strategy"]] += 1 + + vectorized_never_wins = [ + {"distribution": r["distribution"], "operation": r["operation"]} + for r in crossover_rows + if r["scalar_to_vectorized"] is None + ] + + return { + "run_id": metadata["run_id"], + "data_source": "strategy_profile_results.csv", + "metadata": metadata, + "coverage": { + "distributions": distributions, + "operations": operations, + "batch_sizes": batch_sizes, + "total_measurements": len(rows), + }, + "strategy_win_counts": dict( + sorted(strategy_wins.items(), key=lambda x: -x[1]) + ), + "crossover_summary": { + "groups": len(crossover_rows), + "vectorized_never_wins": vectorized_never_wins, + "parallel_crossover_sizes": [ + { + "distribution": r["distribution"], + "operation": r["operation"], + "vectorized_to_parallel": r["vectorized_to_parallel"], + } + for r in crossover_rows + if r["vectorized_to_parallel"] is not None + ], + }, + } + + +def write_csv(path: Path, rows: list[dict[str, Any]], fieldnames: list[str]) -> None: + with path.open("w", encoding="utf-8", newline="") as handle: + writer = csv.DictWriter(handle, fieldnames=fieldnames) + writer.writeheader() + for row in rows: + writer.writerow({field: safe_number(row.get(field)) for field in fieldnames}) + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Generate derived dispatcher profiling summary files for a saved run." + ) + parser.add_argument("run_dir", help="Path to a dispatcher profile bundle directory") + args = parser.parse_args() + + run_dir = Path(args.run_dir).resolve() + metadata_path = run_dir / "metadata.json" + strategy_csv_path = run_dir / "strategy_profile_results.csv" + + if not strategy_csv_path.exists(): + print(f"Strategy profile CSV not found: {strategy_csv_path}") + return 1 + + metadata = load_metadata(metadata_path) + rows = load_strategy_rows(strategy_csv_path) + grouped = group_rows(rows) + + crossover_rows = build_crossover_rows(grouped) + best_strategy_rows = build_best_strategy_rows(grouped) + + write_csv( + run_dir / "crossovers.csv", + crossover_rows, + [ + "distribution", + "operation", + "scalar_to_vectorized", + "vectorized_to_parallel", + "parallel_to_work_stealing", + "best_strategy_at_max_size", + "best_time_us_at_max_size", + "max_batch_size", + ], + ) + + write_csv( + run_dir / "best_strategies.csv", + best_strategy_rows, + [ + "distribution", + "operation", + "batch_size", + "best_strategy", + "best_time_us", + "scalar_time_us", + "speedup_vs_scalar", + ], + ) + + summary = build_summary(metadata, rows, crossover_rows, best_strategy_rows) + with (run_dir / "summary.json").open("w", encoding="utf-8") as handle: + json.dump(summary, handle, indent=2) + handle.write("\n") + + print(f"Derived files written to {run_dir}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/beta.cpp b/src/beta.cpp index 9bdd29c..60c7bf9 100644 --- a/src/beta.cpp +++ b/src/beta.cpp @@ -518,7 +518,7 @@ void BetaDistribution::getProbability(std::span values, std::span< const detail::PerformanceHint& hint) const { detail::DispatchUtils::autoDispatch( *this, values, results, hint, detail::DistributionTraits::distType(), - detail::DistributionTraits::complexity(), + detail::OperationType::PDF, [](const BetaDistribution& dist, double value) { return dist.getProbability(value); }, [](const BetaDistribution& dist, const double* vals, double* res, size_t count) { std::shared_lock lock(dist.cache_mutex_); @@ -543,28 +543,31 @@ void BetaDistribution::getProbability(std::span values, std::span< if (count == 0) return; std::shared_lock lock(dist.cache_mutex_); + if (!dist.cache_valid_) { + lock.unlock(); + std::unique_lock ulock(dist.cache_mutex_); + if (!dist.cache_valid_) + const_cast(dist).updateCacheUnsafe(); + ulock.unlock(); + lock.lock(); + } const double lnc = dist.logNormConst_; const double am1 = dist.alphaMinus1_; const double bm1 = dist.betaMinus1_; lock.unlock(); + // Chunk the batch so each parallel task uses the SIMD pipeline + // (vector_log / vector_exp) instead of per-element scalar math. + constexpr std::size_t CHUNK = 1024; if (arch::should_use_parallel(count)) { - ParallelUtils::parallelFor(std::size_t{0}, count, [&](std::size_t i) { - const double x = vals[i]; - if (x <= 0.0 || x >= 1.0) { - res[i] = dist.getProbability(x); - } else { - res[i] = std::exp(lnc + am1 * std::log(x) + bm1 * std::log(1.0 - x)); - } + const std::size_t num_chunks = (count + CHUNK - 1) / CHUNK; + ParallelUtils::parallelFor(std::size_t{0}, num_chunks, [&](std::size_t ci) { + const std::size_t start = ci * CHUNK; + const std::size_t len = std::min(CHUNK, count - start); + dist.getProbabilityBatchUnsafeImpl(vals.data() + start, res.data() + start, len, + lnc, am1, bm1); }); } else { - for (std::size_t i = 0; i < count; ++i) { - const double x = vals[i]; - if (x <= 0.0 || x >= 1.0) { - res[i] = dist.getProbability(x); - } else { - res[i] = std::exp(lnc + am1 * std::log(x) + bm1 * std::log(1.0 - x)); - } - } + dist.getProbabilityBatchUnsafeImpl(vals.data(), res.data(), count, lnc, am1, bm1); } }, [](const BetaDistribution& dist, std::span vals, std::span res, @@ -575,17 +578,25 @@ void BetaDistribution::getProbability(std::span values, std::span< if (count == 0) return; std::shared_lock lock(dist.cache_mutex_); + if (!dist.cache_valid_) { + lock.unlock(); + std::unique_lock ulock(dist.cache_mutex_); + if (!dist.cache_valid_) + const_cast(dist).updateCacheUnsafe(); + ulock.unlock(); + lock.lock(); + } const double lnc = dist.logNormConst_; const double am1 = dist.alphaMinus1_; const double bm1 = dist.betaMinus1_; lock.unlock(); - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - const double x = vals[i]; - if (x <= 0.0 || x >= 1.0) { - res[i] = dist.getProbability(x); - } else { - res[i] = std::exp(lnc + am1 * std::log(x) + bm1 * std::log(1.0 - x)); - } + constexpr std::size_t CHUNK = 1024; + const std::size_t num_chunks = (count + CHUNK - 1) / CHUNK; + pool.parallelFor(std::size_t{0}, num_chunks, [&](std::size_t ci) { + const std::size_t start = ci * CHUNK; + const std::size_t len = std::min(CHUNK, count - start); + dist.getProbabilityBatchUnsafeImpl(vals.data() + start, res.data() + start, len, + lnc, am1, bm1); }); }, [](const BetaDistribution& dist, std::span vals, std::span res, @@ -596,17 +607,25 @@ void BetaDistribution::getProbability(std::span values, std::span< if (count == 0) return; std::shared_lock lock(dist.cache_mutex_); + if (!dist.cache_valid_) { + lock.unlock(); + std::unique_lock ulock(dist.cache_mutex_); + if (!dist.cache_valid_) + const_cast(dist).updateCacheUnsafe(); + ulock.unlock(); + lock.lock(); + } const double lnc = dist.logNormConst_; const double am1 = dist.alphaMinus1_; const double bm1 = dist.betaMinus1_; lock.unlock(); - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - const double x = vals[i]; - if (x <= 0.0 || x >= 1.0) { - res[i] = dist.getProbability(x); - } else { - res[i] = std::exp(lnc + am1 * std::log(x) + bm1 * std::log(1.0 - x)); - } + constexpr std::size_t CHUNK = 1024; + const std::size_t num_chunks = (count + CHUNK - 1) / CHUNK; + pool.parallelFor(std::size_t{0}, num_chunks, [&](std::size_t ci) { + const std::size_t start = ci * CHUNK; + const std::size_t len = std::min(CHUNK, count - start); + dist.getProbabilityBatchUnsafeImpl(vals.data() + start, res.data() + start, len, + lnc, am1, bm1); }); }); } @@ -615,7 +634,7 @@ void BetaDistribution::getLogProbability(std::span values, std::sp const detail::PerformanceHint& hint) const { detail::DispatchUtils::autoDispatch( *this, values, results, hint, detail::DistributionTraits::distType(), - detail::DistributionTraits::complexity(), + detail::OperationType::LOG_PDF, [](const BetaDistribution& dist, double value) { return dist.getLogProbability(value); }, [](const BetaDistribution& dist, const double* vals, double* res, size_t count) { std::shared_lock lock(dist.cache_mutex_); @@ -640,28 +659,32 @@ void BetaDistribution::getLogProbability(std::span values, std::sp if (count == 0) return; std::shared_lock lock(dist.cache_mutex_); + if (!dist.cache_valid_) { + lock.unlock(); + std::unique_lock ulock(dist.cache_mutex_); + if (!dist.cache_valid_) + const_cast(dist).updateCacheUnsafe(); + ulock.unlock(); + lock.lock(); + } const double lnc = dist.logNormConst_; const double am1 = dist.alphaMinus1_; const double bm1 = dist.betaMinus1_; lock.unlock(); + // Chunk the batch so each parallel task uses the SIMD pipeline + // (vector_log) instead of per-element scalar math. + constexpr std::size_t CHUNK = 1024; if (arch::should_use_parallel(count)) { - ParallelUtils::parallelFor(std::size_t{0}, count, [&](std::size_t i) { - const double x = vals[i]; - if (x <= 0.0 || x >= 1.0) { - res[i] = dist.getLogProbability(x); - } else { - res[i] = lnc + am1 * std::log(x) + bm1 * std::log(1.0 - x); - } + const std::size_t num_chunks = (count + CHUNK - 1) / CHUNK; + ParallelUtils::parallelFor(std::size_t{0}, num_chunks, [&](std::size_t ci) { + const std::size_t start = ci * CHUNK; + const std::size_t len = std::min(CHUNK, count - start); + dist.getLogProbabilityBatchUnsafeImpl(vals.data() + start, res.data() + start, + len, lnc, am1, bm1); }); } else { - for (std::size_t i = 0; i < count; ++i) { - const double x = vals[i]; - if (x <= 0.0 || x >= 1.0) { - res[i] = dist.getLogProbability(x); - } else { - res[i] = lnc + am1 * std::log(x) + bm1 * std::log(1.0 - x); - } - } + dist.getLogProbabilityBatchUnsafeImpl(vals.data(), res.data(), count, lnc, am1, + bm1); } }, [](const BetaDistribution& dist, std::span vals, std::span res, @@ -672,17 +695,25 @@ void BetaDistribution::getLogProbability(std::span values, std::sp if (count == 0) return; std::shared_lock lock(dist.cache_mutex_); + if (!dist.cache_valid_) { + lock.unlock(); + std::unique_lock ulock(dist.cache_mutex_); + if (!dist.cache_valid_) + const_cast(dist).updateCacheUnsafe(); + ulock.unlock(); + lock.lock(); + } const double lnc = dist.logNormConst_; const double am1 = dist.alphaMinus1_; const double bm1 = dist.betaMinus1_; lock.unlock(); - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - const double x = vals[i]; - if (x <= 0.0 || x >= 1.0) { - res[i] = dist.getLogProbability(x); - } else { - res[i] = lnc + am1 * std::log(x) + bm1 * std::log(1.0 - x); - } + constexpr std::size_t CHUNK = 1024; + const std::size_t num_chunks = (count + CHUNK - 1) / CHUNK; + pool.parallelFor(std::size_t{0}, num_chunks, [&](std::size_t ci) { + const std::size_t start = ci * CHUNK; + const std::size_t len = std::min(CHUNK, count - start); + dist.getLogProbabilityBatchUnsafeImpl(vals.data() + start, res.data() + start, len, + lnc, am1, bm1); }); }, [](const BetaDistribution& dist, std::span vals, std::span res, @@ -693,17 +724,25 @@ void BetaDistribution::getLogProbability(std::span values, std::sp if (count == 0) return; std::shared_lock lock(dist.cache_mutex_); + if (!dist.cache_valid_) { + lock.unlock(); + std::unique_lock ulock(dist.cache_mutex_); + if (!dist.cache_valid_) + const_cast(dist).updateCacheUnsafe(); + ulock.unlock(); + lock.lock(); + } const double lnc = dist.logNormConst_; const double am1 = dist.alphaMinus1_; const double bm1 = dist.betaMinus1_; lock.unlock(); - pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - const double x = vals[i]; - if (x <= 0.0 || x >= 1.0) { - res[i] = dist.getLogProbability(x); - } else { - res[i] = lnc + am1 * std::log(x) + bm1 * std::log(1.0 - x); - } + constexpr std::size_t CHUNK = 1024; + const std::size_t num_chunks = (count + CHUNK - 1) / CHUNK; + pool.parallelFor(std::size_t{0}, num_chunks, [&](std::size_t ci) { + const std::size_t start = ci * CHUNK; + const std::size_t len = std::min(CHUNK, count - start); + dist.getLogProbabilityBatchUnsafeImpl(vals.data() + start, res.data() + start, len, + lnc, am1, bm1); }); }); } @@ -713,7 +752,7 @@ void BetaDistribution::getCumulativeProbability(std::span values, const detail::PerformanceHint& hint) const { detail::DispatchUtils::autoDispatch( *this, values, results, hint, detail::DistributionTraits::distType(), - detail::DistributionTraits::complexity(), + detail::OperationType::CDF, [](const BetaDistribution& dist, double value) { return dist.getCumulativeProbability(value); }, @@ -729,14 +768,24 @@ void BetaDistribution::getCumulativeProbability(std::span values, const std::size_t count = vals.size(); if (count == 0) return; + // Acquire cache once; hoist lgamma prefix for the batch. std::shared_lock lock(dist.cache_mutex_); const double a = dist.alpha_, b = dist.beta_; lock.unlock(); - for (std::size_t i = 0; i < count; ++i) { - res[i] = dist.getCumulativeProbability(vals[i]); + const double log_prefix = detail::lgamma(a + b) - detail::lgamma(a) - detail::lgamma(b); + if (arch::should_use_parallel(count)) { + ParallelUtils::parallelFor(std::size_t{0}, count, [&](std::size_t i) { + const double x = vals[i]; + if (x <= 0.0) + res[i] = 0.0; + else if (x >= 1.0) + res[i] = 1.0; + else + res[i] = detail::beta_i(x, a, b, log_prefix); + }); + } else { + dist.getCumulativeProbabilityBatchUnsafeImpl(vals.data(), res.data(), count, a, b); } - (void)a; - (void)b; }, [](const BetaDistribution& dist, std::span vals, std::span res, WorkStealingPool& pool) { @@ -745,8 +794,18 @@ void BetaDistribution::getCumulativeProbability(std::span values, const std::size_t count = vals.size(); if (count == 0) return; + std::shared_lock lock(dist.cache_mutex_); + const double a = dist.alpha_, b = dist.beta_; + lock.unlock(); + const double log_prefix = detail::lgamma(a + b) - detail::lgamma(a) - detail::lgamma(b); pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - res[i] = dist.getCumulativeProbability(vals[i]); + const double x = vals[i]; + if (x <= 0.0) + res[i] = 0.0; + else if (x >= 1.0) + res[i] = 1.0; + else + res[i] = detail::beta_i(x, a, b, log_prefix); }); }, [](const BetaDistribution& dist, std::span vals, std::span res, @@ -756,8 +815,18 @@ void BetaDistribution::getCumulativeProbability(std::span values, const std::size_t count = vals.size(); if (count == 0) return; + std::shared_lock lock(dist.cache_mutex_); + const double a = dist.alpha_, b = dist.beta_; + lock.unlock(); + const double log_prefix = detail::lgamma(a + b) - detail::lgamma(a) - detail::lgamma(b); pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - res[i] = dist.getCumulativeProbability(vals[i]); + const double x = vals[i]; + if (x <= 0.0) + res[i] = 0.0; + else if (x >= 1.0) + res[i] = 1.0; + else + res[i] = detail::beta_i(x, a, b, log_prefix); }); }); } @@ -979,6 +1048,10 @@ void BetaDistribution::getCumulativeProbabilityBatchUnsafeImpl(const double* val double beta) const noexcept { // Scalar per element. See section 18 header for why beta_i cannot be // vectorized without replacing it with a fixed-iteration approximation. + // Hoist the lgamma prefix: lgamma(a+b) - lgamma(a) - lgamma(b) is constant + // for fixed (alpha, beta), saving 3 lgamma calls per element. + const double log_prefix = + detail::lgamma(alpha + beta) - detail::lgamma(alpha) - detail::lgamma(beta); for (std::size_t i = 0; i < count; ++i) { const double x = values[i]; if (x <= detail::ZERO_DOUBLE) { @@ -986,7 +1059,7 @@ void BetaDistribution::getCumulativeProbabilityBatchUnsafeImpl(const double* val } else if (x >= detail::ONE) { results[i] = detail::ONE; } else { - results[i] = detail::beta_i(x, alpha, beta); + results[i] = detail::beta_i(x, alpha, beta, log_prefix); } } } diff --git a/src/discrete.cpp b/src/discrete.cpp index 1e1fc16..b8002fa 100644 --- a/src/discrete.cpp +++ b/src/discrete.cpp @@ -1,6 +1,7 @@ #include "libstats/distributions/discrete.h" // Core functionality - lightweight headers +#include "libstats/core/dispatch_thresholds.h" #include "libstats/core/dispatch_utils.h" #include "libstats/core/log_space_ops.h" #include "libstats/core/math_utils.h" @@ -652,7 +653,7 @@ void DiscreteDistribution::parallelBatchFit(const std::vector= detail::dispatch_table::BATCH_FIT_MIN) { // Direct parallel execution without internal thresholds - bypass ParallelUtils limitation ThreadPool& pool = ParallelUtils::getGlobalThreadPool(); const std::size_t optimal_grain_size = std::max(std::size_t{1}, num_datasets / 8); @@ -1723,7 +1724,7 @@ void DiscreteDistribution::getProbability(std::span values, std::s const detail::PerformanceHint& hint) const { detail::DispatchUtils::autoDispatch( *this, values, results, hint, detail::DistributionTraits::distType(), - detail::DistributionTraits::complexity(), + detail::OperationType::PDF, [](const DiscreteDistribution& dist, double value) { return dist.getProbability(value); }, [](const DiscreteDistribution& dist, const double* vals, double* res, size_t count) { // Ensure cache is valid @@ -1886,7 +1887,7 @@ void DiscreteDistribution::getLogProbability(std::span values, const detail::PerformanceHint& hint) const { detail::DispatchUtils::autoDispatch( *this, values, results, hint, detail::DistributionTraits::distType(), - detail::DistributionTraits::complexity(), + detail::OperationType::LOG_PDF, [](const DiscreteDistribution& dist, double value) { return dist.getLogProbability(value); }, @@ -2069,7 +2070,7 @@ void DiscreteDistribution::getCumulativeProbability(std::span valu const detail::PerformanceHint& hint) const { detail::DispatchUtils::autoDispatch( *this, values, results, hint, detail::DistributionTraits::distType(), - detail::DistributionTraits::complexity(), + detail::OperationType::CDF, [](const DiscreteDistribution& dist, double value) { return dist.getCumulativeProbability(value); }, diff --git a/src/exponential.cpp b/src/exponential.cpp index aceb5ce..4904013 100644 --- a/src/exponential.cpp +++ b/src/exponential.cpp @@ -6,6 +6,7 @@ #include "libstats/core/validation.h" // Note: parallel execution included through distribution base inheritance // Note: thread_pool.h and work_stealing_pool.h are transitively included via dispatch_utils.h +#include "libstats/core/dispatch_thresholds.h" #include "libstats/core/dispatch_utils.h" // For DispatchUtils::autoDispatch #include @@ -436,7 +437,7 @@ void ExponentialDistribution::parallelBatchFit(const std::vector= detail::dispatch_table::BATCH_FIT_MIN) { // Thread-safe parallel execution with proper exception handling // Use a static mutex to synchronize access to the global thread pool from multiple threads static std::mutex pool_access_mutex; @@ -1319,8 +1320,7 @@ void ExponentialDistribution::getProbability(std::span values, const detail::PerformanceHint& hint) const { detail::DispatchUtils::autoDispatch( *this, values, results, hint, - detail::DistributionTraits::distType(), - detail::DistributionTraits::complexity(), + detail::DistributionTraits::distType(), detail::OperationType::PDF, [](const ExponentialDistribution& dist, double value) { return dist.getProbability(value); }, @@ -1492,7 +1492,7 @@ void ExponentialDistribution::getLogProbability(std::span values, detail::DispatchUtils::autoDispatch( *this, values, results, hint, detail::DistributionTraits::distType(), - detail::DistributionTraits::complexity(), + detail::OperationType::LOG_PDF, [](const ExponentialDistribution& dist, double value) { return dist.getLogProbability(value); }, @@ -1664,8 +1664,7 @@ void ExponentialDistribution::getCumulativeProbability(std::span v const detail::PerformanceHint& hint) const { detail::DispatchUtils::autoDispatch( *this, values, results, hint, - detail::DistributionTraits::distType(), - detail::DistributionTraits::complexity(), + detail::DistributionTraits::distType(), detail::OperationType::CDF, [](const ExponentialDistribution& dist, double value) { return dist.getCumulativeProbability(value); }, diff --git a/src/gamma.cpp b/src/gamma.cpp index a8ca32d..ade644f 100644 --- a/src/gamma.cpp +++ b/src/gamma.cpp @@ -1,6 +1,7 @@ #include "libstats/distributions/gamma.h" // Core functionality - lightweight headers +#include "libstats/core/dispatch_thresholds.h" #include "libstats/core/dispatch_utils.h" #include "libstats/core/log_space_ops.h" #include "libstats/core/math_utils.h" @@ -453,7 +454,7 @@ void GammaDistribution::parallelBatchFit(const std::vector>& const std::size_t num_datasets = datasets.size(); // Use distribution-specific parallel thresholds for optimal work distribution - if (arch::shouldUseDistributionParallel("gamma", "batch_fit", num_datasets)) { + if (num_datasets >= detail::dispatch_table::BATCH_FIT_MIN) { // Direct parallel execution without internal thresholds - bypass ParallelUtils limitation ThreadPool& pool = ParallelUtils::getGlobalThreadPool(); const std::size_t optimal_grain_size = std::max(std::size_t{1}, num_datasets / 8); @@ -1453,7 +1454,7 @@ void GammaDistribution::getProbability(std::span values, std::span const detail::PerformanceHint& hint) const { detail::DispatchUtils::autoDispatch( *this, values, results, hint, detail::DistributionTraits::distType(), - detail::DistributionTraits::complexity(), + detail::OperationType::PDF, [](const GammaDistribution& dist, double value) { return dist.getProbability(value); }, [](const GammaDistribution& dist, const double* vals, double* res, size_t count) { // Ensure cache is valid @@ -1629,7 +1630,7 @@ void GammaDistribution::getLogProbability(std::span values, std::s const detail::PerformanceHint& hint) const { detail::DispatchUtils::autoDispatch( *this, values, results, hint, detail::DistributionTraits::distType(), - detail::DistributionTraits::complexity(), + detail::OperationType::LOG_PDF, [](const GammaDistribution& dist, double value) { return dist.getLogProbability(value); }, [](const GammaDistribution& dist, const double* vals, double* res, size_t count) { // Ensure cache is valid @@ -1798,7 +1799,7 @@ void GammaDistribution::getCumulativeProbability(std::span values, const detail::PerformanceHint& hint) const { detail::DispatchUtils::autoDispatch( *this, values, results, hint, detail::DistributionTraits::distType(), - detail::DistributionTraits::complexity(), + detail::OperationType::CDF, [](const GammaDistribution& dist, double value) { return dist.getCumulativeProbability(value); }, diff --git a/src/gaussian.cpp b/src/gaussian.cpp index 6d2e806..75624c5 100644 --- a/src/gaussian.cpp +++ b/src/gaussian.cpp @@ -3,6 +3,7 @@ #include "libstats/common/cpu_detection_fwd.h" // CPU feature queries (lightweight) #include "libstats/common/platform_constants_fwd.h" // Parallel thresholds (lightweight) #include "libstats/common/simd_policy_fwd.h" // SIMD policy decisions (lightweight) +#include "libstats/core/dispatch_thresholds.h" #include "libstats/core/dispatch_utils.h" // Note: thread_pool.h and work_stealing_pool.h are transitively included via dispatch_utils.h #include "libstats/core/safety.h" @@ -654,7 +655,7 @@ void GaussianDistribution::parallelBatchFit(const std::vector= detail::dispatch_table::BATCH_FIT_MIN) { // Thread-safe parallel execution with proper exception handling // Use a static mutex to synchronize access to the global thread pool from multiple threads static std::mutex pool_access_mutex; @@ -1771,7 +1772,7 @@ void GaussianDistribution::getProbability(std::span values, std::s const detail::PerformanceHint& hint) const { detail::DispatchUtils::autoDispatch( *this, values, results, hint, detail::DistributionTraits::distType(), - detail::DistributionTraits::complexity(), + detail::OperationType::PDF, [](const GaussianDistribution& dist, double value) { return dist.getProbability(value); }, [](const GaussianDistribution& dist, const double* vals, double* res, size_t count) { // Ensure cache is valid @@ -1943,7 +1944,7 @@ void GaussianDistribution::getLogProbability(std::span values, const detail::PerformanceHint& hint) const { detail::DispatchUtils::autoDispatch( *this, values, results, hint, detail::DistributionTraits::distType(), - detail::DistributionTraits::complexity(), + detail::OperationType::LOG_PDF, [](const GaussianDistribution& dist, double value) { return dist.getLogProbability(value); }, @@ -2122,7 +2123,7 @@ void GaussianDistribution::getCumulativeProbability(std::span valu const detail::PerformanceHint& hint) const { detail::DispatchUtils::autoDispatch( *this, values, results, hint, detail::DistributionTraits::distType(), - detail::DistributionTraits::complexity(), + detail::OperationType::CDF, [](const GaussianDistribution& dist, double value) { return dist.getCumulativeProbability(value); }, diff --git a/src/math_utils.cpp b/src/math_utils.cpp index 9d2501b..44e86af 100644 --- a/src/math_utils.cpp +++ b/src/math_utils.cpp @@ -254,6 +254,25 @@ double beta_i(double x, double a, double b) noexcept { } } +double beta_i(double x, double a, double b, double log_beta_prefix) noexcept { + if (x < detail::ZERO_DOUBLE || x > detail::ONE || a <= detail::ZERO_DOUBLE || + b <= detail::ZERO_DOUBLE) { + return detail::ZERO_DOUBLE; + } + if (x == detail::ZERO_DOUBLE) + return detail::ZERO_DOUBLE; + if (x == detail::ONE) + return detail::ONE; + + double bt = std::exp(log_beta_prefix + a * std::log(x) + b * std::log(detail::ONE - x)); + + if (x < (a + detail::ONE) / (a + b + detail::TWO)) { + return bt * beta_continued_fraction(x, a, b); + } else { + return detail::ONE - bt * beta_continued_fraction(detail::ONE - x, b, a); + } +} + // Helper function for beta incomplete function continued fraction // Based on Numerical Recipes algorithm static double beta_continued_fraction(double x, double a, double b) noexcept { @@ -683,10 +702,10 @@ void vector_beta_i(std::span x_values, double a, double b, const std::size_t size = x_values.size(); - // For now, use scalar implementation - // Future enhancement: SIMD optimization of the continued fraction + // Hoist the lgamma prefix: constant across all elements for fixed (a, b). + const double log_prefix = lgamma(a + b) - lgamma(a) - lgamma(b); for (std::size_t i = 0; i < size; ++i) { - output[i] = beta_i(x_values[i], a, b); + output[i] = beta_i(x_values[i], a, b, log_prefix); } } @@ -810,8 +829,10 @@ double inverse_t_cdf(double p, double df) noexcept { // Use approximate initial guess from normal distribution double z = inverse_normal_cdf(p); - // For large degrees of freedom, t-distribution approaches normal - if (df > detail::HUNDRED) { + // For large degrees of freedom, t-distribution approaches normal. + // Use 1000 as the cutoff (consistent with t_cdf) β€” at df=120 the + // normal approximation still has ~0.02 error in the tails. + if (df > detail::THOUSAND) { return z; } diff --git a/src/parallel_thresholds.cpp b/src/parallel_thresholds.cpp deleted file mode 100644 index d94c078..0000000 --- a/src/parallel_thresholds.cpp +++ /dev/null @@ -1,262 +0,0 @@ -#include "libstats/platform/parallel_thresholds.h" - -#include "libstats/core/math_constants.h" -#include "libstats/core/statistical_constants.h" -#include "libstats/platform/cpu_detection.h" - -#include -#include -#include - -namespace stats { -namespace arch { - -ArchitectureProfile AdaptiveThresholdCalculator::detectArchitectureProfile() const { - ArchitectureProfile profile; - - // Get CPU features - const auto& features = arch::get_features(); - -// Base architecture detection and configuration -#if defined(__APPLE__) && defined(__aarch64__) - // Apple Silicon: Excellent threading performance - profile.thread_creation_cost_us = 2; - profile.simd_width_elements = 2; // NEON 128-bit - profile.thread_efficiency_factor = detail::CONFIDENCE_95; - profile.base_parallel_threshold = 1024; -#elif defined(__x86_64__) && (defined(__AVX2__) || defined(__AVX512F__)) - // High-end x86_64: Good threading, excellent SIMD - profile.thread_creation_cost_us = 5; - profile.simd_width_elements = 4; // AVX2 256-bit / 4 doubles - profile.thread_efficiency_factor = 0.85; - profile.base_parallel_threshold = 2048; -#elif defined(__x86_64__) - // Standard x86_64: Moderate threading - profile.thread_creation_cost_us = 8; - profile.simd_width_elements = 2; // SSE 128-bit / 2 doubles - profile.thread_efficiency_factor = detail::AD_P_VALUE_MEDIUM; - profile.base_parallel_threshold = 4096; -#else - // Conservative defaults for other architectures - profile.thread_creation_cost_us = 10; - profile.simd_width_elements = 1; // No SIMD assumed - profile.thread_efficiency_factor = detail::STRONG_CORRELATION; - profile.base_parallel_threshold = 8192; -#endif - - // Set L3 cache size - profile.l3_cache_size_elements = features.l3_cache_size / sizeof(double); - if (profile.l3_cache_size_elements == 0) { - // Reasonable default if detection fails - profile.l3_cache_size_elements = 2 * 1024 * 1024; // 2MB worth of doubles - } - - return profile; -} - -std::string toLower(const std::string& str) { - std::string result = str; - std::transform(result.begin(), result.end(), result.begin(), ::tolower); - return result; -} - -OperationComplexity AdaptiveThresholdCalculator::getOperationComplexity( - const std::string& operation) const { - std::string op = toLower(operation); - - if (op == "pdf" || op == "logpdf") { - return OperationComplexity::SIMPLE; - } else if (op == "cdf") { - return OperationComplexity::MODERATE; - } else { - return OperationComplexity::MODERATE; - } -} - -DistributionComplexity AdaptiveThresholdCalculator::getDistributionComplexity( - const std::string& distribution) const { - std::string dist = toLower(distribution); - - if (dist == "uniform") { - return DistributionComplexity::UNIFORM; - } else if (dist == "discrete") { - return DistributionComplexity::DISCRETE; - } else if (dist == "exponential") { - return DistributionComplexity::EXPONENTIAL; - } else if (dist == "poisson") { - return DistributionComplexity::POISSON; - } else if (dist == "gaussian" || dist == "normal") { - return DistributionComplexity::GAUSSIAN; - } else { - return DistributionComplexity::EXPONENTIAL; // Default to moderate complexity - } -} - -std::size_t AdaptiveThresholdCalculator::getThreshold(const std::string& distribution, - const std::string& operation) const { - std::string key = toLower(distribution) + "_" + toLower(operation); - - // Check cache first - auto it = cached_thresholds_.find(key); - if (it != cached_thresholds_.end()) { - return it->second; - } - - // Calculate threshold based on benchmark results - DistributionComplexity dist_complexity = getDistributionComplexity(distribution); - OperationComplexity op_complexity = getOperationComplexity(operation); - - std::size_t threshold; - - // Use empirical results from our benchmark - std::string dist_lower = toLower(distribution); - std::string op_lower = toLower(operation); - - if (dist_lower == "uniform") { - if (op_lower == "pdf") { - threshold = 16384; - } else if (op_lower == "logpdf") { - threshold = 64; - } else if (op_lower == "cdf") { - threshold = 16384; - } else if (op_lower == "batch_fit") { - threshold = 64; // Lower threshold for batch_fit operations - } else { - threshold = 8192; - } - } else if (dist_lower == "discrete") { - if (op_lower == "pdf") { - threshold = 1048576; - } else if (op_lower == "logpdf") { - threshold = 32768; - } else if (op_lower == "cdf") { - threshold = 65536; - } else if (op_lower == "batch_fit") { - threshold = 64; // Lower threshold for batch_fit operations - } else { - threshold = 32768; - } - } else if (dist_lower == "exponential") { - if (op_lower == "pdf") { - threshold = 64; - } else if (op_lower == "logpdf") { - threshold = 128; - } else if (op_lower == "cdf") { - threshold = 64; - } else if (op_lower == "batch_fit") { - threshold = 32; // Lower threshold for batch_fit operations - } else { - threshold = 64; - } - } else if (dist_lower == "gaussian" || dist_lower == "normal") { - if (op_lower == "pdf") { - threshold = 64; - } else if (op_lower == "logpdf") { - threshold = 256; - } else if (op_lower == "cdf") { - threshold = 64; - } else if (op_lower == "batch_fit") { - threshold = 32; // Lower threshold for batch_fit operations - } else { - threshold = 256; - } - } else if (dist_lower == "poisson") { - if (op_lower == "pdf") { - threshold = 4096; - } else if (op_lower == "logpdf") { - threshold = 8192; - } else if (op_lower == "cdf") { - threshold = 512; - } else if (op_lower == "batch_fit") { - threshold = 64; // Lower threshold for batch_fit operations - } else { - threshold = 4096; - } - } else if (dist_lower == "gamma") { - if (op_lower == "pdf") { - threshold = 256; - } else if (op_lower == "logpdf") { - threshold = 512; - } else if (op_lower == "cdf") { - threshold = 128; - } else if (op_lower == "batch_fit") { - threshold = 64; // Lower threshold for batch_fit operations - } else { - threshold = 256; - } - } else if (dist_lower == "beta") { - // Two vector_log calls in the SIMD pipeline; bounded support [0,1] - if (op_lower == "pdf") { - threshold = 128; - } else if (op_lower == "logpdf") { - threshold = 256; - } else if (op_lower == "cdf") { - threshold = 512; // scalar beta_i per element - } else if (op_lower == "batch_fit") { - threshold = 64; - } else { - threshold = 128; - } - } else if (dist_lower == "student_t" || dist_lower == "student t") { - // Log-space pipeline (vector_multiply + vector_log + scalar ops): similar to Gaussian - if (op_lower == "pdf") { - threshold = 128; - } else if (op_lower == "logpdf") { - threshold = 256; - } else if (op_lower == "cdf") { - threshold = 512; // CDF is scalar (detail::t_cdf per element) - } else if (op_lower == "batch_fit") { - threshold = 64; - } else { - threshold = 128; - } - } else if (dist_lower == "generic") { - // Generic operations use moderate thresholds - if (op_lower == "fill" || op_lower == "transform" || op_lower == "for_each") { - threshold = 8192; - } else if (op_lower == "sort" || op_lower == "partial_sort") { - threshold = 4096; - } else if (op_lower == "scan") { - threshold = 16384; - } else if (op_lower == "search" || op_lower == "count") { - threshold = 8192; - } else { - threshold = 8192; // Default for generic operations - } - } else { - // Fallback to calculated threshold - threshold = calculateThreshold(dist_complexity, op_complexity); - } - - // Cache the result - cached_thresholds_[key] = threshold; - - return threshold; -} - -bool AdaptiveThresholdCalculator::shouldUseParallel(const std::string& distribution, - const std::string& operation, - std::size_t data_size) const { - std::size_t threshold = getThreshold(distribution, operation); - return data_size >= threshold; -} - -void AdaptiveThresholdCalculator::updateFromMeasurement(const std::string& distribution, - const std::string& operation, - std::size_t data_size, - bool parallel_beneficial) { - // Future enhancement: adapt thresholds based on runtime measurements - // For now, this is a placeholder - (void)distribution; - (void)operation; - (void)data_size; - (void)parallel_beneficial; -} - -AdaptiveThresholdCalculator& getGlobalThresholdCalculator() { - static AdaptiveThresholdCalculator instance; - return instance; -} - -} // namespace arch -} // namespace stats diff --git a/src/performance_dispatcher.cpp b/src/performance_dispatcher.cpp index 6169e3d..e3ebd50 100644 --- a/src/performance_dispatcher.cpp +++ b/src/performance_dispatcher.cpp @@ -1,6 +1,6 @@ #include "libstats/core/performance_dispatcher.h" -#include "libstats/core/distribution_characteristics.h" +#include "libstats/core/dispatch_thresholds.h" #include "libstats/core/math_constants.h" #include "libstats/core/performance_history.h" #include "libstats/core/statistical_constants.h" @@ -16,10 +16,9 @@ namespace detail { // Performance utilities PerformanceDispatcher::PerformanceDispatcher() : PerformanceDispatcher(SystemCapabilities::current()) {} -PerformanceDispatcher::PerformanceDispatcher(const SystemCapabilities& system) { - // Use SIMDPolicy to get the best SIMD level and initialize thresholds accordingly - auto simd_level = arch::simd::SIMDPolicy::getBestLevel(); - thresholds_ = Thresholds::createForSIMDLevel(simd_level, system); +PerformanceDispatcher::PerformanceDispatcher(const SystemCapabilities& system) + : simd_level_(arch::simd::SIMDPolicy::getBestLevel()) { + thresholds_ = Thresholds::createForSIMDLevel(simd_level_, system); } PerformanceDispatcher::SIMDArchitecture PerformanceDispatcher::detectSIMDArchitecture( @@ -44,20 +43,49 @@ PerformanceDispatcher::SIMDArchitecture PerformanceDispatcher::detectSIMDArchite } } -Strategy PerformanceDispatcher::selectOptimalStrategy( - size_t batch_size, DistributionType dist_type, - [[maybe_unused]] ComputationComplexity complexity, const SystemCapabilities& system) const { - auto& performance_history = getPerformanceHistory(); - auto recommendation = performance_history.getBestStrategy(dist_type, batch_size); +// ── New profiling-derived dispatch ────────────────────────────────────────── - // Use historical data if we have high confidence - if (recommendation.has_sufficient_data && - recommendation.confidence_score > detail::LARGE_EFFECT) { - return recommendation.recommended_strategy; +Strategy PerformanceDispatcher::selectStrategy(size_t batch_size, DistributionType dist_type, + OperationType op_type, + const SystemCapabilities& system) const { + // 1. Below SIMD threshold β†’ SCALAR + const size_t simd_min = arch::simd::SIMDPolicy::getMinThreshold(); + if (batch_size < simd_min) { + return Strategy::SCALAR; + } + + // 2. Below parallel threshold β†’ VECTORIZED + const size_t parallel_threshold = getParallelThreshold(simd_level_, dist_type, op_type); + if (batch_size < parallel_threshold) { + return Strategy::VECTORIZED; } - // Fallback to adaptive logic based on system capabilities - return selectStrategyBasedOnCapabilities(batch_size, dist_type, system); + // 3. At or above parallel threshold β†’ PARALLEL or WORK_STEALING + return selectMultiThreadedStrategy(dist_type, system); +} + +Strategy PerformanceDispatcher::selectMultiThreadedStrategy( + [[maybe_unused]] DistributionType dist_type, const SystemCapabilities& system) noexcept { + // Four-architecture profiling shows the threading backend is the dominant + // factor in P-vs-WS selection: + // macOS/GCD + HT: WORK_STEALING wins (up to 7:1) + // macOS/GCD + no HT: roughly even, slight PARALLEL preference + // Windows/Thread Pool: PARALLEL wins (3.3:1) + +#if defined(_WIN32) + // Windows Thread Pool: PARALLEL dominates across distributions. + return Strategy::PARALLEL; +#elif defined(__APPLE__) + // macOS/GCD: prefer WORK_STEALING when hyperthreading is present. + if (system.logical_cores() > system.physical_cores()) { + return Strategy::WORK_STEALING; + } + return Strategy::PARALLEL; +#else + // Linux/other: default to PARALLEL (conservative; no profiling data yet). + (void)system; + return Strategy::PARALLEL; +#endif } size_t PerformanceDispatcher::getDistributionSpecificParallelThreshold( @@ -108,49 +136,20 @@ PerformanceHistory& PerformanceDispatcher::getPerformanceHistory() noexcept { return global_performance_history; } -Strategy PerformanceDispatcher::selectStrategyBasedOnCapabilities( - size_t batch_size, DistributionType dist_type, const SystemCapabilities& system) const { - // Three-level threshold hierarchy. The thresholds in Thresholds have already been - // tuned for this machine's SIMD level and measured capabilities by - // refineWithCapabilities() at construction time, so the per-call decision is simple: - // - // batch < simd_min β†’ SCALAR (overhead exceeds benefit) - // simd_min <= batch < parallel β†’ VECTORIZED (batch pays off, threading doesn\'t yet) - // batch >= parallel β†’ PARALLEL or WORK_STEALING - // - // Distribution-specific parallel thresholds account for computational cost: - // Gaussian (exp/erf) parallelizes at smaller batch sizes than Uniform (arithmetic only). - - if (batch_size < thresholds_.simd_min) { - return Strategy::SCALAR; - } - - const size_t parallel_threshold = getDistributionSpecificParallelThreshold(dist_type); - if (batch_size < parallel_threshold) { - return Strategy::VECTORIZED; - } - - // Work-stealing is preferred for large batches on multi-core systems: it handles - // variable-cost work more efficiently than a fixed partition. - if (batch_size >= thresholds_.work_stealing_min && system.logical_cores() > 2) { - return Strategy::WORK_STEALING; - } - - return Strategy::PARALLEL; -} - PerformanceDispatcher::Thresholds PerformanceDispatcher::Thresholds::createForSIMDLevel( - arch::simd::SIMDPolicy::Level level, const SystemCapabilities& system) { + arch::simd::SIMDPolicy::Level level, [[maybe_unused]] const SystemCapabilities& system) { Thresholds thresholds; // Use SIMDPolicy's thresholds as foundation thresholds.simd_min = arch::simd::SIMDPolicy::getMinThreshold(); // Set base parallel thresholds based on SIMD level capability + // AVX-512's wider registers process more elements per cycle, so VECTORIZED remains + // faster than PARALLEL up to higher batch sizes than narrower SIMD levels. switch (level) { case arch::simd::SIMDPolicy::Level::AVX512: - thresholds.parallel_min = 500; - thresholds.work_stealing_min = 8000; + thresholds.parallel_min = 5000; + thresholds.work_stealing_min = 50000; break; case arch::simd::SIMDPolicy::Level::AVX2: thresholds.parallel_min = detail::MAX_BISECTION_ITERATIONS; @@ -176,53 +175,18 @@ PerformanceDispatcher::Thresholds PerformanceDispatcher::Thresholds::createForSI break; } - // Set distribution-specific thresholds based on empirical characteristics - using namespace detail; - - // Calculate SIMD and parallel thresholds using empirical data - for (size_t i = 0; i < 6; ++i) { - const auto& chars = DISTRIBUTION_CHARACTERISTICS[i]; - - // Scale base thresholds by complexity - more complex operations need lower thresholds - // to benefit from parallelization due to higher computation-to-overhead ratios - double complexity_scaling = - detail::ONE / std::max(detail::ONE, chars.base_complexity / detail::TWO); - - // Use empirical minimum thresholds, scaled by system characteristics - size_t empirical_parallel_threshold = static_cast( - static_cast(chars.min_parallel_threshold) * complexity_scaling); - - // Assign to distribution-specific thresholds - switch (i) { - case 0: // UNIFORM - thresholds.uniform_parallel_min = - std::max(empirical_parallel_threshold, thresholds.parallel_min); - break; - case 1: // GAUSSIAN - thresholds.gaussian_parallel_min = - std::max(empirical_parallel_threshold, thresholds.parallel_min / 2); - break; - case 2: // EXPONENTIAL - thresholds.exponential_parallel_min = - std::max(empirical_parallel_threshold, thresholds.parallel_min / 2); - break; - case 3: // DISCRETE - thresholds.discrete_parallel_min = - std::max(empirical_parallel_threshold, thresholds.parallel_min); - break; - case 4: // POISSON - thresholds.poisson_parallel_min = - std::max(empirical_parallel_threshold, thresholds.parallel_min / 4); - break; - case 5: // GAMMA - thresholds.gamma_parallel_min = - std::max(empirical_parallel_threshold, thresholds.parallel_min / 4); - break; - } - } - - // Refine with measured system capabilities - thresholds.refineWithCapabilities(system); + // Distribution-specific thresholds are now handled by the constexpr lookup + // table in dispatch_thresholds.h. The Thresholds struct members below are + // populated with reasonable defaults for backward compatibility only. + thresholds.uniform_parallel_min = thresholds.parallel_min * 2; + thresholds.gaussian_parallel_min = thresholds.parallel_min; + thresholds.exponential_parallel_min = thresholds.parallel_min; + thresholds.discrete_parallel_min = thresholds.parallel_min * 2; + thresholds.poisson_parallel_min = thresholds.parallel_min; + thresholds.gamma_parallel_min = thresholds.parallel_min; + thresholds.student_t_parallel_min = thresholds.parallel_min; + thresholds.beta_parallel_min = SIZE_MAX; // Beta: never parallel + thresholds.chi_squared_parallel_min = thresholds.parallel_min; return thresholds; } @@ -384,6 +348,19 @@ void PerformanceDispatcher::Thresholds::refineWithCapabilities(const SystemCapab parallel_min = std::max(parallel_min, static_cast(detail::MAX_NEWTON_ITERATIONS)); work_stealing_min = std::max(work_stealing_min, static_cast(detail::MAX_BISECTION_ITERATIONS)); + + // Ensure distribution-specific thresholds don't drop below parallel_min. + // Simple distributions (Uniform, Discrete) must stay at or above the base; + // complex ones are allowed lower thresholds but still have a floor. + uniform_parallel_min = std::max(uniform_parallel_min, parallel_min * 2); + discrete_parallel_min = std::max(discrete_parallel_min, parallel_min * 2); + gaussian_parallel_min = std::max(gaussian_parallel_min, parallel_min / 2); + exponential_parallel_min = std::max(exponential_parallel_min, parallel_min / 2); + student_t_parallel_min = std::max(student_t_parallel_min, parallel_min / 2); + beta_parallel_min = std::max(beta_parallel_min, parallel_min / 2); + poisson_parallel_min = std::max(poisson_parallel_min, parallel_min / 4); + gamma_parallel_min = std::max(gamma_parallel_min, parallel_min / 4); + chi_squared_parallel_min = std::max(chi_squared_parallel_min, parallel_min / 4); } } // namespace detail diff --git a/src/poisson.cpp b/src/poisson.cpp index aefa787..490b284 100644 --- a/src/poisson.cpp +++ b/src/poisson.cpp @@ -4,6 +4,7 @@ #include "libstats/core/statistical_constants.h" // Core functionality - lightweight headers +#include "libstats/core/dispatch_thresholds.h" #include "libstats/core/dispatch_utils.h" #include "libstats/core/log_space_ops.h" #include "libstats/core/math_utils.h" @@ -490,7 +491,7 @@ void PoissonDistribution::parallelBatchFit(const std::vector const std::size_t num_datasets = datasets.size(); // Use distribution-specific parallel thresholds for optimal work distribution - if (arch::shouldUseDistributionParallel("poisson", "batch_fit", num_datasets)) { + if (num_datasets >= detail::dispatch_table::BATCH_FIT_MIN) { // Direct parallel execution without internal thresholds - bypass ParallelUtils limitation ThreadPool& pool = ParallelUtils::getGlobalThreadPool(); const std::size_t optimal_grain_size = std::max(std::size_t{1}, num_datasets / 8); @@ -1636,7 +1637,7 @@ void PoissonDistribution::getProbability(std::span values, std::sp const detail::PerformanceHint& hint) const { detail::DispatchUtils::autoDispatch( *this, values, results, hint, detail::DistributionTraits::distType(), - detail::DistributionTraits::complexity(), + detail::OperationType::PDF, [](const PoissonDistribution& dist, double value) { return dist.getProbability(value); }, [](const PoissonDistribution& dist, const double* vals, double* res, size_t count) { // Ensure cache is valid @@ -1864,7 +1865,7 @@ void PoissonDistribution::getLogProbability(std::span values, const detail::PerformanceHint& hint) const { detail::DispatchUtils::autoDispatch( *this, values, results, hint, detail::DistributionTraits::distType(), - detail::DistributionTraits::complexity(), + detail::OperationType::LOG_PDF, [](const PoissonDistribution& dist, double value) { return dist.getLogProbability(value); }, [](const PoissonDistribution& dist, const double* vals, double* res, size_t count) { // Ensure cache is valid @@ -2052,7 +2053,7 @@ void PoissonDistribution::getCumulativeProbability(std::span value const detail::PerformanceHint& hint) const { detail::DispatchUtils::autoDispatch( *this, values, results, hint, detail::DistributionTraits::distType(), - detail::DistributionTraits::complexity(), + detail::OperationType::CDF, [](const PoissonDistribution& dist, double value) { return dist.getCumulativeProbability(value); }, diff --git a/src/student_t.cpp b/src/student_t.cpp index f550de7..28e3ce1 100644 --- a/src/student_t.cpp +++ b/src/student_t.cpp @@ -295,9 +295,16 @@ void StudentTDistribution::fit(const std::vector& values) { const double n = static_cast(values.size()); + // Upper bound: beyond NU_MAX the t-distribution is indistinguishable from + // Gaussian, and the score function flattens (psi((nu+1)/2) - psi(nu/2) ~ 1/(2*nu)), + // making Newton-Raphson steps unstable. + constexpr double NU_MAX = 1000.0; + // Initial estimate: method of moments using sample kurtosis. // Excess kurtosis = 6/(nu-4) for nu>4, so nu = 4 + 6/kurtosis. // For nu <= 4, or when sample kurtosis is unavailable, start at nu=5. + // Clamp the initial estimate to keep the optimizer in a region with + // meaningful gradient β€” starting above ~100 risks flat-tail divergence. double nu_est = 5.0; if (values.size() >= 4) { double mean = std::accumulate(values.begin(), values.end(), 0.0) / n; @@ -315,7 +322,7 @@ void StudentTDistribution::fit(const std::vector& values) { if (excess_kurt > detail::ZERO_DOUBLE) { double nu_from_kurt = 4.0 + 6.0 / excess_kurt; if (nu_from_kurt > detail::ONE && std::isfinite(nu_from_kurt)) { - nu_est = nu_from_kurt; + nu_est = std::min(nu_from_kurt, 100.0); } } } @@ -369,10 +376,10 @@ void StudentTDistribution::fit(const std::vector& values) { } double step = s / ds; - // Clamp step to avoid moving outside the positive domain + // Clamp step to avoid moving outside the valid domain step = std::max(step, -(nu - 0.1)); nu -= step; - nu = std::max(nu, 0.1); + nu = std::clamp(nu, 0.1, NU_MAX); if (std::abs(step) < tol) { break; @@ -419,7 +426,7 @@ void StudentTDistribution::getProbability(std::span values, std::s const detail::PerformanceHint& hint) const { detail::DispatchUtils::autoDispatch( *this, values, results, hint, detail::DistributionTraits::distType(), - detail::DistributionTraits::complexity(), + detail::OperationType::PDF, [](const StudentTDistribution& dist, double value) { return dist.getProbability(value); }, [](const StudentTDistribution& dist, const double* vals, double* res, size_t count) { std::shared_lock lock(dist.cache_mutex_); @@ -510,7 +517,7 @@ void StudentTDistribution::getLogProbability(std::span values, const detail::PerformanceHint& hint) const { detail::DispatchUtils::autoDispatch( *this, values, results, hint, detail::DistributionTraits::distType(), - detail::DistributionTraits::complexity(), + detail::OperationType::LOG_PDF, [](const StudentTDistribution& dist, double value) { return dist.getLogProbability(value); }, @@ -594,7 +601,7 @@ void StudentTDistribution::getCumulativeProbability(std::span valu const detail::PerformanceHint& hint) const { detail::DispatchUtils::autoDispatch( *this, values, results, hint, detail::DistributionTraits::distType(), - detail::DistributionTraits::complexity(), + detail::OperationType::CDF, [](const StudentTDistribution& dist, double value) { return dist.getCumulativeProbability(value); }, diff --git a/src/uniform.cpp b/src/uniform.cpp index b0db4f1..fca8b77 100644 --- a/src/uniform.cpp +++ b/src/uniform.cpp @@ -4,6 +4,7 @@ #include "libstats/core/statistical_constants.h" // Core functionality - lightweight headers +#include "libstats/core/dispatch_thresholds.h" #include "libstats/core/dispatch_utils.h" #include "libstats/core/log_space_ops.h" #include "libstats/core/math_utils.h" @@ -533,7 +534,7 @@ void UniformDistribution::parallelBatchFit(const std::vector const std::size_t num_datasets = datasets.size(); // Use distribution-specific parallel thresholds for optimal work distribution - if (arch::shouldUseDistributionParallel("uniform", "batch_fit", num_datasets)) { + if (num_datasets >= detail::dispatch_table::BATCH_FIT_MIN) { // Direct parallel execution without internal thresholds - bypass ParallelUtils limitation ThreadPool& pool = ParallelUtils::getGlobalThreadPool(); const std::size_t optimal_grain_size = std::max(std::size_t{1}, num_datasets / 8); @@ -1334,7 +1335,7 @@ void UniformDistribution::getProbability(std::span values, std::sp const detail::PerformanceHint& hint) const { detail::DispatchUtils::autoDispatch( *this, values, results, hint, detail::DistributionTraits::distType(), - detail::DistributionTraits::complexity(), + detail::OperationType::PDF, [](const UniformDistribution& dist, double value) { return dist.getProbability(value); }, [](const UniformDistribution& dist, const double* vals, double* res, size_t count) { // Use the unsafe implementation directly since batch methods were removed @@ -1479,7 +1480,7 @@ void UniformDistribution::getLogProbability(std::span values, const detail::PerformanceHint& hint) const { detail::DispatchUtils::autoDispatch( *this, values, results, hint, detail::DistributionTraits::distType(), - detail::DistributionTraits::complexity(), + detail::OperationType::LOG_PDF, [](const UniformDistribution& dist, double value) { return dist.getLogProbability(value); }, [](const UniformDistribution& dist, const double* vals, double* res, size_t count) { // Use the unsafe implementation directly since batch methods were removed @@ -1646,7 +1647,7 @@ void UniformDistribution::getCumulativeProbability(std::span value const detail::PerformanceHint& hint) const { detail::DispatchUtils::autoDispatch( *this, values, results, hint, detail::DistributionTraits::distType(), - detail::DistributionTraits::complexity(), + detail::OperationType::CDF, [](const UniformDistribution& dist, double value) { return dist.getCumulativeProbability(value); }, diff --git a/tests/include/validators.h b/tests/include/validators.h index d5480bf..e2dab0c 100644 --- a/tests/include/validators.h +++ b/tests/include/validators.h @@ -57,8 +57,11 @@ inline double getAdaptiveSIMDExpectation() noexcept { return base_expectation; #endif } else if (stats::arch::cpu::is_amd_cpu()) { -// AMD Zen architecture has good but slightly different SIMD characteristics -#if defined(__AVX2__) +// AMD Zen architecture β€” Zen4+ decodes AVX-512 but double-pumps through +// 256-bit execution units, yielding ~1.1-1.3x over native AVX2. +#if defined(__AVX512F__) + return 2.0; // Zen4+ AVX-512 (double-pumped 256-bit) +#elif defined(__AVX2__) return 1.8; // Zen2+ with good AVX2 performance #elif defined(__AVX__) return 1.5; // Zen/Zen+ with moderate AVX performance @@ -118,20 +121,31 @@ inline double getSIMDValidationThreshold(std::size_t batch_size, bool is_complex_distribution = false) noexcept { double base = getAdaptiveSIMDExpectation(); - // SIMD efficiency increases with batch size due to setup cost amortization + // SIMD efficiency increases with batch size due to setup cost amortization. + // On AVX-512 the amortization curve flattens earlier because 8-wide + // processing already amortises setup at moderate sizes. if (batch_size >= 50000) { - base *= 1.2; // Large batches get better SIMD utilization +#if defined(__AVX512F__) + base *= 1.05; // AVX-512 amortisation already near-optimal at smaller sizes +#else + base *= 1.2; // Narrower SIMD still benefits from large-batch amortisation +#endif } else if (batch_size >= 10000) { - base *= 1.1; // Medium batches get moderate boost + base *= 1.1; } else if (batch_size < 1000) { - base *= 0.8; // Small batches may have SIMD overhead + base *= 0.8; } - // Complex distributions benefit more from SIMD due to computational intensity + // Complex distributions contain scalar bottlenecks (lgamma, erfc) that + // limit SIMD benefit. On wide SIMD (AVX-512) the effect is more pronounced + // because the scalar portion occupies a larger fraction of the wider pipeline. if (is_complex_distribution) { - base *= 1.15; +#if defined(__AVX512F__) + base *= 0.7; // Scalar bottlenecks (lgamma, factorial) dominate wide pipeline +#else + base *= 1.15; // Moderate SIMD still hides some scalar cost +#endif } else { - // Simple distributions (Uniform, Discrete) may have overhead that limits speedup base *= 0.9; } @@ -148,20 +162,38 @@ inline double getParallelValidationThreshold(std::size_t batch_size, bool is_complex_distribution = false) noexcept { double base = getAdaptiveParallelExpectation(); +#if defined(__AVX512F__) + // AVX-512: profiling shows vectorized-to-parallel crossovers at 50K-100K, + // vs 8-64 on narrower architectures. Forced PARALLEL below the crossover + // incurs threading overhead against an already-fast vectorized baseline. + if (batch_size >= 100000) { + base *= 0.35; + } else { + // Below crossover: parallel may be slower than sequential. + // Accept any non-catastrophic result (catches deadlocks / silent + // fallback-to-single-thread, but not expected-slower-than-vectorized). + base = 0.1; + } +#else // Parallel efficiency is highly dependent on batch size due to thread overhead if (batch_size >= 100000) { // Large batches achieve close to full parallel potential base *= 1.0; } else if (batch_size >= 10000) { - // Medium batches have some thread overhead - base *= 0.8; + // Medium-large batches: thread overhead is small but measurable, + // especially on heterogeneous core architectures (e.g., P+E cores) + // where parallel efficiency is lower than the core count suggests. + base *= 0.7; } else if (batch_size >= 1000) { - // Small batches have significant overhead - be very conservative - base = std::max(0.9, base * 0.3); + // Small-medium batches: threading overhead is significant relative to + // computation. On architectures with efficient vectorization (NEON, + // wide AVX), forced PARALLEL may be slower than VECTORIZED here. + base = std::max(0.15, base * 0.06); } else { - // Very small batches may be inefficient - just expect some speedup - base = std::max(0.8, base * 0.2); + // Very small batches: threading overhead dominates computation + base = std::max(0.1, base * 0.04); } +#endif // Complex distributions benefit more from parallelization if (is_complex_distribution) { diff --git a/tests/test_cpu_detection.cpp b/tests/test_cpu_detection.cpp index fce0d2b..6d1c452 100644 --- a/tests/test_cpu_detection.cpp +++ b/tests/test_cpu_detection.cpp @@ -13,7 +13,6 @@ */ #include "libstats/platform/cpu_detection.h" -#include "libstats/platform/parallel_thresholds.h" #include "libstats/platform/simd.h" #include diff --git a/tests/test_discrete_enhanced.cpp b/tests/test_discrete_enhanced.cpp index 32d355e..d00dc35 100644 --- a/tests/test_discrete_enhanced.cpp +++ b/tests/test_discrete_enhanced.cpp @@ -339,24 +339,20 @@ TEST_F(DiscreteEnhancedTest, SIMDAndParallelBatchImplementations) { << batch_size; } - // Performance expectations (adjusted for batch size and computational complexity) - EXPECT_GT(simd_speedup, 1.0) << "SIMD should provide speedup for batch size " << batch_size; + // Architecture-aware performance expectations using adaptive validation + // Discrete is a simple distribution (trivial per-element cost) + double simd_threshold = + stats::tests::validators::getSIMDValidationThreshold(batch_size, false); + EXPECT_GT(simd_speedup, simd_threshold) + << "SIMD speedup " << simd_speedup << "x should exceed adaptive threshold " + << simd_threshold << "x for batch size " << batch_size; if (std::thread::hardware_concurrency() > 1) { - if (batch_size >= 10000) { - // For discrete distributions, computations are very simple (range checks), - // so SIMD can achieve massive speedups but parallel has thread overhead. - // In release builds, SIMD optimizations are more pronounced, so reduce - // expectations. Expect parallel to be at least 35% as efficient as SIMD for large - // batches. - EXPECT_GT(parallel_speedup, simd_speedup * 0.35) - << "Parallel should be reasonably competitive with SIMD for large batches"; - } else { - // For smaller batches, parallel may have overhead but should still be reasonable - EXPECT_GT(parallel_speedup, 0.5) - << "Parallel should provide reasonable performance for batch size " - << batch_size; - } + double parallel_threshold = + stats::tests::validators::getParallelValidationThreshold(batch_size, false); + EXPECT_GT(parallel_speedup, parallel_threshold) + << "Parallel speedup " << parallel_speedup << "x should exceed adaptive threshold " + << parallel_threshold << "x for batch size " << batch_size; } } } diff --git a/tests/test_gamma_enhanced.cpp b/tests/test_gamma_enhanced.cpp index 062e85b..8fde5ba 100644 --- a/tests/test_gamma_enhanced.cpp +++ b/tests/test_gamma_enhanced.cpp @@ -547,8 +547,11 @@ TEST_F(GammaEnhancedTest, AutoDispatchAssessment) { EXPECT_TRUE(results_match) << "Auto-dispatch results should match traditional for batch size " << batch_size; - // Auto-dispatch should be competitive or better - if (traditional_time == 0) { + // Auto-dispatch should be competitive or better. + // For very small traditional_time (≀ 2ΞΌs), the ratio is unreliable + // because dispatch overhead dominates sub-microsecond computation. + // Use an absolute time bound in that case, matching the == 0 path. + if (traditional_time <= 2) { EXPECT_LT(auto_time, 100) << "Auto-dispatch should complete quickly for small batches (batch size " << batch_size << ")"; diff --git a/tests/test_parallel_execution_comprehensive.cpp b/tests/test_parallel_execution_comprehensive.cpp index ae37684..930bed1 100644 --- a/tests/test_parallel_execution_comprehensive.cpp +++ b/tests/test_parallel_execution_comprehensive.cpp @@ -135,7 +135,7 @@ int main() { std::cout << "Test 6: Platform-aware adaptive features" << std::endl; // Test optimal parallel threshold - auto optimal_threshold = stats::arch::get_optimal_parallel_threshold("gaussian", "pdf"); + auto optimal_threshold = stats::arch::get_min_elements_for_distribution_parallel(); std::cout << " - Optimal parallel threshold: " << optimal_threshold << " elements" << std::endl; assert(optimal_threshold > 0 && optimal_threshold < 100000); // Reasonable range diff --git a/tests/test_parallel_execution_integration.cpp b/tests/test_parallel_execution_integration.cpp index 8caf4a8..d3e3974 100644 --- a/tests/test_parallel_execution_integration.cpp +++ b/tests/test_parallel_execution_integration.cpp @@ -19,7 +19,7 @@ int main() { // Test 2: CPU-aware threshold detection std::cout << "Test 2: CPU-aware threshold detection - "; - std::size_t optimal_threshold = stats::arch::get_optimal_parallel_threshold("gaussian", "pdf"); + std::size_t optimal_threshold = stats::arch::get_min_elements_for_distribution_parallel(); std::size_t optimal_grain = stats::arch::get_optimal_grain_size(); std::cout << "Threshold: " << optimal_threshold << ", Grain: " << optimal_grain << std::endl; diff --git a/tests/test_performance_dispatcher.cpp b/tests/test_performance_dispatcher.cpp index a4df1d0..ea927b7 100644 --- a/tests/test_performance_dispatcher.cpp +++ b/tests/test_performance_dispatcher.cpp @@ -4,6 +4,7 @@ #endif // Use focused header for performance dispatcher testing +#include "libstats/core/dispatch_thresholds.h" #include "libstats/core/performance_dispatcher.h" #include "libstats/core/performance_history.h" @@ -54,16 +55,17 @@ TEST_F(PerformanceDispatcherTest, BasicStrategySelection) { PerformanceDispatcher dispatcher; const SystemCapabilities& system = SystemCapabilities::current(); - // Very small batches should prefer scalar - auto strategy_small = dispatcher.selectOptimalStrategy(5, DistributionType::GAUSSIAN, - ComputationComplexity::SIMPLE, system); + // Very small batches should prefer scalar. + // Use batch_size=3 which is below the minimum SIMD threshold on all + // architectures (NEON and SSE2 have the lowest at 4). + auto strategy_small = + dispatcher.selectStrategy(3, DistributionType::GAUSSIAN, OperationType::PDF, system); EXPECT_EQ(strategy_small, Strategy::SCALAR); // Very large batches should prefer parallel strategies - auto strategy_large = dispatcher.selectOptimalStrategy(100000, DistributionType::GAUSSIAN, - ComputationComplexity::COMPLEX, system); - EXPECT_TRUE(strategy_large == Strategy::PARALLEL || strategy_large == Strategy::WORK_STEALING || - strategy_large == Strategy::WORK_STEALING); + auto strategy_large = + dispatcher.selectStrategy(100000, DistributionType::GAUSSIAN, OperationType::CDF, system); + EXPECT_TRUE(strategy_large == Strategy::PARALLEL || strategy_large == Strategy::WORK_STEALING); } TEST_F(PerformanceDispatcherTest, DistributionSpecificThresholds) { @@ -72,12 +74,12 @@ TEST_F(PerformanceDispatcherTest, DistributionSpecificThresholds) { // Test that different distributions have different thresholds // Simple distributions (like uniform) should need larger batches for parallelization - auto uniform_medium = dispatcher.selectOptimalStrategy(1000, DistributionType::UNIFORM, - ComputationComplexity::SIMPLE, system); + auto uniform_medium = + dispatcher.selectStrategy(1000, DistributionType::UNIFORM, OperationType::PDF, system); // Complex distributions (like gamma) should parallelize earlier - [[maybe_unused]] auto gamma_medium = dispatcher.selectOptimalStrategy( - 1000, DistributionType::GAMMA, ComputationComplexity::COMPLEX, system); + [[maybe_unused]] auto gamma_medium = + dispatcher.selectStrategy(1000, DistributionType::GAMMA, OperationType::CDF, system); // If we have multiple cores, gamma should be more likely to use parallel strategies if (system.physical_cores() > 1) { @@ -95,11 +97,10 @@ TEST_F(PerformanceDispatcherTest, ComplexityInfluencesStrategy) { constexpr size_t batch_size = 1000; constexpr DistributionType dist = DistributionType::GAUSSIAN; - auto simple_strategy = - dispatcher.selectOptimalStrategy(batch_size, dist, ComputationComplexity::SIMPLE, system); + auto simple_strategy = dispatcher.selectStrategy(batch_size, dist, OperationType::PDF, system); [[maybe_unused]] auto complex_strategy = - dispatcher.selectOptimalStrategy(batch_size, dist, ComputationComplexity::COMPLEX, system); + dispatcher.selectStrategy(batch_size, dist, OperationType::PDF, system); // Complex operations should be more likely to choose parallel execution // (This is a general trend, though specific results depend on system capabilities) @@ -178,20 +179,19 @@ TEST_F(PerformanceDispatcherTest, EdgeCases) { // Test edge cases // Zero batch size (should handle gracefully) - auto zero_strategy = dispatcher.selectOptimalStrategy(0, DistributionType::GAUSSIAN, - ComputationComplexity::SIMPLE, system); + auto zero_strategy = + dispatcher.selectStrategy(0, DistributionType::GAUSSIAN, OperationType::PDF, system); EXPECT_EQ(zero_strategy, Strategy::SCALAR); // Single element - auto single_strategy = dispatcher.selectOptimalStrategy(1, DistributionType::GAMMA, - ComputationComplexity::COMPLEX, system); + auto single_strategy = + dispatcher.selectStrategy(1, DistributionType::GAMMA, OperationType::CDF, system); EXPECT_EQ(single_strategy, Strategy::SCALAR); // Extremely large batch size - auto huge_strategy = dispatcher.selectOptimalStrategy(SIZE_MAX / 2, DistributionType::UNIFORM, - ComputationComplexity::SIMPLE, system); - EXPECT_TRUE(huge_strategy == Strategy::PARALLEL || huge_strategy == Strategy::WORK_STEALING || - huge_strategy == Strategy::WORK_STEALING); + auto huge_strategy = dispatcher.selectStrategy(SIZE_MAX / 2, DistributionType::UNIFORM, + OperationType::PDF, system); + EXPECT_TRUE(huge_strategy == Strategy::PARALLEL || huge_strategy == Strategy::WORK_STEALING); } TEST_F(PerformanceDispatcherTest, ThreadSafety) { @@ -211,10 +211,9 @@ TEST_F(PerformanceDispatcherTest, ThreadSafety) { for (std::size_t i = 0; i < selections_per_thread; ++i) { size_t batch_size = 100 + static_cast(i % 10000); DistributionType dist_type = static_cast(i % 6); - ComputationComplexity complexity = static_cast(i % 3); + OperationType op_type = static_cast(i % 3); - auto strategy = - dispatcher.selectOptimalStrategy(batch_size, dist_type, complexity, system); + auto strategy = dispatcher.selectStrategy(batch_size, dist_type, op_type, system); results[t].push_back(strategy); // Also record some performance data diff --git a/tests/test_platform_optimizations.cpp b/tests/test_platform_optimizations.cpp index e2d2ae2..9bdbd80 100644 --- a/tests/test_platform_optimizations.cpp +++ b/tests/test_platform_optimizations.cpp @@ -13,7 +13,6 @@ */ #include "libstats/platform/cpu_detection.h" -#include "libstats/platform/parallel_thresholds.h" #include "libstats/platform/simd.h" // Standard library includes diff --git a/tests/test_student_t_enhanced.cpp b/tests/test_student_t_enhanced.cpp index 9dbdee2..02859cb 100644 --- a/tests/test_student_t_enhanced.cpp +++ b/tests/test_student_t_enhanced.cpp @@ -124,11 +124,15 @@ TEST_F(StudentTEnhancedTest, SetterPropagates) { EXPECT_TRUE(t.isCauchy()); } -// MLE on t(5) samples should recover nu in a reasonable range +// MLE on t(5) samples should recover nu in a reasonable range. +// Use 2000 samples so the sample excess kurtosis is stable enough for the +// Newton-Raphson optimizer to converge, even when the stdlib's +// std::normal_distribution / std::gamma_distribution produce a different +// sequence from the same mt19937 seed (algorithm is implementation-defined). TEST_F(StudentTEnhancedTest, MLEFit) { mt19937 rng(123); auto source = StudentTDistribution::create(5.0).value; - const auto data = source.sample(rng, 500); + const auto data = source.sample(rng, 2000); auto fitted = StudentTDistribution::create(1.0).value; fitted.fit(data); diff --git a/tests/test_system_capabilities.cpp b/tests/test_system_capabilities.cpp index 56b6edf..4aa4c42 100644 --- a/tests/test_system_capabilities.cpp +++ b/tests/test_system_capabilities.cpp @@ -4,6 +4,7 @@ #endif // Use focused header for system capabilities testing +#include "libstats/core/dispatch_thresholds.h" #include "libstats/core/performance_dispatcher.h" // Standard library includes @@ -96,7 +97,12 @@ TEST_F(SystemCapabilitiesIntegrationTest, ThreadSafety) { constexpr std::size_t accesses_per_thread = 1000; std::vector threads; - std::vector success(static_cast(num_threads), false); + // std::vector packs bits β€” concurrent writes to different indices + // race on the same byte. Use int to guarantee distinct memory locations. + std::vector success(static_cast(num_threads), 0); + + // Per-thread failure reason: 0 = success, 1..6 = which check failed + std::vector fail_reason(num_threads, 0); for (std::size_t t = 0; t < num_threads; ++t) { threads.emplace_back([&, t]() { @@ -106,20 +112,19 @@ TEST_F(SystemCapabilitiesIntegrationTest, ThreadSafety) { const SystemCapabilities& caps = SystemCapabilities::current(); // Verify consistency - if (caps.logical_cores() == 0) - thread_success = false; - if (caps.physical_cores() == 0) - thread_success = false; - if (caps.physical_cores() > caps.logical_cores()) - thread_success = false; - if (caps.l1_cache_size() == 0) - thread_success = false; - - // Verify SIMD consistency - if (caps.has_avx2() && !caps.has_avx()) - thread_success = false; - if (caps.has_avx() && !caps.has_sse2()) - thread_success = false; + if (caps.logical_cores() == 0) { + thread_success = false; fail_reason[t] = 1; + } else if (caps.physical_cores() == 0) { + thread_success = false; fail_reason[t] = 2; + } else if (caps.physical_cores() > caps.logical_cores()) { + thread_success = false; fail_reason[t] = 3; + } else if (caps.l1_cache_size() == 0) { + thread_success = false; fail_reason[t] = 4; + } else if (caps.has_avx2() && !caps.has_avx()) { + thread_success = false; fail_reason[t] = 5; + } else if (caps.has_avx() && !caps.has_sse2()) { + thread_success = false; fail_reason[t] = 6; + } // Small delay to increase chance of race conditions if (i % 100 == 0) { @@ -127,7 +132,7 @@ TEST_F(SystemCapabilitiesIntegrationTest, ThreadSafety) { } } - success[t] = thread_success; + success[t] = thread_success ? 1 : 0; }); } @@ -138,7 +143,9 @@ TEST_F(SystemCapabilitiesIntegrationTest, ThreadSafety) { // All threads should have succeeded for (std::size_t t = 0; t < num_threads; ++t) { - EXPECT_TRUE(success[t]) << "Thread " << t << " failed consistency checks"; + EXPECT_TRUE(success[t]) << "Thread " << t << " failed check #" << fail_reason[t] + << " (1=logical_cores, 2=physical_cores, 3=phys>logical, " + "4=l1_cache, 5=avx2_no_avx, 6=avx_no_sse2)"; } } @@ -155,7 +162,7 @@ TEST_F(SystemCapabilitiesIntegrationTest, PerformanceCharacteristicsRealistic) { // Threading overhead should be measurable but not excessive if (capabilities.physical_cores() > 1) { EXPECT_GE(capabilities.threading_overhead_ns(), 10.0); // At least 10ns - EXPECT_LE(capabilities.threading_overhead_ns(), 100000.0); // At most 100ΞΌs + EXPECT_LE(capabilities.threading_overhead_ns(), 500000.0); // At most 500ΞΌs (Windows SRWLOCK + scheduler jitter) } // Memory bandwidth should be realistic for the era @@ -170,24 +177,23 @@ TEST_F(SystemCapabilitiesIntegrationTest, IntegrationWithDispatcher) { PerformanceDispatcher dispatcher; // The dispatcher should be able to use the capabilities - auto strategy = dispatcher.selectOptimalStrategy(1000, DistributionType::GAUSSIAN, - ComputationComplexity::MODERATE, capabilities); + auto strategy = dispatcher.selectStrategy(1000, DistributionType::GAUSSIAN, OperationType::PDF, + capabilities); // Should return a valid strategy EXPECT_TRUE(strategy >= Strategy::SCALAR && strategy <= Strategy::WORK_STEALING); // Test with different parameters - auto small_strategy = dispatcher.selectOptimalStrategy( - 10, DistributionType::UNIFORM, ComputationComplexity::SIMPLE, capabilities); + auto small_strategy = + dispatcher.selectStrategy(10, DistributionType::UNIFORM, OperationType::PDF, capabilities); // Accept either SCALAR or VECTORIZED for small batches (depends on SIMD policy) EXPECT_TRUE(small_strategy == Strategy::SCALAR || small_strategy == Strategy::VECTORIZED); // Large batch should consider parallel strategies (if we have multiple cores) if (capabilities.physical_cores() > 1) { - auto large_strategy = dispatcher.selectOptimalStrategy( - 100000, DistributionType::GAMMA, ComputationComplexity::COMPLEX, capabilities); + auto large_strategy = dispatcher.selectStrategy(100000, DistributionType::GAMMA, + OperationType::CDF, capabilities); EXPECT_TRUE(large_strategy == Strategy::PARALLEL || - large_strategy == Strategy::WORK_STEALING || large_strategy == Strategy::WORK_STEALING); } } diff --git a/tools/README.md b/tools/README.md index 29ac515..016bea5 100644 --- a/tools/README.md +++ b/tools/README.md @@ -10,14 +10,10 @@ Quick reference for the actively useful tools in `tools/`. ### SIMD and performance validation - `simd_verification` β€” validate SIMD correctness and measure speedups across distributions -- `parallel_threshold_benchmark` β€” inspect architecture-aware threshold behavior +- `strategy_profile` β€” canonical forced-strategy profiler for dispatcher threshold tuning across distributions, operations, and batch sizes - `parallel_batch_fitting_benchmark` β€” benchmark batch fitting behavior across distributions - `parallel_correctness_verification` β€” validate batch correctness under parallel execution -### Dispatch and learning analysis -- `performance_dispatcher_tool` β€” inspect dispatch choices and strategy behavior -- `learning_analyzer` β€” analyze adaptive learning and threshold behavior -- `empirical_characteristics_demo` β€” inspect empirical complexity assumptions used by dispatch logic ### Header-analysis tools These remain useful for include and compilation-health work: @@ -45,5 +41,6 @@ Examples: ## Guidance - Prefer the compiled C++ tools for release validation and performance checks. +- For dispatcher threshold tuning, prefer `strategy_profile` as the canonical raw data source. - Prefer the Python analysis tools for repo-maintenance work. - Do not treat every file in `tools/` as part of the primary supported workflow; some are archival. diff --git a/tools/empirical_characteristics_demo.cpp b/tools/empirical_characteristics_demo.cpp deleted file mode 100644 index d905c21..0000000 --- a/tools/empirical_characteristics_demo.cpp +++ /dev/null @@ -1,231 +0,0 @@ -/** - * @file empirical_characteristics_demo.cpp - * @brief Demonstration of empirical distribution characteristics integration - * - * This tool showcases how the performance dispatcher now uses empirically-derived - * distribution characteristics instead of hardcoded assumptions. - */ - -// Use consolidated tool utilities header which includes libstats.h -#include "tool_utils.h" - -// Additional includes for empirical characteristics -#include "libstats/core/distribution_characteristics.h" - -// Standard library includes -#include // for std::setw, std::setprecision, std::fixed, std::left -#include // for std::cout -#include // for std::ostringstream -#include // for std::string -#include // for std::pair -#include // for std::vector - -using namespace stats; -using namespace stats::detail; -using namespace stats::detail::detail; - -namespace { - -void displayCharacteristics() { - sectionHeader("Empirical Distribution Characteristics"); - - std::vector> distributions = { - {"Uniform", DistributionType::UNIFORM}, {"Gaussian", DistributionType::GAUSSIAN}, - {"Exponential", DistributionType::EXPONENTIAL}, {"Discrete", DistributionType::DISCRETE}, - {"Poisson", DistributionType::POISSON}, {"Gamma", DistributionType::GAMMA}}; - - // Table headers - std::cout << std::left << std::setw(13) << "Distribution" << std::setw(12) << "Complexity" - << std::setw(12) << "SIMD Eff" << std::setw(12) << "Parallel" << std::setw(12) - << "SIMD Thresh" << std::setw(12) << "Par Thresh" << std::setw(12) << "Memory" - << std::setw(12) << "Branching" - << "\n"; - - std::cout << std::string(96, '-') << "\n"; - - for (const auto& [name, dist_type] : distributions) { - const auto& chars = getCharacteristics(dist_type); - - std::cout << std::left << std::setw(13) << name << std::setw(12) << std::fixed - << std::setprecision(1) << chars.base_complexity << std::setw(12) << std::fixed - << std::setprecision(2) << chars.vectorization_efficiency << std::setw(12) - << std::fixed << std::setprecision(2) << chars.parallelization_efficiency - << std::setw(12) << chars.min_simd_threshold << std::setw(12) - << chars.min_parallel_threshold << std::setw(12) << std::fixed - << std::setprecision(2) << chars.memory_access_pattern << std::setw(12) - << std::fixed << std::setprecision(2) << chars.branch_prediction_cost << "\n"; - } - - std::cout << "\n"; - std::cout << "Key:\n"; - std::cout << " Complexity: Computational cost relative to uniform (1.0 = baseline)\n"; - std::cout << " SIMD Eff: Vectorization efficiency (0.0-1.0, higher is better)\n"; - std::cout << " Parallel: Parallelization efficiency (0.0-1.0, higher is better)\n"; - std::cout << " SIMD Thresh: Minimum elements where SIMD becomes beneficial\n"; - std::cout << " Par Thresh: Minimum elements where parallelization helps\n"; - std::cout << " Memory: Memory access efficiency (1.0 = perfect locality)\n"; - std::cout << " Branching: Branch prediction cost factor (1.0 = no branching)\n"; -} - -void displayScalingFactors() { - sectionHeader("Expected Performance Scaling"); - - std::vector> distributions = { - {"Uniform", DistributionType::UNIFORM}, {"Gaussian", DistributionType::GAUSSIAN}, - {"Exponential", DistributionType::EXPONENTIAL}, {"Discrete", DistributionType::DISCRETE}, - {"Poisson", DistributionType::POISSON}, {"Gamma", DistributionType::GAMMA}}; - - std::vector thread_counts = {2, 4, 8, 16}; - - std::cout << std::left << std::setw(13) << "Distribution" << std::setw(12) << "SIMD (4x)" - << std::setw(11) << "2 threads" << std::setw(11) << "4 threads" << std::setw(11) - << "8 threads" << std::setw(12) << "16 threads" - << "\n"; - - std::cout << std::string(76, '-') << "\n"; - - for (const auto& [name, dist_type] : distributions) { - const auto& chars = getCharacteristics(dist_type); - - std::cout << std::left << std::setw(13) << name; - - // SIMD speedup - double simd_speedup = calculateSIMDSpeedup(chars); - std::ostringstream simd_stream; - simd_stream << std::fixed << std::setprecision(2) << simd_speedup << "x"; - std::cout << std::setw(12) << simd_stream.str(); - - // Parallel speedups for different thread counts - for (size_t threads : thread_counts) { - double parallel_speedup = calculateParallelSpeedup(chars, threads); - std::ostringstream parallel_stream; - parallel_stream << std::fixed << std::setprecision(1) << parallel_speedup << "x"; - std::cout << std::setw(11) << parallel_stream.str(); - } - - std::cout << "\n"; - } - - std::cout << "\nNote: These are theoretical maximums based on algorithmic analysis.\n"; - std::cout << " Actual performance depends on system capabilities and data patterns.\n"; -} - -void demonstrateStrategySelection() { - sectionHeader("Strategy Selection with Empirical Data"); - - PerformanceDispatcher dispatcher; - SystemCapabilities system = SystemCapabilities::current(); - - std::vector> distributions = { - {"Uniform", DistributionType::UNIFORM}, {"Gaussian", DistributionType::GAUSSIAN}, - {"Exponential", DistributionType::EXPONENTIAL}, {"Discrete", DistributionType::DISCRETE}, - {"Poisson", DistributionType::POISSON}, {"Gamma", DistributionType::GAMMA}}; - - std::vector batch_sizes = {100, 1000, 10000, 100000}; - - // Widen columns to fit full strategy display names - std::cout << std::left << std::setw(14) << "Distribution" << std::setw(14) << "Size=100" - << std::setw(14) << "Size=1K" << std::setw(14) << "Size=10K" << std::setw(14) - << "Size=100K" - << "\n"; - - std::cout << std::string(70, '-') << "\n"; - - for (const auto& [name, dist_type] : distributions) { - std::cout << std::left << std::setw(14) << name; - - for (size_t batch_size : batch_sizes) { - Strategy strategy = dispatcher.selectOptimalStrategy( - batch_size, dist_type, ComputationComplexity::MODERATE, system); - - // Use display strings that match the Strategy enum names - std::string strategy_str = stats::detail::detail::strategyToDisplayString(strategy); - - std::cout << std::setw(14) << strategy_str; - } - std::cout << "\n"; - } - - std::cout << "\nStrategy Selection Rationale:\n"; - std::cout << " β€’ Simple distributions (Uniform, Discrete) benefit from Vectorized early\n"; - std::cout << " β€’ Complex distributions (Gaussian, Poisson, Gamma) parallelize at smaller\n"; - std::cout << " batch sizes due to higher per-element computation cost\n"; - std::cout << " β€’ Work-Stealing provides dynamic load balancing at very large batch sizes\n"; - std::cout << " β€’ Decisions use a simple threshold hierarchy tuned per architecture\n"; -} - -void demonstrateAdaptiveLearning() { - sectionHeader("Adaptive Learning Integration"); - - // Show how empirical characteristics can be refined - auto base_chars = getCharacteristics(DistributionType::GAUSSIAN); - - std::cout << "Base Gaussian Characteristics:\n"; - std::cout << " SIMD Efficiency: " << std::fixed << std::setprecision(2) - << base_chars.vectorization_efficiency << "\n"; - std::cout << " Parallel Efficiency: " << std::fixed << std::setprecision(2) - << base_chars.parallelization_efficiency << "\n"; - std::cout << " Base Complexity: " << std::fixed << std::setprecision(1) - << base_chars.base_complexity << "\n"; - - // Note: Adaptive refinement functionality not yet implemented - std::cout << "\nAdaptive Learning (Planned Feature):\n"; - std::cout << " The system will learn from actual performance measurements to refine:\n"; - std::cout << " β€’ SIMD efficiency multipliers based on observed speedups\n"; - std::cout << " β€’ Parallel efficiency adjustments for specific workloads\n"; - std::cout << " β€’ Complexity refinements based on measured execution times\n"; - std::cout << " β€’ Threshold adjustments for optimal strategy selection\n"; - std::cout << "\n Example potential improvements:\n"; - std::cout << " β€’ SIMD Efficiency: " << std::fixed << std::setprecision(2) - << base_chars.vectorization_efficiency << " β†’ " - << (base_chars.vectorization_efficiency * 1.2) << " (+20%)\n"; - std::cout << " β€’ Parallel Efficiency: " << std::fixed << std::setprecision(2) - << base_chars.parallelization_efficiency << " β†’ " - << (base_chars.parallelization_efficiency * 0.85) << " (-15%)\n"; - std::cout << " β€’ SIMD Threshold: " << base_chars.min_simd_threshold << " β†’ " - << (base_chars.min_simd_threshold - 8) << " (8 elements earlier)\n"; - - std::cout << "\nAdaptive Learning Benefits:\n"; - std::cout << " β€’ Starts with empirically-derived baselines instead of assumptions\n"; - std::cout << " β€’ Learns system-specific refinements over time\n"; - std::cout << " β€’ Blends empirical knowledge with measured performance\n"; - std::cout << " β€’ Confidence-weighted adjustments prevent over-fitting\n"; -} - -} // anonymous namespace - -int main() { - // Initialize performance systems - stats::initialize_performance_systems(); - - sectionHeader("Empirical Distribution Characteristics Demo"); - std::cout << "This demo shows how libstats now uses empirically-derived distribution\n"; - std::cout << "characteristics instead of hardcoded performance assumptions.\n"; - - displayCharacteristics(); - displayScalingFactors(); - demonstrateStrategySelection(); - demonstrateAdaptiveLearning(); - - sectionHeader("Summary"); - std::cout << "The empirical characteristics system provides:\n\n"; - std::cout << "1. Data-Driven Baselines:\n"; - std::cout << " β€’ Characteristics derived from algorithmic analysis\n"; - std::cout << " β€’ No more magic numbers or arbitrary assumptions\n"; - std::cout << " β€’ Performance models grounded in computational reality\n\n"; - - std::cout << "2. Distribution-Aware Strategy Selection:\n"; - std::cout << " β€’ Considers vectorization efficiency per distribution\n"; - std::cout << " β€’ Accounts for branch prediction and memory access patterns\n"; - std::cout << " β€’ Scales thresholds by computational complexity\n\n"; - - std::cout << "3. Adaptive Learning Integration:\n"; - std::cout << " β€’ Starts with empirical baselines, not zero knowledge\n"; - std::cout << " β€’ Learns system-specific refinements over time\n"; - std::cout << " β€’ Confidence-weighted blending prevents over-correction\n\n"; - - std::cout << "This foundation enables more accurate performance predictions and\n"; - std::cout << "better strategy selection across different distribution types.\n"; - - return 0; -} diff --git a/tools/learning_analyzer.cpp b/tools/learning_analyzer.cpp deleted file mode 100644 index 33680cb..0000000 --- a/tools/learning_analyzer.cpp +++ /dev/null @@ -1,1035 +0,0 @@ -/** - * @file learning_analyzer.cpp - * @brief Consolidated learning analysis tool combining real execution analysis and educational - * simulation - * - * This tool consolidates the functionality of adaptive_learning_analyzer.cpp and - * threshold_learning_demo.cpp, providing both comprehensive performance analysis with real - * execution data and educational simulation demonstrating adaptive threshold learning. - */ - -// Use consolidated tool utilities header which includes libstats.h -#include "tool_utils.h" - -// Additional includes for performance analysis functionality -#include "libstats/core/performance_history.h" - -// Standard library includes -#include // for std::sort, std::max -#include // for std::chrono timing functions -#include // for std::uint64_t -#include // for std::exception -#include // for std::setw, std::setprecision, std::fixed, std::left -#include // for std::cout, std::cerr -#include // for std::map -#include // for std::memory (if needed) -#include // for std::optional -#include // for std::mt19937, random distributions -#include // for std::ostringstream -#include // for std::string -#include // for threading (if needed) -#include // for std::pair -#include // for std::vector - -using namespace stats; -using namespace stats::detail; - -// Consolidated learning analysis constants -namespace { -// Time conversion constants - reserved for future use -[[maybe_unused]] constexpr long NANOSECONDS_TO_MICROSECONDS = 1000; -[[maybe_unused]] constexpr long NANOSECONDS_TO_MILLISECONDS = 1000000; -[[maybe_unused]] constexpr long NANOSECONDS_TO_SECONDS = 1000000000; - -// Test data generation -constexpr double TEST_VALUE_MIN = 0.1; -constexpr double TEST_VALUE_MAX = 10.0; - -// Performance simulation parameters (for demo mode) -constexpr double SIMULATION_NOISE_MIN = 0.9; -constexpr double SIMULATION_NOISE_MAX = 1.1; -constexpr double SCALAR_PERFORMANCE_FACTOR = 10.0; -constexpr double SIMD_PERFORMANCE_FACTOR = 3.0; -constexpr double PARALLEL_PERFORMANCE_FACTOR = 2.0; - -// Strategy overhead constants - reserved for future simulation modes -[[maybe_unused]] constexpr uint64_t SIMD_SMALL_OVERHEAD = 500; -[[maybe_unused]] constexpr uint64_t PARALLEL_SMALL_OVERHEAD = 5000; -constexpr size_t SIMD_OVERHEAD_THRESHOLD = 10000; -[[maybe_unused]] constexpr size_t PARALLEL_OVERHEAD_THRESHOLD = 1000; - -// Learning simulation parameters -constexpr int SAMPLES_PER_STRATEGY = 6; - -// Performance simulation speedup factors (for analysis mode) -constexpr int SIMD_SPEEDUP_FACTOR = 3; -constexpr int PARALLEL_SPEEDUP_FACTOR = 6; -constexpr int WORK_STEALING_SPEEDUP_FACTOR = 8; - -// Strategy threshold sizes -constexpr size_t MIN_VECTORIZED_BATCH_SIZE = 32; -constexpr size_t MIN_PARALLEL_BATCH_SIZE = 1000; -constexpr size_t MIN_WORK_STEALING_BATCH_SIZE = 10000; - -// Distribution parameters -namespace distribution_params { -constexpr double UNIFORM_MIN = 0.0; -constexpr double UNIFORM_MAX = 10.0; -constexpr double GAUSSIAN_MEAN = 0.0; -constexpr double GAUSSIAN_STDDEV = 1.0; -constexpr double EXPONENTIAL_LAMBDA = 1.0; -constexpr int DISCRETE_MIN = 1; -constexpr int DISCRETE_MAX = 100; -constexpr double POISSON_LAMBDA = 5.0; -constexpr double GAMMA_ALPHA = 2.0; -constexpr double GAMMA_BETA = 1.0; -} // namespace distribution_params - -// Output formatting - reserved for future formatting improvements -[[maybe_unused]] constexpr int CONFIDENCE_PRECISION = 3; -[[maybe_unused]] constexpr int TIME_PRECISION = 0; -} // namespace - -class LearningAnalyzer { - private: - std::mt19937 rng_; - - public: - LearningAnalyzer() : rng_(std::random_device{}()) {} - - void showUsage() { - std::cout << "LIBSTATS LEARNING ANALYZER\n"; - std::cout << "==========================\n\n"; - std::cout - << "This consolidated tool provides comprehensive adaptive learning analysis.\n\n"; - std::cout << "Usage: learning_analyzer [mode]\n\n"; - std::cout << "Modes:\n"; - std::cout << " demo - Educational demonstration with simulated performance data\n"; - std::cout << " analysis - Comprehensive analysis with real execution data (default)\n"; - std::cout << " both - Run both demo and analysis modes\n\n"; - std::cout << "The demo mode shows the learning process step-by-step with realistic\n"; - std::cout << "simulation, while analysis mode exercises actual distributions and\n"; - std::cout << "collects real performance data for detailed analysis.\n\n"; - } - - void runDemo() { - // Initialize performance systems for accurate threshold learning - stats::initialize_performance_systems(); - - std::cout << "=== THRESHOLD LEARNING DEMONSTRATION ===\n\n"; - - showInitialState(); - simulatePerformanceLearning(); - showLearnedStrategies(); - demonstrateAdaptiveSelection(); - } - - void runAnalysis() { - // Initialize performance systems for optimal measurement accuracy - stats::initialize_performance_systems(); - - std::cout << "============================================================\n"; - std::cout << "ADAPTIVE LEARNING ANALYSIS\n"; - std::cout << "============================================================\n\n"; - - std::cout << "This mode exercises the adaptive learning system by running\n"; - std::cout << "various distribution operations across different batch sizes\n"; - std::cout << "and strategies, then analyzes the collected performance data.\n\n"; - - // Use a more comprehensive set of batch sizes that covers the full range - // with better granularity around threshold boundaries - std::vector batch_sizes = { - 5, 8, 10, 16, 20, 25, 32, 40, 50, 64, 80, - 100, 128, 160, 200, 256, 320, 400, 500, 640, 800, 1000, - 1280, 1600, 2000, 2560, 3200, 4000, 5000, 6400, 8000, 10000, 12800, - 16000, 20000, 25600, 32000, 40000, 50000, 64000, 80000, 100000}; - - std::cout << "Testing " << batch_sizes.size() - << " different batch sizes across all distributions...\n\n"; - - // Exercise different distributions with real operations - exerciseAllDistributionsEnhanced(batch_sizes); - - // Analyze the collected performance data - analyzePerformanceHistoryEnhanced(); - } - - private: - void showInitialState() { - std::cout << "--- Initial State (Before Learning) ---\n"; - - // Show system capabilities - const auto& capabilities = SystemCapabilities::current(); - std::cout << "System Configuration:\n"; - std::cout << " Logical cores: " << capabilities.logical_cores() << "\n"; - std::cout << " Physical cores: " << capabilities.physical_cores() << "\n"; - std::cout << " SIMD efficiency: " << std::fixed << std::setprecision(3) - << capabilities.simd_efficiency() << "\n"; - std::cout << " Memory bandwidth: " << std::setprecision(1) - << capabilities.memory_bandwidth_gb_s() << " GB/s\n"; - - // Show some initial strategy selections - std::vector test_sizes = {100, 1000, 10000, 100000}; - - std::cout << "\nInitial Strategy Selections:\n"; - std::cout << std::left << std::setw(12) << "Batch Size" << std::setw(20) - << "Strategy (Uniform)" << std::setw(20) << "Strategy (Gaussian)" - << "\n"; - std::cout << std::string(52, '-') << "\n"; - - PerformanceDispatcher dispatcher; - for (auto size : test_sizes) { - auto uniform_strategy = dispatcher.selectOptimalStrategy( - size, DistributionType::UNIFORM, ComputationComplexity::SIMPLE, capabilities); - auto gaussian_strategy = dispatcher.selectOptimalStrategy( - size, DistributionType::GAUSSIAN, ComputationComplexity::MODERATE, capabilities); - - std::cout << std::setw(12) << size << std::setw(20) - << stats::detail::detail::strategyToDisplayString(uniform_strategy) - << std::setw(20) - << stats::detail::detail::strategyToDisplayString(gaussian_strategy) << "\n"; - } - std::cout << "\n"; - } - - void simulatePerformanceLearning() { - std::cout << "--- Simulating Performance Learning ---\n"; - - // Get access to the performance history system - auto& history = PerformanceDispatcher::getPerformanceHistory(); - history.clearHistory(); // Start fresh - - std::cout - << "Recording performance data across different distributions and batch sizes...\n"; - - // Simulate realistic performance patterns - std::uniform_real_distribution noise(SIMULATION_NOISE_MIN, SIMULATION_NOISE_MAX); - - // All distribution types to simulate - std::vector distributions = { - DistributionType::UNIFORM, DistributionType::GAUSSIAN, - DistributionType::EXPONENTIAL, DistributionType::DISCRETE, - DistributionType::POISSON, DistributionType::GAMMA, - DistributionType::CHI_SQUARED, DistributionType::STUDENT_T, - DistributionType::BETA}; - - // Performance complexity factors for different distributions - std::map complexity_factors = { - {DistributionType::UNIFORM, 1.0}, // Simple - just random scaling - {DistributionType::DISCRETE, 1.5}, // Simple integer operations - {DistributionType::EXPONENTIAL, 2.5}, // Moderate - requires exp/log - {DistributionType::GAUSSIAN, 3.0}, // Moderate - Box-Muller transform - {DistributionType::POISSON, 4.0}, // Complex - iterative algorithms - {DistributionType::GAMMA, 5.0}, // Most complex - special functions - {DistributionType::CHI_SQUARED, 5.0}, // Delegates to Gamma - same complexity - {DistributionType::STUDENT_T, 3.2}, // Moderate - log-space continuous - {DistributionType::BETA, 3.4} // Moderate - bounded log-space continuous - }; - - // Distribution-specific efficiency characteristics - std::map> efficiency_characteristics = { - {DistributionType::UNIFORM, {0.40, 0.25}}, // Good SIMD/Parallel efficiency - {DistributionType::DISCRETE, {0.35, 0.22}}, // Decent efficiency - {DistributionType::EXPONENTIAL, {0.28, 0.18}}, // Moderate efficiency - {DistributionType::GAUSSIAN, {0.25, 0.15}}, // Lower efficiency - {DistributionType::POISSON, {0.22, 0.12}}, // Poor efficiency - {DistributionType::GAMMA, {0.20, 0.10}}, // Worst efficiency - {DistributionType::CHI_SQUARED, {0.20, 0.10}}, // Delegates to Gamma; same efficiency - {DistributionType::STUDENT_T, {0.24, 0.15}}, // Moderate efficiency - {DistributionType::BETA, {0.23, 0.14}} // Moderate efficiency with fixup - }; - - // More granular sizes around potential crossover points - std::vector sizes = {10, 25, 50, 75, 100, 150, 200, - 300, 500, 750, 1000, 1500, 2000, 3000, - 5000, 7500, 10000, 15000, 25000, 50000}; - - for (auto dist_type : distributions) { - std::cout << "\n Simulating " - << stats::detail::detail::distributionTypeToString(dist_type) - << " distribution:\n"; - - double complexity = complexity_factors[dist_type]; - auto [simd_efficiency, parallel_efficiency] = efficiency_characteristics[dist_type]; - - for (auto size : sizes) { - std::cout << " Recording data for size " << size << "..." << std::flush; - - // Record multiple samples per strategy - for (int sample = 0; sample < SAMPLES_PER_STRATEGY; ++sample) { - // Scalar strategy - auto scalar_time = - static_cast(static_cast(size) * - SCALAR_PERFORMANCE_FACTOR * complexity * noise(rng_)); - history.recordPerformance(Strategy::SCALAR, dist_type, size, scalar_time); - - // SIMD strategy - auto simd_time = - static_cast(static_cast(size) * SIMD_PERFORMANCE_FACTOR * - complexity * simd_efficiency * noise(rng_)); - if (size < SIMD_OVERHEAD_THRESHOLD) { - simd_time += SIMD_SMALL_OVERHEAD; - } - history.recordPerformance(Strategy::VECTORIZED, dist_type, size, simd_time); - - // Parallel strategy - auto parallel_time = static_cast( - static_cast(size) * PARALLEL_PERFORMANCE_FACTOR * complexity * - parallel_efficiency * noise(rng_)); - double complexity_factor = complexity; - double overhead_reduction = std::max(1.0, static_cast(size) / 1000.0); - uint64_t base_overhead = - static_cast(8000.0 / complexity_factor / overhead_reduction); - parallel_time += base_overhead; - history.recordPerformance(Strategy::PARALLEL, dist_type, size, parallel_time); - } - - std::cout << " βœ“"; - } - std::cout << "\n"; - } - - std::cout << "\nTotal recorded executions: " << history.getTotalExecutions() << "\n\n"; - } - - void showLearnedStrategies() { - std::cout << "--- Learned Strategy Recommendations ---\n"; - - auto& history = PerformanceDispatcher::getPerformanceHistory(); - std::vector test_sizes = {100, 1000, 10000, 50000}; - - std::cout << std::left << std::setw(12) << "Size" << std::setw(20) << "Best Strategy" - << std::setw(15) << "Confidence" << std::setw(15) << "Expected Time" - << "\n"; - std::cout << std::string(62, '-') << "\n"; - - for (auto size : test_sizes) { - auto recommendation = history.getBestStrategy(DistributionType::GAUSSIAN, size); - - std::cout << std::setw(12) << size << std::setw(20) - << stats::detail::detail::strategyToDisplayString( - recommendation.recommended_strategy) - << std::setw(15) - << stats::detail::detail::confidenceToString(recommendation.confidence_score) - << std::setw(12) - << stats::detail::detail::nanosecondsToMicroseconds( - recommendation.expected_time_ns) - << "\n"; - } - std::cout << "\n"; - } - - void demonstrateAdaptiveSelection() { - std::cout << "--- Adaptive Selection Results ---\n"; - - std::cout << "The PerformanceDispatcher now uses learned data to make better decisions.\n"; - std::cout << "Key insights from the learning process:\n"; - std::cout - << "β€’ Small batches (< 1000): Scalar or SIMD preferred due to parallel overhead\n"; - std::cout << "β€’ Medium batches (1000-10000): SIMD shows good balance\n"; - std::cout << "β€’ Large batches (> 10000): Parallel strategies become advantageous\n\n"; - - // Show threshold learning results - auto& history = PerformanceDispatcher::getPerformanceHistory(); - - std::cout << "Learned optimal thresholds for all distributions:\n"; - for (auto dist_type : - {DistributionType::UNIFORM, DistributionType::GAUSSIAN, DistributionType::EXPONENTIAL, - DistributionType::DISCRETE, DistributionType::POISSON, DistributionType::GAMMA, - DistributionType::CHI_SQUARED, DistributionType::STUDENT_T, DistributionType::BETA}) { - auto thresholds = history.learnOptimalThresholds(dist_type); - if (thresholds.has_value()) { - std::cout << " " << stats::detail::detail::distributionTypeToString(dist_type) - << ":\n"; - std::cout << " SIMD threshold: " << thresholds->first << " elements\n"; - std::cout << " Parallel threshold: " << thresholds->second << " elements\n"; - } else { - std::cout << " " << stats::detail::detail::distributionTypeToString(dist_type) - << ": Insufficient data\n"; - } - } - - std::cout << "\nDemo completed successfully!\n"; - } - - // Exercise different distributions with real operations - template - void exerciseDistribution(const std::string& dist_name, DistributionType dist_type, - Distribution& dist, const std::vector& batch_sizes) { - std::cout << "\n=== Testing " << dist_name << " Distribution ===\n"; - - std::random_device rd; - std::mt19937 gen(rd()); - - for (size_t batch_size : batch_sizes) { - std::cout << "\nBatch size: " << batch_size << std::endl; - - // Create test data - std::vector values(batch_size); - std::uniform_real_distribution value_gen(TEST_VALUE_MIN, TEST_VALUE_MAX); - for (auto& v : values) { - v = value_gen(gen); - } - - // Test PDF operations (medium complexity) - { - auto start = std::chrono::high_resolution_clock::now(); - std::vector results(batch_size); - for (size_t i = 0; i < batch_size; ++i) { - results[i] = dist.getProbability(values[i]); - } - auto end = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast(end - start); - - // Record performance for SCALAR strategy - PerformanceDispatcher::recordPerformance( - Strategy::SCALAR, dist_type, batch_size, - static_cast(duration.count())); - - std::cout << " PDF (scalar): " << stats::detail::detail::formatDuration(duration) - << " (" << (static_cast(duration.count()) / batch_size) - << "ns/op)" << std::endl; - } - - // Test CDF operations (higher complexity) - { - auto start = std::chrono::high_resolution_clock::now(); - std::vector results(batch_size); - for (size_t i = 0; i < batch_size; ++i) { - results[i] = dist.getCumulativeProbability(values[i]); - } - auto end = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast(end - start); - - // Simulate SIMD performance for larger batches - if (batch_size >= MIN_VECTORIZED_BATCH_SIZE) { - auto simd_duration = duration / SIMD_SPEEDUP_FACTOR; - PerformanceDispatcher::recordPerformance( - Strategy::VECTORIZED, dist_type, batch_size, - static_cast(simd_duration.count())); - std::cout << " CDF (simd): " - << stats::detail::detail::formatDuration(simd_duration) << " (" - << (static_cast(simd_duration.count()) / batch_size) - << "ns/op)" << std::endl; - } - - // Simulate parallel performance for very large batches - if (batch_size >= MIN_PARALLEL_BATCH_SIZE) { - auto parallel_duration = duration / PARALLEL_SPEEDUP_FACTOR; - PerformanceDispatcher::recordPerformance( - Strategy::PARALLEL, dist_type, batch_size, - static_cast(parallel_duration.count())); - std::cout << " CDF (parallel): " - << stats::detail::detail::formatDuration(parallel_duration) << " (" - << (static_cast(parallel_duration.count()) / - batch_size) - << "ns/op)" << std::endl; - } - - PerformanceDispatcher::recordPerformance( - Strategy::SCALAR, dist_type, batch_size, - static_cast(duration.count())); - std::cout << " CDF (scalar): " << stats::detail::detail::formatDuration(duration) - << " (" << (static_cast(duration.count()) / batch_size) - << "ns/op)" << std::endl; - } - - // For very large batches, test advanced strategies - if (batch_size >= MIN_WORK_STEALING_BATCH_SIZE) { - auto start = std::chrono::high_resolution_clock::now(); - std::vector results(batch_size); - for (size_t i = 0; i < batch_size; ++i) { - results[i] = - dist.getProbability(values[i]) + dist.getCumulativeProbability(values[i]); - } - auto end = std::chrono::high_resolution_clock::now(); - auto base_duration = - std::chrono::duration_cast(end - start); - - // Simulate work-stealing - auto work_stealing_duration = base_duration / WORK_STEALING_SPEEDUP_FACTOR; - PerformanceDispatcher::recordPerformance( - Strategy::WORK_STEALING, dist_type, batch_size, - static_cast(work_stealing_duration.count())); - std::cout << " Mixed (work-stealing): " - << stats::detail::detail::formatDuration(work_stealing_duration) << " (" - << (static_cast(work_stealing_duration.count()) / - batch_size) - << "ns/op)" << std::endl; - } - } - } - - void exerciseAllDistributions(const std::vector& batch_sizes) { - // Exercise different distributions using safe factory methods - { - auto uniform_dist = stats::UniformDistribution::create(distribution_params::UNIFORM_MIN, - distribution_params::UNIFORM_MAX) - .value; - exerciseDistribution("Uniform", DistributionType::UNIFORM, uniform_dist, batch_sizes); - } - - { - auto gaussian_dist = - stats::GaussianDistribution::create(distribution_params::GAUSSIAN_MEAN, - distribution_params::GAUSSIAN_STDDEV) - .value; - exerciseDistribution("Gaussian", DistributionType::GAUSSIAN, gaussian_dist, - batch_sizes); - } - - { - auto exp_dist = - stats::ExponentialDistribution::create(distribution_params::EXPONENTIAL_LAMBDA) - .value; - exerciseDistribution("Exponential", DistributionType::EXPONENTIAL, exp_dist, - batch_sizes); - } - - { - auto disc_dist = stats::DiscreteDistribution::create(distribution_params::DISCRETE_MIN, - distribution_params::DISCRETE_MAX) - .value; - exerciseDistribution("Discrete", DistributionType::DISCRETE, disc_dist, batch_sizes); - } - - { - auto poisson_dist = - stats::PoissonDistribution::create(distribution_params::POISSON_LAMBDA).value; - exerciseDistribution("Poisson", DistributionType::POISSON, poisson_dist, batch_sizes); - } - - { - auto gamma_dist = stats::GammaDistribution::create(distribution_params::GAMMA_ALPHA, - distribution_params::GAMMA_BETA) - .value; - exerciseDistribution("Gamma", DistributionType::GAMMA, gamma_dist, batch_sizes); - } - - { - auto chi_sq_dist = stats::ChiSquaredDistribution::create(5.0).value; - exerciseDistribution("ChiSquared", DistributionType::CHI_SQUARED, chi_sq_dist, - batch_sizes); - } - - { - auto student_t_dist = stats::StudentTDistribution::create(5.0).value; - exerciseDistribution("StudentT", DistributionType::STUDENT_T, student_t_dist, - batch_sizes); - } - - { - auto beta_dist = stats::BetaDistribution::create(2.0, 5.0).value; - exerciseDistribution("Beta", DistributionType::BETA, beta_dist, batch_sizes); - } - } - - void analyzePerformanceHistory() { - auto& history = PerformanceDispatcher::getPerformanceHistory(); - - std::cout << "\n" << std::string(60, '=') << std::endl; - std::cout << "ADAPTIVE LEARNING ANALYSIS" << std::endl; - std::cout << std::string(60, '=') << std::endl; - - std::cout << "\nTotal executions recorded: " << history.getTotalExecutions() << std::endl; - - // Test strategy recommendations for different scenarios - std::vector distributions = { - DistributionType::UNIFORM, DistributionType::GAUSSIAN, - DistributionType::EXPONENTIAL, DistributionType::DISCRETE, - DistributionType::POISSON, DistributionType::GAMMA, - DistributionType::CHI_SQUARED, DistributionType::STUDENT_T, - DistributionType::BETA}; - - std::vector test_sizes = {10, 100, 1000, 5000, 25000, 100000}; - - std::cout << "\n" << std::string(60, '-') << std::endl; - std::cout << "STRATEGY RECOMMENDATIONS" << std::endl; - std::cout << std::string(60, '-') << std::endl; - - for (auto dist_type : distributions) { - std::cout << "\n" - << stats::detail::detail::distributionTypeToString(dist_type) - << " Distribution:" << std::endl; - std::cout << " Size Strategy Confidence Expected Time" << std::endl; - std::cout << " -------- -------------- ---------- -------------" << std::endl; - - for (size_t size : test_sizes) { - auto recommendation = history.getBestStrategy(dist_type, size); - - std::cout << " " << std::setw(8) << size << " " << std::setw(14) - << stats::detail::detail::strategyToDisplayString( - recommendation.recommended_strategy) - << " " << std::setw(10) - << stats::detail::detail::confidenceToString( - recommendation.confidence_score) - << " " << std::setw(8) - << stats::detail::detail::nanosecondsToMicroseconds( - recommendation.expected_time_ns) - << (recommendation.has_sufficient_data ? "" : " (insufficient data)") - << std::endl; - } - } - - // Show learned thresholds - std::cout << "\n" << std::string(60, '-') << std::endl; - std::cout << "LEARNED OPTIMAL THRESHOLDS" << std::endl; - std::cout << std::string(60, '-') << std::endl; - - for (auto dist_type : distributions) { - auto thresholds = history.learnOptimalThresholds(dist_type); - std::cout << stats::detail::detail::distributionTypeToString(dist_type) << ": "; - if (thresholds) { - std::cout << "SIMD >= " << thresholds->first - << ", Parallel >= " << thresholds->second << std::endl; - } else { - std::cout << "Insufficient data for learning" << std::endl; - } - } - - // Show performance statistics for each strategy - std::cout << "\n" << std::string(60, '-') << std::endl; - std::cout << "STRATEGY PERFORMANCE STATISTICS" << std::endl; - std::cout << std::string(60, '-') << std::endl; - - for (auto dist_type : distributions) { - std::cout << "\n" - << stats::detail::detail::distributionTypeToString(dist_type) - << " Performance:" << std::endl; - - std::vector strategies = {Strategy::SCALAR, Strategy::VECTORIZED, - Strategy::PARALLEL, Strategy::WORK_STEALING}; - - for (auto strategy : strategies) { - auto stats = history.getPerformanceStats(strategy, dist_type); - if (stats) { - std::cout - << " " << std::setw(14) - << stats::detail::detail::strategyToDisplayString(strategy) << ": " - << std::setw(6) << stats->execution_count << " runs, " - << "avg: " << std::setw(8) - << stats::detail::detail::nanosecondsToMicroseconds( - stats->getAverageTimeNs()) - << ", " - << "min: " << std::setw(6) - << stats::detail::detail::nanosecondsToMicroseconds(stats->min_time_ns) - << ", " - << "max: " << std::setw(6) - << stats::detail::detail::nanosecondsToMicroseconds(stats->max_time_ns) - << std::endl; - } - } - } - } - - // Enhanced methods for analysis mode - void exerciseAllDistributionsEnhanced(const std::vector& batch_sizes) { - std::cout << "Generating comprehensive performance data...\n\n"; - - // Multiple runs per batch size to generate sufficient data - constexpr int RUNS_PER_BATCH_SIZE = 3; - int total_operations = static_cast( - 9 * batch_sizes.size() * RUNS_PER_BATCH_SIZE); // 9 distributions * sizes * runs - int completed = 0; - - // Enhanced testing with multiple strategies per size - for (int run = 0; run < RUNS_PER_BATCH_SIZE; ++run) { - std::cout << "\n=== Run " << (run + 1) << " of " << RUNS_PER_BATCH_SIZE << " ===\n"; - - // Test all distributions using safe factory methods - { - std::cout << "Testing Uniform Distribution..." << std::flush; - auto uniform_dist = - stats::UniformDistribution::create(distribution_params::UNIFORM_MIN, - distribution_params::UNIFORM_MAX) - .value; - exerciseDistributionEnhanced("Uniform", DistributionType::UNIFORM, uniform_dist, - batch_sizes); - std::cout << " βœ“\n"; - completed += static_cast(batch_sizes.size()); - } - - { - std::cout << "Testing Gaussian Distribution..." << std::flush; - auto gaussian_dist = - stats::GaussianDistribution::create(distribution_params::GAUSSIAN_MEAN, - distribution_params::GAUSSIAN_STDDEV) - .value; - exerciseDistributionEnhanced("Gaussian", DistributionType::GAUSSIAN, gaussian_dist, - batch_sizes); - std::cout << " βœ“\n"; - completed += static_cast(batch_sizes.size()); - } - - { - std::cout << "Testing Exponential Distribution..." << std::flush; - auto exp_dist = - stats::ExponentialDistribution::create(distribution_params::EXPONENTIAL_LAMBDA) - .value; - exerciseDistributionEnhanced("Exponential", DistributionType::EXPONENTIAL, exp_dist, - batch_sizes); - std::cout << " βœ“\n"; - completed += static_cast(batch_sizes.size()); - } - - { - std::cout << "Testing Discrete Distribution..." << std::flush; - auto disc_dist = - stats::DiscreteDistribution::create(distribution_params::DISCRETE_MIN, - distribution_params::DISCRETE_MAX) - .value; - exerciseDistributionEnhanced("Discrete", DistributionType::DISCRETE, disc_dist, - batch_sizes); - std::cout << " βœ“\n"; - completed += static_cast(batch_sizes.size()); - } - - { - std::cout << "Testing Poisson Distribution..." << std::flush; - auto poisson_dist = - stats::PoissonDistribution::create(distribution_params::POISSON_LAMBDA).value; - exerciseDistributionEnhanced("Poisson", DistributionType::POISSON, poisson_dist, - batch_sizes); - std::cout << " βœ“\n"; - completed += static_cast(batch_sizes.size()); - } - - { - std::cout << "Testing Gamma Distribution..." << std::flush; - auto gamma_dist = stats::GammaDistribution::create(distribution_params::GAMMA_ALPHA, - distribution_params::GAMMA_BETA) - .value; - exerciseDistributionEnhanced("Gamma", DistributionType::GAMMA, gamma_dist, - batch_sizes); - std::cout << " βœ“\n"; - completed += static_cast(batch_sizes.size()); - } - - { - std::cout << "Testing ChiSquared Distribution..." << std::flush; - auto chi_sq_dist = stats::ChiSquaredDistribution::create(5.0).value; - exerciseDistributionEnhanced("ChiSquared", DistributionType::CHI_SQUARED, - chi_sq_dist, batch_sizes); - std::cout << " βœ“\n"; - completed += static_cast(batch_sizes.size()); - } - - { - std::cout << "Testing StudentT Distribution..." << std::flush; - auto student_t_dist = stats::StudentTDistribution::create(5.0).value; - exerciseDistributionEnhanced("StudentT", DistributionType::STUDENT_T, - student_t_dist, batch_sizes); - std::cout << " βœ“\n"; - completed += static_cast(batch_sizes.size()); - } - - { - std::cout << "Testing Beta Distribution..." << std::flush; - auto beta_dist = stats::BetaDistribution::create(2.0, 5.0).value; - exerciseDistributionEnhanced("Beta", DistributionType::BETA, beta_dist, - batch_sizes); - std::cout << " βœ“\n"; - completed += static_cast(batch_sizes.size()); - } - - double progress = - static_cast(completed) / static_cast(total_operations) * 100.0; - std::cout << "Progress: " << std::fixed << std::setprecision(1) << progress << "%\n"; - } - - auto& history = PerformanceDispatcher::getPerformanceHistory(); - std::cout << "\nData collection complete! Total executions: " - << history.getTotalExecutions() << "\n"; - } - - template - void exerciseDistributionEnhanced(const std::string& /* dist_name */, - DistributionType dist_type, Distribution& dist, - const std::vector& batch_sizes) { - std::random_device rd; - std::mt19937 gen(rd()); - - for (size_t batch_size : batch_sizes) { - // Create test data - std::vector values(batch_size); - std::uniform_real_distribution value_gen(TEST_VALUE_MIN, TEST_VALUE_MAX); - for (auto& v : values) { - v = value_gen(gen); - } - - // Always test scalar strategy - { - auto start = std::chrono::high_resolution_clock::now(); - std::vector results(batch_size); - for (size_t i = 0; i < batch_size; ++i) { - results[i] = dist.getProbability(values[i]); - } - auto end = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast(end - start); - - PerformanceDispatcher::recordPerformance(Strategy::SCALAR, dist_type, batch_size, - static_cast(duration.count())); - } - - // Test SIMD strategy for appropriate batch sizes - if (batch_size >= MIN_VECTORIZED_BATCH_SIZE) { - auto start = std::chrono::high_resolution_clock::now(); - std::vector results(batch_size); - for (size_t i = 0; i < batch_size; ++i) { - results[i] = dist.getCumulativeProbability(values[i]); - } - auto end = std::chrono::high_resolution_clock::now(); - auto base_duration = - std::chrono::duration_cast(end - start); - - // Simulate SIMD improvement - auto simd_duration = base_duration / SIMD_SPEEDUP_FACTOR; - PerformanceDispatcher::recordPerformance( - Strategy::VECTORIZED, dist_type, batch_size, - static_cast(simd_duration.count())); - } - - // Test parallel strategies for larger batch sizes - if (batch_size >= MIN_PARALLEL_BATCH_SIZE) { - auto start = std::chrono::high_resolution_clock::now(); - std::vector results(batch_size); - for (size_t i = 0; i < batch_size; ++i) { - results[i] = - dist.getProbability(values[i]) + dist.getCumulativeProbability(values[i]); - } - auto end = std::chrono::high_resolution_clock::now(); - auto base_duration = - std::chrono::duration_cast(end - start); - - // Simulate parallel improvement - auto parallel_duration = base_duration / PARALLEL_SPEEDUP_FACTOR; - PerformanceDispatcher::recordPerformance( - Strategy::PARALLEL, dist_type, batch_size, - static_cast(parallel_duration.count())); - } - - // Test work-stealing for very large batch sizes - if (batch_size >= MIN_WORK_STEALING_BATCH_SIZE) { - auto start = std::chrono::high_resolution_clock::now(); - std::vector results(batch_size); - for (size_t i = 0; i < batch_size; ++i) { - results[i] = dist.getProbability(values[i]) * 2.0; - } - auto end = std::chrono::high_resolution_clock::now(); - auto base_duration = - std::chrono::duration_cast(end - start); - - auto work_stealing_duration = base_duration / WORK_STEALING_SPEEDUP_FACTOR; - PerformanceDispatcher::recordPerformance( - Strategy::WORK_STEALING, dist_type, batch_size, - static_cast(work_stealing_duration.count())); - } - } - } - - void analyzePerformanceHistoryEnhanced() { - auto& history = PerformanceDispatcher::getPerformanceHistory(); - - std::cout << "\n" << std::string(60, '=') << std::endl; - std::cout << "ADAPTIVE LEARNING ANALYSIS" << std::endl; - std::cout << std::string(60, '=') << std::endl; - - std::cout << "\nTotal executions recorded: " << history.getTotalExecutions() << std::endl; - - // Test strategy recommendations for different scenarios - std::vector distributions = { - DistributionType::UNIFORM, DistributionType::GAUSSIAN, - DistributionType::EXPONENTIAL, DistributionType::DISCRETE, - DistributionType::POISSON, DistributionType::GAMMA, - DistributionType::CHI_SQUARED, DistributionType::STUDENT_T, - DistributionType::BETA}; - - std::vector test_sizes = {10, 100, 1000, 5000, 25000, 100000}; - - std::cout << "\n" << std::string(60, '-') << std::endl; - std::cout << "STRATEGY RECOMMENDATIONS" << std::endl; - std::cout << std::string(60, '-') << std::endl; - - for (auto dist_type : distributions) { - std::cout << "\n" - << stats::detail::detail::distributionTypeToString(dist_type) - << " Distribution:" << std::endl; - std::cout << " Size Strategy Confidence Expected Time" << std::endl; - std::cout << " -------- -------------- ---------- -------------" << std::endl; - - for (size_t size : test_sizes) { - auto recommendation = history.getBestStrategy(dist_type, size); - - std::cout << " " << std::setw(8) << size << " " << std::setw(14) - << stats::detail::detail::strategyToDisplayString( - recommendation.recommended_strategy) - << " " << std::setw(10) - << stats::detail::detail::confidenceToString( - recommendation.confidence_score) - << " " << std::setw(8) - << stats::detail::detail::nanosecondsToMicroseconds( - recommendation.expected_time_ns) - << (recommendation.has_sufficient_data ? "" : " (insufficient data)") - << std::endl; - } - } - - // Show learned thresholds - std::cout << "\n" << std::string(60, '-') << std::endl; - std::cout << "LEARNED OPTIMAL THRESHOLDS" << std::endl; - std::cout << std::string(60, '-') << std::endl; - - for (auto dist_type : distributions) { - auto thresholds = history.learnOptimalThresholds(dist_type); - std::cout << stats::detail::detail::distributionTypeToString(dist_type) << ": "; - if (thresholds) { - std::cout << "SIMD >= " << thresholds->first - << ", Parallel >= " << thresholds->second << std::endl; - } else { - std::cout << "Insufficient data for learning" << std::endl; - } - } - - // Enhanced performance statistics with insights - std::cout << "\n" << std::string(60, '-') << std::endl; - std::cout << "STRATEGY PERFORMANCE STATISTICS" << std::endl; - std::cout << std::string(60, '-') << std::endl; - - for (auto dist_type : distributions) { - std::cout << "\n" - << stats::detail::detail::distributionTypeToString(dist_type) - << " Performance:" << std::endl; - - std::vector strategies = {Strategy::SCALAR, Strategy::VECTORIZED, - Strategy::PARALLEL, Strategy::WORK_STEALING}; - - for (auto strategy : strategies) { - auto stats = history.getPerformanceStats(strategy, dist_type); - if (stats) { - std::cout - << " " << std::setw(14) - << stats::detail::detail::strategyToDisplayString(strategy) << ": " - << std::setw(6) << stats->execution_count << " runs, " - << "avg: " << std::setw(8) - << stats::detail::detail::nanosecondsToMicroseconds( - stats->getAverageTimeNs()) - << ", " - << "min: " << std::setw(6) - << stats::detail::detail::nanosecondsToMicroseconds(stats->min_time_ns) - << ", " - << "max: " << std::setw(6) - << stats::detail::detail::nanosecondsToMicroseconds(stats->max_time_ns) - << std::endl; - } - } - } - - // Add insights and recommendations - std::cout << "\n" << std::string(60, '-') << std::endl; - std::cout << "PERFORMANCE INSIGHTS" << std::endl; - std::cout << std::string(60, '-') << std::endl; - - generatePerformanceInsights(history, distributions); - } - - void generatePerformanceInsights(PerformanceHistory& history, - const std::vector& distributions) { - std::cout << "\nBased on collected performance data:\n\n"; - - // Analyze efficiency patterns across distributions - std::cout << "Distribution Efficiency Rankings (lower times = better):\n"; - std::vector> efficiency_ranking; - - for (auto dist_type : distributions) { - auto stats = history.getPerformanceStats(Strategy::SCALAR, dist_type); - if (stats && stats->execution_count > 0) { - efficiency_ranking.emplace_back(dist_type, stats->getAverageTimeNs()); - } - } - - std::sort(efficiency_ranking.begin(), efficiency_ranking.end(), - [](const auto& a, const auto& b) { return a.second < b.second; }); - - int rank = 1; - for (const auto& [dist_type, avg_time] : efficiency_ranking) { - std::cout << " " << rank++ << ". " - << stats::detail::detail::distributionTypeToString(dist_type) << " (" - << stats::detail::detail::nanosecondsToMicroseconds(avg_time) << " avg)\n"; - } - - // Strategy effectiveness analysis - std::cout << "\nStrategy Effectiveness Summary:\n"; - for (auto strategy : {Strategy::VECTORIZED, Strategy::PARALLEL, Strategy::WORK_STEALING}) { - int total_distributions = 0; - int effective_distributions = 0; - - for (auto dist_type : distributions) { - auto scalar_stats = history.getPerformanceStats(Strategy::SCALAR, dist_type); - auto strategy_stats = history.getPerformanceStats(strategy, dist_type); - - if (scalar_stats && strategy_stats && scalar_stats->execution_count > 0 && - strategy_stats->execution_count > 0) { - total_distributions++; - if (strategy_stats->getAverageTimeNs() < scalar_stats->getAverageTimeNs()) { - effective_distributions++; - } - } - } - - if (total_distributions > 0) { - double effectiveness = static_cast(effective_distributions) / - static_cast(total_distributions) * 100.0; - std::cout << " " << stats::detail::detail::strategyToDisplayString(strategy) - << ": " << std::fixed << std::setprecision(1) << effectiveness - << "% effective (" << effective_distributions << "/" - << total_distributions << " distributions)\n"; - } - } - - std::cout << "\nRecommendations for optimal performance:\n"; - std::cout << "β€’ Use Scalar strategy for small batch sizes (< 100 elements)\n"; - std::cout << "β€’ Consider SIMD for medium batches (100-10,000 elements)\n"; - std::cout << "β€’ Use Parallel strategies for large batches (> 10,000 elements)\n"; - std::cout << "β€’ Advanced strategies (Work-Stealing, Cache-Aware) show benefits with very " - "large datasets\n"; - } -}; - -int main(int argc, char* argv[]) { - LearningAnalyzer analyzer; - - // Parse command line arguments - std::string mode = "analysis"; // default mode - if (argc > 1) { - mode = argv[1]; - } - - if (mode == "help" || mode == "--help" || mode == "-h") { - analyzer.showUsage(); - return 0; - } - - try { - if (mode == "demo") { - analyzer.runDemo(); - } else if (mode == "analysis") { - analyzer.runAnalysis(); - } else if (mode == "both") { - analyzer.runDemo(); - std::cout << "\n" << std::string(80, '=') << "\n\n"; - analyzer.runAnalysis(); - } else { - std::cerr << "Unknown mode: " << mode << std::endl; - analyzer.showUsage(); - return 1; - } - - std::cout << "\nLearning analysis completed successfully!" << std::endl; - - } catch (const std::exception& e) { - std::cerr << "Error: " << e.what() << std::endl; - return 1; - } - - return 0; -} diff --git a/tools/parallel_threshold_benchmark.cpp b/tools/parallel_threshold_benchmark.cpp deleted file mode 100644 index acaaf2f..0000000 --- a/tools/parallel_threshold_benchmark.cpp +++ /dev/null @@ -1,577 +0,0 @@ -/** - * @file parallel_threshold_benchmark.cpp - * @brief Enhanced Benchmark tool for determining dynamic thresholds using PerformanceHistory - * - * This tool benchmarks different data sizes to find the optimal thresholds - * for parallel execution, utilizing adaptive learning from PerformanceHistory. - */ - -// Use consolidated tool utilities header which includes libstats.h -#include "tool_utils.h" - -#include // for timing operations -#include // for size_t -#include // for file I/O -#include // for std::cout -#include // for std::map -#include // for std::mt19937, distributions -#include // for std::span -#include // for std::string -#include // for threading operations -#include // for std::vector - -// Include the specific headers instead of broad constants.h -#include "libstats/core/performance_dispatcher.h" -#include "libstats/distributions/discrete.h" -#include "libstats/distributions/exponential.h" -#include "libstats/distributions/gamma.h" -#include "libstats/distributions/gaussian.h" -#include "libstats/distributions/poisson.h" -#include "libstats/distributions/uniform.h" - -using namespace std::chrono; -using namespace stats; -using namespace stats::detail; - -// Tool-specific benchmark constants -namespace { -// Benchmark timing constants -constexpr int DEFAULT_RNG_SEED = 42; -constexpr double SPEEDUP_SLOWDOWN_THRESHOLD = 0.5; // Below this is "extreme slowdown" - -// Distribution-specific test parameters -namespace distribution_params { -// Poisson parameters -constexpr double DEFAULT_POISSON_LAMBDA = 3.5; -constexpr int POISSON_TEST_LAMBDA = 3; - -// Discrete distribution range -constexpr int DISCRETE_MIN = 0; -constexpr int DISCRETE_MAX = 10; -constexpr int DISCRETE_TEST_MIN = -2; -constexpr int DISCRETE_TEST_MAX = 12; - -// Uniform distribution range -constexpr double UNIFORM_MIN = 0.0; -constexpr double UNIFORM_MAX = 1.0; -constexpr double UNIFORM_TEST_MIN = -0.5; -constexpr double UNIFORM_TEST_MAX = 1.5; - -// Gaussian distribution parameters -constexpr double GAUSSIAN_MEAN = 0.0; -constexpr double GAUSSIAN_STDDEV = 1.0; -constexpr double GAUSSIAN_TEST_STDDEV = 2.0; // Wider range for testing - -// Exponential distribution parameter -constexpr double EXPONENTIAL_LAMBDA = 1.0; -constexpr double EXPONENTIAL_TEST_LAMBDA = 0.5; - -// Gamma distribution parameters -constexpr double GAMMA_ALPHA = 2.0; -constexpr double GAMMA_BETA = 1.0; -constexpr double GAMMA_TEST_ALPHA = 1.5; -constexpr double GAMMA_TEST_BETA = 2.0; -} // namespace distribution_params - -// Output file configuration -constexpr const char* RESULTS_CSV_FILENAME = "parallel_threshold_benchmark_results.csv"; -} // namespace - -struct ToolBenchmarkResult { - std::size_t data_size; - std::string distribution_type; - std::string operation_type; - double serial_time_us; - double parallel_time_us; - double vectorized_time_us; - double parallel_speedup; - double simd_speedup; - bool parallel_beneficial; -}; - -class ParallelThresholdBenchmark { - private: - std::mt19937 gen_; - std::vector results_; - std::vector test_sizes_; - - void initializeTestSizes(bool include_large) { - // Base test sizes - start small and work up to 524K elements - test_sizes_ = {64, 128, 256, 512, 1024, 2048, 4096, - 8192, 16384, 32768, 65536, 131072, 262144, 524288}; - - // Add the large (and slow) test sizes only if requested - if (include_large) { - test_sizes_.push_back(1048576); // 1M elements - test_sizes_.push_back(2097152); // 2M elements - } - } - - // Number of iterations for timing stability - static constexpr int TIMING_ITERATIONS = 10; - static constexpr int WARMUP_ITERATIONS = 3; - - public: - ParallelThresholdBenchmark(bool include_large = false) : gen_(DEFAULT_RNG_SEED) { - initializeTestSizes(include_large); - } - - void runAllBenchmarks() { - using namespace stats::detail; - - // Initialize performance systems for accurate threshold determination - stats::initialize_performance_systems(); - - // Display tool header with system information - stats::detail::detail::displayToolHeader( - "Parallel Threshold Benchmark", - "Distribution-specific threshold optimization with adaptive learning"); - - benchmarkUniformDistribution(); - benchmarkPoissonDistribution(); - benchmarkDiscreteDistribution(); - benchmarkGaussianDistribution(); - benchmarkExponentialDistribution(); - benchmarkGammaDistribution(); - - analyzeResults(); - saveResults(); - } - - private: - void benchmarkUniformDistribution() { - using namespace stats::detail; - - stats::detail::detail::subsectionHeader("Uniform Distribution Benchmark"); - auto uniform = stats::UniformDistribution::create(distribution_params::UNIFORM_MIN, - distribution_params::UNIFORM_MAX) - .value; - - for (auto size : test_sizes_) { - std::cout << " Testing size: " << size << std::flush; - - // Generate test data - std::vector test_data(size); - std::uniform_real_distribution dis(distribution_params::UNIFORM_TEST_MIN, - distribution_params::UNIFORM_TEST_MAX); - for (auto& val : test_data) { - val = dis(gen_); - } - - // Benchmark PDF - auto pdf_result = benchmarkOperation(uniform, test_data, "PDF", "Uniform"); - results_.push_back(pdf_result); - - // Benchmark LogPDF - auto logpdf_result = benchmarkOperation(uniform, test_data, "LogPDF", "Uniform"); - results_.push_back(logpdf_result); - - // Benchmark CDF - auto cdf_result = benchmarkOperation(uniform, test_data, "CDF", "Uniform"); - results_.push_back(cdf_result); - - std::cout << " βœ“\n"; - } - } - - void benchmarkPoissonDistribution() { - using namespace stats::detail; - - stats::detail::detail::subsectionHeader("Poisson Distribution Benchmark"); - auto poisson = - stats::PoissonDistribution::create(distribution_params::DEFAULT_POISSON_LAMBDA).value; - - for (auto size : test_sizes_) { - std::cout << " Testing size: " << size << std::flush; - - // Generate test data (integer values for Poisson) - std::vector test_data(size); - std::poisson_distribution dis(distribution_params::POISSON_TEST_LAMBDA); - for (auto& val : test_data) { - val = static_cast(dis(gen_)); - } - - // Benchmark PDF (PMF) - auto pdf_result = benchmarkOperation(poisson, test_data, "PDF", "Poisson"); - results_.push_back(pdf_result); - - // Benchmark LogPDF - auto logpdf_result = benchmarkOperation(poisson, test_data, "LogPDF", "Poisson"); - results_.push_back(logpdf_result); - - // Benchmark CDF - auto cdf_result = benchmarkOperation(poisson, test_data, "CDF", "Poisson"); - results_.push_back(cdf_result); - - std::cout << " βœ“\n"; - } - } - - void benchmarkDiscreteDistribution() { - using namespace stats::detail; - - stats::detail::detail::subsectionHeader("Discrete Distribution Benchmark"); - auto discrete = stats::DiscreteDistribution::create(distribution_params::DISCRETE_MIN, - distribution_params::DISCRETE_MAX) - .value; - - for (auto size : test_sizes_) { - std::cout << " Testing size: " << size << std::flush; - - // Generate test data (integer values) - std::vector test_data(size); - std::uniform_int_distribution dis(distribution_params::DISCRETE_TEST_MIN, - distribution_params::DISCRETE_TEST_MAX); - for (auto& val : test_data) { - val = static_cast(dis(gen_)); - } - - // Benchmark PDF (PMF) - auto pdf_result = benchmarkOperation(discrete, test_data, "PDF", "Discrete"); - results_.push_back(pdf_result); - - // Benchmark LogPDF - auto logpdf_result = benchmarkOperation(discrete, test_data, "LogPDF", "Discrete"); - results_.push_back(logpdf_result); - - // Benchmark CDF - auto cdf_result = benchmarkOperation(discrete, test_data, "CDF", "Discrete"); - results_.push_back(cdf_result); - - std::cout << " βœ“\n"; - } - } - - void benchmarkGaussianDistribution() { - using namespace stats::detail; - - stats::detail::detail::subsectionHeader("Gaussian Distribution Benchmark"); - auto gaussian = stats::GaussianDistribution::create(distribution_params::GAUSSIAN_MEAN, - distribution_params::GAUSSIAN_STDDEV) - .value; - - for (auto size : test_sizes_) { - std::cout << " Testing size: " << size << std::flush; - - // Generate test data (normal distribution values) - std::vector test_data(size); - std::normal_distribution dis( - distribution_params::GAUSSIAN_MEAN, - distribution_params::GAUSSIAN_TEST_STDDEV); // Wider range - for (auto& val : test_data) { - val = dis(gen_); - } - - // Benchmark PDF - auto pdf_result = benchmarkOperation(gaussian, test_data, "PDF", "Gaussian"); - results_.push_back(pdf_result); - - // Benchmark LogPDF - auto logpdf_result = benchmarkOperation(gaussian, test_data, "LogPDF", "Gaussian"); - results_.push_back(logpdf_result); - - // Benchmark CDF - auto cdf_result = benchmarkOperation(gaussian, test_data, "CDF", "Gaussian"); - results_.push_back(cdf_result); - - std::cout << " βœ“\n"; - } - } - - void benchmarkExponentialDistribution() { - using namespace stats::detail; - - stats::detail::detail::subsectionHeader("Exponential Distribution Benchmark"); - auto exponential = - stats::ExponentialDistribution::create(distribution_params::EXPONENTIAL_LAMBDA).value; - - for (auto size : test_sizes_) { - std::cout << " Testing size: " << size << std::flush; - - // Generate test data (exponential distribution values) - std::vector test_data(size); - std::exponential_distribution dis(distribution_params::EXPONENTIAL_TEST_LAMBDA); - for (auto& val : test_data) { - val = dis(gen_); - } - - // Benchmark PDF - auto pdf_result = benchmarkOperation(exponential, test_data, "PDF", "Exponential"); - results_.push_back(pdf_result); - - // Benchmark LogPDF - auto logpdf_result = - benchmarkOperation(exponential, test_data, "LogPDF", "Exponential"); - results_.push_back(logpdf_result); - - // Benchmark CDF - auto cdf_result = benchmarkOperation(exponential, test_data, "CDF", "Exponential"); - results_.push_back(cdf_result); - - std::cout << " βœ“\n"; - } - } - - void benchmarkGammaDistribution() { - using namespace stats::detail; - - stats::detail::detail::subsectionHeader("Gamma Distribution Benchmark"); - auto gamma = stats::GammaDistribution::create(distribution_params::GAMMA_ALPHA, - distribution_params::GAMMA_BETA) - .value; - - for (auto size : test_sizes_) { - std::cout << " Testing size: " << size << std::flush; - - // Generate test data (gamma distribution values) - std::vector test_data(size); - std::gamma_distribution dis(distribution_params::GAMMA_TEST_ALPHA, - distribution_params::GAMMA_TEST_BETA); - for (auto& val : test_data) { - val = dis(gen_); - } - - // Benchmark PDF - auto pdf_result = benchmarkOperation(gamma, test_data, "PDF", "Gamma"); - results_.push_back(pdf_result); - - // Benchmark LogPDF - auto logpdf_result = benchmarkOperation(gamma, test_data, "LogPDF", "Gamma"); - results_.push_back(logpdf_result); - - // Benchmark CDF - auto cdf_result = benchmarkOperation(gamma, test_data, "CDF", "Gamma"); - results_.push_back(cdf_result); - - std::cout << " βœ“\n"; - } - } - - template - ToolBenchmarkResult benchmarkOperation(const Distribution& dist, - const std::vector& test_data, - const std::string& operation, - const std::string& dist_type) { - ToolBenchmarkResult result; - result.data_size = test_data.size(); - result.distribution_type = dist_type; - result.operation_type = operation; - - std::vector results_buffer(test_data.size()); - std::span input_span(test_data); - std::span output_span(results_buffer); - - // Warmup - for (int i = 0; i < WARMUP_ITERATIONS; ++i) { - performOperation(dist, input_span, output_span, operation, "serial"); - } - - // Benchmark Serial (using SIMD batch operations) - auto serial_start = high_resolution_clock::now(); - for (int i = 0; i < TIMING_ITERATIONS; ++i) { - performOperation(dist, input_span, output_span, operation, "simd"); - } - auto serial_end = high_resolution_clock::now(); - result.vectorized_time_us = - static_cast(duration_cast(serial_end - serial_start).count()) / - static_cast(TIMING_ITERATIONS); - - // Benchmark True Serial (element by element) - auto true_serial_start = high_resolution_clock::now(); - for (int i = 0; i < TIMING_ITERATIONS; ++i) { - performOperation(dist, input_span, output_span, operation, "serial"); - } - auto true_serial_end = high_resolution_clock::now(); - result.serial_time_us = - static_cast( - duration_cast(true_serial_end - true_serial_start).count()) / - static_cast(TIMING_ITERATIONS); - - // Benchmark Parallel - auto parallel_start = high_resolution_clock::now(); - for (int i = 0; i < TIMING_ITERATIONS; ++i) { - performOperation(dist, input_span, output_span, operation, "parallel"); - } - auto parallel_end = high_resolution_clock::now(); - result.parallel_time_us = - static_cast( - duration_cast(parallel_end - parallel_start).count()) / - static_cast(TIMING_ITERATIONS); - - // Calculate speedups - result.parallel_speedup = result.vectorized_time_us / result.parallel_time_us; - result.simd_speedup = result.serial_time_us / result.vectorized_time_us; - result.parallel_beneficial = result.parallel_speedup > 1.0; - - return result; - } - - template - void performOperation(const Distribution& dist, std::span input, - std::span output, const std::string& operation, - const std::string& method) { - if (method == "serial") { - // True serial: element by element - if (operation == "PDF") { - for (size_t i = 0; i < input.size(); ++i) { - output[i] = dist.getProbability(input[i]); - } - } else if (operation == "LogPDF") { - for (size_t i = 0; i < input.size(); ++i) { - output[i] = dist.getLogProbability(input[i]); - } - } else if (operation == "CDF") { - for (size_t i = 0; i < input.size(); ++i) { - output[i] = dist.getCumulativeProbability(input[i]); - } - } - } else if (method == "simd") { - // SIMD batch operations using explicit strategy to ensure SIMD benchmarking - if (operation == "PDF") { - dist.getProbabilityWithStrategy(input, output, stats::detail::Strategy::VECTORIZED); - } else if (operation == "LogPDF") { - dist.getLogProbabilityWithStrategy(input, output, - stats::detail::Strategy::VECTORIZED); - } else if (operation == "CDF") { - dist.getCumulativeProbabilityWithStrategy(input, output, - stats::detail::Strategy::VECTORIZED); - } - } else if (method == "parallel") { - // Parallel operations using explicit strategy to ensure parallel benchmarking - if (operation == "PDF") { - dist.getProbabilityWithStrategy(input, output, stats::detail::Strategy::PARALLEL); - } else if (operation == "LogPDF") { - dist.getLogProbabilityWithStrategy(input, output, - stats::detail::Strategy::PARALLEL); - } else if (operation == "CDF") { - dist.getCumulativeProbabilityWithStrategy(input, output, - stats::detail::Strategy::PARALLEL); - } - } - } - - void analyzeResults() { - std::cout << "\n=== Analysis Results ===\n"; - - // Group results by distribution and operation - std::map> grouped_results; - for (auto& result : results_) { - std::string key = result.distribution_type + "_" + result.operation_type; - grouped_results[key].push_back(&result); - } - - std::cout << std::left << std::setw(20) << "Dist_Op" << std::setw(10) << "Size" - << std::setw(12) << "Serial(ΞΌs)" << std::setw(12) << "SIMD(ΞΌs)" << std::setw(12) - << "Parallel(ΞΌs)" << std::setw(12) << "S-Speedup" << std::setw(12) << "P-Speedup" - << std::setw(12) << "Beneficial?" - << "\n"; - std::cout << std::string(120, '-') << "\n"; - - for (const auto& [key, results] : grouped_results) { - std::size_t beneficial_threshold = SIZE_MAX; - - for (const auto* result : results) { - std::cout << std::left << std::setw(20) << key << std::setw(10) << result->data_size - << std::setw(12) << std::fixed << std::setprecision(1) - << result->serial_time_us << std::setw(12) << std::fixed - << std::setprecision(1) << result->vectorized_time_us << std::setw(12) - << std::fixed << std::setprecision(1) << result->parallel_time_us - << std::setw(12) << std::fixed << std::setprecision(2) - << result->simd_speedup << std::setw(12) << std::fixed - << std::setprecision(2) << result->parallel_speedup << std::setw(12) - << (result->parallel_beneficial ? "YES" : "NO") << "\n"; - - if (result->parallel_beneficial && beneficial_threshold == SIZE_MAX) { - beneficial_threshold = result->data_size; - } - } - - std::cout << " β†’ Recommended threshold for " << key << ": "; - if (beneficial_threshold != SIZE_MAX) { - std::cout << beneficial_threshold << " elements\n"; - } else { - std::cout << "NEVER (parallel not beneficial)\n"; - } - std::cout << "\n"; - } - - // Find extreme slowdowns - std::cout << "\n=== Extreme Slowdowns (Speedup < " << SPEEDUP_SLOWDOWN_THRESHOLD - << ") ===\n"; - bool found_extreme = false; - for (const auto& result : results_) { - if (result.parallel_speedup < SPEEDUP_SLOWDOWN_THRESHOLD) { - std::cout << result.distribution_type << " " << result.operation_type << " at size " - << result.data_size << ": " << result.parallel_speedup << "x speedup (" - << (1.0 / result.parallel_speedup) << "x slowdown)\n"; - found_extreme = true; - } - } - if (!found_extreme) { - std::cout << "No extreme slowdowns found.\n"; - } - } - - void saveResults() { - std::ofstream csv_file(RESULTS_CSV_FILENAME); - csv_file << "Distribution,Operation,DataSize,SerialTime_us,SIMDTime_us,ParallelTime_us," - "SIMDSpeedup,ParallelSpeedup,ParallelBeneficial\n"; - - for (const auto& result : results_) { - csv_file << result.distribution_type << "," << result.operation_type << "," - << result.data_size << "," << result.serial_time_us << "," - << result.vectorized_time_us << "," << result.parallel_time_us << "," - << result.simd_speedup << "," << result.parallel_speedup << "," - << (result.parallel_beneficial ? "true" : "false") << "\n"; - } - - std::cout << "\n=== Results saved to parallel_threshold_benchmark_results.csv ===\n"; - } -}; - -void printUsage(const char* program_name) { - std::cout << "Usage: " << program_name << " [OPTIONS]\n"; - std::cout << "\nOptions:\n"; - std::cout << " -l, --large Include large dataset tests (1M and 2M elements)\n"; - std::cout << " -h, --help Show this help message\n"; - std::cout << "\nDefault: Tests up to 524K elements only (faster execution)\n"; - std::cout << "With --large: Tests up to 2M elements (slower but more comprehensive)\n"; -} - -int main(int argc, char* argv[]) { - bool include_large = false; - - // Parse command line arguments - for (int i = 1; i < argc; ++i) { - std::string arg = argv[i]; - if (arg == "-l" || arg == "--large") { - include_large = true; - } else if (arg == "-h" || arg == "--help") { - printUsage(argv[0]); - return 0; - } else { - std::cerr << "Unknown option: " << arg << "\n"; - printUsage(argv[0]); - return 1; - } - } - - try { - ParallelThresholdBenchmark benchmark(include_large); - - // Display test configuration - std::cout << "\n=== Test Configuration ===\n"; - std::cout << "Large dataset tests (1M-2M elements): " - << (include_large ? "ENABLED" : "DISABLED") << "\n"; - if (!include_large) { - std::cout << "To enable large tests, use: " << argv[0] << " --large\n"; - } - std::cout << "\n"; - - benchmark.runAllBenchmarks(); - return 0; - } catch (const std::exception& e) { - std::cerr << "Benchmark failed: " << e.what() << std::endl; - return 1; - } -} diff --git a/tools/performance_dispatcher_tool.cpp b/tools/performance_dispatcher_tool.cpp deleted file mode 100644 index 88af8b5..0000000 --- a/tools/performance_dispatcher_tool.cpp +++ /dev/null @@ -1,351 +0,0 @@ -/** - * @file performance_dispatcher_tool.cpp - * @brief Interactive tool to test and analyze the PerformanceDispatcher system - * - * This tool demonstrates the Phase 3 performance optimization framework including: - * - SystemCapabilities detection and benchmarking - * - PerformanceDispatcher strategy selection - * - PerformanceHistory learning and adaptation - * - Real-time threshold optimization - */ - -// Use consolidated tool utilities header which includes libstats.h -#include "tool_utils.h" - -// Additional standard library includes for performance analysis -#include "libstats/core/performance_dispatcher.h" -#include "libstats/core/performance_history.h" - -#include // for timing operations -#include // for size_t -#include // for std::cout -#include // for std::map -#include // for std::mt19937 -#include // for std::ostringstream -#include // for std::string, to_string -#include // for std::vector - -using namespace stats::detail; -using namespace std::chrono; - -// Tool-specific simulation constants -namespace { -constexpr int DEMO_SEED = 42; -constexpr double SIMULATION_NOISE_MIN = 0.9; -constexpr double SIMULATION_NOISE_MAX = 1.1; - -// Realistic performance simulation parameters (matching threshold_learning_demo) -namespace timing_simulation { -// Performance scaling factors for different strategies -constexpr double SCALAR_PERFORMANCE_FACTOR = 10.0; -constexpr double SIMD_PERFORMANCE_FACTOR = 3.0; -constexpr double PARALLEL_PERFORMANCE_FACTOR = 2.0; - -// Strategy overhead constants -constexpr uint64_t SIMD_SMALL_OVERHEAD = 500; // Additional time for small SIMD operations -constexpr uint64_t PARALLEL_BASE_OVERHEAD = 8000; // Base threading overhead - -// Size thresholds for overhead application -constexpr size_t SIMD_OVERHEAD_THRESHOLD = 10000; -} // namespace timing_simulation - -namespace batch_sizes { -// Batch sizes reserved for future interactive testing features -[[maybe_unused]] constexpr size_t SMALL_BATCH = 50; -[[maybe_unused]] constexpr size_t MEDIUM_BATCH = 1000; -[[maybe_unused]] constexpr size_t LARGE_BATCH = 10000; -[[maybe_unused]] constexpr size_t OTHER_DIST_BATCH = 100; -[[maybe_unused]] constexpr size_t OTHER_DIST_MEDIUM_BATCH = 1000; -[[maybe_unused]] constexpr size_t OTHER_DIST_LARGE_BATCH = 10000; -} // namespace batch_sizes - -// Sample counts for simulation - reserved for future use -[[maybe_unused]] constexpr int SAMPLES_PER_STRATEGY = 20; -[[maybe_unused]] constexpr int OTHER_DIST_SAMPLES = 10; -} // namespace - -class PerformanceDispatcherTool { - private: - PerformanceDispatcher dispatcher_; - const SystemCapabilities& system_; - std::mt19937 rng_; - - public: - PerformanceDispatcherTool() : system_(SystemCapabilities::current()), rng_(DEMO_SEED) {} - - void run() { - using namespace stats::detail; - - // Display tool header with system information - stats::detail::detail::displayToolHeader( - "Performance Dispatcher Tool", - "Interactive analysis of performance optimization framework"); - - // Display major sections - stats::detail::detail::displaySystemCapabilities(); - demonstrateStrategySelection(); - demonstratePerformanceLearning(); - runInteractiveMode(); - - std::cout << "Performance dispatcher analysis completed successfully.\n"; - } - - private: - void demonstrateStrategySelection() { - using namespace stats::detail; - - stats::detail::detail::sectionHeader("Strategy Selection Demonstration"); - - // Test different batch sizes and show strategy selection - std::vector test_sizes = {10, 100, 1000, 10000, 100000, 1000000}; - std::vector distributions = { - DistributionType::UNIFORM, DistributionType::GAUSSIAN, - DistributionType::EXPONENTIAL, DistributionType::POISSON, - DistributionType::DISCRETE, DistributionType::GAMMA, - DistributionType::CHI_SQUARED, DistributionType::STUDENT_T, - DistributionType::BETA}; - - stats::detail::detail::ColumnFormatter formatter({12, 14, 15, 18}); - std::cout << formatter.formatRow( - {"Batch Size", "Distribution", "Complexity", "Selected Strategy"}) - << "\n"; - std::cout << formatter.getSeparator() << "\n"; - - for (auto size : test_sizes) { - for (auto dist : distributions) { - for (auto complexity : - {ComputationComplexity::SIMPLE, ComputationComplexity::COMPLEX}) { - auto strategy = - dispatcher_.selectOptimalStrategy(size, dist, complexity, system_); - - std::cout << formatter.formatRow( - {std::to_string(size), - stats::detail::detail::distributionTypeToString(dist), - stats::detail::detail::complexityToString(complexity), - stats::detail::detail::strategyToString(strategy)}) - << "\n"; - } - } - } - std::cout << "\n"; - } - - void demonstratePerformanceLearning() { - using namespace stats::detail; - - stats::detail::detail::sectionHeader("Performance Learning Demonstration"); - - auto& history = PerformanceDispatcher::getPerformanceHistory(); - history.clearHistory(); // Start fresh for demonstration - - std::cout << "Simulating performance data collection...\n\n"; - - // Simulate collecting performance data over time - simulatePerformanceData(history); - - std::cout << "Total recorded executions: " << history.getTotalExecutions() << "\n\n"; - - // Show learned thresholds - stats::detail::detail::subsectionHeader("Learned Optimal Thresholds"); - - stats::detail::detail::ColumnFormatter threshold_formatter({15, 20, 20}); - std::cout << threshold_formatter.formatRow( - {"Distribution", "SIMD Threshold", "Parallel Threshold"}) - << "\n"; - std::cout << threshold_formatter.getSeparator() << "\n"; - - for (auto dist : - {DistributionType::GAUSSIAN, DistributionType::EXPONENTIAL, DistributionType::UNIFORM, - DistributionType::DISCRETE, DistributionType::POISSON, DistributionType::GAMMA, - DistributionType::CHI_SQUARED, DistributionType::STUDENT_T, DistributionType::BETA}) { - auto thresholds = history.learnOptimalThresholds(dist); - if (thresholds.has_value()) { - std::cout << threshold_formatter.formatRow( - {stats::detail::detail::distributionTypeToString(dist), - std::to_string(thresholds->first), - std::to_string(thresholds->second)}) - << "\n"; - } else { - std::cout << threshold_formatter.formatRow( - {stats::detail::detail::distributionTypeToString(dist), - "Insufficient data", "Insufficient data"}) - << "\n"; - } - } - - // Show strategy recommendations - stats::detail::detail::subsectionHeader("Strategy Recommendations (with confidence)"); - - stats::detail::detail::ColumnFormatter rec_formatter({12, 15, 22, 12}); - std::cout << rec_formatter.formatRow( - {"Batch Size", "Distribution", "Recommended Strategy", "Confidence"}) - << "\n"; - std::cout << rec_formatter.getSeparator() << "\n"; - - std::vector test_sizes = {100, 1000, 10000}; - std::vector rec_distributions = { - DistributionType::GAUSSIAN, DistributionType::EXPONENTIAL, DistributionType::UNIFORM, - DistributionType::DISCRETE, DistributionType::POISSON, DistributionType::GAMMA, - DistributionType::CHI_SQUARED, DistributionType::STUDENT_T, DistributionType::BETA}; - - for (auto size : test_sizes) { - for (auto dist : rec_distributions) { - auto recommendation = history.getBestStrategy(dist, size); - std::string confidence_str = - stats::detail::detail::confidenceToString(recommendation.confidence_score); - - std::cout << rec_formatter.formatRow( - {std::to_string(size), - stats::detail::detail::distributionTypeToString(dist), - stats::detail::detail::strategyToDisplayString( - recommendation.recommended_strategy), - confidence_str}) - << "\n"; - } - } - std::cout << "\n"; - } - - void simulatePerformanceData(PerformanceHistory& history) { - // Simulate realistic performance patterns using the same modeling as - // threshold_learning_demo - std::uniform_real_distribution noise(SIMULATION_NOISE_MIN, SIMULATION_NOISE_MAX); - - // Performance complexity factors for different distributions - std::map complexity_factors = { - {DistributionType::UNIFORM, 1.0}, // Simple - just random scaling - {DistributionType::DISCRETE, 1.5}, // Simple integer operations - {DistributionType::EXPONENTIAL, 2.5}, // Moderate - requires exp/log - {DistributionType::GAUSSIAN, 3.0}, // Moderate - Box-Muller transform - {DistributionType::POISSON, 4.0}, // Complex - iterative algorithms - {DistributionType::GAMMA, 5.0}, // Most complex - special functions - {DistributionType::CHI_SQUARED, 5.0}, // Delegates to Gamma - same complexity - {DistributionType::STUDENT_T, 3.2}, // Moderate - log-space continuous - {DistributionType::BETA, 3.4} // Moderate - bounded log-space continuous - }; - - // Distribution-specific efficiency characteristics - std::map> efficiency_characteristics = { - {DistributionType::UNIFORM, - {0.40, 0.25}}, // Good SIMD/Parallel efficiency - simple ops - {DistributionType::DISCRETE, {0.35, 0.22}}, // Decent efficiency - {DistributionType::EXPONENTIAL, {0.28, 0.18}}, // Moderate efficiency - transcendental - {DistributionType::GAUSSIAN, {0.25, 0.15}}, // Lower efficiency - complex transform - {DistributionType::POISSON, {0.22, 0.12}}, // Poor efficiency - iterative - {DistributionType::GAMMA, {0.20, 0.10}}, // Worst efficiency - special functions - {DistributionType::CHI_SQUARED, {0.20, 0.10}}, // Delegates to Gamma; same efficiency - {DistributionType::STUDENT_T, {0.24, 0.15}}, // Moderate efficiency - {DistributionType::BETA, {0.23, 0.14}} // Moderate efficiency with fixup - }; - - // More granular sizes around potential crossover points for better threshold learning - std::vector sizes = {10, 25, 50, 75, 100, 150, 200, - 300, 500, 750, 1000, 1500, 2000, 3000, - 5000, 7500, 10000, 15000, 25000, 50000}; - - // All distribution types to simulate - std::vector distributions = { - DistributionType::UNIFORM, DistributionType::GAUSSIAN, - DistributionType::EXPONENTIAL, DistributionType::DISCRETE, - DistributionType::POISSON, DistributionType::GAMMA, - DistributionType::CHI_SQUARED, DistributionType::STUDENT_T, - DistributionType::BETA}; - - for (auto dist_type : distributions) { - double complexity = complexity_factors[dist_type]; - auto [simd_efficiency, parallel_efficiency] = efficiency_characteristics[dist_type]; - - for (auto size : sizes) { - // Record multiple samples per strategy to reach the reliable data threshold (>=5 - // samples) - for (int sample = 0; sample < SAMPLES_PER_STRATEGY / 4; - ++sample) { // Use fewer samples per size for broader coverage - // Scalar strategy - affected by computational complexity - auto scalar_time = static_cast( - static_cast(size) * timing_simulation::SCALAR_PERFORMANCE_FACTOR * - complexity * noise(rng_)); - history.recordPerformance(Strategy::SCALAR, dist_type, size, scalar_time); - - // SIMD strategy - use distribution-specific efficiency with overhead - auto simd_time = static_cast( - static_cast(size) * timing_simulation::SIMD_PERFORMANCE_FACTOR * - complexity * simd_efficiency * noise(rng_)); - if (size < timing_simulation::SIMD_OVERHEAD_THRESHOLD) { - simd_time += timing_simulation::SIMD_SMALL_OVERHEAD; // SIMD overhead for - // small sizes - } - history.recordPerformance(Strategy::VECTORIZED, dist_type, size, simd_time); - - // Parallel strategy - use distribution-specific efficiency with realistic - // overhead model - auto parallel_time = static_cast( - static_cast(size) * timing_simulation::PARALLEL_PERFORMANCE_FACTOR * - complexity * parallel_efficiency * noise(rng_)); - - // More realistic parallel overhead model - decreases with complexity and size - double complexity_factor = complexity; - double overhead_reduction = std::max( - 1.0, static_cast(size) / 1000.0); // Overhead reduces with size - - // Base overhead varies by complexity: - // - Simple distributions (Uniform): High overhead, needs ~10k+ elements - // - Complex distributions (Gamma): Lower overhead, benefits earlier - uint64_t base_overhead = - static_cast(timing_simulation::PARALLEL_BASE_OVERHEAD / - complexity_factor / overhead_reduction); - parallel_time += base_overhead; - history.recordPerformance(Strategy::PARALLEL, dist_type, size, parallel_time); - } - } - } - } - - void runInteractiveMode() { - using namespace stats::detail; - - stats::detail::detail::sectionHeader("Interactive Mode"); - - std::cout << "Enter batch sizes to test strategy selection (0 to exit):\n"; - - size_t batch_size; - while (std::cout << "> " && std::cin >> batch_size && batch_size != 0) { - stats::detail::detail::subsectionHeader("Testing batch size: " + - std::to_string(batch_size)); - - stats::detail::detail::ColumnFormatter formatter({15, 12, 18}); - std::cout << formatter.formatRow({"Distribution", "Complexity", "Selected Strategy"}) - << "\n"; - std::cout << formatter.getSeparator() << "\n"; - - for (auto dist : - {DistributionType::UNIFORM, DistributionType::GAUSSIAN, - DistributionType::EXPONENTIAL, DistributionType::DISCRETE, - DistributionType::POISSON, DistributionType::GAMMA, DistributionType::CHI_SQUARED, - DistributionType::STUDENT_T, DistributionType::BETA}) { - for (auto complexity : - {ComputationComplexity::SIMPLE, ComputationComplexity::COMPLEX}) { - auto strategy = - dispatcher_.selectOptimalStrategy(batch_size, dist, complexity, system_); - std::cout << formatter.formatRow( - {stats::detail::detail::distributionTypeToString(dist), - stats::detail::detail::complexityToString(complexity), - stats::detail::detail::strategyToDisplayString(strategy)}) - << "\n"; - } - } - std::cout << "\n"; - } - - std::cout << "Interactive mode ended.\n"; - } -}; - -int main() { - using namespace stats::detail; - - // Use the standard tool runner pattern - return stats::detail::detail::runTool("Performance Dispatcher Tool", []() { - PerformanceDispatcherTool tool; - tool.run(); - }); -} diff --git a/tools/strategy_profile.cpp b/tools/strategy_profile.cpp new file mode 100644 index 0000000..be0bf46 --- /dev/null +++ b/tools/strategy_profile.cpp @@ -0,0 +1,456 @@ +/** + * @file strategy_profile.cpp + * @brief Canonical forced-strategy profiler for dispatcher threshold tuning + * + * Profiles forced SCALAR, VECTORIZED, PARALLEL, and WORK_STEALING execution + * across all dispatcher-supported distributions, core batch operations, and a + * representative batch-size sweep. The output is intended to be the canonical + * raw dataset for tuning dispatcher thresholds. + */ + +#include "libstats/distributions/beta.h" +#include "libstats/distributions/chi_squared.h" +#include "libstats/distributions/discrete.h" +#include "libstats/distributions/exponential.h" +#include "libstats/distributions/gamma.h" +#include "libstats/distributions/gaussian.h" +#include "libstats/distributions/poisson.h" +#include "libstats/distributions/student_t.h" +#include "libstats/distributions/uniform.h" +#include "tool_utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace stats; +using namespace stats::detail; +using namespace std::chrono; + +namespace { + +constexpr int DEFAULT_RNG_SEED = 42; +constexpr int WARMUP_ITERATIONS = 3; +constexpr int TIMING_REPEATS = 7; +constexpr const char* RESULTS_CSV_FILENAME = "strategy_profile_results.csv"; + +enum class ProfileOperation { PDF, LOG_PDF, CDF }; + +struct StrategyProfileResult { + std::string distribution; + std::string operation; + std::size_t batch_size; + Strategy strategy; + double median_time_us; +}; + +double median_us(std::vector& timings) { + std::sort(timings.begin(), timings.end()); + return timings[timings.size() / 2]; +} + +std::string operation_to_string(ProfileOperation operation) { + switch (operation) { + case ProfileOperation::PDF: + return "PDF"; + case ProfileOperation::LOG_PDF: + return "LogPDF"; + case ProfileOperation::CDF: + return "CDF"; + default: + return "Unknown"; + } +} + +constexpr std::array OPERATIONS = { + ProfileOperation::PDF, ProfileOperation::LOG_PDF, ProfileOperation::CDF}; + +constexpr std::array STRATEGIES = {Strategy::SCALAR, Strategy::VECTORIZED, + Strategy::PARALLEL, Strategy::WORK_STEALING}; + +} // namespace + +class StrategyProfiler { + public: + explicit StrategyProfiler(bool include_large) : gen_(DEFAULT_RNG_SEED) { + initialize_batch_sizes(include_large); + } + + void run(const std::string& output_csv_path) { + stats::detail::detail::displayToolHeader( + "Strategy Profile", "Forced-strategy timing profiler for dispatcher threshold tuning"); + + std::cout << "Batch sizes:"; + for (auto size : batch_sizes_) { + std::cout << " " << size; + } + std::cout << "\n\n"; + + profile_all_distributions(); + print_summary(); + save_results(output_csv_path); + } + + private: + std::mt19937 gen_; + std::vector results_; + std::vector batch_sizes_; + + void initialize_batch_sizes(bool include_large) { + batch_sizes_ = {8, 16, 32, 64, 128, 256, 512, 1000, + 2000, 5000, 10000, 20000, 50000, 100000, 250000, 500000}; + + if (include_large) { + batch_sizes_.push_back(1000000); + batch_sizes_.push_back(2000000); + } + } + + void profile_all_distributions() { + profile_uniform_distribution(); + profile_gaussian_distribution(); + profile_exponential_distribution(); + profile_discrete_distribution(); + profile_poisson_distribution(); + profile_gamma_distribution(); + profile_student_t_distribution(); + profile_beta_distribution(); + profile_chi_squared_distribution(); + } + + template + void profile_distribution(const std::string& distribution_name, + const Distribution& distribution, Generator&& generator) { + stats::detail::detail::subsectionHeader(distribution_name + " Strategy Profile"); + + for (auto batch_size : batch_sizes_) { + std::cout << " Profiling batch size " << batch_size << "..." << std::flush; + + const auto input_values = generator(batch_size); + + for (auto operation : OPERATIONS) { + for (auto strategy : STRATEGIES) { + const double median_time_us = + benchmark_strategy(distribution, input_values, operation, strategy); + + results_.push_back({distribution_name, operation_to_string(operation), + batch_size, strategy, median_time_us}); + } + } + + std::cout << " βœ“\n"; + } + std::cout << "\n"; + } + + template + double benchmark_strategy(const Distribution& distribution, + const std::vector& input_values, ProfileOperation operation, + Strategy strategy) const { + std::vector output_values(input_values.size()); + std::span input_span(input_values); + std::span output_span(output_values); + + for (int i = 0; i < WARMUP_ITERATIONS; ++i) { + perform_operation(distribution, input_span, output_span, operation, strategy); + } + + std::vector timings_us; + timings_us.reserve(TIMING_REPEATS); + + for (int i = 0; i < TIMING_REPEATS; ++i) { + const auto start = high_resolution_clock::now(); + perform_operation(distribution, input_span, output_span, operation, strategy); + const auto end = high_resolution_clock::now(); + timings_us.push_back(duration(end - start).count()); + } + + return median_us(timings_us); + } + + template + void perform_operation(const Distribution& distribution, std::span input_values, + std::span output_values, ProfileOperation operation, + Strategy strategy) const { + switch (operation) { + case ProfileOperation::PDF: + distribution.getProbabilityWithStrategy(input_values, output_values, strategy); + break; + case ProfileOperation::LOG_PDF: + distribution.getLogProbabilityWithStrategy(input_values, output_values, strategy); + break; + case ProfileOperation::CDF: + distribution.getCumulativeProbabilityWithStrategy(input_values, output_values, + strategy); + break; + } + } + + void profile_uniform_distribution() { + const auto uniform = stats::UniformDistribution::create(0.0, 1.0).value; + profile_distribution("Uniform", uniform, [this](std::size_t count) { + std::vector values(count); + std::uniform_real_distribution dist(-0.5, 1.5); + for (auto& value : values) { + value = dist(gen_); + } + return values; + }); + } + + void profile_gaussian_distribution() { + const auto gaussian = stats::GaussianDistribution::create(0.0, 1.0).value; + profile_distribution("Gaussian", gaussian, [](std::size_t count) { + std::vector values(count); + const double denominator = + static_cast(std::max(1, count > 0 ? count - 1 : 0)); + for (std::size_t i = 0; i < count; ++i) { + values[i] = -4.0 + 8.0 * static_cast(i) / denominator; + } + return values; + }); + } + + void profile_exponential_distribution() { + const auto exponential = stats::ExponentialDistribution::create(1.0).value; + profile_distribution("Exponential", exponential, [this](std::size_t count) { + std::vector values(count); + std::exponential_distribution dist(1.0); + for (auto& value : values) { + value = dist(gen_); + } + return values; + }); + } + + void profile_discrete_distribution() { + const auto discrete = stats::DiscreteDistribution::create(0, 10).value; + profile_distribution("Discrete", discrete, [this](std::size_t count) { + std::vector values(count); + std::uniform_int_distribution dist(0, 10); + for (auto& value : values) { + value = static_cast(dist(gen_)); + } + return values; + }); + } + + void profile_poisson_distribution() { + const auto poisson = stats::PoissonDistribution::create(3.5).value; + profile_distribution("Poisson", poisson, [this](std::size_t count) { + std::vector values(count); + std::poisson_distribution dist(3); + for (auto& value : values) { + value = static_cast(dist(gen_)); + } + return values; + }); + } + + void profile_gamma_distribution() { + const auto gamma = stats::GammaDistribution::create(2.0, 1.0).value; + profile_distribution("Gamma", gamma, [this](std::size_t count) { + std::vector values(count); + std::gamma_distribution dist(1.5, 2.0); + for (auto& value : values) { + value = dist(gen_); + } + return values; + }); + } + + void profile_student_t_distribution() { + const auto student_t = stats::StudentTDistribution::create(5.0).value; + profile_distribution("StudentT", student_t, [this](std::size_t count) { + std::vector values(count); + std::student_t_distribution dist(5.0); + for (auto& value : values) { + value = dist(gen_); + } + return values; + }); + } + + void profile_beta_distribution() { + const auto beta = stats::BetaDistribution::create(2.0, 5.0).value; + profile_distribution("Beta", beta, [this](std::size_t count) { + std::vector values(count); + std::uniform_real_distribution dist(-0.1, 1.1); + for (auto& value : values) { + value = dist(gen_); + } + return values; + }); + } + + void profile_chi_squared_distribution() { + const auto chi_squared = stats::ChiSquaredDistribution::create(4.0).value; + profile_distribution("ChiSquared", chi_squared, [this](std::size_t count) { + std::vector values(count); + std::chi_squared_distribution dist(4.0); + for (auto& value : values) { + value = dist(gen_); + } + return values; + }); + } + + void print_summary() const { + stats::detail::detail::sectionHeader("Best Strategy Summary"); + + using SummaryKey = std::tuple; + std::map> grouped_results; + for (const auto& result : results_) { + grouped_results[{result.distribution, result.operation, result.batch_size}].push_back( + &result); + } + + stats::detail::detail::ColumnFormatter formatter({14, 10, 10, 16, 14}); + std::cout << formatter.formatRow( + {"Distribution", "Operation", "Size", "Best Strategy", "Time (ΞΌs)"}) + << "\n"; + std::cout << formatter.getSeparator() << "\n"; + + for (const auto& [key, result_group] : grouped_results) { + const auto* best_result = *std::min_element( + result_group.begin(), result_group.end(), + [](const StrategyProfileResult* left, const StrategyProfileResult* right) { + return left->median_time_us < right->median_time_us; + }); + + std::cout << formatter.formatRow( + {std::get<0>(key), std::get<1>(key), std::to_string(std::get<2>(key)), + stats::detail::detail::strategyToDisplayString(best_result->strategy), + stats::detail::detail::formatDouble(best_result->median_time_us, 2)}) + << "\n"; + } + + std::cout << "\n"; + print_crossover_summary(grouped_results); + } + + void print_crossover_summary( + const std::map, + std::vector>& grouped_results) const { + stats::detail::detail::sectionHeader("Crossover Summary"); + + using GroupKey = std::pair; + std::map>> timings_by_group; + + for (const auto& [key, result_group] : grouped_results) { + const GroupKey group_key{std::get<0>(key), std::get<1>(key)}; + auto& size_timings = timings_by_group[group_key][std::get<2>(key)]; + for (const auto* result : result_group) { + size_timings[result->strategy] = result->median_time_us; + } + } + + stats::detail::detail::ColumnFormatter formatter({14, 10, 16, 16, 18}); + std::cout << formatter.formatRow( + {"Distribution", "Operation", "Sβ†’V", "Vβ†’P", "Pβ†’Work-Steal"}) + << "\n"; + std::cout << formatter.getSeparator() << "\n"; + + for (const auto& [group_key, size_map] : timings_by_group) { + const auto scalar_to_vectorized = + find_first_crossover(size_map, Strategy::SCALAR, Strategy::VECTORIZED); + const auto vectorized_to_parallel = + find_first_crossover(size_map, Strategy::VECTORIZED, Strategy::PARALLEL); + const auto parallel_to_work_stealing = + find_first_crossover(size_map, Strategy::PARALLEL, Strategy::WORK_STEALING); + + std::cout << formatter.formatRow({group_key.first, group_key.second, + crossover_to_string(scalar_to_vectorized), + crossover_to_string(vectorized_to_parallel), + crossover_to_string(parallel_to_work_stealing)}) + << "\n"; + } + + std::cout << "\n"; + } + + static std::optional find_first_crossover( + const std::map>& size_map, Strategy slower_strategy, + Strategy faster_strategy) { + for (const auto& [batch_size, timings] : size_map) { + const auto slower_it = timings.find(slower_strategy); + const auto faster_it = timings.find(faster_strategy); + if (slower_it == timings.end() || faster_it == timings.end()) { + continue; + } + if (faster_it->second < slower_it->second) { + return batch_size; + } + } + return std::nullopt; + } + + static std::string crossover_to_string(const std::optional& crossover) { + return crossover.has_value() ? std::to_string(*crossover) : "never"; + } + + void save_results(const std::string& output_csv_path) const { + std::ofstream csv_file(output_csv_path); + csv_file << "Distribution,Operation,BatchSize,Strategy,MedianTime_us\n"; + csv_file << std::fixed << std::setprecision(6); + + for (const auto& result : results_) { + csv_file << result.distribution << "," << result.operation << "," << result.batch_size + << "," << stats::detail::detail::strategyToString(result.strategy) << "," + << result.median_time_us << "\n"; + } + + std::cout << "Results saved to " << output_csv_path << "\n"; + } +}; + +void print_usage(const char* program_name) { + std::cout << "Usage: " << program_name << " [OPTIONS]\n"; + std::cout << "\nOptions:\n"; + std::cout << " -l, --large Include 1M and 2M batch sizes\n"; + std::cout << " -o, --output-csv PATH Write CSV results to PATH\n"; + std::cout << " -h, --help Show this help message\n"; + std::cout << "\nDefault output file: " << RESULTS_CSV_FILENAME << "\n"; +} + +int main(int argc, char* argv[]) { + bool include_large = false; + std::string output_csv_path = RESULTS_CSV_FILENAME; + + for (int i = 1; i < argc; ++i) { + const std::string arg = argv[i]; + if (arg == "-l" || arg == "--large") { + include_large = true; + } else if (arg == "-o" || arg == "--output-csv") { + if (i + 1 >= argc) { + std::cerr << "Missing value for " << arg << "\n"; + return 1; + } + output_csv_path = argv[++i]; + } else if (arg == "-h" || arg == "--help") { + print_usage(argv[0]); + return 0; + } else { + std::cerr << "Unknown option: " << arg << "\n"; + print_usage(argv[0]); + return 1; + } + } + + return stats::detail::detail::runTool("Strategy Profile", [include_large, &output_csv_path]() { + StrategyProfiler profiler(include_large); + profiler.run(output_csv_path); + }); +} diff --git a/tools/system_inspector.cpp b/tools/system_inspector.cpp index bc3966e..5d53549 100644 --- a/tools/system_inspector.cpp +++ b/tools/system_inspector.cpp @@ -24,6 +24,7 @@ #include #include // for std::vector (keep standard portable header) // Use consolidated header for complete library functionality +#include "libstats/core/dispatch_thresholds.h" #include "libstats/core/performance_dispatcher.h" // for SystemCapabilities, DistributionType #include "libstats/platform/platform_constants.h" // for platform constants #include "libstats/platform/simd.h" // for VectorOps @@ -35,7 +36,6 @@ using namespace std::chrono; namespace { constexpr size_t BASELINE_TEST_SIZE = 1000000; constexpr int BASELINE_ITERATIONS = 10; -constexpr int MAX_COMPLEXITY_DEMOS = 1; // Only show first complexity for brevity } // namespace // Mode enumeration @@ -460,7 +460,7 @@ class SystemInspector { std::cout << "Example Strategy Selections:\n"; stats::detail::detail::ColumnFormatter formatter({20, 15, 15, 20}); - std::cout << formatter.formatRow({"Batch Size", "Distribution", "Complexity", "Strategy"}) + std::cout << formatter.formatRow({"Batch Size", "Distribution", "Operation", "Strategy"}) << "\n"; std::cout << formatter.getSeparator() << "\n"; @@ -470,30 +470,18 @@ class SystemInspector { stats::detail::DistributionType::UNIFORM, stats::detail::DistributionType::GAUSSIAN, stats::detail::DistributionType::EXPONENTIAL, stats::detail::DistributionType::POISSON, stats::detail::DistributionType::DISCRETE}; - std::vector complexities = { - stats::detail::ComputationComplexity::SIMPLE, - stats::detail::ComputationComplexity::MODERATE, - stats::detail::ComputationComplexity::COMPLEX}; for (auto size : test_sizes) { for (auto dist : dist_types) { - int complexity_count = 0; - for (auto complexity : complexities) { - stats::detail::PerformanceDispatcher dispatcher; - auto strategy = - dispatcher.selectOptimalStrategy(size, dist, complexity, capabilities); - - std::cout << formatter.formatRow( - {std::to_string(size), - stats::detail::detail::distributionTypeToString(dist), - stats::detail::detail::complexityToString(complexity), - stats::detail::detail::strategyToDisplayString(strategy)}) - << "\n"; - - // Only show first complexity for brevity - if (++complexity_count >= MAX_COMPLEXITY_DEMOS) - break; - } + stats::detail::PerformanceDispatcher dispatcher; + auto strategy = dispatcher.selectStrategy( + size, dist, stats::detail::OperationType::PDF, capabilities); + + std::cout << formatter.formatRow( + {std::to_string(size), + stats::detail::detail::distributionTypeToString(dist), "PDF", + stats::detail::detail::strategyToDisplayString(strategy)}) + << "\n"; } } std::cout << "\n";