diff --git a/.clang-tidy b/.clang-tidy index 6677e5e..14e9910 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -211,7 +211,7 @@ CheckOptions: value: '' - key: readability-identifier-naming.NamespaceSuffix value: '' - + # Performance and modernization options - key: modernize-use-auto.MinTypeNameLength value: '5' @@ -223,13 +223,13 @@ CheckOptions: value: 'true' - key: performance-unnecessary-value-param.IncludeStyle value: 'llvm' - + # Certificate and security options - key: cert-dcl16-c.NewSuffixes value: 'L;LL;LU;LLU' - key: cert-oop54-cpp.WarnOnlyIfThisHasSuspiciousField value: 'false' - + # Core guidelines options - key: cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor value: 'true' diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index 7ed0b29..b6b0143 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -4,3 +4,6 @@ # style: bulk reformat all source files with clang-format (2026-04-23) 7221753 + +# style: apply clang-format 19.1.7 to all source files (2026-05-03) +662c172 diff --git a/.gitattributes b/.gitattributes index a6df10c..50fd8dc 100644 --- a/.gitattributes +++ b/.gitattributes @@ -30,10 +30,10 @@ CMakeLists.txt text eol=lf # Scripts — always LF so they run correctly in bash/sh *.sh text eol=lf -# Windows-only scripts stay CRLF +# Windows batch/cmd scripts stay CRLF; PowerShell handles LF on all platforms *.bat text eol=crlf *.cmd text eol=crlf -*.ps1 text eol=crlf +*.ps1 text eol=lf # XML (HMM model files) *.xml text eol=lf diff --git a/CHANGELOG.md b/CHANGELOG.md index 7404884..46b4c4c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,54 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [3.3.0] - 2026-05-03 + +SIMD performance phase: explicit vector kernels for transcendental +operations and two additional Tier-2 distributions. 37/37 tests pass. + +### Added + +- **SIMD transcendental kernels** (`src/performance/transcendental_kernels.cpp`): + five inner-loop kernels used by `ForwardBackwardCalculator` (FB max-reduce + recurrence) and `BaumWelchTrainer` (dense-xi accumulation) now have + AVX-512 / AVX / SSE2 / NEON backends. The vector `exp` helper uses a + 13-term Horner polynomial with Cephes `ln2` range reduction and branch-free + underflow masking at `MIN_LOG_PROBABILITY`. AVX path stays AVX-1 compatible + for Ivy Bridge / Catalina. Benchmarks on Zen 4 / AVX-512 (T=1000): + FB max-reduce 5.7× faster at N=32; BW xi accumulation 1.03–1.15×. +- **LogNormal and Pareto promoted to Tier 2** (`src/distributions/`): explicit + SIMD `getBatchLogProbabilities` via a vector `log` helper (IEEE-754 exponent + extraction, 7-term Horner, split-LN2 reconstruction, ≤5 ULP). +- **`simd_kernels_internal.h`**: single source of truth for vector exp/log + primitives shared by all Tier-2 distribution TUs and the transcendental + kernels TU. +- **FB recurrence crossover retuned** (`fb_recurrence_policy.h`): threshold + moved from N≥5 to N≥4 on x86 after profiling post-SIMD (MaxReduce is 1.7× + faster at N=4). +- **New tests** (37 total, up from 33): + - `test_simd_platform`: compile-time ISA hierarchy invariants (`#error`) and + runtime contracts on `simd_platform.h` utility functions. + - `test_transcendental_kernels`: SIMD vs `std::exp` parity for all five + kernels across 11 sizes; 1e-12 rel / 1e-15 abs tolerance. + - `test_fb_mode_parity`: Pairwise vs MaxReduce FB log-likelihood agreement. + - `test_bw_parity`: BW determinism (bit-exact) and EM monotonicity. +- **New tools**: `bw_hotspot` (BW E-step phase breakdown), `hotspot_breakdown` + (FB phase-level timings), `fb_crossover_sweep` (Pairwise vs MaxReduce + timing across N), `fb_contour_sweep` (2-D N×T timing heatmap data). + +### Changed + +- `fb_recurrence_policy.h` moved from `include/libhmm/calculators/` to + `include/libhmm/performance/` (cross-cutting primitive, not calculator-specific). +- Test group labels in `tests/CMakeLists.txt` changed from numeric Level N + notation to semantic names; Performance Primitives group reordered before + Distributions to reflect dependency order. +- `performance/PERFORMANCE_ARCHITECTURE.md` updated: Tier-2 coverage, + delivered recurrence-kernel SIMD, corrected `LIBHMM_SIMD_SOURCES` list. +- `*.ps1` line-ending rule in `.gitattributes` changed from `eol=crlf` to + `eol=lf` (PowerShell handles LF on all platforms; avoids CI pre-commit + mixed-line-ending failures). + ## [3.2.1] - 2026-05-02 CI hygiene fix; no functional changes. diff --git a/CMakeLists.txt b/CMakeLists.txt index 92e7bed..ded76de 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,7 +58,7 @@ if(APPLE AND NOT CMAKE_CXX_COMPILER) endif() project(libhmm - VERSION 3.2.1 + VERSION 3.3.0 DESCRIPTION "Modern C++20 Hidden Markov Model Library" LANGUAGES CXX ) @@ -479,6 +479,15 @@ set(LIBHMM_SIMD_SOURCES src/distributions/weibull_distribution.cpp ) +# Additional TUs that include simd_kernels_internal.h or transcendental_kernels.h +# and therefore need LIBHMM_BEST_SIMD_FLAGS to activate the #if LIBHMM_HAS_* cascade. +# (log_normal and pareto are already in LIBHMM_SIMD_SOURCES above.) +list(APPEND LIBHMM_SIMD_SOURCES + src/performance/transcendental_kernels.cpp + src/calculators/forward_backward_calculator.cpp + src/training/baum_welch_trainer.cpp +) + if(LIBHMM_BEST_SIMD_FLAGS) foreach(simd_src ${LIBHMM_SIMD_SOURCES}) set_source_files_properties( @@ -499,6 +508,7 @@ set(LIBHMM_SOURCES src/common/common.cpp src/common/string_tokenizer.cpp src/common/numerical_stability.cpp + src/performance/transcendental_kernels.cpp src/distributions/distribution_base.cpp src/distributions/discrete_distribution.cpp src/distributions/gaussian_distribution.cpp diff --git a/WARP.md b/WARP.md index e78bc16..ac46e41 100644 --- a/WARP.md +++ b/WARP.md @@ -6,8 +6,8 @@ This file provides guidance to Warp (warp.dev) when working in this repository. ## Current Status -**Version**: v3.2.1 — latest tag and published release on `main`. -**Tests**: 33/33 passing on all four CI platforms (Linux/GCC, Linux/Clang, macOS/AppleClang, Windows/MSVC). +**Version**: v3.3.0 — latest tag and published release on `main`. +**Tests**: 37/37 passing on all four CI platforms (Linux/GCC, Linux/Clang, macOS/AppleClang, Windows/MSVC). **Active phase**: Complete. All phases through Post-Phase 5 (CI/tooling, benchmarks) are done. --- @@ -36,7 +36,7 @@ include/libhmm/ │ └── segmental_kmeans_trainer.h # Discrete-state initialisation └── io/ # XML I/O src/ # Implementation (mirrors include/) -tests/ # GTest suite — levels 0–7 (see tests/CMakeLists.txt) +tests/ # GTest suite — semantic groups (see tests/CMakeLists.txt) examples/ # 13 usage demonstrations (all canonical API) tools/ # Standalone diagnostic/benchmarking executables benchmarks/ # Comparative benchmarks @@ -70,7 +70,7 @@ Both are always produced regardless of `BUILD_SHARED_LIBS`. Tests link against 2. **Two canonical calculators** — `ForwardBackwardCalculator` (log-space, precomputed log-trans) and `ViterbiCalculator`. Both call `getBatchLogProbabilities()` per state per time step. -3. **Compile-time SIMD dispatch** — source-distributed; each machine builds for its own CPU. GCC/Clang: `-march=native`. MSVC: `check_cxx_source_runs`-verified `/arch:AVX512`/`AVX2`/`AVX`. All 15 distribution TUs in `LIBHMM_SIMD_SOURCES`. Tier 2 explicit intrinsics: Gaussian + Exponential via `detail::` free functions (extractable to separate TU for future runtime dispatch). +3. **Compile-time SIMD dispatch** — source-distributed; each machine builds for its own CPU. GCC/Clang: `-march=native`. MSVC: `check_cxx_source_runs`-verified `/arch:AVX512`/`AVX2`/`AVX`. All 15 distribution TUs plus transcendental kernels, FB calculator, and BW trainer in `LIBHMM_SIMD_SOURCES`. Tier 2 explicit intrinsics: Gaussian, Exponential, LogNormal, Pareto via `detail::` free functions; recurrence kernels (FB max-reduce, BW xi) via `TranscendentalKernels` in `src/performance/`. Shared vector exp/log helpers in `include/libhmm/performance/simd_kernels_internal.h`. 4. **Thread-safe cache** — `std::atomic cacheValid_` in `DistributionBase`. Avoids mutex; safe for concurrent const reads if the library is invoked from multiple threads (calculators and trainers themselves run single-threaded — see `performance/PERFORMANCE_ARCHITECTURE.md`). @@ -210,6 +210,7 @@ CRLF: `.gitattributes` enforces LF. CRLF warnings on `git add` are normal. - Always run `./scripts/configure_catalina.sh build` for the first configure. - The script sanitizes toolchain-related environment variables, pins AppleClang via `xcrun`, and sets `CMAKE_OSX_DEPLOYMENT_TARGET=10.15`. +- **Build type:** the script defaults to `Release` (`-O3`). This is required for correctness: at `-O0`, AppleClang inserts `VZEROUPPER` in the prologue of large-frame AVX functions before saving the `__m256d` argument, silently zeroing `x[2]` and `x[3]`. For debuggable builds use `RelWithDebInfo` (`-O2 -g`) — SIMD helpers inline at `-O2` so the issue cannot occur: `./scripts/configure_catalina.sh build -DCMAKE_BUILD_TYPE=RelWithDebInfo`. Pure `Debug` (`-O0`) is unsafe for any code path that passes `__m256d` through a real call boundary. - Do not point Catalina builds at Homebrew LLVM/libc++ (`/usr/local/opt/llvm`, `Cellar/llvm*`, libc++ include paths). The root `CMakeLists.txt` guard fails configure when those hints are detected. - Use `-DLIBHMM_ALLOW_UNSUPPORTED_CATALINA_HOMEBREW_LIBCXX=ON` only for explicit troubleshooting; runtime stability is not guaranteed. @@ -217,17 +218,18 @@ CRLF: `.gitattributes` enforces LF. CRLF warnings on `git add` are normal. ## Test Suite Structure -Tests in `tests/CMakeLists.txt` use `add_hmm_test()` helper organized into 8 levels: +Tests in `tests/CMakeLists.txt` use `add_hmm_test()` helper organized into semantic groups: -| Level | Content | +| Group | Content | |---|---| -| 1 | Math & Numerics | -| 2 | Linear Algebra | -| 3 | Distributions (all 15 + traits/header/type_safety) | -| 4 | Core HMM | -| 5 | Calculators (canonical + continuous + edge cases) | -| 6 | Trainers (canonical + training + edge cases + BW convergence) | -| 7 | IO + Integration (stream IO + end-to-end casino) | +| Platform Capabilities | No tests yet (placeholder) | +| Math & Numerics | constants, numerical stability, common types | +| Performance Primitives | transcendental kernels (SIMD parity vs `std::exp`) | +| Distributions | all 15 + traits/header/type_safety | +| Core HMM | HMM construction and state management | +| Calculators | canonical + continuous + edge cases + FB mode parity | +| Trainers | canonical + training + edge cases + BW convergence + BW parity | +| IO & Integration | stream IO + end-to-end casino | Custom targets: `check` (correctness, parallel), `check_timing` (serial). Note: named `check` not `run_tests` to avoid cmake's built-in `RUN_TESTS` on Windows. diff --git a/benchmark-analysis/fb_contour_sweep_adaptive_static_v1.csv b/benchmark-analysis/fb_contour_sweep_adaptive_static_v1.csv new file mode 100644 index 0000000..52de679 --- /dev/null +++ b/benchmark-analysis/fb_contour_sweep_adaptive_static_v1.csv @@ -0,0 +1,23 @@ +mode,n,t,runs,warmup,recurrence_work,emission_work,transition_ms,obs_copy_ms,emission_ms,alloc_ms,forward_ms,backward_ms,reduction_ms,total_ms +adaptive_static_v1,2,1000,5,2,3996,2000,0.0002,0.0006,0.0006,0.0005,0.0555,0.053,0.0001,0.1109 +adaptive_static_v1,2,10000,5,2,39996,20000,0.0007,0.0071,0.0045,0.043,0.3578,0.3551,0,0.7707 +adaptive_static_v1,2,100000,5,2,399996,200000,0.0026,0.1488,0.2834,0.508,3.8598,3.6578,0.0003,9.0083 +adaptive_static_v1,2,1000000,5,2,3999996,2000000,0.0031,2.0429,3.4685,3.7612,36.9812,36.2041,0.0002,82.1594 +adaptive_static_v1,4,1000,5,2,15984,4000,0.001,0.0007,0.0106,0.0154,0.2256,0.2209,0.0001,0.4701 +adaptive_static_v1,4,10000,5,2,159984,40000,0.0018,0.0104,0.014,0.0139,1.4938,1.5459,0.0005,3.0504 +adaptive_static_v1,4,100000,5,2,1599984,400000,0.0036,0.1141,0.58,0.9126,14.5554,14.3194,0.0007,30.568 +adaptive_static_v1,8,1000,5,2,63936,8000,0.0012,0.0024,0.0157,0.0294,0.3975,0.3908,0.0002,0.8399 +adaptive_static_v1,8,5000,5,2,319936,40000,0.0006,0.0022,0.007,0.0059,1.9524,1.9707,0.0002,3.9503 +adaptive_static_v1,8,10000,5,2,639936,80000,0.002,0.0087,0.019,0.2104,3.9859,4.0981,0.0006,8.434 +adaptive_static_v1,16,1000,5,2,255744,16000,0.0024,0.0036,0.0276,0.0427,1.4421,1.4556,0.0005,2.9893 +adaptive_static_v1,16,2000,5,2,511744,32000,0.0015,0.0017,0.0057,0.0056,2.8761,2.9113,0.0005,5.7923 +adaptive_static_v1,16,5000,5,2,1279744,80000,0.0029,0.005,0.0262,0.1948,7.2773,7.3363,0.0007,14.8745 +adaptive_static_v1,32,500,5,2,510976,16000,0.0102,0.0007,0.0276,0.0519,4.0494,4.2193,0.0008,8.3801 +adaptive_static_v1,32,1000,5,2,1022976,32000,0.0134,0.0031,0.044,0.0831,8.221,8.6986,0.001,17.1867 +adaptive_static_v1,32,2000,5,2,2046976,64000,0.0158,0.0056,0.0887,0.1513,16.2641,16.9673,0.001,33.4698 +adaptive_static_v1,64,200,5,2,815104,12800,0.0268,0.0006,0.0238,0.0412,8.7132,8.7867,0.0017,17.5748 +adaptive_static_v1,64,500,5,2,2043904,32000,0.0417,0.0027,0.0657,0.1169,36.6388,36.9101,0.0019,74.5554 +adaptive_static_v1,64,1000,5,2,4091904,64000,0.0355,0.0045,0.1179,0.1798,45.2402,47.7388,0.0015,93.3553 +adaptive_static_v1,128,100,5,2,1622016,12800,0.0678,0.0005,0.0268,0.0428,21.5884,25.9046,0.0023,50.4003 +adaptive_static_v1,128,250,5,2,4079616,32000,0.0685,0.001,0.0247,0.0602,54.7442,59.1274,0.0025,111.21 +adaptive_static_v1,128,500,5,2,8175616,64000,0.0821,0.0013,0.0333,0.032,115.191,122.896,0.0026,231.18 diff --git a/benchmark-analysis/fb_contour_sweep_max_reduce.csv b/benchmark-analysis/fb_contour_sweep_max_reduce.csv new file mode 100644 index 0000000..716e04d --- /dev/null +++ b/benchmark-analysis/fb_contour_sweep_max_reduce.csv @@ -0,0 +1,23 @@ +mode,n,t,runs,warmup,recurrence_work,emission_work,transition_ms,obs_copy_ms,emission_ms,alloc_ms,forward_ms,backward_ms,reduction_ms,total_ms +max_reduce,2,1000,5,2,3996,2000,0.0001,0.0003,0.0004,0.0003,0.0541,0.0557,0,0.1112 +max_reduce,2,10000,5,2,39996,20000,0.0003,0.0033,0.0036,0.0029,0.5451,0.5607,0.0001,1.1176 +max_reduce,2,100000,5,2,399996,200000,0.0024,0.1024,0.292,0.5074,5.9164,5.8783,0.0006,12.7317 +max_reduce,2,1000000,5,2,3999996,2000000,0.0019,1.5644,3.6518,4.0798,61.6187,65.8737,0.0008,138.632 +max_reduce,4,1000,5,2,15984,4000,0.0002,0.0003,0.0072,0.0148,0.1365,0.1401,0.0001,0.3002 +max_reduce,4,10000,5,2,159984,40000,0.0005,0.0036,0.0072,0.0061,1.3655,1.4421,0.0002,2.8389 +max_reduce,4,100000,5,2,1599984,400000,0.0039,0.1803,0.544,0.8251,14.3255,14.7261,0.0007,30.5996 +max_reduce,8,1000,5,2,63936,8000,0.0005,0.0024,0.015,0.0308,0.3906,0.4051,0.0002,0.8435 +max_reduce,8,5000,5,2,319936,40000,0.0015,0.0127,0.0492,0.094,1.9496,2.0359,0.0003,4.1927 +max_reduce,8,10000,5,2,639936,80000,0.0024,0.0097,0.0191,0.1943,3.9162,4.15,0.0005,8.2942 +max_reduce,16,1000,5,2,255744,16000,0.0012,0.0027,0.0325,0.045,1.4214,1.4575,0.0004,2.963 +max_reduce,16,2000,5,2,511744,32000,0.0018,0.0063,0.0454,0.0944,2.8557,2.9186,0.0006,6.0147 +max_reduce,16,5000,5,2,1279744,80000,0.0036,0.0147,0.1311,0.186,7.0892,7.4272,0.0006,15.147 +max_reduce,32,500,5,2,510976,16000,0.0045,0.0023,0.0257,0.0451,4.0341,4.1987,0.0008,8.3059 +max_reduce,32,1000,5,2,1022976,32000,0.0064,0.0067,0.0439,0.0748,8.1545,8.4885,0.0008,16.8164 +max_reduce,32,2000,5,2,2046976,64000,0.0069,0.0067,0.0793,0.151,16.8425,17.4785,0.0013,35.1039 +max_reduce,64,200,5,2,815104,12800,0.0297,0.0025,0.0322,0.0434,9.1157,9.1911,0.0018,18.3756 +max_reduce,64,500,5,2,2043904,32000,0.0483,0.0029,0.0804,0.1053,27.1055,28.3244,0.0024,55.0267 +max_reduce,64,1000,5,2,4091904,64000,0.0318,0.0042,0.1039,0.1689,62.8022,63.4727,0.0016,120.995 +max_reduce,128,100,5,2,1622016,12800,0.071,0.0007,0.0337,0.0426,21.6621,21.5886,0.0024,43.8249 +max_reduce,128,250,5,2,4079616,32000,0.0696,0.0008,0.0513,0.0852,77.0032,61.7649,0.0023,137.852 +max_reduce,128,500,5,2,8175616,64000,0.0756,0.0031,0.085,0.1356,128.719,119.591,0.0025,243.712 diff --git a/benchmark-analysis/fb_contour_sweep_pairwise.csv b/benchmark-analysis/fb_contour_sweep_pairwise.csv new file mode 100644 index 0000000..bbbac66 --- /dev/null +++ b/benchmark-analysis/fb_contour_sweep_pairwise.csv @@ -0,0 +1,23 @@ +mode,n,t,runs,warmup,recurrence_work,emission_work,transition_ms,obs_copy_ms,emission_ms,alloc_ms,forward_ms,backward_ms,reduction_ms,total_ms +pairwise,2,1000,5,1,3996,2000,0.0001,0.0003,0.0004,0.0003,0.0343,0.0336,0.0001,0.0693 +pairwise,2,10000,5,1,39996,20000,0.0001,0.0024,0.0047,0.0023,0.3434,0.3354,0,0.6895 +pairwise,2,100000,5,1,399996,200000,0.001,0.1048,0.2501,0.4206,3.461,3.3926,0.0001,7.6391 +pairwise,2,1000000,5,1,3999996,2000000,0.0049,1.5373,2.8471,3.7466,34.7657,34.3781,0.0004,78.5542 +pairwise,4,1000,5,1,15984,4000,0.0003,0.0004,0.0101,0.0187,0.2189,0.2153,0.0001,0.4634 +pairwise,4,10000,5,1,159984,40000,0.0019,0.0122,0.0167,0.0218,3.4942,3.2695,0.0002,6.8535 +pairwise,4,100000,5,1,1599984,400000,0.0033,0.1415,0.6652,1.1502,29.2175,26.0248,0.0002,58.7034 +pairwise,8,1000,5,1,63936,8000,0.0005,0.0034,0.0159,0.0316,1.166,1.1765,0.0002,2.3957 +pairwise,8,5000,5,1,319936,40000,0.0016,0.0156,0.052,0.1019,5.8452,5.8658,0.0002,11.8913 +pairwise,8,10000,5,1,639936,80000,0.0022,0.0079,0.0197,0.204,11.6961,11.7406,0.0002,23.715 +pairwise,16,1000,5,1,255744,16000,0.0019,0.0042,0.0326,0.0477,5.3054,5.3313,0.0004,10.7288 +pairwise,16,2000,5,1,511744,32000,0.0033,0.0073,0.0434,0.0883,10.6612,10.8194,0.0005,21.7072 +pairwise,16,5000,5,1,1279744,80000,0.0051,0.0149,0.0966,0.2077,26.5814,26.6937,0.0005,53.6173 +pairwise,32,500,5,1,510976,16000,0.0047,0.0028,0.029,0.044,9.7704,9.8929,0.0006,19.7958 +pairwise,32,1000,5,1,1022976,32000,0.0058,0.0047,0.0453,0.0761,19.5781,19.7934,0.0007,39.505 +pairwise,32,2000,5,1,2046976,64000,0.0064,0.0065,0.0791,0.1424,39.3132,40.2802,0.0008,80.4737 +pairwise,64,200,5,1,815104,12800,0.0311,0.0022,0.0302,0.0409,14.4688,14.2692,0.0014,28.7968 +pairwise,64,500,5,1,2043904,32000,0.0293,0.002,0.0509,0.0823,37.0369,38.7809,0.0014,76.2688 +pairwise,64,1000,5,1,4091904,64000,0.0298,0.0036,0.0765,0.1626,70.9994,71.0655,0.0013,142.836 +pairwise,128,100,5,1,1622016,12800,0.0658,0.0008,0.0361,0.044,27.5451,27.7767,0.002,55.5736 +pairwise,128,250,5,1,4079616,32000,0.0637,0.0008,0.0164,0.0593,66.9222,67.2184,0.002,134.272 +pairwise,128,500,5,1,8175616,64000,0.0677,0.001,0.0482,0.0731,133.704,135.611,0.0023,269.665 diff --git a/benchmark-analysis/focus-n2-8-catalina-ivybridge/focused_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-catalina-ivybridge/focused_max_reduce_n2_8.csv new file mode 100644 index 0000000..3432d02 --- /dev/null +++ b/benchmark-analysis/focus-n2-8-catalina-ivybridge/focused_max_reduce_n2_8.csv @@ -0,0 +1,43 @@ +mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms +max_reduce,2,500,5,2,0.3,0.114,0.13 +max_reduce,2,1000,5,2,0.637,0.233,0.252 +max_reduce,2,2000,5,2,1.217,0.467,0.527 +max_reduce,2,5000,5,2,3.092,1.191,1.347 +max_reduce,2,10000,5,2,6.021,2.482,2.443 +max_reduce,2,100000,5,2,63.802,26.135,26.283 +max_reduce,3,500,5,2,0.589,0.234,0.258 +max_reduce,3,1000,5,2,1.107,0.455,0.501 +max_reduce,3,2000,5,2,2.289,0.94,1.034 +max_reduce,3,5000,5,2,5.686,2.326,2.592 +max_reduce,3,10000,5,2,12.027,4.796,5.664 +max_reduce,3,100000,5,2,120.989,49.523,55.446 +max_reduce,4,500,5,2,0.884,0.372,0.416 +max_reduce,4,1000,5,2,1.879,0.792,0.877 +max_reduce,4,2000,5,2,3.776,1.606,1.767 +max_reduce,4,5000,5,2,9.505,4.148,4.381 +max_reduce,4,10000,5,2,19.404,8.402,8.949 +max_reduce,4,100000,5,2,201.829,84.693,96.849 +max_reduce,5,500,5,2,1.317,0.568,0.632 +max_reduce,5,1000,5,2,2.775,1.196,1.337 +max_reduce,5,2000,5,2,5.672,2.391,2.801 +max_reduce,5,5000,5,2,13.83,5.923,6.682 +max_reduce,5,10000,5,2,29.043,12.056,14.445 +max_reduce,5,100000,5,2,291.988,124.124,142.458 +max_reduce,6,500,5,2,1.933,0.836,0.951 +max_reduce,6,1000,5,2,4.947,2.178,2.407 +max_reduce,6,2000,5,2,8.027,3.517,3.891 +max_reduce,6,5000,5,2,19.475,8.439,9.547 +max_reduce,6,10000,5,2,39.116,17.027,19.181 +max_reduce,6,100000,5,2,410.151,176.87,203.052 +max_reduce,7,500,5,2,2.623,1.146,1.304 +max_reduce,7,1000,5,2,5.839,2.317,3.179 +max_reduce,7,2000,5,2,10.765,4.824,5.204 +max_reduce,7,5000,5,2,25.732,11.46,12.566 +max_reduce,7,10000,5,2,53.622,23.214,27.048 +max_reduce,7,100000,5,2,548.109,240.248,271.739 +max_reduce,8,500,5,2,3.935,1.592,2.096 +max_reduce,8,1000,5,2,7.416,3.137,3.887 +max_reduce,8,2000,5,2,13.338,5.863,6.718 +max_reduce,8,5000,5,2,35.927,14.932,19.053 +max_reduce,8,10000,5,2,67.716,29.651,34.379 +max_reduce,8,100000,5,2,707.026,309.823,357.473 diff --git a/benchmark-analysis/focus-n2-8-catalina-ivybridge/focused_pairwise_n2_8.csv b/benchmark-analysis/focus-n2-8-catalina-ivybridge/focused_pairwise_n2_8.csv new file mode 100644 index 0000000..8096d21 --- /dev/null +++ b/benchmark-analysis/focus-n2-8-catalina-ivybridge/focused_pairwise_n2_8.csv @@ -0,0 +1,43 @@ +mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms +pairwise,2,500,5,2,0.217,0.077,0.084 +pairwise,2,1000,5,2,0.412,0.15,0.155 +pairwise,2,2000,5,2,1.195,0.399,0.506 +pairwise,2,5000,5,2,2.078,0.759,0.773 +pairwise,2,10000,5,2,4.231,1.538,1.596 +pairwise,2,100000,5,2,44.74,16.476,17.079 +pairwise,3,500,5,2,0.469,0.185,0.205 +pairwise,3,1000,5,2,0.951,0.389,0.405 +pairwise,3,2000,5,2,1.851,0.775,0.773 +pairwise,3,5000,5,2,4.812,1.993,2.038 +pairwise,3,10000,5,2,9.393,3.795,4.022 +pairwise,3,100000,5,2,97.533,39.481,42.397 +pairwise,4,500,5,2,0.746,0.318,0.332 +pairwise,4,1000,5,2,1.577,0.672,0.702 +pairwise,4,2000,5,2,3.171,1.349,1.417 +pairwise,4,5000,5,2,8.058,3.536,3.523 +pairwise,4,10000,5,2,16.258,6.922,7.335 +pairwise,4,100000,5,2,165.673,71.192,74.499 +pairwise,5,500,5,2,1.113,0.485,0.509 +pairwise,5,1000,5,2,2.436,1.062,1.103 +pairwise,5,2000,5,2,5.02,2.064,2.462 +pairwise,5,5000,5,2,11.962,5.197,5.515 +pairwise,5,10000,5,2,24.438,10.759,11.021 +pairwise,5,100000,5,2,250.178,112.919,111.994 +pairwise,6,500,5,2,1.632,0.726,0.764 +pairwise,6,1000,5,2,3.284,1.456,1.531 +pairwise,6,2000,5,2,6.833,3.051,3.183 +pairwise,6,5000,5,2,16.789,7.384,7.872 +pairwise,6,10000,5,2,34.298,15.829,15.664 +pairwise,6,100000,5,2,348.493,155.326,161.492 +pairwise,7,500,5,2,2.257,1.014,1.038 +pairwise,7,1000,5,2,4.423,1.965,2.116 +pairwise,7,2000,5,2,9.453,3.95,4.715 +pairwise,7,5000,5,2,23.992,10.022,12.256 +pairwise,7,10000,5,2,44.92,20.22,21.249 +pairwise,7,100000,5,2,461.136,210.373,214.594 +pairwise,8,500,5,2,2.928,1.274,1.454 +pairwise,8,1000,5,2,5.612,2.515,2.718 +pairwise,8,2000,5,2,11.229,5.211,5.265 +pairwise,8,5000,5,2,28.531,12.717,13.92 +pairwise,8,10000,5,2,58.541,27.524,27.201 +pairwise,8,100000,5,2,591.284,270.222,280.583 diff --git a/benchmark-analysis/focus-n2-8-catalina-ivybridge/focused_pairwise_vs_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-catalina-ivybridge/focused_pairwise_vs_max_reduce_n2_8.csv new file mode 100644 index 0000000..f65003c --- /dev/null +++ b/benchmark-analysis/focus-n2-8-catalina-ivybridge/focused_pairwise_vs_max_reduce_n2_8.csv @@ -0,0 +1,43 @@ +n,t,pairwise_fb_total_ms,max_reduce_fb_total_ms,speedup_max_over_pair,winner +2,500,0.217,0.3,0.7233333333333334,pairwise +2,1000,0.412,0.637,0.6467817896389324,pairwise +2,2000,1.195,1.217,0.9819227608874281,pairwise +2,5000,2.078,3.092,0.6720569210866753,pairwise +2,10000,4.231,6.021,0.7027071914964291,pairwise +2,100000,44.74,63.802,0.7012319363029372,pairwise +3,500,0.469,0.589,0.7962648556876061,pairwise +3,1000,0.951,1.107,0.8590785907859079,pairwise +3,2000,1.851,2.289,0.8086500655307994,pairwise +3,5000,4.812,5.686,0.8462891311994373,pairwise +3,10000,9.393,12.027,0.7809927662758793,pairwise +3,100000,97.533,120.989,0.8061311358883865,pairwise +4,500,0.746,0.884,0.8438914027149321,pairwise +4,1000,1.577,1.879,0.8392762107503992,pairwise +4,2000,3.171,3.776,0.8397775423728814,pairwise +4,5000,8.058,9.505,0.8477643345607574,pairwise +4,10000,16.258,19.404,0.8378684807256236,pairwise +4,100000,165.673,201.829,0.8208582512919352,pairwise +5,500,1.113,1.317,0.8451025056947609,pairwise +5,1000,2.436,2.775,0.8778378378378379,pairwise +5,2000,5.02,5.672,0.885049365303244,pairwise +5,5000,11.962,13.83,0.8649313087490962,pairwise +5,10000,24.438,29.043,0.8414419997934097,pairwise +5,100000,250.178,291.988,0.856809183939066,pairwise +6,500,1.632,1.933,0.8442834971546818,pairwise +6,1000,3.284,4.947,0.6638366686880938,pairwise +6,2000,6.833,8.027,0.8512520244175907,pairwise +6,5000,16.789,19.475,0.8620795892169448,pairwise +6,10000,34.298,39.116,0.8768278965129359,pairwise +6,100000,348.493,410.151,0.8496699995855185,pairwise +7,500,2.257,2.623,0.8604651162790697,pairwise +7,1000,4.423,5.839,0.7574927213563966,pairwise +7,2000,9.453,10.765,0.8781235485369251,pairwise +7,5000,23.992,25.732,0.9323799160578269,pairwise +7,10000,44.92,53.622,0.8377158628920965,pairwise +7,100000,461.136,548.109,0.8413217079084635,pairwise +8,500,2.928,3.935,0.7440914866581957,pairwise +8,1000,5.612,7.416,0.7567421790722761,pairwise +8,2000,11.229,13.338,0.8418803418803419,pairwise +8,5000,28.531,35.927,0.7941381133966098,pairwise +8,10000,58.541,67.716,0.864507649595369,pairwise +8,100000,591.284,707.026,0.836297392175111,pairwise diff --git a/benchmark-analysis/focus-n2-8-ryzen-windows-clangcl-rerun/focused_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-ryzen-windows-clangcl-rerun/focused_max_reduce_n2_8.csv new file mode 100644 index 0000000..1fa2fea --- /dev/null +++ b/benchmark-analysis/focus-n2-8-ryzen-windows-clangcl-rerun/focused_max_reduce_n2_8.csv @@ -0,0 +1,43 @@ +mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms +max_reduce,2,500,5,2,0.125,0.05,0.069 +max_reduce,2,1000,5,2,0.316,0.143,0.156 +max_reduce,2,2000,5,2,0.468,0.199,0.217 +max_reduce,2,5000,5,2,1.115,0.481,0.524 +max_reduce,2,10000,5,2,2.146,0.962,1.051 +max_reduce,2,100000,5,2,22.075,9.8,10.74 +max_reduce,3,500,5,2,0.208,0.093,0.107 +max_reduce,3,1000,5,2,0.435,0.187,0.214 +max_reduce,3,2000,5,2,0.866,0.374,0.428 +max_reduce,3,5000,5,2,2.118,0.96,1.077 +max_reduce,3,10000,5,2,4.226,1.909,2.165 +max_reduce,3,100000,5,2,43.079,18.992,21.896 +max_reduce,4,500,5,2,0.34,0.153,0.178 +max_reduce,4,1000,5,2,0.706,0.311,0.356 +max_reduce,4,2000,5,2,1.408,0.617,0.711 +max_reduce,4,5000,5,2,3.501,1.552,1.793 +max_reduce,4,10000,5,2,6.805,3.084,3.568 +max_reduce,4,100000,5,2,71.122,31.764,36.614 +max_reduce,5,500,5,2,0.522,0.229,0.267 +max_reduce,5,1000,5,2,1.042,0.459,0.535 +max_reduce,5,2000,5,2,2.097,0.922,1.075 +max_reduce,5,5000,5,2,5.247,2.3,2.717 +max_reduce,5,10000,5,2,10.308,4.654,5.474 +max_reduce,5,100000,5,2,105.437,47.128,54.645 +max_reduce,6,500,5,2,0.724,0.318,0.376 +max_reduce,6,1000,5,2,1.455,0.639,0.756 +max_reduce,6,2000,5,2,2.849,1.276,1.507 +max_reduce,6,5000,5,2,7.09,3.207,3.778 +max_reduce,6,10000,5,2,14.272,6.488,7.566 +max_reduce,6,100000,5,2,146.633,65.236,77.093 +max_reduce,7,500,5,2,0.966,0.427,0.503 +max_reduce,7,1000,5,2,1.923,0.847,1.009 +max_reduce,7,2000,5,2,3.833,1.699,2.016 +max_reduce,7,5000,5,2,9.465,4.275,5.07 +max_reduce,7,10000,5,2,19.148,8.62,10.112 +max_reduce,7,100000,5,2,191.651,86.109,101.104 +max_reduce,8,500,5,2,1.23,0.542,0.649 +max_reduce,8,1000,5,2,2.548,1.09,1.366 +max_reduce,8,2000,5,2,4.963,2.237,2.637 +max_reduce,8,5000,5,2,12.596,5.686,6.769 +max_reduce,8,10000,5,2,25.42,11.105,13.834 +max_reduce,8,100000,5,2,249.409,111.539,132.687 diff --git a/benchmark-analysis/focus-n2-8-ryzen-windows-clangcl-rerun/focused_pairwise_n2_8.csv b/benchmark-analysis/focus-n2-8-ryzen-windows-clangcl-rerun/focused_pairwise_n2_8.csv new file mode 100644 index 0000000..05bb3b6 --- /dev/null +++ b/benchmark-analysis/focus-n2-8-ryzen-windows-clangcl-rerun/focused_pairwise_n2_8.csv @@ -0,0 +1,43 @@ +mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms +pairwise,2,500,5,2,0.078,0.035,0.038 +pairwise,2,1000,5,2,0.156,0.07,0.075 +pairwise,2,2000,5,2,0.339,0.14,0.151 +pairwise,2,5000,5,2,0.827,0.351,0.377 +pairwise,2,10000,5,2,1.551,0.699,0.756 +pairwise,2,100000,5,2,16.659,7.05,8.087 +pairwise,3,500,5,2,0.204,0.095,0.101 +pairwise,3,1000,5,2,0.432,0.194,0.203 +pairwise,3,2000,5,2,0.847,0.378,0.406 +pairwise,3,5000,5,2,2.112,1.006,1.022 +pairwise,3,10000,5,2,4.093,1.909,2.03 +pairwise,3,100000,5,2,57.89,25.89,29.252 +pairwise,4,500,5,2,0.392,0.186,0.197 +pairwise,4,1000,5,2,0.95,0.439,0.455 +pairwise,4,2000,5,2,1.644,0.751,0.79 +pairwise,4,5000,5,2,4.004,1.938,1.957 +pairwise,4,10000,5,2,7.862,3.753,3.95 +pairwise,4,100000,5,2,80.511,37.969,39.852 +pairwise,5,500,5,2,0.643,0.302,0.312 +pairwise,5,1000,5,2,1.289,0.609,0.631 +pairwise,5,2000,5,2,2.574,1.216,1.26 +pairwise,5,5000,5,2,6.444,3.054,3.193 +pairwise,5,10000,5,2,12.605,6.112,6.312 +pairwise,5,100000,5,2,130.656,62.657,64.69 +pairwise,6,500,5,2,0.945,0.452,0.464 +pairwise,6,1000,5,2,1.89,0.9,0.936 +pairwise,6,2000,5,2,3.749,1.811,1.864 +pairwise,6,5000,5,2,9.492,4.564,4.7 +pairwise,6,10000,5,2,19.026,9.206,9.39 +pairwise,6,100000,5,2,191.567,92.293,95.123 +pairwise,7,500,5,2,1.302,0.627,0.641 +pairwise,7,1000,5,2,2.604,1.258,1.293 +pairwise,7,2000,5,2,5.18,2.529,2.576 +pairwise,7,5000,5,2,13.197,6.366,6.569 +pairwise,7,10000,5,2,25.912,12.639,12.89 +pairwise,7,100000,5,2,266.082,128.95,132.57 +pairwise,8,500,5,2,1.914,0.897,0.957 +pairwise,8,1000,5,2,3.814,1.807,1.886 +pairwise,8,2000,5,2,7.895,3.715,4.004 +pairwise,8,5000,5,2,23.27,9.318,13.555 +pairwise,8,10000,5,2,34.83,16.856,17.516 +pairwise,8,100000,5,2,346.151,169.146,171.958 diff --git a/benchmark-analysis/focus-n2-8-ryzen-windows-clangcl-rerun/focused_pairwise_vs_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-ryzen-windows-clangcl-rerun/focused_pairwise_vs_max_reduce_n2_8.csv new file mode 100644 index 0000000..debff59 --- /dev/null +++ b/benchmark-analysis/focus-n2-8-ryzen-windows-clangcl-rerun/focused_pairwise_vs_max_reduce_n2_8.csv @@ -0,0 +1,43 @@ +n,t,pairwise_fb_total_ms,max_reduce_fb_total_ms,speedup_max_over_pair,winner +2,500,0.078,0.125,0.624,pairwise +2,1000,0.156,0.316,0.4936708860759494,pairwise +2,2000,0.339,0.468,0.7243589743589743,pairwise +2,5000,0.827,1.115,0.7417040358744394,pairwise +2,10000,1.551,2.146,0.722739981360671,pairwise +2,100000,16.659,22.075,0.7546545866364666,pairwise +3,500,0.204,0.208,0.9807692307692307,pairwise +3,1000,0.432,0.435,0.993103448275862,pairwise +3,2000,0.847,0.866,0.9780600461893765,pairwise +3,5000,2.112,2.118,0.9971671388101984,pairwise +3,10000,4.093,4.226,0.9685281590156176,pairwise +3,100000,57.89,43.079,1.3438102091506303,max_reduce +4,500,0.392,0.34,1.1529411764705881,max_reduce +4,1000,0.95,0.706,1.3456090651558075,max_reduce +4,2000,1.644,1.408,1.1676136363636365,max_reduce +4,5000,4.004,3.501,1.1436732362182234,max_reduce +4,10000,7.862,6.805,1.1553269654665688,max_reduce +4,100000,80.511,71.122,1.1320125980709204,max_reduce +5,500,0.643,0.522,1.2318007662835249,max_reduce +5,1000,1.289,1.042,1.2370441458733203,max_reduce +5,2000,2.574,2.097,1.2274678111587982,max_reduce +5,5000,6.444,5.247,1.228130360205832,max_reduce +5,10000,12.605,10.308,1.222836631742336,max_reduce +5,100000,130.656,105.437,1.2391854851712398,max_reduce +6,500,0.945,0.724,1.3052486187845305,max_reduce +6,1000,1.89,1.455,1.2989690721649483,max_reduce +6,2000,3.749,2.849,1.3159003159003158,max_reduce +6,5000,9.492,7.09,1.338787023977433,max_reduce +6,10000,19.026,14.272,1.3330997757847534,max_reduce +6,100000,191.567,146.633,1.3064385233883231,max_reduce +7,500,1.302,0.966,1.3478260869565217,max_reduce +7,1000,2.604,1.923,1.3541341653666146,max_reduce +7,2000,5.18,3.833,1.3514218627706756,max_reduce +7,5000,13.197,9.465,1.3942947702060222,max_reduce +7,10000,25.912,19.148,1.3532483810319615,max_reduce +7,100000,266.082,191.651,1.3883673969872319,max_reduce +8,500,1.914,1.23,1.5560975609756098,max_reduce +8,1000,3.814,2.548,1.4968602825745683,max_reduce +8,2000,7.895,4.963,1.5907717106588755,max_reduce +8,5000,23.27,12.596,1.8474118767862813,max_reduce +8,10000,34.83,25.42,1.3701809598741148,max_reduce +8,100000,346.151,249.409,1.3878849600455478,max_reduce diff --git a/benchmark-analysis/focus-n2-8-ryzen-windows-mingw-rerun/focused_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-ryzen-windows-mingw-rerun/focused_max_reduce_n2_8.csv new file mode 100644 index 0000000..dc747ba --- /dev/null +++ b/benchmark-analysis/focus-n2-8-ryzen-windows-mingw-rerun/focused_max_reduce_n2_8.csv @@ -0,0 +1,43 @@ +mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms +max_reduce,2,500,5,2,0.153,0.066,0.079 +max_reduce,2,1000,5,2,0.327,0.136,0.161 +max_reduce,2,2000,5,2,0.787,0.293,0.416 +max_reduce,2,5000,5,2,1.602,0.666,0.797 +max_reduce,2,10000,5,2,3.165,1.331,1.587 +max_reduce,2,100000,5,2,29.774,13.093,14.691 +max_reduce,3,500,5,2,0.317,0.142,0.164 +max_reduce,3,1000,5,2,0.748,0.307,0.392 +max_reduce,3,2000,5,2,1.3,0.57,0.654 +max_reduce,3,5000,5,2,3.266,1.429,1.668 +max_reduce,3,10000,5,2,6.408,2.873,3.304 +max_reduce,3,100000,5,2,62.015,27.423,31.925 +max_reduce,4,500,5,2,0.556,0.264,0.28 +max_reduce,4,1000,5,2,1.142,0.528,0.565 +max_reduce,4,2000,5,2,2.288,1.07,1.123 +max_reduce,4,5000,5,2,5.714,2.681,2.822 +max_reduce,4,10000,5,2,11.323,5.32,5.628 +max_reduce,4,100000,5,2,108.579,51.289,53.479 +max_reduce,5,500,5,2,0.856,0.399,0.429 +max_reduce,5,1000,5,2,1.703,0.787,0.859 +max_reduce,5,2000,5,2,3.421,1.592,1.715 +max_reduce,5,5000,5,2,8.482,3.967,4.274 +max_reduce,5,10000,5,2,16.837,7.948,8.608 +max_reduce,5,100000,5,2,159.391,72.178,83.104 +max_reduce,6,500,5,2,1.18,0.547,0.599 +max_reduce,6,1000,5,2,2.401,1.104,1.21 +max_reduce,6,2000,5,2,4.729,2.196,2.416 +max_reduce,6,5000,5,2,11.973,5.492,6.173 +max_reduce,6,10000,5,2,23.521,11.061,12.136 +max_reduce,6,100000,5,2,218.719,97.497,116.585 +max_reduce,7,500,5,2,1.581,0.734,0.807 +max_reduce,7,1000,5,2,3.159,1.461,1.621 +max_reduce,7,2000,5,2,6.307,2.928,3.267 +max_reduce,7,5000,5,2,15.845,7.306,8.31 +max_reduce,7,10000,5,2,30.936,14.59,15.772 +max_reduce,7,100000,5,2,290.579,129.087,155.833 +max_reduce,8,500,5,2,2.022,0.931,1.044 +max_reduce,8,1000,5,2,4.077,1.876,2.11 +max_reduce,8,2000,5,2,8.136,3.744,4.21 +max_reduce,8,5000,5,2,19.676,9.409,10.057 +max_reduce,8,10000,5,2,39.402,17.655,21.184 +max_reduce,8,100000,5,2,376.802,168.902,201.718 diff --git a/benchmark-analysis/focus-n2-8-ryzen-windows-mingw-rerun/focused_pairwise_n2_8.csv b/benchmark-analysis/focus-n2-8-ryzen-windows-mingw-rerun/focused_pairwise_n2_8.csv new file mode 100644 index 0000000..14e7274 --- /dev/null +++ b/benchmark-analysis/focus-n2-8-ryzen-windows-mingw-rerun/focused_pairwise_n2_8.csv @@ -0,0 +1,43 @@ +mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms +pairwise,2,500,5,2,0.179,0.079,0.087 +pairwise,2,1000,5,2,0.257,0.11,0.116 +pairwise,2,2000,5,2,0.501,0.21,0.232 +pairwise,2,5000,5,2,1.267,0.551,0.589 +pairwise,2,10000,5,2,2.432,1.076,1.161 +pairwise,2,100000,5,2,24.278,10.558,11.428 +pairwise,3,500,5,2,0.34,0.156,0.169 +pairwise,3,1000,5,2,0.693,0.315,0.329 +pairwise,3,2000,5,2,1.432,0.641,0.675 +pairwise,3,5000,5,2,3.512,1.62,1.696 +pairwise,3,10000,5,2,7.273,3.138,3.68 +pairwise,3,100000,5,2,101.009,45.806,50.0 +pairwise,4,500,5,2,0.784,0.366,0.398 +pairwise,4,1000,5,2,1.666,0.786,0.807 +pairwise,4,2000,5,2,2.307,1.094,1.094 +pairwise,4,5000,5,2,5.513,2.485,2.853 +pairwise,4,10000,5,2,10.479,4.846,5.254 +pairwise,4,100000,5,2,103.305,48.905,50.64 +pairwise,5,500,5,2,0.835,0.392,0.414 +pairwise,5,1000,5,2,1.721,0.823,0.841 +pairwise,5,2000,5,2,3.409,1.567,1.723 +pairwise,5,5000,5,2,8.462,3.965,4.233 +pairwise,5,10000,5,2,16.672,7.849,8.367 +pairwise,5,100000,5,2,162.323,76.356,81.557 +pairwise,6,500,5,2,1.215,0.57,0.611 +pairwise,6,1000,5,2,2.418,1.129,1.221 +pairwise,6,2000,5,2,4.971,2.337,2.494 +pairwise,6,5000,5,2,11.924,5.688,6.041 +pairwise,6,10000,5,2,24.001,11.309,12.178 +pairwise,6,100000,5,2,233.534,109.951,118.681 +pairwise,7,500,5,2,1.673,0.783,0.849 +pairwise,7,1000,5,2,3.399,1.618,1.703 +pairwise,7,2000,5,2,6.617,3.116,3.356 +pairwise,7,5000,5,2,16.757,7.873,8.514 +pairwise,7,10000,5,2,33.121,15.73,16.863 +pairwise,7,100000,5,2,330.164,157.671,167.113 +pairwise,8,500,5,2,2.195,1.031,1.119 +pairwise,8,1000,5,2,4.401,2.063,2.23 +pairwise,8,2000,5,2,8.776,4.137,4.437 +pairwise,8,5000,5,2,21.755,10.29,11.116 +pairwise,8,10000,5,2,43.354,20.653,22.13 +pairwise,8,100000,5,2,427.122,203.754,216.973 diff --git a/benchmark-analysis/focus-n2-8-ryzen-windows-mingw-rerun/focused_pairwise_vs_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-ryzen-windows-mingw-rerun/focused_pairwise_vs_max_reduce_n2_8.csv new file mode 100644 index 0000000..de1ff34 --- /dev/null +++ b/benchmark-analysis/focus-n2-8-ryzen-windows-mingw-rerun/focused_pairwise_vs_max_reduce_n2_8.csv @@ -0,0 +1,43 @@ +n,t,pairwise_fb_total_ms,max_reduce_fb_total_ms,speedup_max_over_pair,winner +2,500,0.179,0.153,1.1699346405228759,max_reduce +2,1000,0.257,0.327,0.7859327217125383,pairwise +2,2000,0.501,0.787,0.6365946632782719,pairwise +2,5000,1.267,1.602,0.7908863920099874,pairwise +2,10000,2.432,3.165,0.7684044233807267,pairwise +2,100000,24.278,29.774,0.8154094176126822,pairwise +3,500,0.34,0.317,1.0725552050473186,max_reduce +3,1000,0.693,0.748,0.926470588235294,pairwise +3,2000,1.432,1.3,1.1015384615384614,max_reduce +3,5000,3.512,3.266,1.0753214941824862,max_reduce +3,10000,7.273,6.408,1.134987515605493,max_reduce +3,100000,101.009,62.015,1.6287833588647909,max_reduce +4,500,0.784,0.556,1.410071942446043,max_reduce +4,1000,1.666,1.142,1.458844133099825,max_reduce +4,2000,2.307,2.288,1.0083041958041958,max_reduce +4,5000,5.513,5.714,0.964823241162058,pairwise +4,10000,10.479,11.323,0.925461450145721,pairwise +4,100000,103.305,108.579,0.9514270715331695,pairwise +5,500,0.835,0.856,0.9754672897196262,pairwise +5,1000,1.721,1.703,1.010569583088667,max_reduce +5,2000,3.409,3.421,0.9964922537269804,pairwise +5,5000,8.462,8.482,0.9976420655505778,pairwise +5,10000,16.672,16.837,0.9902001544218092,pairwise +5,100000,162.323,159.391,1.0183950160297635,max_reduce +6,500,1.215,1.18,1.0296610169491527,max_reduce +6,1000,2.418,2.401,1.0070803831736779,max_reduce +6,2000,4.971,4.729,1.0511736096426305,max_reduce +6,5000,11.924,11.973,0.995907458448175,pairwise +6,10000,24.001,23.521,1.02040729560818,max_reduce +6,100000,233.534,218.719,1.067735313347263,max_reduce +7,500,1.673,1.581,1.0581910183428211,max_reduce +7,1000,3.399,3.159,1.0759734093067428,max_reduce +7,2000,6.617,6.307,1.0491517361661644,max_reduce +7,5000,16.757,15.845,1.0575575891448408,max_reduce +7,10000,33.121,30.936,1.07062968709594,max_reduce +7,100000,330.164,290.579,1.1362280137243228,max_reduce +8,500,2.195,2.022,1.0855588526211672,max_reduce +8,1000,4.401,4.077,1.0794701986754967,max_reduce +8,2000,8.776,8.136,1.0786627335299903,max_reduce +8,5000,21.755,19.676,1.1056617198617607,max_reduce +8,10000,43.354,39.402,1.1002994771838992,max_reduce +8,100000,427.122,376.802,1.133544938721132,max_reduce diff --git a/benchmark-analysis/focus-n2-8-ryzen-windows-msvc-rerun/focused_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-ryzen-windows-msvc-rerun/focused_max_reduce_n2_8.csv new file mode 100644 index 0000000..8c5494e --- /dev/null +++ b/benchmark-analysis/focus-n2-8-ryzen-windows-msvc-rerun/focused_max_reduce_n2_8.csv @@ -0,0 +1,43 @@ +mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms +max_reduce,2,500,5,2,0.118,0.054,0.058 +max_reduce,2,1000,5,2,0.234,0.108,0.116 +max_reduce,2,2000,5,2,0.495,0.216,0.232 +max_reduce,2,5000,5,2,1.292,0.583,0.597 +max_reduce,2,10000,5,2,2.353,1.093,1.173 +max_reduce,2,100000,5,2,25.272,11.953,11.912 +max_reduce,3,500,5,2,0.358,0.162,0.186 +max_reduce,3,1000,5,2,0.51,0.228,0.253 +max_reduce,3,2000,5,2,1.014,0.457,0.495 +max_reduce,3,5000,5,2,2.415,1.087,1.254 +max_reduce,3,10000,5,2,4.969,2.216,2.596 +max_reduce,3,100000,5,2,51.875,24.161,25.746 +max_reduce,4,500,5,2,0.394,0.179,0.207 +max_reduce,4,1000,5,2,1.051,0.542,0.454 +max_reduce,4,2000,5,2,1.627,0.717,0.832 +max_reduce,4,5000,5,2,4.164,1.878,2.135 +max_reduce,4,10000,5,2,8.044,3.688,4.228 +max_reduce,4,100000,5,2,83.852,39.282,42.178 +max_reduce,5,500,5,2,0.608,0.266,0.311 +max_reduce,5,1000,5,2,1.206,0.538,0.623 +max_reduce,5,2000,5,2,2.385,1.078,1.255 +max_reduce,5,5000,5,2,5.902,2.677,3.119 +max_reduce,5,10000,5,2,11.849,5.404,6.29 +max_reduce,5,100000,5,2,123.388,56.537,63.592 +max_reduce,6,500,5,2,0.847,0.371,0.448 +max_reduce,6,1000,5,2,1.643,0.749,0.876 +max_reduce,6,2000,5,2,3.311,1.484,1.768 +max_reduce,6,5000,5,2,8.231,3.724,4.422 +max_reduce,6,10000,5,2,16.484,7.51,8.799 +max_reduce,6,100000,5,2,177.269,82.83,89.799 +max_reduce,7,500,5,2,1.106,0.492,0.581 +max_reduce,7,1000,5,2,2.283,1.041,1.176 +max_reduce,7,2000,5,2,4.423,2.035,2.327 +max_reduce,7,5000,5,2,11.124,4.986,5.9 +max_reduce,7,10000,5,2,27.072,12.387,14.291 +max_reduce,7,100000,5,2,232.871,106.143,122.576 +max_reduce,8,500,5,2,1.431,0.641,0.747 +max_reduce,8,1000,5,2,2.831,1.269,1.492 +max_reduce,8,2000,5,2,5.852,2.614,3.102 +max_reduce,8,5000,5,2,14.216,6.443,7.514 +max_reduce,8,10000,5,2,35.189,15.989,18.798 +max_reduce,8,100000,5,2,290.193,134.363,151.274 diff --git a/benchmark-analysis/focus-n2-8-ryzen-windows-msvc-rerun/focused_pairwise_n2_8.csv b/benchmark-analysis/focus-n2-8-ryzen-windows-msvc-rerun/focused_pairwise_n2_8.csv new file mode 100644 index 0000000..92bf7fd --- /dev/null +++ b/benchmark-analysis/focus-n2-8-ryzen-windows-msvc-rerun/focused_pairwise_n2_8.csv @@ -0,0 +1,43 @@ +mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms +pairwise,2,500,5,2,0.114,0.051,0.053 +pairwise,2,1000,5,2,0.161,0.075,0.077 +pairwise,2,2000,5,2,0.356,0.155,0.154 +pairwise,2,5000,5,2,0.861,0.373,0.383 +pairwise,2,10000,5,2,1.603,0.749,0.771 +pairwise,2,100000,5,2,18.276,8.452,8.228 +pairwise,3,500,5,2,0.2,0.096,0.098 +pairwise,3,1000,5,2,0.419,0.192,0.195 +pairwise,3,2000,5,2,0.844,0.385,0.395 +pairwise,3,5000,5,2,2.013,0.964,0.977 +pairwise,3,10000,5,2,4.233,2.043,2.051 +pairwise,3,100000,5,2,44.618,21.469,21.058 +pairwise,4,500,5,2,0.373,0.182,0.184 +pairwise,4,1000,5,2,0.768,0.363,0.368 +pairwise,4,2000,5,2,1.545,0.727,0.737 +pairwise,4,5000,5,2,3.823,1.825,1.846 +pairwise,4,10000,5,2,7.495,3.659,3.701 +pairwise,4,100000,5,2,79.862,38.657,38.499 +pairwise,5,500,5,2,0.622,0.297,0.3 +pairwise,5,1000,5,2,1.24,0.594,0.6 +pairwise,5,2000,5,2,2.425,1.18,1.192 +pairwise,5,5000,5,2,6.159,2.98,2.994 +pairwise,5,10000,5,2,12.094,5.935,6.006 +pairwise,5,100000,5,2,127.933,62.751,61.402 +pairwise,6,500,5,2,0.91,0.437,0.438 +pairwise,6,1000,5,2,1.985,0.909,0.999 +pairwise,6,2000,5,2,3.654,1.76,1.795 +pairwise,6,5000,5,2,9.473,4.441,4.815 +pairwise,6,10000,5,2,18.321,9.059,9.083 +pairwise,6,100000,5,2,185.866,91.562,90.741 +pairwise,7,500,5,2,1.226,0.6,0.601 +pairwise,7,1000,5,2,2.486,1.208,1.208 +pairwise,7,2000,5,2,4.909,2.413,2.431 +pairwise,7,5000,5,2,12.285,6.08,6.103 +pairwise,7,10000,5,2,27.202,13.082,13.75 +pairwise,7,100000,5,2,254.356,125.283,125.012 +pairwise,8,500,5,2,1.7,0.852,0.811 +pairwise,8,1000,5,2,3.293,1.6,1.602 +pairwise,8,2000,5,2,6.597,3.26,3.199 +pairwise,8,5000,5,2,16.998,8.052,8.665 +pairwise,8,10000,5,2,34.379,17.394,16.564 +pairwise,8,100000,5,2,335.824,164.916,164.701 diff --git a/benchmark-analysis/focus-n2-8-ryzen-windows-msvc-rerun/focused_pairwise_vs_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-ryzen-windows-msvc-rerun/focused_pairwise_vs_max_reduce_n2_8.csv new file mode 100644 index 0000000..6268db1 --- /dev/null +++ b/benchmark-analysis/focus-n2-8-ryzen-windows-msvc-rerun/focused_pairwise_vs_max_reduce_n2_8.csv @@ -0,0 +1,43 @@ +n,t,pairwise_fb_total_ms,max_reduce_fb_total_ms,speedup_max_over_pair,winner +2,500,0.114,0.118,0.9661016949152543,pairwise +2,1000,0.161,0.234,0.688034188034188,pairwise +2,2000,0.356,0.495,0.7191919191919192,pairwise +2,5000,0.861,1.292,0.6664086687306501,pairwise +2,10000,1.603,2.353,0.6812579685507861,pairwise +2,100000,18.276,25.272,0.7231718898385565,pairwise +3,500,0.2,0.358,0.558659217877095,pairwise +3,1000,0.419,0.51,0.8215686274509804,pairwise +3,2000,0.844,1.014,0.8323471400394477,pairwise +3,5000,2.013,2.415,0.8335403726708074,pairwise +3,10000,4.233,4.969,0.8518816663312536,pairwise +3,100000,44.618,51.875,0.8601060240963856,pairwise +4,500,0.373,0.394,0.9467005076142132,pairwise +4,1000,0.768,1.051,0.7307326355851571,pairwise +4,2000,1.545,1.627,0.9496004917025199,pairwise +4,5000,3.823,4.164,0.9181075888568685,pairwise +4,10000,7.495,8.044,0.9317503729487817,pairwise +4,100000,79.862,83.852,0.9524161618089013,pairwise +5,500,0.622,0.608,1.0230263157894737,max_reduce +5,1000,1.24,1.206,1.0281923714759535,max_reduce +5,2000,2.425,2.385,1.0167714884696017,max_reduce +5,5000,6.159,5.902,1.0435445611657064,max_reduce +5,10000,12.094,11.849,1.0206768503671195,max_reduce +5,100000,127.933,123.388,1.036835024475638,max_reduce +6,500,0.91,0.847,1.0743801652892562,max_reduce +6,1000,1.985,1.643,1.2081558125380403,max_reduce +6,2000,3.654,3.311,1.1035940803382664,max_reduce +6,5000,9.473,8.231,1.1508929656177864,max_reduce +6,10000,18.321,16.484,1.1114413977190003,max_reduce +6,100000,185.866,177.269,1.0484969171146676,max_reduce +7,500,1.226,1.106,1.1084990958408678,max_reduce +7,1000,2.486,2.283,1.088918090232151,max_reduce +7,2000,4.909,4.423,1.1098801718290752,max_reduce +7,5000,12.285,11.124,1.104368932038835,max_reduce +7,10000,27.202,27.072,1.0048020094562649,max_reduce +7,100000,254.356,232.871,1.0922613807644574,max_reduce +8,500,1.7,1.431,1.187980433263452,max_reduce +8,1000,3.293,2.831,1.1631932179441895,max_reduce +8,2000,6.597,5.852,1.127306903622693,max_reduce +8,5000,16.998,14.216,1.195694991558807,max_reduce +8,10000,34.379,35.189,0.976981443064594,pairwise +8,100000,335.824,290.193,1.1572436275168596,max_reduce diff --git a/benchmark-analysis/focus-n2-8-tahoe-m1-appleclang-rerun/focused_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-tahoe-m1-appleclang-rerun/focused_max_reduce_n2_8.csv new file mode 100644 index 0000000..adf6a25 --- /dev/null +++ b/benchmark-analysis/focus-n2-8-tahoe-m1-appleclang-rerun/focused_max_reduce_n2_8.csv @@ -0,0 +1,43 @@ +mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms +max_reduce,2,500,5,2,0.157,0.06,0.065 +max_reduce,2,1000,5,2,0.306,0.118,0.127 +max_reduce,2,2000,5,2,0.607,0.232,0.253 +max_reduce,2,5000,5,2,1.51,0.581,0.629 +max_reduce,2,10000,5,2,3.032,1.167,1.264 +max_reduce,2,100000,5,2,30.532,11.674,12.775 +max_reduce,3,500,5,2,0.272,0.108,0.119 +max_reduce,3,1000,5,2,0.527,0.21,0.231 +max_reduce,3,2000,5,2,1.059,0.421,0.463 +max_reduce,3,5000,5,2,2.653,1.061,1.16 +max_reduce,3,10000,5,2,5.277,2.101,2.318 +max_reduce,3,100000,5,2,52.996,21.124,23.223 +max_reduce,4,500,5,2,0.416,0.17,0.188 +max_reduce,4,1000,5,2,0.829,0.34,0.377 +max_reduce,4,2000,5,2,1.662,0.682,0.756 +max_reduce,4,5000,5,2,4.139,1.702,1.88 +max_reduce,4,10000,5,2,8.288,3.414,3.759 +max_reduce,4,100000,5,2,83.281,34.132,37.948 +max_reduce,5,500,5,2,0.629,0.262,0.294 +max_reduce,5,1000,5,2,1.26,0.527,0.59 +max_reduce,5,2000,5,2,2.531,1.055,1.188 +max_reduce,5,5000,5,2,6.374,2.674,2.984 +max_reduce,5,10000,5,2,12.622,5.274,5.921 +max_reduce,5,100000,5,2,121.932,51.038,57.119 +max_reduce,6,500,5,2,0.823,0.337,0.403 +max_reduce,6,1000,5,2,1.645,0.676,0.805 +max_reduce,6,2000,5,2,3.302,1.352,1.623 +max_reduce,6,5000,5,2,8.271,3.393,4.062 +max_reduce,6,10000,5,2,16.73,6.933,8.159 +max_reduce,6,100000,5,2,165.653,68.143,81.055 +max_reduce,7,500,5,2,1.085,0.463,0.526 +max_reduce,7,1000,5,2,2.201,0.957,1.054 +max_reduce,7,2000,5,2,4.341,1.853,2.11 +max_reduce,7,5000,5,2,10.845,4.63,5.271 +max_reduce,7,10000,5,2,21.804,9.311,10.598 +max_reduce,7,100000,5,2,218.178,93.182,105.963 +max_reduce,8,500,5,2,1.352,0.579,0.663 +max_reduce,8,1000,5,2,2.667,1.134,1.317 +max_reduce,8,2000,5,2,5.495,2.345,2.705 +max_reduce,8,5000,5,2,13.925,5.966,6.841 +max_reduce,8,10000,5,2,27.723,11.773,13.719 +max_reduce,8,100000,5,2,269.065,114.653,132.759 diff --git a/benchmark-analysis/focus-n2-8-tahoe-m1-appleclang-rerun/focused_pairwise_n2_8.csv b/benchmark-analysis/focus-n2-8-tahoe-m1-appleclang-rerun/focused_pairwise_n2_8.csv new file mode 100644 index 0000000..e9afee5 --- /dev/null +++ b/benchmark-analysis/focus-n2-8-tahoe-m1-appleclang-rerun/focused_pairwise_n2_8.csv @@ -0,0 +1,43 @@ +mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms +pairwise,2,500,5,2,0.112,0.04,0.041 +pairwise,2,1000,5,2,0.223,0.079,0.082 +pairwise,2,2000,5,2,0.442,0.157,0.162 +pairwise,2,5000,5,2,1.098,0.389,0.402 +pairwise,2,10000,5,2,2.217,0.793,0.808 +pairwise,2,100000,5,2,22.163,7.899,8.108 +pairwise,3,500,5,2,0.263,0.108,0.11 +pairwise,3,1000,5,2,0.587,0.274,0.224 +pairwise,3,2000,5,2,1.05,0.431,0.443 +pairwise,3,5000,5,2,2.624,1.073,1.107 +pairwise,3,10000,5,2,5.266,2.161,2.221 +pairwise,3,100000,5,2,52.696,21.633,22.205 +pairwise,4,500,5,2,0.493,0.215,0.219 +pairwise,4,1000,5,2,0.99,0.433,0.441 +pairwise,4,2000,5,2,1.962,0.855,0.876 +pairwise,4,5000,5,2,4.897,2.138,2.183 +pairwise,4,10000,5,2,10.048,4.513,4.388 +pairwise,4,100000,5,2,104.145,48.626,44.01 +pairwise,5,500,5,2,0.756,0.34,0.347 +pairwise,5,1000,5,2,1.524,0.687,0.7 +pairwise,5,2000,5,2,3.052,1.37,1.406 +pairwise,5,5000,5,2,7.549,3.399,3.466 +pairwise,5,10000,5,2,15.133,6.826,6.936 +pairwise,5,100000,5,2,152.57,68.763,69.956 +pairwise,6,500,5,2,1.084,0.494,0.508 +pairwise,6,1000,5,2,2.17,0.988,1.018 +pairwise,6,2000,5,2,4.353,1.98,2.045 +pairwise,6,5000,5,2,11.523,5.606,5.103 +pairwise,6,10000,5,2,21.77,9.907,10.229 +pairwise,6,100000,5,2,231.107,112.567,102.156 +pairwise,7,500,5,2,1.5,0.695,0.709 +pairwise,7,1000,5,2,2.99,1.389,1.411 +pairwise,7,2000,5,2,5.991,2.776,2.837 +pairwise,7,5000,5,2,15.865,7.84,7.084 +pairwise,7,10000,5,2,30.024,13.946,14.181 +pairwise,7,100000,5,2,300.698,139.566,142.122 +pairwise,8,500,5,2,1.954,0.915,0.93 +pairwise,8,1000,5,2,4.055,1.83,2.009 +pairwise,8,2000,5,2,7.838,3.671,3.737 +pairwise,8,5000,5,2,19.53,9.164,9.296 +pairwise,8,10000,5,2,39.227,18.417,18.664 +pairwise,8,100000,5,2,405.026,184.878,198.534 diff --git a/benchmark-analysis/focus-n2-8-tahoe-m1-appleclang-rerun/focused_pairwise_vs_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-tahoe-m1-appleclang-rerun/focused_pairwise_vs_max_reduce_n2_8.csv new file mode 100644 index 0000000..4c3f565 --- /dev/null +++ b/benchmark-analysis/focus-n2-8-tahoe-m1-appleclang-rerun/focused_pairwise_vs_max_reduce_n2_8.csv @@ -0,0 +1,43 @@ +n,t,pairwise_fb_total_ms,max_reduce_fb_total_ms,speedup_max_over_pair,winner +2,500,0.112,0.157,0.713375796178344,pairwise +2,1000,0.223,0.306,0.7287581699346406,pairwise +2,2000,0.442,0.607,0.728171334431631,pairwise +2,5000,1.098,1.51,0.7271523178807947,pairwise +2,10000,2.217,3.032,0.7312005277044855,pairwise +2,100000,22.163,30.532,0.7258941438490764,pairwise +3,500,0.263,0.272,0.9669117647058824,pairwise +3,1000,0.587,0.527,1.113851992409867,max_reduce +3,2000,1.05,1.059,0.991501416430595,pairwise +3,5000,2.624,2.653,0.9890689785148888,pairwise +3,10000,5.266,5.277,0.9979154822815993,pairwise +3,100000,52.696,52.996,0.9943391954109744,pairwise +4,500,0.493,0.416,1.185096153846154,max_reduce +4,1000,0.99,0.829,1.1942098914354644,max_reduce +4,2000,1.962,1.662,1.180505415162455,max_reduce +4,5000,4.897,4.139,1.1831360231940082,max_reduce +4,10000,10.048,8.288,1.2123552123552124,max_reduce +4,100000,104.145,83.281,1.2505253299071817,max_reduce +5,500,0.756,0.629,1.2019077901430844,max_reduce +5,1000,1.524,1.26,1.2095238095238094,max_reduce +5,2000,3.052,2.531,1.205847491110233,max_reduce +5,5000,7.549,6.374,1.1843426419830563,max_reduce +5,10000,15.133,12.622,1.198938361590873,max_reduce +5,100000,152.57,121.932,1.2512712003411737,max_reduce +6,500,1.084,0.823,1.3171324422843258,max_reduce +6,1000,2.17,1.645,1.3191489361702127,max_reduce +6,2000,4.353,3.302,1.3182919442761962,max_reduce +6,5000,11.523,8.271,1.3931809938338773,max_reduce +6,10000,21.77,16.73,1.301255230125523,max_reduce +6,100000,231.107,165.653,1.3951271634078466,max_reduce +7,500,1.5,1.085,1.3824884792626728,max_reduce +7,1000,2.99,2.201,1.3584734211721945,max_reduce +7,2000,5.991,4.341,1.3800967519004836,max_reduce +7,5000,15.865,10.845,1.46288612263716,max_reduce +7,10000,30.024,21.804,1.3769950467804073,max_reduce +7,100000,300.698,218.178,1.378223285574164,max_reduce +8,500,1.954,1.352,1.445266272189349,max_reduce +8,1000,4.055,2.667,1.520434945631796,max_reduce +8,2000,7.838,5.495,1.4263876251137397,max_reduce +8,5000,19.53,13.925,1.4025134649910234,max_reduce +8,10000,39.227,27.723,1.4149623056667748,max_reduce +8,100000,405.026,269.065,1.5053091260476095,max_reduce diff --git a/benchmark-analysis/focus-n2-8-tahoe-m1-gcc15/focused_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-tahoe-m1-gcc15/focused_max_reduce_n2_8.csv new file mode 100644 index 0000000..a595565 --- /dev/null +++ b/benchmark-analysis/focus-n2-8-tahoe-m1-gcc15/focused_max_reduce_n2_8.csv @@ -0,0 +1,43 @@ +mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms +max_reduce,2,500,5,2,0.172,0.072,0.082 +max_reduce,2,1000,5,2,0.355,0.148,0.172 +max_reduce,2,2000,5,2,0.691,0.291,0.33 +max_reduce,2,5000,5,2,1.723,0.72,0.829 +max_reduce,2,10000,5,2,3.506,1.469,1.687 +max_reduce,2,100000,5,2,35.103,14.611,16.945 +max_reduce,3,500,5,2,0.322,0.137,0.161 +max_reduce,3,1000,5,2,0.645,0.274,0.323 +max_reduce,3,2000,5,2,1.302,0.551,0.655 +max_reduce,3,5000,5,2,3.28,1.399,1.64 +max_reduce,3,10000,5,2,6.51,2.767,3.265 +max_reduce,3,100000,5,2,65.988,28.265,32.896 +max_reduce,4,500,5,2,0.53,0.229,0.27 +max_reduce,4,1000,5,2,1.08,0.469,0.548 +max_reduce,4,2000,5,2,2.391,0.921,1.348 +max_reduce,4,5000,5,2,5.339,2.303,2.732 +max_reduce,4,10000,5,2,10.754,4.608,5.531 +max_reduce,4,100000,5,2,120.083,46.97,66.981 +max_reduce,5,500,5,2,0.809,0.346,0.426 +max_reduce,5,1000,5,2,1.61,0.687,0.849 +max_reduce,5,2000,5,2,3.581,1.388,2.041 +max_reduce,5,5000,5,2,8.11,3.443,4.287 +max_reduce,5,10000,5,2,16.216,6.904,8.566 +max_reduce,5,100000,5,2,161.294,69.191,84.669 +max_reduce,6,500,5,2,1.294,0.48,0.769 +max_reduce,6,1000,5,2,2.559,0.975,1.494 +max_reduce,6,2000,5,2,4.475,1.909,2.392 +max_reduce,6,5000,5,2,11.181,4.771,5.976 +max_reduce,6,10000,5,2,22.777,9.697,12.209 +max_reduce,6,100000,5,2,256.489,97.482,150.26 +max_reduce,7,500,5,2,1.434,0.621,0.764 +max_reduce,7,1000,5,2,2.927,1.26,1.568 +max_reduce,7,2000,5,2,5.839,2.524,3.119 +max_reduce,7,5000,5,2,16.801,6.317,9.998 +max_reduce,7,10000,5,2,29.012,12.558,15.477 +max_reduce,7,100000,5,2,291.194,126.161,155.247 +max_reduce,8,500,5,2,2.178,0.824,1.297 +max_reduce,8,1000,5,2,3.883,1.705,2.061 +max_reduce,8,2000,5,2,7.273,3.19,3.862 +max_reduce,8,5000,5,2,18.49,8.13,9.804 +max_reduce,8,10000,5,2,43.258,16.238,25.909 +max_reduce,8,100000,5,2,369.76,162.365,196.334 diff --git a/benchmark-analysis/focus-n2-8-tahoe-m1-gcc15/focused_pairwise_n2_8.csv b/benchmark-analysis/focus-n2-8-tahoe-m1-gcc15/focused_pairwise_n2_8.csv new file mode 100644 index 0000000..dbada60 --- /dev/null +++ b/benchmark-analysis/focus-n2-8-tahoe-m1-gcc15/focused_pairwise_n2_8.csv @@ -0,0 +1,43 @@ +mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms +pairwise,2,500,5,2,0.134,0.054,0.062 +pairwise,2,1000,5,2,0.263,0.108,0.121 +pairwise,2,2000,5,2,0.542,0.225,0.248 +pairwise,2,5000,5,2,1.337,0.561,0.603 +pairwise,2,10000,5,2,2.705,1.127,1.23 +pairwise,2,100000,5,2,25.302,10.294,11.537 +pairwise,3,500,5,2,0.324,0.145,0.154 +pairwise,3,1000,5,2,0.627,0.281,0.298 +pairwise,3,2000,5,2,1.374,0.614,0.662 +pairwise,3,5000,5,2,3.387,1.525,1.625 +pairwise,3,10000,5,2,6.65,3.023,3.145 +pairwise,3,100000,5,2,62.775,28.061,29.923 +pairwise,4,500,5,2,0.585,0.272,0.282 +pairwise,4,1000,5,2,1.14,0.528,0.551 +pairwise,4,2000,5,2,2.332,1.083,1.128 +pairwise,4,5000,5,2,5.76,2.676,2.781 +pairwise,4,10000,5,2,11.558,5.353,5.595 +pairwise,4,100000,5,2,118.66,55.307,57.106 +pairwise,5,500,5,2,0.878,0.413,0.425 +pairwise,5,1000,5,2,1.755,0.828,0.849 +pairwise,5,2000,5,2,3.683,1.738,1.792 +pairwise,5,5000,5,2,8.791,4.141,4.268 +pairwise,5,10000,5,2,17.636,8.321,8.541 +pairwise,5,100000,5,2,177.332,83.627,86.041 +pairwise,6,500,5,2,1.23,0.584,0.602 +pairwise,6,1000,5,2,2.553,1.215,1.249 +pairwise,6,2000,5,2,4.95,2.352,2.417 +pairwise,6,5000,5,2,12.307,5.823,6.045 +pairwise,6,10000,5,2,25.468,12.236,12.354 +pairwise,6,100000,5,2,251.998,121.44,121.839 +pairwise,7,500,5,2,1.663,0.794,0.817 +pairwise,7,1000,5,2,3.256,1.56,1.595 +pairwise,7,2000,5,2,6.807,3.267,3.338 +pairwise,7,5000,5,2,16.316,7.817,7.992 +pairwise,7,10000,5,2,34.127,16.348,16.78 +pairwise,7,100000,5,2,329.173,156.951,162.094 +pairwise,8,500,5,2,2.104,1.013,1.033 +pairwise,8,1000,5,2,4.157,2.008,2.033 +pairwise,8,2000,5,2,8.777,4.249,4.295 +pairwise,8,5000,5,2,23.153,11.325,11.202 +pairwise,8,10000,5,2,45.271,21.771,22.223 +pairwise,8,100000,5,2,440.486,209.662,219.102 diff --git a/benchmark-analysis/focus-n2-8-tahoe-m1-gcc15/focused_pairwise_vs_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-tahoe-m1-gcc15/focused_pairwise_vs_max_reduce_n2_8.csv new file mode 100644 index 0000000..21d6b18 --- /dev/null +++ b/benchmark-analysis/focus-n2-8-tahoe-m1-gcc15/focused_pairwise_vs_max_reduce_n2_8.csv @@ -0,0 +1,43 @@ +n,t,pairwise_fb_total_ms,max_reduce_fb_total_ms,speedup_max_over_pair,winner +2,500,0.134,0.172,0.7790697674418606,pairwise +2,1000,0.263,0.355,0.7408450704225353,pairwise +2,2000,0.542,0.691,0.7843704775687411,pairwise +2,5000,1.337,1.723,0.7759721416134648,pairwise +2,10000,2.705,3.506,0.7715345122646892,pairwise +2,100000,25.302,35.103,0.7207930946072985,pairwise +3,500,0.324,0.322,1.0062111801242235,max_reduce +3,1000,0.627,0.645,0.9720930232558139,pairwise +3,2000,1.374,1.302,1.055299539170507,max_reduce +3,5000,3.387,3.28,1.0326219512195123,max_reduce +3,10000,6.65,6.51,1.021505376344086,max_reduce +3,100000,62.775,65.988,0.9513093289689034,pairwise +4,500,0.585,0.53,1.1037735849056602,max_reduce +4,1000,1.14,1.08,1.0555555555555554,max_reduce +4,2000,2.332,2.391,0.9753241321622751,pairwise +4,5000,5.76,5.339,1.0788537179247049,max_reduce +4,10000,11.558,10.754,1.0747628789287706,max_reduce +4,100000,118.66,120.083,0.988149863011417,pairwise +5,500,0.878,0.809,1.0852904820766378,max_reduce +5,1000,1.755,1.61,1.0900621118012421,max_reduce +5,2000,3.683,3.581,1.0284836637810668,max_reduce +5,5000,8.791,8.11,1.0839704069050555,max_reduce +5,10000,17.636,16.216,1.0875678342377897,max_reduce +5,100000,177.332,161.294,1.0994333329200092,max_reduce +6,500,1.23,1.294,0.9505409582689335,pairwise +6,1000,2.553,2.559,0.9976553341148885,pairwise +6,2000,4.95,4.475,1.106145251396648,max_reduce +6,5000,12.307,11.181,1.100706555764243,max_reduce +6,10000,25.468,22.777,1.1181454976511394,max_reduce +6,100000,251.998,256.489,0.9824904771744598,pairwise +7,500,1.663,1.434,1.1596931659693166,max_reduce +7,1000,3.256,2.927,1.1124017765630336,max_reduce +7,2000,6.807,5.839,1.1657818119541017,max_reduce +7,5000,16.316,16.801,0.9711326706743647,pairwise +7,10000,34.127,29.012,1.1763063559906246,max_reduce +7,100000,329.173,291.194,1.1304250774397824,max_reduce +8,500,2.104,2.178,0.9660238751147843,pairwise +8,1000,4.157,3.883,1.070563996909606,max_reduce +8,2000,8.777,7.273,1.2067922452908015,max_reduce +8,5000,23.153,18.49,1.252190373174689,max_reduce +8,10000,45.271,43.258,1.0465347450182625,max_reduce +8,100000,440.486,369.76,1.1912754218952835,max_reduce diff --git a/benchmark-analysis/focus-n2-8-tahoe-m1-homebrew-llvm-rerun/focused_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-tahoe-m1-homebrew-llvm-rerun/focused_max_reduce_n2_8.csv new file mode 100644 index 0000000..2a545b4 --- /dev/null +++ b/benchmark-analysis/focus-n2-8-tahoe-m1-homebrew-llvm-rerun/focused_max_reduce_n2_8.csv @@ -0,0 +1,43 @@ +mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms +max_reduce,2,500,5,2,0.366,0.14,0.152 +max_reduce,2,1000,5,2,0.539,0.207,0.224 +max_reduce,2,2000,5,2,0.859,0.328,0.362 +max_reduce,2,5000,5,2,1.801,0.694,0.751 +max_reduce,2,10000,5,2,3.127,1.205,1.3 +max_reduce,2,100000,5,2,29.167,11.226,12.169 +max_reduce,3,500,5,2,0.293,0.116,0.128 +max_reduce,3,1000,5,2,0.585,0.232,0.257 +max_reduce,3,2000,5,2,1.197,0.482,0.524 +max_reduce,3,5000,5,2,2.818,1.082,1.294 +max_reduce,3,10000,5,2,5.209,2.088,2.266 +max_reduce,3,100000,5,2,51.026,20.294,22.466 +max_reduce,4,500,5,2,0.485,0.189,0.233 +max_reduce,4,1000,5,2,0.924,0.378,0.421 +max_reduce,4,2000,5,2,1.733,0.709,0.792 +max_reduce,4,5000,5,2,4.366,1.708,2.092 +max_reduce,4,10000,5,2,8.138,3.366,3.679 +max_reduce,4,100000,5,2,84.46,32.998,40.688 +max_reduce,5,500,5,2,0.676,0.282,0.316 +max_reduce,5,1000,5,2,1.316,0.566,0.596 +max_reduce,5,2000,5,2,2.515,1.051,1.18 +max_reduce,5,5000,5,2,6.402,2.499,3.217 +max_reduce,5,10000,5,2,12.69,4.945,6.409 +max_reduce,5,100000,5,2,118.285,49.424,55.571 +max_reduce,6,500,5,2,0.918,0.376,0.45 +max_reduce,6,1000,5,2,1.837,0.754,0.9 +max_reduce,6,2000,5,2,3.412,1.401,1.674 +max_reduce,6,5000,5,2,8.67,3.337,4.541 +max_reduce,6,10000,5,2,15.991,6.585,7.831 +max_reduce,6,100000,5,2,171.928,65.649,90.501 +max_reduce,7,500,5,2,1.206,0.514,0.585 +max_reduce,7,1000,5,2,2.248,0.96,1.088 +max_reduce,7,2000,5,2,4.387,1.895,2.101 +max_reduce,7,5000,5,2,11.492,4.532,6.047 +max_reduce,7,10000,5,2,20.987,8.956,10.201 +max_reduce,7,100000,5,2,211.033,90.182,102.544 +max_reduce,8,500,5,2,1.442,0.551,0.783 +max_reduce,8,1000,5,2,2.769,1.175,1.369 +max_reduce,8,2000,5,2,5.497,2.333,2.711 +max_reduce,8,5000,5,2,13.267,5.666,6.523 +max_reduce,8,10000,5,2,26.494,11.293,13.048 +max_reduce,8,100000,5,2,258.81,110.393,127.621 diff --git a/benchmark-analysis/focus-n2-8-tahoe-m1-homebrew-llvm-rerun/focused_pairwise_n2_8.csv b/benchmark-analysis/focus-n2-8-tahoe-m1-homebrew-llvm-rerun/focused_pairwise_n2_8.csv new file mode 100644 index 0000000..fd6cde2 --- /dev/null +++ b/benchmark-analysis/focus-n2-8-tahoe-m1-homebrew-llvm-rerun/focused_pairwise_n2_8.csv @@ -0,0 +1,43 @@ +mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms +pairwise,2,500,5,2,0.107,0.038,0.039 +pairwise,2,1000,5,2,0.24,0.086,0.088 +pairwise,2,2000,5,2,0.524,0.172,0.219 +pairwise,2,5000,5,2,1.2,0.429,0.439 +pairwise,2,10000,5,2,2.236,0.799,0.816 +pairwise,2,100000,5,2,20.877,7.438,7.663 +pairwise,3,500,5,2,0.287,0.119,0.12 +pairwise,3,1000,5,2,0.574,0.238,0.24 +pairwise,3,2000,5,2,1.143,0.47,0.483 +pairwise,3,5000,5,2,2.65,1.094,1.115 +pairwise,3,10000,5,2,5.094,2.102,2.13 +pairwise,3,100000,5,2,49.585,20.414,20.873 +pairwise,4,500,5,2,0.531,0.233,0.235 +pairwise,4,1000,5,2,1.063,0.468,0.47 +pairwise,4,2000,5,2,2.078,0.931,0.899 +pairwise,4,5000,5,2,4.841,2.152,2.115 +pairwise,4,10000,5,2,9.547,4.196,4.23 +pairwise,4,100000,5,2,94.874,42.855,41.191 +pairwise,5,500,5,2,0.845,0.381,0.387 +pairwise,5,1000,5,2,1.6,0.717,0.729 +pairwise,5,2000,5,2,3.18,1.461,1.433 +pairwise,5,5000,5,2,7.481,3.401,3.395 +pairwise,5,10000,5,2,14.703,6.683,6.693 +pairwise,5,100000,5,2,149.014,67.846,67.666 +pairwise,6,500,5,2,1.212,0.552,0.568 +pairwise,6,1000,5,2,2.31,1.072,1.067 +pairwise,6,2000,5,2,4.45,2.052,2.061 +pairwise,6,5000,5,2,10.542,4.815,4.934 +pairwise,6,10000,5,2,21.06,9.612,9.871 +pairwise,6,100000,5,2,211.288,96.319,99.171 +pairwise,7,500,5,2,1.551,0.72,0.732 +pairwise,7,1000,5,2,3.13,1.452,1.479 +pairwise,7,2000,5,2,5.981,2.771,2.827 +pairwise,7,5000,5,2,14.508,6.71,6.882 +pairwise,7,10000,5,2,29.166,13.517,13.814 +pairwise,7,100000,5,2,292.368,134.58,139.464 +pairwise,8,500,5,2,2.039,0.959,0.966 +pairwise,8,1000,5,2,3.949,1.86,1.866 +pairwise,8,2000,5,2,7.685,3.592,3.667 +pairwise,8,5000,5,2,18.993,8.878,9.08 +pairwise,8,10000,5,2,37.866,17.756,18.033 +pairwise,8,100000,5,2,379.795,177.795,181.225 diff --git a/benchmark-analysis/focus-n2-8-tahoe-m1-homebrew-llvm-rerun/focused_pairwise_vs_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-tahoe-m1-homebrew-llvm-rerun/focused_pairwise_vs_max_reduce_n2_8.csv new file mode 100644 index 0000000..a67f5fb --- /dev/null +++ b/benchmark-analysis/focus-n2-8-tahoe-m1-homebrew-llvm-rerun/focused_pairwise_vs_max_reduce_n2_8.csv @@ -0,0 +1,43 @@ +n,t,pairwise_fb_total_ms,max_reduce_fb_total_ms,speedup_max_over_pair,winner +2,500,0.107,0.366,0.2923497267759563,pairwise +2,1000,0.24,0.539,0.44526901669758806,pairwise +2,2000,0.524,0.859,0.610011641443539,pairwise +2,5000,1.2,1.801,0.6662965019433648,pairwise +2,10000,2.236,3.127,0.7150623600895428,pairwise +2,100000,20.877,29.167,0.7157746768608358,pairwise +3,500,0.287,0.293,0.9795221843003413,pairwise +3,1000,0.574,0.585,0.9811965811965812,pairwise +3,2000,1.143,1.197,0.9548872180451128,pairwise +3,5000,2.65,2.818,0.9403832505322923,pairwise +3,10000,5.094,5.209,0.9779228258782877,pairwise +3,100000,49.585,51.026,0.9717594951593305,pairwise +4,500,0.531,0.485,1.0948453608247424,max_reduce +4,1000,1.063,0.924,1.1504329004329004,max_reduce +4,2000,2.078,1.733,1.199076745527986,max_reduce +4,5000,4.841,4.366,1.1087952359138802,max_reduce +4,10000,9.547,8.138,1.17313836323421,max_reduce +4,100000,94.874,84.46,1.1233009708737864,max_reduce +5,500,0.845,0.676,1.2499999999999998,max_reduce +5,1000,1.6,1.316,1.21580547112462,max_reduce +5,2000,3.18,2.515,1.2644135188866799,max_reduce +5,5000,7.481,6.402,1.168541080912215,max_reduce +5,10000,14.703,12.69,1.158628841607565,max_reduce +5,100000,149.014,118.285,1.2597878006509702,max_reduce +6,500,1.212,0.918,1.3202614379084967,max_reduce +6,1000,2.31,1.837,1.25748502994012,max_reduce +6,2000,4.45,3.412,1.3042203985932006,max_reduce +6,5000,10.542,8.67,1.215916955017301,max_reduce +6,10000,21.06,15.991,1.3169908073291225,max_reduce +6,100000,211.288,171.928,1.2289330417384021,max_reduce +7,500,1.551,1.206,1.2860696517412935,max_reduce +7,1000,3.13,2.248,1.3923487544483983,max_reduce +7,2000,5.981,4.387,1.363346250284933,max_reduce +7,5000,14.508,11.492,1.2624434389140269,max_reduce +7,10000,29.166,20.987,1.3897174441320819,max_reduce +7,100000,292.368,211.033,1.385413655684182,max_reduce +8,500,2.039,1.442,1.4140083217753123,max_reduce +8,1000,3.949,2.769,1.4261466233297218,max_reduce +8,2000,7.685,5.497,1.3980352919774421,max_reduce +8,5000,18.993,13.267,1.4315971960503504,max_reduce +8,10000,37.866,26.494,1.429229259454971,max_reduce +8,100000,379.795,258.81,1.4674664812024265,max_reduce diff --git a/benchmark-analysis/focus-n2-8-tahoe-m1/focused_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-tahoe-m1/focused_max_reduce_n2_8.csv new file mode 100644 index 0000000..2d6616b --- /dev/null +++ b/benchmark-analysis/focus-n2-8-tahoe-m1/focused_max_reduce_n2_8.csv @@ -0,0 +1,43 @@ +mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms +max_reduce,2,500,5,2,0.153,0.058,0.064 +max_reduce,2,1000,5,2,0.313,0.12,0.131 +max_reduce,2,2000,5,2,0.603,0.232,0.252 +max_reduce,2,5000,5,2,1.506,0.579,0.63 +max_reduce,2,10000,5,2,3.016,1.159,1.262 +max_reduce,2,100000,5,2,30.249,11.622,12.676 +max_reduce,3,500,5,2,0.262,0.104,0.115 +max_reduce,3,1000,5,2,0.524,0.208,0.23 +max_reduce,3,2000,5,2,1.087,0.416,0.5 +max_reduce,3,5000,5,2,2.718,1.08,1.196 +max_reduce,3,10000,5,2,5.243,2.081,2.309 +max_reduce,3,100000,5,2,52.645,20.883,23.206 +max_reduce,4,500,5,2,0.414,0.169,0.188 +max_reduce,4,1000,5,2,0.843,0.35,0.381 +max_reduce,4,2000,5,2,1.691,0.695,0.769 +max_reduce,4,5000,5,2,4.138,1.698,1.885 +max_reduce,4,10000,5,2,8.285,3.4,3.772 +max_reduce,4,100000,5,2,83.105,34.108,37.867 +max_reduce,5,500,5,2,0.606,0.253,0.283 +max_reduce,5,1000,5,2,1.212,0.507,0.567 +max_reduce,5,2000,5,2,2.597,1.012,1.311 +max_reduce,5,5000,5,2,6.063,2.539,2.84 +max_reduce,5,10000,5,2,12.146,5.087,5.688 +max_reduce,5,100000,5,2,121.787,50.859,57.182 +max_reduce,6,500,5,2,0.822,0.336,0.403 +max_reduce,6,1000,5,2,1.643,0.673,0.805 +max_reduce,6,2000,5,2,3.294,1.352,1.616 +max_reduce,6,5000,5,2,8.313,3.427,4.067 +max_reduce,6,10000,5,2,16.464,6.742,8.081 +max_reduce,6,100000,5,2,177.789,67.649,93.821 +max_reduce,7,500,5,2,1.178,0.461,0.622 +max_reduce,7,1000,5,2,2.159,0.921,1.048 +max_reduce,7,2000,5,2,4.327,1.849,2.101 +max_reduce,7,5000,5,2,10.82,4.613,5.263 +max_reduce,7,10000,5,2,21.646,9.234,10.525 +max_reduce,7,100000,5,2,236.383,92.557,124.809 +max_reduce,8,500,5,2,1.325,0.563,0.653 +max_reduce,8,1000,5,2,2.954,1.127,1.611 +max_reduce,8,2000,5,2,5.323,2.258,2.635 +max_reduce,8,5000,5,2,13.262,5.66,6.532 +max_reduce,8,10000,5,2,26.559,11.297,13.108 +max_reduce,8,100000,5,2,295.398,113.367,160.473 diff --git a/benchmark-analysis/focus-n2-8-tahoe-m1/focused_pairwise_n2_8.csv b/benchmark-analysis/focus-n2-8-tahoe-m1/focused_pairwise_n2_8.csv new file mode 100644 index 0000000..a7eae2c --- /dev/null +++ b/benchmark-analysis/focus-n2-8-tahoe-m1/focused_pairwise_n2_8.csv @@ -0,0 +1,43 @@ +mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms +pairwise,2,500,5,2,0.113,0.04,0.041 +pairwise,2,1000,5,2,0.216,0.077,0.079 +pairwise,2,2000,5,2,0.429,0.154,0.157 +pairwise,2,5000,5,2,1.075,0.384,0.393 +pairwise,2,10000,5,2,2.156,0.77,0.788 +pairwise,2,100000,5,2,21.628,7.757,7.854 +pairwise,3,500,5,2,0.266,0.11,0.111 +pairwise,3,1000,5,2,0.565,0.21,0.268 +pairwise,3,2000,5,2,1.019,0.42,0.429 +pairwise,3,5000,5,2,2.578,1.076,1.076 +pairwise,3,10000,5,2,5.162,2.156,2.152 +pairwise,3,100000,5,2,51.556,21.373,21.637 +pairwise,4,500,5,2,0.475,0.208,0.21 +pairwise,4,1000,5,2,0.952,0.42,0.42 +pairwise,4,2000,5,2,1.905,0.839,0.843 +pairwise,4,5000,5,2,4.778,2.098,2.124 +pairwise,4,10000,5,2,9.604,4.197,4.296 +pairwise,4,100000,5,2,99.357,44.892,43.313 +pairwise,5,500,5,2,0.755,0.341,0.345 +pairwise,5,1000,5,2,1.563,0.703,0.717 +pairwise,5,2000,5,2,3.026,1.365,1.387 +pairwise,5,5000,5,2,7.573,3.425,3.465 +pairwise,5,10000,5,2,15.112,6.838,6.904 +pairwise,5,100000,5,2,151.847,68.742,69.363 +pairwise,6,500,5,2,1.116,0.511,0.519 +pairwise,6,1000,5,2,2.189,1.0,1.025 +pairwise,6,2000,5,2,4.356,1.988,2.043 +pairwise,6,5000,5,2,11.326,5.157,5.322 +pairwise,6,10000,5,2,21.75,9.936,10.187 +pairwise,6,100000,5,2,218.774,99.698,102.684 +pairwise,7,500,5,2,1.616,0.717,0.8 +pairwise,7,1000,5,2,3.108,1.444,1.467 +pairwise,7,2000,5,2,6.0,2.784,2.837 +pairwise,7,5000,5,2,15.009,6.961,7.105 +pairwise,7,10000,5,2,30.005,13.959,14.157 +pairwise,7,100000,5,2,300.895,139.938,141.98 +pairwise,8,500,5,2,1.973,0.926,0.938 +pairwise,8,1000,5,2,3.9,1.833,1.852 +pairwise,8,2000,5,2,7.82,3.677,3.714 +pairwise,8,5000,5,2,20.138,9.188,9.879 +pairwise,8,10000,5,2,39.09,18.394,18.547 +pairwise,8,100000,5,2,391.364,183.707,186.116 diff --git a/benchmark-analysis/focus-n2-8-tahoe-m1/focused_pairwise_vs_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-tahoe-m1/focused_pairwise_vs_max_reduce_n2_8.csv new file mode 100644 index 0000000..bec5986 --- /dev/null +++ b/benchmark-analysis/focus-n2-8-tahoe-m1/focused_pairwise_vs_max_reduce_n2_8.csv @@ -0,0 +1,43 @@ +n,t,pairwise_fb_total_ms,max_reduce_fb_total_ms,speedup_max_over_pair,winner +2,500,0.113,0.153,0.738562091503268,pairwise +2,1000,0.216,0.313,0.6900958466453674,pairwise +2,2000,0.429,0.603,0.7114427860696517,pairwise +2,5000,1.075,1.506,0.7138114209827356,pairwise +2,10000,2.156,3.016,0.7148541114058355,pairwise +2,100000,21.628,30.249,0.7149988429369566,pairwise +3,500,0.266,0.262,1.015267175572519,max_reduce +3,1000,0.565,0.524,1.07824427480916,max_reduce +3,2000,1.019,1.087,0.937442502299908,pairwise +3,5000,2.578,2.718,0.9484915378955113,pairwise +3,10000,5.162,5.243,0.9845508296776654,pairwise +3,100000,51.556,52.645,0.9793142748599106,pairwise +4,500,0.475,0.414,1.1473429951690821,max_reduce +4,1000,0.952,0.843,1.129300118623962,max_reduce +4,2000,1.905,1.691,1.1265523358959195,max_reduce +4,5000,4.778,4.138,1.1546640889318511,max_reduce +4,10000,9.604,8.285,1.1592033796016896,max_reduce +4,100000,99.357,83.105,1.1955598339450093,max_reduce +5,500,0.755,0.606,1.245874587458746,max_reduce +5,1000,1.563,1.212,1.2896039603960396,max_reduce +5,2000,3.026,2.597,1.1651906045437042,max_reduce +5,5000,7.573,6.063,1.24905162460828,max_reduce +5,10000,15.112,12.146,1.2441956199571875,max_reduce +5,100000,151.847,121.787,1.2468243737016267,max_reduce +6,500,1.116,0.822,1.3576642335766425,max_reduce +6,1000,2.189,1.643,1.332318928788801,max_reduce +6,2000,4.356,3.294,1.3224043715846994,max_reduce +6,5000,11.326,8.313,1.362444364248767,max_reduce +6,10000,21.75,16.464,1.3210641399416911,max_reduce +6,100000,218.774,177.789,1.2305260730416394,max_reduce +7,500,1.616,1.178,1.371816638370119,max_reduce +7,1000,3.108,2.159,1.4395553496989348,max_reduce +7,2000,6.0,4.327,1.386642015253062,max_reduce +7,5000,15.009,10.82,1.3871534195933457,max_reduce +7,10000,30.005,21.646,1.3861683451907973,max_reduce +7,100000,300.895,236.383,1.2729130267405016,max_reduce +8,500,1.973,1.325,1.489056603773585,max_reduce +8,1000,3.9,2.954,1.3202437373053486,max_reduce +8,2000,7.82,5.323,1.469096374225061,max_reduce +8,5000,20.138,13.262,1.5184738350173428,max_reduce +8,10000,39.09,26.559,1.4718174630068903,max_reduce +8,100000,391.364,295.398,1.3248701751535215,max_reduce diff --git a/benchmark-analysis/focus-n2-8-ventura-kabylake/focused_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-ventura-kabylake/focused_max_reduce_n2_8.csv new file mode 100644 index 0000000..d795131 --- /dev/null +++ b/benchmark-analysis/focus-n2-8-ventura-kabylake/focused_max_reduce_n2_8.csv @@ -0,0 +1,43 @@ +mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms +max_reduce,2,500,5,2,0.277,0.112,0.118 +max_reduce,2,1000,5,2,0.54,0.216,0.235 +max_reduce,2,2000,5,2,1.087,0.432,0.471 +max_reduce,2,5000,5,2,2.706,1.084,1.179 +max_reduce,2,10000,5,2,5.423,2.175,2.373 +max_reduce,2,100000,5,2,56.385,22.584,24.996 +max_reduce,3,500,5,2,0.54,0.227,0.249 +max_reduce,3,1000,5,2,1.079,0.455,0.5 +max_reduce,3,2000,5,2,2.161,0.911,1.005 +max_reduce,3,5000,5,2,5.399,2.277,2.511 +max_reduce,3,10000,5,2,11.384,5.132,5.036 +max_reduce,3,100000,5,2,114.188,49.074,52.8 +max_reduce,4,500,5,2,0.904,0.394,0.427 +max_reduce,4,1000,5,2,1.823,0.791,0.872 +max_reduce,4,2000,5,2,3.616,1.585,1.715 +max_reduce,4,5000,5,2,9.206,3.862,4.558 +max_reduce,4,10000,5,2,19.364,8.678,9.123 +max_reduce,4,100000,5,2,188.534,82.793,90.003 +max_reduce,5,500,5,2,1.314,0.571,0.644 +max_reduce,5,1000,5,2,2.692,1.182,1.297 +max_reduce,5,2000,5,2,5.385,2.363,2.636 +max_reduce,5,5000,5,2,14.302,5.998,7.27 +max_reduce,5,10000,5,2,26.999,11.654,13.431 +max_reduce,5,100000,5,2,281.104,123.921,137.218 +max_reduce,6,500,5,2,1.837,0.802,0.914 +max_reduce,6,1000,5,2,3.734,1.613,1.864 +max_reduce,6,2000,5,2,7.391,3.227,3.708 +max_reduce,6,5000,5,2,19.422,8.992,9.299 +max_reduce,6,10000,5,2,38.488,16.513,19.652 +max_reduce,6,100000,5,2,388.776,169.478,195.388 +max_reduce,7,500,5,2,2.674,1.095,1.388 +max_reduce,7,1000,5,2,5.001,2.25,2.482 +max_reduce,7,2000,5,2,9.926,4.381,5.02 +max_reduce,7,5000,5,2,25.743,11.491,12.949 +max_reduce,7,10000,5,2,51.916,23.076,26.213 +max_reduce,7,100000,5,2,520.842,230.772,262.623 +max_reduce,8,500,5,2,3.146,1.413,1.579 +max_reduce,8,1000,5,2,6.216,2.756,3.158 +max_reduce,8,2000,5,2,12.468,5.52,6.352 +max_reduce,8,5000,5,2,32.278,14.519,16.279 +max_reduce,8,10000,5,2,65.428,29.275,33.206 +max_reduce,8,100000,5,2,654.927,289.674,334.606 diff --git a/benchmark-analysis/focus-n2-8-ventura-kabylake/focused_pairwise_n2_8.csv b/benchmark-analysis/focus-n2-8-ventura-kabylake/focused_pairwise_n2_8.csv new file mode 100644 index 0000000..84e41f7 --- /dev/null +++ b/benchmark-analysis/focus-n2-8-ventura-kabylake/focused_pairwise_n2_8.csv @@ -0,0 +1,43 @@ +mode,n,t,runs,warmup,fb_total_ms,forward_ms,backward_ms +pairwise,2,500,5,2,0.221,0.087,0.089 +pairwise,2,1000,5,2,0.44,0.173,0.178 +pairwise,2,2000,5,2,0.882,0.347,0.357 +pairwise,2,5000,5,2,2.207,0.867,0.897 +pairwise,2,10000,5,2,4.447,1.774,1.798 +pairwise,2,100000,5,2,45.713,17.999,18.851 +pairwise,3,500,5,2,0.507,0.219,0.224 +pairwise,3,1000,5,2,1.012,0.438,0.449 +pairwise,3,2000,5,2,2.023,0.877,0.899 +pairwise,3,5000,5,2,5.322,2.225,2.415 +pairwise,3,10000,5,2,10.175,4.392,4.568 +pairwise,3,100000,5,2,105.112,45.665,46.981 +pairwise,4,500,5,2,0.872,0.391,0.398 +pairwise,4,1000,5,2,1.747,0.785,0.802 +pairwise,4,2000,5,2,3.558,1.629,1.611 +pairwise,4,5000,5,2,9.266,4.386,4.086 +pairwise,4,10000,5,2,18.341,8.003,8.771 +pairwise,4,100000,5,2,181.699,82.067,83.36 +pairwise,5,500,5,2,1.325,0.61,0.615 +pairwise,5,1000,5,2,2.65,1.226,1.23 +pairwise,5,2000,5,2,5.339,2.45,2.502 +pairwise,5,5000,5,2,13.325,6.161,6.206 +pairwise,5,10000,5,2,27.926,13.005,13.014 +pairwise,5,100000,5,2,279.625,128.97,129.897 +pairwise,6,500,5,2,1.86,0.862,0.879 +pairwise,6,1000,5,2,3.774,1.726,1.79 +pairwise,6,2000,5,2,7.444,3.471,3.518 +pairwise,6,5000,5,2,19.162,8.772,9.256 +pairwise,6,10000,5,2,38.646,18.23,18.16 +pairwise,6,100000,5,2,392.471,182.592,186.207 +pairwise,7,500,5,2,2.489,1.167,1.186 +pairwise,7,1000,5,2,5.027,2.376,2.383 +pairwise,7,2000,5,2,10.012,4.727,4.758 +pairwise,7,5000,5,2,25.914,12.402,12.192 +pairwise,7,10000,5,2,52.01,24.442,24.963 +pairwise,7,100000,5,2,521.582,244.542,249.374 +pairwise,8,500,5,2,3.23,1.504,1.532 +pairwise,8,1000,5,2,6.377,3.018,3.058 +pairwise,8,2000,5,2,13.28,6.027,6.64 +pairwise,8,5000,5,2,33.573,15.734,16.363 +pairwise,8,10000,5,2,66.948,31.635,32.28 +pairwise,8,100000,5,2,670.26,316.36,322.499 diff --git a/benchmark-analysis/focus-n2-8-ventura-kabylake/focused_pairwise_vs_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8-ventura-kabylake/focused_pairwise_vs_max_reduce_n2_8.csv new file mode 100644 index 0000000..1b37cf8 --- /dev/null +++ b/benchmark-analysis/focus-n2-8-ventura-kabylake/focused_pairwise_vs_max_reduce_n2_8.csv @@ -0,0 +1,43 @@ +n,t,pairwise_fb_total_ms,max_reduce_fb_total_ms,speedup_max_over_pair,winner +2,500,0.221,0.277,0.7978339350180504,pairwise +2,1000,0.44,0.54,0.8148148148148148,pairwise +2,2000,0.882,1.087,0.8114075436982521,pairwise +2,5000,2.207,2.706,0.8155949741315595,pairwise +2,10000,4.447,5.423,0.8200258159690208,pairwise +2,100000,45.713,56.385,0.8107298040258935,pairwise +3,500,0.507,0.54,0.9388888888888889,pairwise +3,1000,1.012,1.079,0.93790546802595,pairwise +3,2000,2.023,2.161,0.9361406756131421,pairwise +3,5000,5.322,5.399,0.985738099648083,pairwise +3,10000,10.175,11.384,0.8937983134223472,pairwise +3,100000,105.112,114.188,0.9205170420709706,pairwise +4,500,0.872,0.904,0.9646017699115044,pairwise +4,1000,1.747,1.823,0.9583104772353265,pairwise +4,2000,3.558,3.616,0.9839601769911503,pairwise +4,5000,9.266,9.206,1.006517488594395,max_reduce +4,10000,18.341,19.364,0.9471700061970667,pairwise +4,100000,181.699,188.534,0.9637465921266192,pairwise +5,500,1.325,1.314,1.0083713850837137,max_reduce +5,1000,2.65,2.692,0.9843982169390787,pairwise +5,2000,5.339,5.385,0.9914577530176417,pairwise +5,5000,13.325,14.302,0.9316878758215634,pairwise +5,10000,27.926,26.999,1.0343346049853699,max_reduce +5,100000,279.625,281.104,0.9947386020832149,pairwise +6,500,1.86,1.837,1.0125204137180186,max_reduce +6,1000,3.774,3.734,1.0107123727905731,max_reduce +6,2000,7.444,7.391,1.007170883506968,max_reduce +6,5000,19.162,19.422,0.9866131191432396,pairwise +6,10000,38.646,38.488,1.0041051756391604,max_reduce +6,100000,392.471,388.776,1.009504187501286,max_reduce +7,500,2.489,2.674,0.9308152580403889,pairwise +7,1000,5.027,5.001,1.0051989602079583,max_reduce +7,2000,10.012,9.926,1.008664114446907,max_reduce +7,5000,25.914,25.743,1.0066425824495981,max_reduce +7,10000,52.01,51.916,1.0018106171507821,max_reduce +7,100000,521.582,520.842,1.0014207763582814,max_reduce +8,500,3.23,3.146,1.0267005721551177,max_reduce +8,1000,6.377,6.216,1.0259009009009008,max_reduce +8,2000,13.28,12.468,1.065126724414501,max_reduce +8,5000,33.573,32.278,1.0401202057128696,max_reduce +8,10000,66.948,65.428,1.0232316439444886,max_reduce +8,100000,670.26,654.927,1.0234117695559963,max_reduce diff --git a/benchmark-analysis/focus-n2-8/focused_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8/focused_max_reduce_n2_8.csv new file mode 100644 index 0000000..ee91af5 --- /dev/null +++ b/benchmark-analysis/focus-n2-8/focused_max_reduce_n2_8.csv @@ -0,0 +1,43 @@ +"mode","n","t","runs","warmup","fb_total_ms","forward_ms","backward_ms" +"max_reduce","2","500","5","2","0.035","0.017","0.017" +"max_reduce","2","1000","5","2","0.105","0.051","0.052" +"max_reduce","2","2000","5","2","0.166","0.069","0.07" +"max_reduce","2","5000","5","2","0.407","0.171","0.174" +"max_reduce","2","10000","5","2","0.808","0.343","0.427" +"max_reduce","2","100000","5","2","7.75","3.437","3.498" +"max_reduce","3","500","5","2","0.064","0.032","0.031" +"max_reduce","3","1000","5","2","0.144","0.063","0.063" +"max_reduce","3","2000","5","2","0.405","0.211","0.136" +"max_reduce","3","5000","5","2","0.716","0.318","0.316" +"max_reduce","3","10000","5","2","1.278","0.634","0.631" +"max_reduce","3","100000","5","2","14.657","6.589","6.891" +"max_reduce","4","500","5","2","0.098","0.048","0.049" +"max_reduce","4","1000","5","2","0.221","0.095","0.099" +"max_reduce","4","2000","5","2","0.652","0.286","0.294" +"max_reduce","4","5000","5","2","1.644","0.745","0.765" +"max_reduce","4","10000","5","2","2.025","1.004","0.996" +"max_reduce","4","100000","5","2","21.311","9.707","10.107" +"max_reduce","5","500","5","2","0.169","0.074","0.076" +"max_reduce","5","1000","5","2","0.332","0.149","0.153" +"max_reduce","5","2000","5","2","0.694","0.302","0.304" +"max_reduce","5","5000","5","2","1.631","0.744","0.828" +"max_reduce","5","10000","5","2","3.227","1.632","1.558" +"max_reduce","5","100000","5","2","31.91","14.943","15.307" +"max_reduce","6","500","5","2","0.219","0.099","0.101" +"max_reduce","6","1000","5","2","0.438","0.198","0.201" +"max_reduce","6","2000","5","2","0.877","0.398","0.41" +"max_reduce","6","5000","5","2","2.195","0.995","1.059" +"max_reduce","6","10000","5","2","4.084","1.99","2.053" +"max_reduce","6","100000","5","2","44.258","20.539","21.621" +"max_reduce","7","500","5","2","0.281","0.128","0.131" +"max_reduce","7","1000","5","2","0.567","0.257","0.264" +"max_reduce","7","2000","5","2","1.127","0.518","0.532" +"max_reduce","7","5000","5","2","2.763","1.286","1.32" +"max_reduce","7","10000","5","2","5.572","2.629","2.765" +"max_reduce","7","100000","5","2","56.326","26.534","27.534" +"max_reduce","8","500","5","2","0.341","0.158","0.16" +"max_reduce","8","1000","5","2","0.687","0.316","0.32" +"max_reduce","8","2000","5","2","1.35","0.633","0.642" +"max_reduce","8","5000","5","2","3.369","1.588","1.619" +"max_reduce","8","10000","5","2","6.713","3.211","3.308" +"max_reduce","8","100000","5","2","67.659","32.351","32.64" diff --git a/benchmark-analysis/focus-n2-8/focused_pairwise_n2_8.csv b/benchmark-analysis/focus-n2-8/focused_pairwise_n2_8.csv new file mode 100644 index 0000000..03aeaf9 --- /dev/null +++ b/benchmark-analysis/focus-n2-8/focused_pairwise_n2_8.csv @@ -0,0 +1,43 @@ +"mode","n","t","runs","warmup","fb_total_ms","forward_ms","backward_ms" +"pairwise","2","500","5","2","0.059","0.03","0.028" +"pairwise","2","1000","5","2","0.071","0.036","0.034" +"pairwise","2","2000","5","2","0.17","0.073","0.067" +"pairwise","2","5000","5","2","0.392","0.181","0.169" +"pairwise","2","10000","5","2","0.705","0.36","0.337" +"pairwise","2","100000","5","2","7.878","3.656","3.413" +"pairwise","3","500","5","2","0.096","0.049","0.046" +"pairwise","3","1000","5","2","0.31","0.146","0.139" +"pairwise","3","2000","5","2","0.421","0.194","0.186" +"pairwise","3","5000","5","2","0.984","0.493","0.464" +"pairwise","3","10000","5","2","1.982","1.017","0.949" +"pairwise","3","100000","5","2","20.358","9.852","9.412" +"pairwise","4","500","5","2","0.216","0.108","0.107" +"pairwise","4","1000","5","2","0.456","0.217","0.214" +"pairwise","4","2000","5","2","0.992","0.468","0.47" +"pairwise","4","5000","5","2","2.281","1.092","1.074" +"pairwise","4","10000","5","2","4.358","2.184","2.153" +"pairwise","4","100000","5","2","45.618","22.369","21.551" +"pairwise","5","500","5","2","0.401","0.194","0.191" +"pairwise","5","1000","5","2","0.799","0.387","0.382" +"pairwise","5","2000","5","2","1.613","0.78","0.765" +"pairwise","5","5000","5","2","3.969","1.946","1.921" +"pairwise","5","10000","5","2","7.789","3.918","3.839" +"pairwise","5","100000","5","2","79.753","39.361","38.679" +"pairwise","6","500","5","2","0.939","0.452","0.447" +"pairwise","6","1000","5","2","1.411","0.698","0.705" +"pairwise","6","2000","5","2","2.9","1.411","1.45" +"pairwise","6","5000","5","2","7.532","4.059","3.241" +"pairwise","6","10000","5","2","14.834","8.057","6.694" +"pairwise","6","100000","5","2","124.796","61.695","60.95" +"pairwise","7","500","5","2","0.89","0.434","0.436" +"pairwise","7","1000","5","2","1.76","0.87","0.862" +"pairwise","7","2000","5","2","3.5","1.739","1.735" +"pairwise","7","5000","5","2","8.758","4.39","4.341" +"pairwise","7","10000","5","2","17.708","8.771","8.752" +"pairwise","7","100000","5","2","178.154","88.417","87.337" +"pairwise","8","500","5","2","1.199","0.588","0.587" +"pairwise","8","1000","5","2","2.464","1.2","1.214" +"pairwise","8","2000","5","2","4.8","2.362","2.346" +"pairwise","8","5000","5","2","11.938","5.899","5.871" +"pairwise","8","10000","5","2","23.908","11.86","11.807" +"pairwise","8","100000","5","2","241.353","119.882","118.472" diff --git a/benchmark-analysis/focus-n2-8/focused_pairwise_vs_max_reduce_n2_8.csv b/benchmark-analysis/focus-n2-8/focused_pairwise_vs_max_reduce_n2_8.csv new file mode 100644 index 0000000..f5681e4 --- /dev/null +++ b/benchmark-analysis/focus-n2-8/focused_pairwise_vs_max_reduce_n2_8.csv @@ -0,0 +1,43 @@ +"n","t","pairwise_fb_total_ms","max_reduce_fb_total_ms","speedup_max_over_pair","winner" +"2","500","0.059","0.035","1.6857142857142855","max_reduce" +"2","1000","0.071","0.105","0.6761904761904761","pairwise" +"2","2000","0.17","0.166","1.0240963855421688","max_reduce" +"2","5000","0.392","0.407","0.9631449631449632","pairwise" +"2","10000","0.705","0.808","0.8725247524752474","pairwise" +"2","100000","7.878","7.75","1.0165161290322582","max_reduce" +"3","500","0.096","0.064","1.5","max_reduce" +"3","1000","0.31","0.144","2.152777777777778","max_reduce" +"3","2000","0.421","0.405","1.039506172839506","max_reduce" +"3","5000","0.984","0.716","1.3743016759776536","max_reduce" +"3","10000","1.982","1.278","1.5508607198748043","max_reduce" +"3","100000","20.358","14.657","1.3889609060517158","max_reduce" +"4","500","0.216","0.098","2.204081632653061","max_reduce" +"4","1000","0.456","0.221","2.063348416289593","max_reduce" +"4","2000","0.992","0.652","1.5214723926380367","max_reduce" +"4","5000","2.281","1.644","1.387469586374696","max_reduce" +"4","10000","4.358","2.025","2.1520987654320987","max_reduce" +"4","100000","45.618","21.311","2.1405846745812025","max_reduce" +"5","500","0.401","0.169","2.3727810650887573","max_reduce" +"5","1000","0.799","0.332","2.4066265060240966","max_reduce" +"5","2000","1.613","0.694","2.3242074927953893","max_reduce" +"5","5000","3.969","1.631","2.4334763948497855","max_reduce" +"5","10000","7.789","3.227","2.41369693213511","max_reduce" +"5","100000","79.753","31.91","2.4993105609526793","max_reduce" +"6","500","0.939","0.219","4.287671232876712","max_reduce" +"6","1000","1.411","0.438","3.221461187214612","max_reduce" +"6","2000","2.9","0.877","3.30672748004561","max_reduce" +"6","5000","7.532","2.195","3.431435079726652","max_reduce" +"6","10000","14.834","4.084","3.632223310479922","max_reduce" +"6","100000","124.796","44.258","2.819738804283971","max_reduce" +"7","500","0.89","0.281","3.167259786476868","max_reduce" +"7","1000","1.76","0.567","3.104056437389771","max_reduce" +"7","2000","3.5","1.127","3.1055900621118013","max_reduce" +"7","5000","8.758","2.763","3.169743032935215","max_reduce" +"7","10000","17.708","5.572","3.1780330222541275","max_reduce" +"7","100000","178.154","56.326","3.1629087810247487","max_reduce" +"8","500","1.199","0.341","3.5161290322580645","max_reduce" +"8","1000","2.464","0.687","3.5866084425036386","max_reduce" +"8","2000","4.8","1.35","3.5555555555555554","max_reduce" +"8","5000","11.938","3.369","3.543484713564856","max_reduce" +"8","10000","23.908","6.713","3.5614479368389693","max_reduce" +"8","100000","241.353","67.659","3.5671972686560545","max_reduce" diff --git a/benchmark-analysis/high-n-ventura-kabylake/high_n_max_reduce.csv b/benchmark-analysis/high-n-ventura-kabylake/high_n_max_reduce.csv new file mode 100644 index 0000000..acd0131 --- /dev/null +++ b/benchmark-analysis/high-n-ventura-kabylake/high_n_max_reduce.csv @@ -0,0 +1,13 @@ +mode,n,t,runs,warmup,fb_total_ms +max_reduce,16,1000,5,2,25.934 +max_reduce,16,2000,5,2,50.027 +max_reduce,16,5000,5,2,123.155 +max_reduce,32,500,5,2,44.255 +max_reduce,32,1000,5,2,88.512 +max_reduce,32,2000,5,2,177.733 +max_reduce,64,200,5,2,64.2 +max_reduce,64,500,5,2,161.063 +max_reduce,64,1000,5,2,325.204 +max_reduce,128,100,5,2,121.246 +max_reduce,128,250,5,2,302.313 +max_reduce,128,500,5,2,612.757 diff --git a/benchmark-analysis/high-n-ventura-kabylake/high_n_pairwise.csv b/benchmark-analysis/high-n-ventura-kabylake/high_n_pairwise.csv new file mode 100644 index 0000000..e3e9dff --- /dev/null +++ b/benchmark-analysis/high-n-ventura-kabylake/high_n_pairwise.csv @@ -0,0 +1,13 @@ +mode,n,t,runs,warmup,fb_total_ms +pairwise,16,1000,5,2,25.675 +pairwise,16,2000,5,2,51.435 +pairwise,16,5000,5,2,126.996 +pairwise,32,500,5,2,43.501 +pairwise,32,1000,5,2,86.664 +pairwise,32,2000,5,2,173.84 +pairwise,64,200,5,2,56.837 +pairwise,64,500,5,2,145.821 +pairwise,64,1000,5,2,283.18 +pairwise,128,100,5,2,102.735 +pairwise,128,250,5,2,250.751 +pairwise,128,500,5,2,505.429 diff --git a/benchmark-analysis/high-n-ventura-kabylake/high_n_pairwise_vs_max_reduce.csv b/benchmark-analysis/high-n-ventura-kabylake/high_n_pairwise_vs_max_reduce.csv new file mode 100644 index 0000000..b374ec2 --- /dev/null +++ b/benchmark-analysis/high-n-ventura-kabylake/high_n_pairwise_vs_max_reduce.csv @@ -0,0 +1,13 @@ +n,t,pairwise_fb_total_ms,max_reduce_fb_total_ms,speedup_max_over_pair,winner +16,1000,25.675,25.934,0.9900131102028226,pairwise +16,2000,51.435,50.027,1.0281448018070243,max_reduce +16,5000,126.996,123.155,1.031188339896878,max_reduce +32,500,43.501,44.255,0.9829623771325273,pairwise +32,1000,86.664,88.512,0.97912147505423,pairwise +32,2000,173.84,177.733,0.9780963580201764,pairwise +64,200,56.837,64.2,0.8853115264797508,pairwise +64,500,145.821,161.063,0.9053662231549146,pairwise +64,1000,283.18,325.204,0.8707764972140564,pairwise +128,100,102.735,121.246,0.8473269221252661,pairwise +128,250,250.751,302.313,0.8294416713803244,pairwise +128,500,505.429,612.757,0.8248441062280807,pairwise diff --git a/benchmark-analysis/hmmlib-9pass-ryzen-windows-clangcl-rerun-o2/adaptive_passes.csv b/benchmark-analysis/hmmlib-9pass-ryzen-windows-clangcl-rerun-o2/adaptive_passes.csv new file mode 100644 index 0000000..8fe2aab --- /dev/null +++ b/benchmark-analysis/hmmlib-9pass-ryzen-windows-clangcl-rerun-o2/adaptive_passes.csv @@ -0,0 +1,10 @@ +label,pass,libhmm_avg_obs_ms,hmmlib_avg_obs_ms,ratio_hmmlib_over_libhmm +clangcl_adaptive_o2,1,9979.5,30481.2,3.05 +clangcl_adaptive_o2,2,9192.6,27960.2,3.04 +clangcl_adaptive_o2,3,10620.8,30674.7,2.89 +clangcl_adaptive_o2,4,10261.2,30457.3,2.97 +clangcl_adaptive_o2,5,10377.6,30265.0,2.92 +clangcl_adaptive_o2,6,10339.4,30766.2,2.98 +clangcl_adaptive_o2,7,10430.0,30559.7,2.93 +clangcl_adaptive_o2,8,7184.8,25793.7,3.59 +clangcl_adaptive_o2,9,9890.9,30525.4,3.09 diff --git a/benchmark-analysis/hmmlib-9pass-ryzen-windows-clangcl-rerun-o2/control_passes.csv b/benchmark-analysis/hmmlib-9pass-ryzen-windows-clangcl-rerun-o2/control_passes.csv new file mode 100644 index 0000000..a21d418 --- /dev/null +++ b/benchmark-analysis/hmmlib-9pass-ryzen-windows-clangcl-rerun-o2/control_passes.csv @@ -0,0 +1,10 @@ +label,pass,libhmm_avg_obs_ms,hmmlib_avg_obs_ms,ratio_hmmlib_over_libhmm +clangcl_control_o2,1,8844.3,28803.3,3.26 +clangcl_control_o2,2,10440.4,30681.8,2.94 +clangcl_control_o2,3,10607.2,30760.2,2.9 +clangcl_control_o2,4,10244.6,30830.2,3.01 +clangcl_control_o2,5,10492.5,30586.3,2.92 +clangcl_control_o2,6,10371.1,30365.2,2.93 +clangcl_control_o2,7,10235.7,30156.6,2.95 +clangcl_control_o2,8,10331.6,30036.8,2.91 +clangcl_control_o2,9,10265.7,30875.1,3.01 diff --git a/benchmark-analysis/hmmlib-9pass-ryzen-windows-clangcl-rerun/adaptive_passes.csv b/benchmark-analysis/hmmlib-9pass-ryzen-windows-clangcl-rerun/adaptive_passes.csv new file mode 100644 index 0000000..afdb795 --- /dev/null +++ b/benchmark-analysis/hmmlib-9pass-ryzen-windows-clangcl-rerun/adaptive_passes.csv @@ -0,0 +1,10 @@ +label,pass,libhmm_avg_obs_ms,hmmlib_avg_obs_ms,ratio_hmmlib_over_libhmm +clangcl_adaptive,1,4413.6,5817.9,1.32 +clangcl_adaptive,2,4311.0,5602.0,1.3 +clangcl_adaptive,3,4557.1,5949.2,1.31 +clangcl_adaptive,4,4674.8,5959.4,1.27 +clangcl_adaptive,5,4749.7,5995.5,1.26 +clangcl_adaptive,6,4652.7,6016.9,1.29 +clangcl_adaptive,7,4632.0,5938.7,1.28 +clangcl_adaptive,8,4641.3,6016.9,1.3 +clangcl_adaptive,9,4661.4,6073.8,1.3 diff --git a/benchmark-analysis/hmmlib-9pass-ryzen-windows-clangcl-rerun/control_passes.csv b/benchmark-analysis/hmmlib-9pass-ryzen-windows-clangcl-rerun/control_passes.csv new file mode 100644 index 0000000..5f30d9a --- /dev/null +++ b/benchmark-analysis/hmmlib-9pass-ryzen-windows-clangcl-rerun/control_passes.csv @@ -0,0 +1,10 @@ +label,pass,libhmm_avg_obs_ms,hmmlib_avg_obs_ms,ratio_hmmlib_over_libhmm +clangcl_control,1,4641.1,5795.5,1.25 +clangcl_control,2,4659.8,5948.7,1.28 +clangcl_control,3,4593.9,5817.9,1.27 +clangcl_control,4,4690.3,6095.3,1.3 +clangcl_control,5,4628.5,5979.6,1.29 +clangcl_control,6,4634.2,5999.1,1.29 +clangcl_control,7,4627.3,5894.7,1.27 +clangcl_control,8,4050.7,5181.0,1.28 +clangcl_control,9,4826.3,5919.6,1.23 diff --git a/benchmark-analysis/hmmlib-9pass-ryzen-windows-mingw-rerun/adaptive_passes.csv b/benchmark-analysis/hmmlib-9pass-ryzen-windows-mingw-rerun/adaptive_passes.csv new file mode 100644 index 0000000..2f32b45 --- /dev/null +++ b/benchmark-analysis/hmmlib-9pass-ryzen-windows-mingw-rerun/adaptive_passes.csv @@ -0,0 +1,10 @@ +label,pass,libhmm_avg_obs_ms,hmmlib_avg_obs_ms,ratio_hmmlib_over_libhmm +mingw_adaptive,1,10078.5,34151.8,3.39 +mingw_adaptive,2,8781.1,29842.7,3.4 +mingw_adaptive,3,9702.9,33915.3,3.5 +mingw_adaptive,4,10226.5,34044.0,3.33 +mingw_adaptive,5,9529.7,32876.4,3.45 +mingw_adaptive,6,10208.4,34532.0,3.38 +mingw_adaptive,7,10291.1,34420.4,3.34 +mingw_adaptive,8,10247.6,34227.6,3.34 +mingw_adaptive,9,10227.4,34389.8,3.36 diff --git a/benchmark-analysis/hmmlib-9pass-ryzen-windows-mingw-rerun/control_passes.csv b/benchmark-analysis/hmmlib-9pass-ryzen-windows-mingw-rerun/control_passes.csv new file mode 100644 index 0000000..f4a88a1 --- /dev/null +++ b/benchmark-analysis/hmmlib-9pass-ryzen-windows-mingw-rerun/control_passes.csv @@ -0,0 +1,10 @@ +label,pass,libhmm_avg_obs_ms,hmmlib_avg_obs_ms,ratio_hmmlib_over_libhmm +mingw_control,1,9954.9,33594.9,3.37 +mingw_control,2,8793.8,31930.7,3.63 +mingw_control,3,9913.5,33971.1,3.43 +mingw_control,4,10019.6,33623.8,3.36 +mingw_control,5,9744.4,32670.8,3.35 +mingw_control,6,10212.6,34327.2,3.36 +mingw_control,7,10327.8,34152.9,3.31 +mingw_control,8,10298.2,34393.7,3.34 +mingw_control,9,9755.7,33453.4,3.43 diff --git a/benchmark-analysis/hmmlib-9pass-ryzen-windows-msvc-rerun/adaptive_passes.csv b/benchmark-analysis/hmmlib-9pass-ryzen-windows-msvc-rerun/adaptive_passes.csv new file mode 100644 index 0000000..7ea41a5 --- /dev/null +++ b/benchmark-analysis/hmmlib-9pass-ryzen-windows-msvc-rerun/adaptive_passes.csv @@ -0,0 +1,10 @@ +label,pass,libhmm_avg_obs_ms,hmmlib_avg_obs_ms,ratio_hmmlib_over_libhmm +msvc_adaptive,1,7929.9,27251.2,3.44 +msvc_adaptive,2,8946.5,29649.1,3.31 +msvc_adaptive,3,9145.1,28956.1,3.17 +msvc_adaptive,4,9448.0,29762.3,3.15 +msvc_adaptive,5,9403.0,30316.3,3.22 +msvc_adaptive,6,9418.2,30474.7,3.24 +msvc_adaptive,7,9168.2,28367.2,3.09 +msvc_adaptive,8,9466.6,30332.9,3.2 +msvc_adaptive,9,9358.2,30473.8,3.26 diff --git a/benchmark-analysis/hmmlib-9pass-ryzen-windows-msvc-rerun/control_passes.csv b/benchmark-analysis/hmmlib-9pass-ryzen-windows-msvc-rerun/control_passes.csv new file mode 100644 index 0000000..2db8e05 --- /dev/null +++ b/benchmark-analysis/hmmlib-9pass-ryzen-windows-msvc-rerun/control_passes.csv @@ -0,0 +1,10 @@ +label,pass,libhmm_avg_obs_ms,hmmlib_avg_obs_ms,ratio_hmmlib_over_libhmm +msvc_control,1,8899.9,29202.0,3.28 +msvc_control,2,8708.8,29335.0,3.37 +msvc_control,3,8586.8,29263.8,3.41 +msvc_control,4,8847.9,28780.8,3.25 +msvc_control,5,9660.4,30483.4,3.16 +msvc_control,6,9397.2,29902.9,3.18 +msvc_control,7,9433.7,29669.7,3.15 +msvc_control,8,9497.0,30340.8,3.19 +msvc_control,9,9033.0,27398.8,3.03 diff --git a/benchmark-analysis/median-gate-hmmlib-9pass-tahoe-m1-appleclang-rerun/hmmlib_9pass_summary.json b/benchmark-analysis/median-gate-hmmlib-9pass-tahoe-m1-appleclang-rerun/hmmlib_9pass_summary.json new file mode 100644 index 0000000..4362a1e --- /dev/null +++ b/benchmark-analysis/median-gate-hmmlib-9pass-tahoe-m1-appleclang-rerun/hmmlib_9pass_summary.json @@ -0,0 +1,117 @@ +{ + "control_median_ratio_hmmlib_over_libhmm": 7.5171448054162004, + "adaptive_median_ratio_hmmlib_over_libhmm": 7.613662932294204, + "delta_percent_adaptive_vs_control": 1.283973228884206, + "control_passes": [ + { + "pass": 1, + "libhmm_avg_throughput_obs_per_ms": 4025.5, + "hmmlib_avg_throughput_obs_per_ms": 30999.6, + "ratio_hmmlib_over_libhmm": 7.700807353123835 + }, + { + "pass": 2, + "libhmm_avg_throughput_obs_per_ms": 4114.5, + "hmmlib_avg_throughput_obs_per_ms": 30763.8, + "ratio_hmmlib_over_libhmm": 7.476923076923077 + }, + { + "pass": 3, + "libhmm_avg_throughput_obs_per_ms": 4099.8, + "hmmlib_avg_throughput_obs_per_ms": 30764.1, + "ratio_hmmlib_over_libhmm": 7.503805063661641 + }, + { + "pass": 4, + "libhmm_avg_throughput_obs_per_ms": 4141.5, + "hmmlib_avg_throughput_obs_per_ms": 31065.0, + "ratio_hmmlib_over_libhmm": 7.5009054690329595 + }, + { + "pass": 5, + "libhmm_avg_throughput_obs_per_ms": 4045.7, + "hmmlib_avg_throughput_obs_per_ms": 30134.7, + "ratio_hmmlib_over_libhmm": 7.448575030279062 + }, + { + "pass": 6, + "libhmm_avg_throughput_obs_per_ms": 4102.6, + "hmmlib_avg_throughput_obs_per_ms": 31059.9, + "ratio_hmmlib_over_libhmm": 7.5707843806366695 + }, + { + "pass": 7, + "libhmm_avg_throughput_obs_per_ms": 4056.2, + "hmmlib_avg_throughput_obs_per_ms": 30943.8, + "ratio_hmmlib_over_libhmm": 7.628765839948721 + }, + { + "pass": 8, + "libhmm_avg_throughput_obs_per_ms": 4106.2, + "hmmlib_avg_throughput_obs_per_ms": 30866.9, + "ratio_hmmlib_over_libhmm": 7.5171448054162004 + }, + { + "pass": 9, + "libhmm_avg_throughput_obs_per_ms": 4112.7, + "hmmlib_avg_throughput_obs_per_ms": 30960.5, + "ratio_hmmlib_over_libhmm": 7.528022953291026 + } + ], + "adaptive_passes": [ + { + "pass": 1, + "libhmm_avg_throughput_obs_per_ms": 4068.7, + "hmmlib_avg_throughput_obs_per_ms": 31003.7, + "ratio_hmmlib_over_libhmm": 7.620050630422494 + }, + { + "pass": 2, + "libhmm_avg_throughput_obs_per_ms": 4106.0, + "hmmlib_avg_throughput_obs_per_ms": 31261.7, + "ratio_hmmlib_over_libhmm": 7.613662932294204 + }, + { + "pass": 3, + "libhmm_avg_throughput_obs_per_ms": 4103.9, + "hmmlib_avg_throughput_obs_per_ms": 30937.6, + "ratio_hmmlib_over_libhmm": 7.5385852481785625 + }, + { + "pass": 4, + "libhmm_avg_throughput_obs_per_ms": 3983.1, + "hmmlib_avg_throughput_obs_per_ms": 30418.5, + "ratio_hmmlib_over_libhmm": 7.6368908638999775 + }, + { + "pass": 5, + "libhmm_avg_throughput_obs_per_ms": 4001.6, + "hmmlib_avg_throughput_obs_per_ms": 30412.1, + "ratio_hmmlib_over_libhmm": 7.599985005997601 + }, + { + "pass": 6, + "libhmm_avg_throughput_obs_per_ms": 3996.8, + "hmmlib_avg_throughput_obs_per_ms": 30508.4, + "ratio_hmmlib_over_libhmm": 7.633206565252202 + }, + { + "pass": 7, + "libhmm_avg_throughput_obs_per_ms": 3995.2, + "hmmlib_avg_throughput_obs_per_ms": 30228.8, + "ratio_hmmlib_over_libhmm": 7.566279535442531 + }, + { + "pass": 8, + "libhmm_avg_throughput_obs_per_ms": 3982.1, + "hmmlib_avg_throughput_obs_per_ms": 30486.9, + "ratio_hmmlib_over_libhmm": 7.655985535270335 + }, + { + "pass": 9, + "libhmm_avg_throughput_obs_per_ms": 4001.8, + "hmmlib_avg_throughput_obs_per_ms": 30388.0, + "ratio_hmmlib_over_libhmm": 7.593582887700534 + } + ] +} diff --git a/benchmark-analysis/median-gate-hmmlib-9pass-tahoe-m1-gcc15/hmmlib_9pass_summary.json b/benchmark-analysis/median-gate-hmmlib-9pass-tahoe-m1-gcc15/hmmlib_9pass_summary.json new file mode 100644 index 0000000..964098d --- /dev/null +++ b/benchmark-analysis/median-gate-hmmlib-9pass-tahoe-m1-gcc15/hmmlib_9pass_summary.json @@ -0,0 +1,117 @@ +{ + "control_median_ratio_hmmlib_over_libhmm": 9.156518900955433, + "adaptive_median_ratio_hmmlib_over_libhmm": 9.1735840061973, + "delta_percent_adaptive_vs_control": 0.18637110266966933, + "control_passes": [ + { + "pass": 1, + "libhmm_avg_throughput_obs_per_ms": 3253.3, + "hmmlib_avg_throughput_obs_per_ms": 31084.7, + "ratio_hmmlib_over_libhmm": 9.554821258414533 + }, + { + "pass": 2, + "libhmm_avg_throughput_obs_per_ms": 3420.7, + "hmmlib_avg_throughput_obs_per_ms": 31343.0, + "ratio_hmmlib_over_libhmm": 9.162744467506652 + }, + { + "pass": 3, + "libhmm_avg_throughput_obs_per_ms": 3411.1, + "hmmlib_avg_throughput_obs_per_ms": 30845.9, + "ratio_hmmlib_over_libhmm": 9.042801442349976 + }, + { + "pass": 4, + "libhmm_avg_throughput_obs_per_ms": 3367.0, + "hmmlib_avg_throughput_obs_per_ms": 30953.7, + "ratio_hmmlib_over_libhmm": 9.193258093258093 + }, + { + "pass": 5, + "libhmm_avg_throughput_obs_per_ms": 3356.2, + "hmmlib_avg_throughput_obs_per_ms": 30719.0, + "ratio_hmmlib_over_libhmm": 9.152911030331923 + }, + { + "pass": 6, + "libhmm_avg_throughput_obs_per_ms": 3404.8, + "hmmlib_avg_throughput_obs_per_ms": 30811.7, + "ratio_hmmlib_over_libhmm": 9.049488956766917 + }, + { + "pass": 7, + "libhmm_avg_throughput_obs_per_ms": 3370.2, + "hmmlib_avg_throughput_obs_per_ms": 30859.3, + "ratio_hmmlib_over_libhmm": 9.156518900955433 + }, + { + "pass": 8, + "libhmm_avg_throughput_obs_per_ms": 3341.8, + "hmmlib_avg_throughput_obs_per_ms": 30105.2, + "ratio_hmmlib_over_libhmm": 9.008677957986713 + }, + { + "pass": 9, + "libhmm_avg_throughput_obs_per_ms": 3352.0, + "hmmlib_avg_throughput_obs_per_ms": 31165.2, + "ratio_hmmlib_over_libhmm": 9.297494033412889 + } + ], + "adaptive_passes": [ + { + "pass": 1, + "libhmm_avg_throughput_obs_per_ms": 3122.7, + "hmmlib_avg_throughput_obs_per_ms": 28684.6, + "ratio_hmmlib_over_libhmm": 9.18583277292087 + }, + { + "pass": 2, + "libhmm_avg_throughput_obs_per_ms": 3319.1, + "hmmlib_avg_throughput_obs_per_ms": 30293.0, + "ratio_hmmlib_over_libhmm": 9.126871742339791 + }, + { + "pass": 3, + "libhmm_avg_throughput_obs_per_ms": 3036.4, + "hmmlib_avg_throughput_obs_per_ms": 28819.5, + "ratio_hmmlib_over_libhmm": 9.491338427084706 + }, + { + "pass": 4, + "libhmm_avg_throughput_obs_per_ms": 3342.7, + "hmmlib_avg_throughput_obs_per_ms": 30987.7, + "ratio_hmmlib_over_libhmm": 9.27026056780447 + }, + { + "pass": 5, + "libhmm_avg_throughput_obs_per_ms": 3368.9, + "hmmlib_avg_throughput_obs_per_ms": 30740.6, + "ratio_hmmlib_over_libhmm": 9.12481818991362 + }, + { + "pass": 6, + "libhmm_avg_throughput_obs_per_ms": 3371.9, + "hmmlib_avg_throughput_obs_per_ms": 30832.6, + "ratio_hmmlib_over_libhmm": 9.143984103917672 + }, + { + "pass": 7, + "libhmm_avg_throughput_obs_per_ms": 3359.7, + "hmmlib_avg_throughput_obs_per_ms": 31037.3, + "ratio_hmmlib_over_libhmm": 9.23811649849689 + }, + { + "pass": 8, + "libhmm_avg_throughput_obs_per_ms": 3400.0, + "hmmlib_avg_throughput_obs_per_ms": 31027.1, + "ratio_hmmlib_over_libhmm": 9.125617647058823 + }, + { + "pass": 9, + "libhmm_avg_throughput_obs_per_ms": 3356.3, + "hmmlib_avg_throughput_obs_per_ms": 30789.3, + "ratio_hmmlib_over_libhmm": 9.1735840061973 + } + ] +} diff --git a/benchmark-analysis/median-gate-hmmlib-9pass-tahoe-m1-homebrew-llvm-rerun/hmmlib_9pass_summary.json b/benchmark-analysis/median-gate-hmmlib-9pass-tahoe-m1-homebrew-llvm-rerun/hmmlib_9pass_summary.json new file mode 100644 index 0000000..b0afbf5 --- /dev/null +++ b/benchmark-analysis/median-gate-hmmlib-9pass-tahoe-m1-homebrew-llvm-rerun/hmmlib_9pass_summary.json @@ -0,0 +1,117 @@ +{ + "control_median_ratio_hmmlib_over_libhmm": 7.595913843781621, + "adaptive_median_ratio_hmmlib_over_libhmm": 7.60328317373461, + "delta_percent_adaptive_vs_control": 0.09701702921527999, + "control_passes": [ + { + "pass": 1, + "libhmm_avg_throughput_obs_per_ms": 3340.7, + "hmmlib_avg_throughput_obs_per_ms": 27567.1, + "ratio_hmmlib_over_libhmm": 8.251893315772143 + }, + { + "pass": 2, + "libhmm_avg_throughput_obs_per_ms": 4187.9, + "hmmlib_avg_throughput_obs_per_ms": 31588.5, + "ratio_hmmlib_over_libhmm": 7.542801881611309 + }, + { + "pass": 3, + "libhmm_avg_throughput_obs_per_ms": 4171.2, + "hmmlib_avg_throughput_obs_per_ms": 31666.0, + "ratio_hmmlib_over_libhmm": 7.591580360567702 + }, + { + "pass": 4, + "libhmm_avg_throughput_obs_per_ms": 4182.0, + "hmmlib_avg_throughput_obs_per_ms": 31595.3, + "ratio_hmmlib_over_libhmm": 7.555069344811095 + }, + { + "pass": 5, + "libhmm_avg_throughput_obs_per_ms": 4150.6, + "hmmlib_avg_throughput_obs_per_ms": 31527.6, + "ratio_hmmlib_over_libhmm": 7.595913843781621 + }, + { + "pass": 6, + "libhmm_avg_throughput_obs_per_ms": 3931.3, + "hmmlib_avg_throughput_obs_per_ms": 31573.2, + "ratio_hmmlib_over_libhmm": 8.031236486658358 + }, + { + "pass": 7, + "libhmm_avg_throughput_obs_per_ms": 4176.6, + "hmmlib_avg_throughput_obs_per_ms": 31611.1, + "ratio_hmmlib_over_libhmm": 7.568620408945074 + }, + { + "pass": 8, + "libhmm_avg_throughput_obs_per_ms": 4161.4, + "hmmlib_avg_throughput_obs_per_ms": 31685.3, + "ratio_hmmlib_over_libhmm": 7.614096217619071 + }, + { + "pass": 9, + "libhmm_avg_throughput_obs_per_ms": 4164.3, + "hmmlib_avg_throughput_obs_per_ms": 31757.8, + "ratio_hmmlib_over_libhmm": 7.626203683692337 + } + ], + "adaptive_passes": [ + { + "pass": 1, + "libhmm_avg_throughput_obs_per_ms": 3561.1, + "hmmlib_avg_throughput_obs_per_ms": 27534.9, + "ratio_hmmlib_over_libhmm": 7.732133329589172 + }, + { + "pass": 2, + "libhmm_avg_throughput_obs_per_ms": 4057.8, + "hmmlib_avg_throughput_obs_per_ms": 31262.9, + "ratio_hmmlib_over_libhmm": 7.704396470994134 + }, + { + "pass": 3, + "libhmm_avg_throughput_obs_per_ms": 4046.3, + "hmmlib_avg_throughput_obs_per_ms": 31150.0, + "ratio_hmmlib_over_libhmm": 7.698391122754121 + }, + { + "pass": 4, + "libhmm_avg_throughput_obs_per_ms": 4090.4, + "hmmlib_avg_throughput_obs_per_ms": 30688.5, + "ratio_hmmlib_over_libhmm": 7.502566986113828 + }, + { + "pass": 5, + "libhmm_avg_throughput_obs_per_ms": 4102.6, + "hmmlib_avg_throughput_obs_per_ms": 31126.7, + "ratio_hmmlib_over_libhmm": 7.58706673816604 + }, + { + "pass": 6, + "libhmm_avg_throughput_obs_per_ms": 4093.6, + "hmmlib_avg_throughput_obs_per_ms": 31124.8, + "ratio_hmmlib_over_libhmm": 7.60328317373461 + }, + { + "pass": 7, + "libhmm_avg_throughput_obs_per_ms": 4134.9, + "hmmlib_avg_throughput_obs_per_ms": 31160.5, + "ratio_hmmlib_over_libhmm": 7.535974267817844 + }, + { + "pass": 8, + "libhmm_avg_throughput_obs_per_ms": 4111.6, + "hmmlib_avg_throughput_obs_per_ms": 30903.3, + "ratio_hmmlib_over_libhmm": 7.516125109446444 + }, + { + "pass": 9, + "libhmm_avg_throughput_obs_per_ms": 3833.6, + "hmmlib_avg_throughput_obs_per_ms": 31131.7, + "ratio_hmmlib_over_libhmm": 8.120748121869783 + } + ] +} diff --git a/benchmark-analysis/median-gate-hmmlib-9pass-tahoe-m1/hmmlib_9pass_summary.json b/benchmark-analysis/median-gate-hmmlib-9pass-tahoe-m1/hmmlib_9pass_summary.json new file mode 100644 index 0000000..7fa7176 --- /dev/null +++ b/benchmark-analysis/median-gate-hmmlib-9pass-tahoe-m1/hmmlib_9pass_summary.json @@ -0,0 +1,117 @@ +{ + "control_median_ratio_hmmlib_over_libhmm": 7.612772915264018, + "adaptive_median_ratio_hmmlib_over_libhmm": 7.609598545384946, + "delta_percent_adaptive_vs_control": -0.04169794520872997, + "control_passes": [ + { + "pass": 1, + "libhmm_avg_throughput_obs_per_ms": 4120.8, + "hmmlib_avg_throughput_obs_per_ms": 31708.3, + "ratio_hmmlib_over_libhmm": 7.6946952048145985 + }, + { + "pass": 2, + "libhmm_avg_throughput_obs_per_ms": 4147.9, + "hmmlib_avg_throughput_obs_per_ms": 31540.3, + "ratio_hmmlib_over_libhmm": 7.603920055931918 + }, + { + "pass": 3, + "libhmm_avg_throughput_obs_per_ms": 4110.0, + "hmmlib_avg_throughput_obs_per_ms": 31868.2, + "ratio_hmmlib_over_libhmm": 7.753819951338199 + }, + { + "pass": 4, + "libhmm_avg_throughput_obs_per_ms": 4158.8, + "hmmlib_avg_throughput_obs_per_ms": 31660.0, + "ratio_hmmlib_over_libhmm": 7.612772915264018 + }, + { + "pass": 5, + "libhmm_avg_throughput_obs_per_ms": 4159.8, + "hmmlib_avg_throughput_obs_per_ms": 31731.5, + "ratio_hmmlib_over_libhmm": 7.62813116015193 + }, + { + "pass": 6, + "libhmm_avg_throughput_obs_per_ms": 4172.9, + "hmmlib_avg_throughput_obs_per_ms": 31570.4, + "ratio_hmmlib_over_libhmm": 7.5655778954683806 + }, + { + "pass": 7, + "libhmm_avg_throughput_obs_per_ms": 4186.4, + "hmmlib_avg_throughput_obs_per_ms": 31907.0, + "ratio_hmmlib_over_libhmm": 7.621584177336136 + }, + { + "pass": 8, + "libhmm_avg_throughput_obs_per_ms": 4227.0, + "hmmlib_avg_throughput_obs_per_ms": 31928.1, + "ratio_hmmlib_over_libhmm": 7.553371185237757 + }, + { + "pass": 9, + "libhmm_avg_throughput_obs_per_ms": 4201.7, + "hmmlib_avg_throughput_obs_per_ms": 31626.7, + "ratio_hmmlib_over_libhmm": 7.527119975248114 + } + ], + "adaptive_passes": [ + { + "pass": 1, + "libhmm_avg_throughput_obs_per_ms": 3552.5, + "hmmlib_avg_throughput_obs_per_ms": 27750.1, + "ratio_hmmlib_over_libhmm": 7.811428571428571 + }, + { + "pass": 2, + "libhmm_avg_throughput_obs_per_ms": 4179.8, + "hmmlib_avg_throughput_obs_per_ms": 31806.6, + "ratio_hmmlib_over_libhmm": 7.609598545384946 + }, + { + "pass": 3, + "libhmm_avg_throughput_obs_per_ms": 4169.8, + "hmmlib_avg_throughput_obs_per_ms": 31679.3, + "ratio_hmmlib_over_libhmm": 7.597318816250179 + }, + { + "pass": 4, + "libhmm_avg_throughput_obs_per_ms": 4172.3, + "hmmlib_avg_throughput_obs_per_ms": 31363.2, + "ratio_hmmlib_over_libhmm": 7.517005009227524 + }, + { + "pass": 5, + "libhmm_avg_throughput_obs_per_ms": 4117.6, + "hmmlib_avg_throughput_obs_per_ms": 31708.6, + "ratio_hmmlib_over_libhmm": 7.700748008548668 + }, + { + "pass": 6, + "libhmm_avg_throughput_obs_per_ms": 4174.7, + "hmmlib_avg_throughput_obs_per_ms": 31600.1, + "ratio_hmmlib_over_libhmm": 7.569430138692601 + }, + { + "pass": 7, + "libhmm_avg_throughput_obs_per_ms": 4144.7, + "hmmlib_avg_throughput_obs_per_ms": 31582.4, + "ratio_hmmlib_over_libhmm": 7.619948367795016 + }, + { + "pass": 8, + "libhmm_avg_throughput_obs_per_ms": 4183.4, + "hmmlib_avg_throughput_obs_per_ms": 31468.3, + "ratio_hmmlib_over_libhmm": 7.522182913419707 + }, + { + "pass": 9, + "libhmm_avg_throughput_obs_per_ms": 4081.0, + "hmmlib_avg_throughput_obs_per_ms": 31163.2, + "ratio_hmmlib_over_libhmm": 7.636167605978927 + } + ] +} diff --git a/benchmark-analysis/multirun-20260426-194758/perf_vs_main_delta.csv b/benchmark-analysis/multirun-20260426-194758/perf_vs_main_delta.csv new file mode 100644 index 0000000..925c850 --- /dev/null +++ b/benchmark-analysis/multirun-20260426-194758/perf_vs_main_delta.csv @@ -0,0 +1,4 @@ +"benchmark","main_median","perf_median","perf_vs_main_median_delta_pct","main_mean","perf_mean","main_stddev","perf_stddev" +"hmmlib","9367.9","9317","-0.5433448264819184","9309.140000000001","9289.279999999999","625.8642927983668","104.81577648426794" +"stochhmm_discrete","9008.5","9217.3","2.317810956319024","8924.98","9199.380000000001","305.1292054196056","112.49971999965126" +"stochhmm_continuous","7001.3","6946.3","-0.7855683944410323","6581.640000000001","6554.539999999999","747.6081212774511","560.5696995022117" diff --git a/benchmark-analysis/multirun-20260426-194758/raw_results.csv b/benchmark-analysis/multirun-20260426-194758/raw_results.csv new file mode 100644 index 0000000..5757897 --- /dev/null +++ b/benchmark-analysis/multirun-20260426-194758/raw_results.csv @@ -0,0 +1,31 @@ +"branch","benchmark","run","exit_code","libhmm_obs_per_ms","comparator_obs_per_ms","reported_ratio_x","log_file" +"main","hmmlib","1","0","9810.5","30645.6","3.12","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\hmmlib-run1.log" +"main","hmmlib","2","0","8254","26242.7","3.18","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\hmmlib-run2.log" +"main","hmmlib","3","0","9361.5","29549.7","3.16","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\hmmlib-run3.log" +"main","hmmlib","4","0","9751.8","30446.6","3.12","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\hmmlib-run4.log" +"main","hmmlib","5","0","9367.9","30395","3.24","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\hmmlib-run5.log" +"main","stochhmm_discrete","1","0","8783.1","4124.6","0.47","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\stochhmm_discrete-run1.log" +"main","stochhmm_discrete","2","0","9235.9","4302","0.47","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\stochhmm_discrete-run2.log" +"main","stochhmm_discrete","3","0","9127.9","4219","0.46","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\stochhmm_discrete-run3.log" +"main","stochhmm_discrete","4","0","8469.5","4109.1","0.49","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\stochhmm_discrete-run4.log" +"main","stochhmm_discrete","5","0","9008.5","4153.6","0.46","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\stochhmm_discrete-run5.log" +"main","stochhmm_continuous","1","0","7177.6","6141.8","0.86","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\stochhmm_continuous-run1.log" +"main","stochhmm_continuous","2","0","7112.8","5945.1","0.84","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\stochhmm_continuous-run2.log" +"main","stochhmm_continuous","3","0","6144.4","5364.2","0.87","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\stochhmm_continuous-run3.log" +"main","stochhmm_continuous","4","0","7001.3","6195.2","0.88","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\stochhmm_continuous-run4.log" +"main","stochhmm_continuous","5","0","5472.1","5308.7","0.97","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\main\stochhmm_continuous-run5.log" +"perf","hmmlib","1","0","9369.8","28381","3.03","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\hmmlib-run1.log" +"perf","hmmlib","2","0","9218.2","29956.7","3.25","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\hmmlib-run2.log" +"perf","hmmlib","3","0","9395.1","29843","3.18","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\hmmlib-run3.log" +"perf","hmmlib","4","0","9146.3","29254.1","3.2","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\hmmlib-run4.log" +"perf","hmmlib","5","0","9317","30369.3","3.26","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\hmmlib-run5.log" +"perf","stochhmm_discrete","1","0","9008.2","3980","0.44","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\stochhmm_discrete-run1.log" +"perf","stochhmm_discrete","2","0","9207.3","4118.9","0.45","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\stochhmm_discrete-run2.log" +"perf","stochhmm_discrete","3","0","9217.3","4171.5","0.45","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\stochhmm_discrete-run3.log" +"perf","stochhmm_discrete","4","0","9278.7","4277.5","0.46","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\stochhmm_discrete-run4.log" +"perf","stochhmm_discrete","5","0","9285.4","4252.6","0.46","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\stochhmm_discrete-run5.log" +"perf","stochhmm_continuous","1","0","5820.3","5176.7","0.89","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\stochhmm_continuous-run1.log" +"perf","stochhmm_continuous","2","0","6982.3","6158.5","0.88","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\stochhmm_continuous-run2.log" +"perf","stochhmm_continuous","3","0","6946.3","6305.5","0.91","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\stochhmm_continuous-run3.log" +"perf","stochhmm_continuous","4","0","6077.3","5402.7","0.89","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\stochhmm_continuous-run4.log" +"perf","stochhmm_continuous","5","0","6946.5","6148.9","0.89","C:\Users\gdwol\Development\libhmm\benchmark-analysis\multirun-20260426-194758\perf\stochhmm_continuous-run5.log" diff --git a/benchmark-analysis/multirun-20260426-194758/run_manifest.json b/benchmark-analysis/multirun-20260426-194758/run_manifest.json new file mode 100644 index 0000000..9f1d4df --- /dev/null +++ b/benchmark-analysis/multirun-20260426-194758/run_manifest.json @@ -0,0 +1,7 @@ +{ + "output_root": "C:\\Users\\gdwol\\Development\\libhmm\\benchmark-analysis\\multirun-20260426-194758", + "raw_results_csv": "C:\\Users\\gdwol\\Development\\libhmm\\benchmark-analysis\\multirun-20260426-194758\\raw_results.csv", + "summary_stats_csv": "C:\\Users\\gdwol\\Development\\libhmm\\benchmark-analysis\\multirun-20260426-194758\\summary_stats.csv", + "delta_csv": "C:\\Users\\gdwol\\Development\\libhmm\\benchmark-analysis\\multirun-20260426-194758\\perf_vs_main_delta.csv", + "runs_per_benchmark_per_branch": 5 +} diff --git a/benchmark-analysis/multirun-20260426-194758/summary_stats.csv b/benchmark-analysis/multirun-20260426-194758/summary_stats.csv new file mode 100644 index 0000000..2915ad8 --- /dev/null +++ b/benchmark-analysis/multirun-20260426-194758/summary_stats.csv @@ -0,0 +1,7 @@ +"branch","benchmark","n","mean_libhmm_obs_per_ms","median_libhmm_obs_per_ms","stddev_libhmm_obs_per_ms","min_libhmm_obs_per_ms","max_libhmm_obs_per_ms" +"main","hmmlib","5","9309.140000000001","9367.9","625.8642927983668","8254","9810.5" +"perf","hmmlib","5","9289.279999999999","9317","104.81577648426794","9146.3","9395.1" +"main","stochhmm_continuous","5","6581.640000000001","7001.3","747.6081212774511","5472.1","7177.6" +"perf","stochhmm_continuous","5","6554.539999999999","6946.3","560.5696995022117","5820.3","6982.3" +"main","stochhmm_discrete","5","8924.98","9008.5","305.1292054196056","8469.5","9235.9" +"perf","stochhmm_discrete","5","9199.380000000001","9217.3","112.49971999965126","9008.2","9285.4" diff --git a/benchmark-analysis/rollback-dump-20260426-201852.patch b/benchmark-analysis/rollback-dump-20260426-201852.patch new file mode 100644 index 0000000..326f733 --- /dev/null +++ b/benchmark-analysis/rollback-dump-20260426-201852.patch @@ -0,0 +1,500 @@ +diff --git a/include/libhmm/calculators/forward_backward_calculator.h b/include/libhmm/calculators/forward_backward_calculator.h +index 3efd38d..c661736 100755 +--- a/include/libhmm/calculators/forward_backward_calculator.h ++++ b/include/libhmm/calculators/forward_backward_calculator.h +@@ -89,15 +89,20 @@ private: + + // Precomputed log-transition matrix [N x N]: logTrans_(i,j) = log a_{ij} + Matrix logTrans_; ++ // Transposed transition matrix [N x N]: logTransT_(j,i) = log a_{ij} ++ // Used to improve locality in forward recursion (fixed destination state j). ++ Matrix logTransT_; + + // Results + Matrix logAlpha_; // T x N + Matrix logBeta_; // T x N + double logProbability_{-std::numeric_limits::infinity()}; + +- // Per-state log-emission buffer reused each timestep [T x N, row-major]. +- // Allocated once; filled by getBatchLogProbabilities per state. +- mutable std::vector logEmitBuf_; ++ // Per-state log-emission buffer: logEmitBuf_[i * T + t] = log b_i(O_t) ++ std::vector logEmitBuf_; ++ // Time-major emission buffer: logEmitByTime_[t * N + i] = log b_i(O_t) ++ // Built once per compute() to improve locality in DP kernels. ++ std::vector logEmitByTime_; + + void precomputeLogTransitions(); + void computeLogForward(); +diff --git a/include/libhmm/calculators/viterbi_calculator.h b/include/libhmm/calculators/viterbi_calculator.h +index 7b9ae64..a341ecb 100755 +--- a/include/libhmm/calculators/viterbi_calculator.h ++++ b/include/libhmm/calculators/viterbi_calculator.h +@@ -65,19 +65,24 @@ private: + + // Precomputed log-transition matrix [N x N] + Matrix logTrans_; ++ // Transposed transition matrix [N x N]: logTransT_(j,i) = log a_{ij} ++ Matrix logTransT_; + + // Viterbi trellis: logDelta(t,i) = max log-prob path ending at state i at time t + Matrix logDelta_; + +- // Backtrack pointers: psi(t,i) = arg max_j [logDelta(t-1,j) + logTrans(j,i)] +- std::vector> psi_; ++ // Backtrack pointers in time-major contiguous storage: ++ // psi_[t * N + j] = arg max_i [logDelta(t-1,i) + logTrans(i,j)] ++ std::vector psi_; + + // Result + StateSequence sequence_; + double logProbability_{-std::numeric_limits::infinity()}; + +- // Per-state emission buffer +- mutable std::vector logEmitBuf_; ++ // Per-state log-emission buffer: logEmitBuf_[i * T + t] = log b_i(O_t) ++ std::vector logEmitBuf_; ++ // Time-major emission buffer: logEmitByTime_[t * N + i] = log b_i(O_t) ++ std::vector logEmitByTime_; + + void precomputeLogTransitions(); + void runViterbi(); +diff --git a/src/calculators/forward_backward_calculator.cpp b/src/calculators/forward_backward_calculator.cpp +index 1097acc..789e632 100755 +--- a/src/calculators/forward_backward_calculator.cpp ++++ b/src/calculators/forward_backward_calculator.cpp +@@ -50,27 +50,33 @@ void ForwardBackwardCalculator::compute() { + logAlpha_.resize(T, numStates_); + logBeta_.resize(T, numStates_); + +- // Pre-fill the log-emission buffer: logEmitBuf_[i * T + t] = log b_i(O_t) +- // Build observation span once; reuse across all N states. ++ // Fill per-state log-emission buffer: logEmitBuf_[i * T + t] = log b_i(O_t) ++ // Build observation span directly from ObservationSet storage; no copy. + logEmitBuf_.resize(T * numStates_); +- std::vector obsVec(T); +- for (std::size_t t = 0; t < T; ++t) +- obsVec[t] = observations_(t); +- const std::span obsSpan(obsVec.data(), T); ++ const std::span obsSpan(observations_.data(), T); + + const Hmm &hmm = getHmmRef(); + for (std::size_t i = 0; i < numStates_; ++i) { + hmm.getDistribution(i).getBatchLogProbabilities( + obsSpan, std::span(logEmitBuf_.data() + i * T, T)); + } ++ // Build time-major emission buffer once to improve locality in DP recurrences. ++ logEmitByTime_.resize(T * numStates_); ++ for (std::size_t i = 0; i < numStates_; ++i) { ++ const double *stateRow = logEmitBuf_.data() + i * T; ++ for (std::size_t t = 0; t < T; ++t) { ++ logEmitByTime_[t * numStates_ + i] = stateRow[t]; ++ } ++ } + + computeLogForward(); + computeLogBackward(); + + // log P(O|λ) = log-sum-exp over states at final timestep ++ const double *finalAlphaRow = logAlpha_.data() + (T - 1) * numStates_; + double lp = LOG_ZERO; + for (std::size_t i = 0; i < numStates_; ++i) { +- lp = logSumExp(lp, logAlpha_(T - 1, i)); ++ lp = logSumExp(lp, finalAlphaRow[i]); + } + logProbability_ = lp; + } +@@ -83,10 +89,13 @@ void ForwardBackwardCalculator::precomputeLogTransitions() { + const Hmm &hmm = getHmmRef(); + const Matrix &trans = hmm.getTrans(); + logTrans_.resize(numStates_, numStates_); ++ logTransT_.resize(numStates_, numStates_); + for (std::size_t i = 0; i < numStates_; ++i) { + for (std::size_t j = 0; j < numStates_; ++j) { + const double a = trans(i, j); +- logTrans_(i, j) = (a > 0.0) ? std::log(a) : LOG_ZERO; ++ const double logA = (a > 0.0) ? std::log(a) : LOG_ZERO; ++ logTrans_(i, j) = logA; ++ logTransT_(j, i) = logA; + } + } + } +@@ -96,42 +105,57 @@ void ForwardBackwardCalculator::computeLogForward() { + const Vector &pi = hmm.getPi(); + const std::size_t T = observations_.size(); + ++ const double *logEmitByTimeData = logEmitByTime_.data(); ++ const double *logTransTData = logTransT_.data(); ++ double *logAlphaData = logAlpha_.data(); ++ const std::size_t N = numStates_; ++ + // t = 0: log alpha(0, i) = log pi_i + log b_i(O_0) ++ const double *emitRow0 = logEmitByTimeData; + for (std::size_t i = 0; i < numStates_; ++i) { + const double logPi = (pi(i) > 0.0) ? std::log(pi(i)) : LOG_ZERO; +- logAlpha_(0, i) = logPi + logEmitBuf_[i * T + 0]; ++ logAlphaData[i] = logPi + emitRow0[i]; + } + + // t > 0 + for (std::size_t t = 1; t < T; ++t) { ++ const double *prevAlphaRow = logAlphaData + (t - 1) * N; ++ double *alphaRow = logAlphaData + t * N; ++ const double *emitRow = logEmitByTimeData + t * N; + for (std::size_t j = 0; j < numStates_; ++j) { + double logSum = LOG_ZERO; ++ const double *transCol = logTransTData + j * N; + for (std::size_t i = 0; i < numStates_; ++i) { +- logSum = logSumExp(logSum, logAlpha_(t - 1, i) + logTrans_(i, j)); ++ logSum = logSumExp(logSum, prevAlphaRow[i] + transCol[i]); + } +- logAlpha_(t, j) = logEmitBuf_[j * T + t] + logSum; ++ alphaRow[j] = emitRow[j] + logSum; + } + } + } + + void ForwardBackwardCalculator::computeLogBackward() { + const std::size_t T = observations_.size(); ++ const double *logTransData = logTrans_.data(); ++ const double *logEmitByTimeData = logEmitByTime_.data(); ++ double *logBetaData = logBeta_.data(); ++ const std::size_t N = numStates_; + + // t = T-1: log beta(T-1, i) = log(1) = 0 +- for (std::size_t i = 0; i < numStates_; ++i) { +- logBeta_(T - 1, i) = 0.0; +- } ++ std::fill(logBetaData + (T - 1) * N, logBetaData + T * N, 0.0); + + // t < T-1, working backwards + if (T > 1) { + for (std::size_t t = T - 2;; --t) { ++ const double *nextBetaRow = logBetaData + (t + 1) * N; ++ double *betaRow = logBetaData + t * N; ++ const double *nextEmitRow = logEmitByTimeData + (t + 1) * N; + for (std::size_t i = 0; i < numStates_; ++i) { + double logSum = LOG_ZERO; ++ const double *transRow = logTransData + i * N; + for (std::size_t j = 0; j < numStates_; ++j) { +- logSum = logSumExp(logSum, logTrans_(i, j) + logEmitBuf_[j * T + (t + 1)] + +- logBeta_(t + 1, j)); ++ logSum = logSumExp(logSum, transRow[j] + nextEmitRow[j] + nextBetaRow[j]); + } +- logBeta_(t, i) = logSum; ++ betaRow[i] = logSum; + } + if (t == 0) + break; +diff --git a/src/calculators/viterbi_calculator.cpp b/src/calculators/viterbi_calculator.cpp +index 3ade510..1df7a3f 100755 +--- a/src/calculators/viterbi_calculator.cpp ++++ b/src/calculators/viterbi_calculator.cpp +@@ -44,16 +44,21 @@ StateSequence ViterbiCalculator::decode() { + // Fill log-emission buffer: logEmitBuf_[i * T + t] = log b_i(O_t) + logEmitBuf_.resize(T * numStates_); + const Hmm &hmm = getHmmRef(); +- +- std::vector obsVec(T); +- for (std::size_t t = 0; t < T; ++t) +- obsVec[t] = observations_(t); ++ const std::span obsSpan(observations_.data(), T); + + for (std::size_t i = 0; i < numStates_; ++i) { + hmm.getDistribution(i).getBatchLogProbabilities( +- std::span(obsVec.data(), T), ++ obsSpan, + std::span(logEmitBuf_.data() + i * T, T)); + } ++ // Build time-major emission buffer once for locality in dynamic programming. ++ logEmitByTime_.resize(T * numStates_); ++ for (std::size_t i = 0; i < numStates_; ++i) { ++ const double *stateRow = logEmitBuf_.data() + i * T; ++ for (std::size_t t = 0; t < T; ++t) { ++ logEmitByTime_[t * numStates_ + i] = stateRow[t]; ++ } ++ } + + runViterbi(); + backtrack(); +@@ -68,10 +73,13 @@ void ViterbiCalculator::precomputeLogTransitions() { + const Hmm &hmm = getHmmRef(); + const Matrix &trans = hmm.getTrans(); + logTrans_.resize(numStates_, numStates_); ++ logTransT_.resize(numStates_, numStates_); + for (std::size_t i = 0; i < numStates_; ++i) { + for (std::size_t j = 0; j < numStates_; ++j) { + const double a = trans(i, j); +- logTrans_(i, j) = (a > 0.0) ? std::log(a) : LOG_ZERO; ++ const double logA = (a > 0.0) ? std::log(a) : LOG_ZERO; ++ logTrans_(i, j) = logA; ++ logTransT_(j, i) = logA; + } + } + } +@@ -82,37 +90,48 @@ void ViterbiCalculator::runViterbi() { + const std::size_t T = observations_.size(); + + logDelta_.resize(T, numStates_); +- psi_.assign(T, std::vector(numStates_, 0)); ++ psi_.assign(T * numStates_, 0); ++ ++ const double *logTransTData = logTransT_.data(); ++ const double *logEmitByTimeData = logEmitByTime_.data(); ++ double *logDeltaData = logDelta_.data(); ++ const std::size_t N = numStates_; + + // t = 0: initialise ++ const double *emitRow0 = logEmitByTimeData; + for (std::size_t i = 0; i < numStates_; ++i) { + const double logPi = (pi(i) > 0.0) ? std::log(pi(i)) : LOG_ZERO; +- logDelta_(0, i) = logPi + logEmitBuf_[i * T + 0]; ++ logDeltaData[i] = logPi + emitRow0[i]; + } + + // t > 0: recursion + for (std::size_t t = 1; t < T; ++t) { ++ const double *prevDeltaRow = logDeltaData + (t - 1) * N; ++ double *deltaRow = logDeltaData + t * N; ++ const double *emitRow = logEmitByTimeData + t * N; + for (std::size_t j = 0; j < numStates_; ++j) { + double maxVal = LOG_ZERO; + int maxFrom = 0; ++ const double *transCol = logTransTData + j * N; + for (std::size_t i = 0; i < numStates_; ++i) { +- const double val = logDelta_(t - 1, i) + logTrans_(i, j); ++ const double val = prevDeltaRow[i] + transCol[i]; + if (val > maxVal) { + maxVal = val; + maxFrom = static_cast(i); + } + } +- logDelta_(t, j) = maxVal + logEmitBuf_[j * T + t]; +- psi_[t][j] = maxFrom; ++ deltaRow[j] = maxVal + emitRow[j]; ++ psi_[t * N + j] = maxFrom; + } + } + + // Termination: best last state + double bestVal = LOG_ZERO; + int bestLast = 0; ++ const double *finalDeltaRow = logDeltaData + (T - 1) * N; + for (std::size_t i = 0; i < numStates_; ++i) { +- if (logDelta_(T - 1, i) > bestVal) { +- bestVal = logDelta_(T - 1, i); ++ if (finalDeltaRow[i] > bestVal) { ++ bestVal = finalDeltaRow[i]; + bestLast = static_cast(i); + } + } +@@ -126,9 +145,10 @@ void ViterbiCalculator::backtrack() { + const std::size_t T = observations_.size(); + if (T <= 1) + return; ++ const std::size_t N = numStates_; + + for (std::size_t t = T - 2;; --t) { +- sequence_(t) = psi_[t + 1][static_cast(sequence_(t + 1))]; ++ sequence_(t) = psi_[(t + 1) * N + static_cast(sequence_(t + 1))]; + if (t == 0) + break; + } +diff --git a/src/training/baum_welch_trainer.cpp b/src/training/baum_welch_trainer.cpp +index 7ae236f..37d1b9c 100755 +--- a/src/training/baum_welch_trainer.cpp ++++ b/src/training/baum_welch_trainer.cpp +@@ -29,22 +29,40 @@ void BaumWelchTrainer::train() { + + // Accumulators (linear space, summed across all sequences) + std::vector piNum(N, 0.0); +- std::vector> transNum(N, std::vector(N, 0.0)); ++ Matrix transNum(N, N); ++ clear_matrix(transNum); + std::vector transDen(N, 0.0); + + // Per-state emission data/weights accumulated across sequences + std::vector> emisData(N); + std::vector> emisWts(N); ++ std::size_t totalObservations = 0; ++ for (const auto &obs : obsLists_) { ++ totalObservations += obs.size(); ++ } ++ const std::size_t reservePerState = (N > 0) ? (totalObservations / N + 1) : 0; ++ for (std::size_t i = 0; i < N; ++i) { ++ emisData[i].reserve(reservePerState); ++ emisWts[i].reserve(reservePerState); ++ } + + // Precompute log-transition matrix from the current model + const Matrix &curTrans = hmm.getTrans(); +- std::vector> logTrans(N, std::vector(N)); ++ Matrix logTrans(N, N); ++ std::vector> activeNextStates(N); + for (std::size_t i = 0; i < N; ++i) { ++ activeNextStates[i].reserve(N); + for (std::size_t j = 0; j < N; ++j) { + const double a = curTrans(i, j); +- logTrans[i][j] = (a > 0.0) ? std::log(a) : LOG_ZERO; ++ if (a > 0.0) { ++ logTrans(i, j) = std::log(a); ++ activeNextStates[i].push_back(j); ++ } else { ++ logTrans(i, j) = LOG_ZERO; ++ } + } + } ++ const double *logTransData = logTrans.data(); + + std::size_t validSeqs = 0; + +@@ -60,24 +78,29 @@ void BaumWelchTrainer::train() { + + const Matrix &logAlpha = fbc.getLogForwardVariables(); + const Matrix &logBeta = fbc.getLogBackwardVariables(); ++ const double *logAlphaData = logAlpha.data(); ++ const double *logBetaData = logBeta.data(); ++ const double *obsData = obs.data(); + + // Precompute log-emissions for this sequence: logEmit[i * T + t] +- std::vector obsVec(T); +- for (std::size_t t = 0; t < T; ++t) +- obsVec[t] = obs(t); + + std::vector logEmit(N * T); ++ const std::span obsSpan(obsData, T); + for (std::size_t i = 0; i < N; ++i) { + hmm.getDistribution(i).getBatchLogProbabilities( +- std::span(obsVec.data(), T), ++ obsSpan, + std::span(logEmit.data() + i * T, T)); + } + + // Accumulate gamma (per timestep per state) and pi/trans denominators + for (std::size_t t = 0; t < T; ++t) { ++ const double *alphaRow = logAlphaData + t * N; ++ const double *betaRow = logBetaData + t * N; ++ const double obsValue = obsData[t]; + for (std::size_t i = 0; i < N; ++i) { +- const double g = std::exp(logAlpha(t, i) + logBeta(t, i) - logP); +- emisData[i].push_back(obs(t)); ++ const double logGamma = alphaRow[i] + betaRow[i] - logP; ++ const double g = std::isfinite(logGamma) ? std::exp(logGamma) : 0.0; ++ emisData[i].push_back(obsValue); + emisWts[i].push_back(g); + if (t == 0) + piNum[i] += g; +@@ -88,11 +111,25 @@ void BaumWelchTrainer::train() { + + // Accumulate xi (transition counts) + for (std::size_t t = 0; t + 1 < T; ++t) { ++ const double *alphaRow = logAlphaData + t * N; ++ const double *betaNextRow = logBetaData + (t + 1) * N; + for (std::size_t i = 0; i < N; ++i) { +- for (std::size_t j = 0; j < N; ++j) { +- const double logXi = logAlpha(t, i) + logTrans[i][j] + +- logEmit[j * T + (t + 1)] + logBeta(t + 1, j) - logP; +- transNum[i][j] += std::exp(logXi); ++ const double alphaVal = alphaRow[i]; ++ if (!std::isfinite(alphaVal)) { ++ continue; ++ } ++ const double *logTransRow = logTransData + i * N; ++ for (const std::size_t j : activeNextStates[i]) { ++ const double betaNext = betaNextRow[j]; ++ const double emitNext = logEmit[j * T + (t + 1)]; ++ if (!std::isfinite(betaNext) || !std::isfinite(emitNext)) { ++ continue; ++ } ++ const double logXi = ++ alphaVal + logTransRow[j] + emitNext + betaNext - logP; ++ if (std::isfinite(logXi)) { ++ transNum(i, j) += std::exp(logXi); ++ } + } + } + } +@@ -122,7 +159,7 @@ void BaumWelchTrainer::train() { + Matrix newTrans(N, N); + for (std::size_t i = 0; i < N; ++i) { + for (std::size_t j = 0; j < N; ++j) { +- newTrans(i, j) = (transDen[i] > 0.0) ? transNum[i][j] / transDen[i] ++ newTrans(i, j) = (transDen[i] > 0.0) ? transNum(i, j) / transDen[i] + : 1.0 / static_cast(N); + } + } +diff --git a/src/training/viterbi_trainer.cpp b/src/training/viterbi_trainer.cpp +index d159bb0..8943940 100755 +--- a/src/training/viterbi_trainer.cpp ++++ b/src/training/viterbi_trainer.cpp +@@ -91,6 +91,15 @@ double ViterbiTrainer::runIteration() { + Matrix trans(N, N); + clear_matrix(trans); + std::vector> emisData(N); ++ std::size_t totalObservations = 0; ++ for (const auto &obs : obsLists_) { ++ totalObservations += obs.size(); ++ } ++ const std::size_t reservePerState = (N > 0) ? (totalObservations / N + 1) : 0; ++ for (std::size_t i = 0; i < N; ++i) { ++ emisData[i].reserve(reservePerState); ++ } ++ std::vector transRowSums(N, 0.0); + + double totalLogProb = 0.0; + std::size_t validSeqs = 0; +@@ -107,15 +116,18 @@ double ViterbiTrainer::runIteration() { + totalLogProb += lp; + const StateSequence &seq = vc.getStateSequence(); + const std::size_t T = obs.size(); ++ const int *seqData = seq.data(); ++ const double *obsData = obs.data(); + +- pi(static_cast(seq(0))) += 1.0; ++ pi(static_cast(seqData[0])) += 1.0; + + for (std::size_t t = 0; t < T; ++t) { +- const std::size_t s = static_cast(seq(t)); +- emisData[s].push_back(obs(t)); ++ const std::size_t s = static_cast(seqData[t]); ++ emisData[s].push_back(obsData[t]); + if (t + 1 < T) { +- const std::size_t sNext = static_cast(seq(t + 1)); ++ const std::size_t sNext = static_cast(seqData[t + 1]); + trans(s, sNext) += 1.0; ++ transRowSums[s] += 1.0; + } + } + ++validSeqs; +@@ -129,12 +141,10 @@ double ViterbiTrainer::runIteration() { + + // Normalise pi + { +- double piSum = 0.0; +- for (std::size_t i = 0; i < N; ++i) +- piSum += pi(i); +- if (piSum > 0.0) { ++ if (validSeqs > 0) { ++ const double invValidSeqs = 1.0 / static_cast(validSeqs); + for (std::size_t i = 0; i < N; ++i) +- pi(i) /= piSum; ++ pi(i) *= invValidSeqs; + } else { + for (std::size_t i = 0; i < N; ++i) + pi(i) = 1.0 / static_cast(N); +@@ -144,12 +154,11 @@ double ViterbiTrainer::runIteration() { + + // Normalise transition rows + for (std::size_t i = 0; i < N; ++i) { +- double rowSum = 0.0; +- for (std::size_t j = 0; j < N; ++j) +- rowSum += trans(i, j); ++ const double rowSum = transRowSums[i]; + if (rowSum > 0.0) { ++ const double invRowSum = 1.0 / rowSum; + for (std::size_t j = 0; j < N; ++j) +- trans(i, j) /= rowSum; ++ trans(i, j) *= invRowSum; + } else { + for (std::size_t j = 0; j < N; ++j) + trans(i, j) = 1.0 / static_cast(N); diff --git a/benchmark-analysis/run_focus_compiler_sweep.py b/benchmark-analysis/run_focus_compiler_sweep.py new file mode 100644 index 0000000..a356c30 --- /dev/null +++ b/benchmark-analysis/run_focus_compiler_sweep.py @@ -0,0 +1,134 @@ +import csv +import pathlib +import re +import subprocess +import statistics + +compilers = { + 'msvc': { + 'pair_exe': pathlib.Path(r'C:\Users\gdwol\Development\libhmm\build-focus-pairwise-ryzen-msvc\tools\hotspot_breakdown.exe'), + 'max_exe': pathlib.Path(r'C:\Users\gdwol\Development\libhmm\build-focus-max-ryzen-msvc\tools\hotspot_breakdown.exe'), + 'out_dir': pathlib.Path(r'C:\Users\gdwol\Development\libhmm\benchmark-analysis\focus-n2-8-ryzen-windows-msvc-rerun'), + }, + 'clangcl': { + 'pair_exe': pathlib.Path(r'C:\Users\gdwol\Development\libhmm\build-focus-pairwise-ryzen-clangcl\tools\hotspot_breakdown.exe'), + 'max_exe': pathlib.Path(r'C:\Users\gdwol\Development\libhmm\build-focus-max-ryzen-clangcl\tools\hotspot_breakdown.exe'), + 'out_dir': pathlib.Path(r'C:\Users\gdwol\Development\libhmm\benchmark-analysis\focus-n2-8-ryzen-windows-clangcl-rerun'), + }, + 'mingw': { + 'pair_exe': pathlib.Path(r'C:\Users\gdwol\Development\libhmm\build-focus-pairwise-ryzen-mingw\tools\hotspot_breakdown.exe'), + 'max_exe': pathlib.Path(r'C:\Users\gdwol\Development\libhmm\build-focus-max-ryzen-mingw\tools\hotspot_breakdown.exe'), + 'out_dir': pathlib.Path(r'C:\Users\gdwol\Development\libhmm\benchmark-analysis\focus-n2-8-ryzen-windows-mingw-rerun'), + }, +} + +n_vals = list(range(2, 9)) +t_vals = [500, 1000, 2000, 5000, 10000, 100000] +runs = 5 +warmup = 2 + +fb_block_re = re.compile(r'Forward-Backward phase breakdown:(.*?)Viterbi phase breakdown:', re.S) +num_re = re.compile(r'([0-9]+(?:\.[0-9]+)?)') + +def parse_hotspot_output(text: str): + m = fb_block_re.search(text) + if not m: + raise RuntimeError('Could not find FB breakdown block') + block = m.group(1) + + def find_metric(label: str): + for candidate in block.splitlines(): + if label in candidate: + nums = num_re.findall(candidate) + if nums: + return float(nums[0]) + raise RuntimeError(f'Missing metric line for {label}') + + total_line = None + for candidate in block.splitlines(): + if candidate.strip().startswith('TOTAL'): + total_line = candidate + break + if total_line is None: + raise RuntimeError('Missing TOTAL line in FB block') + + total_nums = num_re.findall(total_line) + if not total_nums: + raise RuntimeError('No TOTAL numeric value in FB block') + + return { + 'fb_total_ms': float(total_nums[0]), + 'forward_ms': find_metric('Forward recursion'), + 'backward_ms': find_metric('Backward recursion'), + } + +def run_grid(exe: pathlib.Path, mode: str): + rows = [] + for n in n_vals: + for t in t_vals: + proc = subprocess.run( + [str(exe), str(n), str(t), str(runs), str(warmup)], + capture_output=True, + text=True, + check=True, + ) + metrics = parse_hotspot_output(proc.stdout) + rows.append({ + 'mode': mode, + 'n': n, + 't': t, + 'runs': runs, + 'warmup': warmup, + 'fb_total_ms': metrics['fb_total_ms'], + 'forward_ms': metrics['forward_ms'], + 'backward_ms': metrics['backward_ms'], + }) + return rows + +for compiler, cfg in compilers.items(): + out_dir = cfg['out_dir'] + out_dir.mkdir(parents=True, exist_ok=True) + + pair_rows = run_grid(cfg['pair_exe'], 'pairwise') + max_rows = run_grid(cfg['max_exe'], 'max_reduce') + + pair_csv = out_dir / 'focused_pairwise_n2_8.csv' + max_csv = out_dir / 'focused_max_reduce_n2_8.csv' + cmp_csv = out_dir / 'focused_pairwise_vs_max_reduce_n2_8.csv' + + with pair_csv.open('w', newline='') as f: + w = csv.DictWriter(f, fieldnames=list(pair_rows[0].keys())) + w.writeheader() + w.writerows(pair_rows) + + with max_csv.open('w', newline='') as f: + w = csv.DictWriter(f, fieldnames=list(max_rows[0].keys())) + w.writeheader() + w.writerows(max_rows) + + pair_map = {(r['n'], r['t']): r for r in pair_rows} + cmp_rows = [] + for mr in max_rows: + key = (mr['n'], mr['t']) + pr = pair_map[key] + speedup = pr['fb_total_ms'] / mr['fb_total_ms'] + cmp_rows.append({ + 'n': mr['n'], + 't': mr['t'], + 'pairwise_fb_total_ms': pr['fb_total_ms'], + 'max_reduce_fb_total_ms': mr['fb_total_ms'], + 'speedup_max_over_pair': speedup, + 'winner': 'max_reduce' if speedup > 1.0 else 'pairwise', + }) + + with cmp_csv.open('w', newline='') as f: + w = csv.DictWriter(f, fieldnames=list(cmp_rows[0].keys())) + w.writeheader() + w.writerows(sorted(cmp_rows, key=lambda r: (r['n'], r['t']))) + + vals = [r['speedup_max_over_pair'] for r in cmp_rows] + max_wins = sum(1 for r in cmp_rows if r['winner'] == 'max_reduce') + pair_wins = len(cmp_rows) - max_wins + print(f"{compiler}: points={len(cmp_rows)} max_wins={max_wins} pair_wins={pair_wins} median={statistics.median(vals):.6f}") + +print('DONE') diff --git a/benchmark-analysis/run_focus_single_compiler.py b/benchmark-analysis/run_focus_single_compiler.py new file mode 100644 index 0000000..ccd402b --- /dev/null +++ b/benchmark-analysis/run_focus_single_compiler.py @@ -0,0 +1,157 @@ +import argparse +import csv +import pathlib +import re +import statistics +import subprocess + + +COMPILERS = { + "msvc": { + "pair_build": pathlib.Path(r"C:\Users\gdwol\Development\libhmm\build-focus-pairwise-ryzen-msvc"), + "max_build": pathlib.Path(r"C:\Users\gdwol\Development\libhmm\build-focus-max-ryzen-msvc"), + "out_dir": pathlib.Path(r"C:\Users\gdwol\Development\libhmm\benchmark-analysis\focus-n2-8-ryzen-windows-msvc-rerun"), + }, + "clangcl": { + "pair_build": pathlib.Path(r"C:\Users\gdwol\Development\libhmm\build-focus-pairwise-ryzen-clangcl"), + "max_build": pathlib.Path(r"C:\Users\gdwol\Development\libhmm\build-focus-max-ryzen-clangcl"), + "out_dir": pathlib.Path(r"C:\Users\gdwol\Development\libhmm\benchmark-analysis\focus-n2-8-ryzen-windows-clangcl-rerun"), + }, + "mingw": { + "pair_build": pathlib.Path(r"C:\Users\gdwol\Development\libhmm\build-focus-pairwise-ryzen-mingw"), + "max_build": pathlib.Path(r"C:\Users\gdwol\Development\libhmm\build-focus-max-ryzen-mingw"), + "out_dir": pathlib.Path(r"C:\Users\gdwol\Development\libhmm\benchmark-analysis\focus-n2-8-ryzen-windows-mingw-rerun"), + }, +} + +N_VALUES = list(range(2, 9)) +T_VALUES = [500, 1000, 2000, 5000, 10000, 100000] + +FB_BLOCK_RE = re.compile(r"Forward-Backward phase breakdown:(.*?)Viterbi phase breakdown:", re.S) +NUM_RE = re.compile(r"([0-9]+(?:\.[0-9]+)?)") + + +def parse_output(text: str) -> dict: + block_match = FB_BLOCK_RE.search(text) + if not block_match: + raise RuntimeError("Could not find Forward-Backward breakdown block") + block = block_match.group(1) + + def metric(label: str) -> float: + for line in block.splitlines(): + if label in line: + nums = NUM_RE.findall(line) + if nums: + return float(nums[0]) + raise RuntimeError(f"Missing metric line for {label}") + + total_line = None + for line in block.splitlines(): + if line.strip().startswith("TOTAL"): + total_line = line + break + if total_line is None: + raise RuntimeError("Missing TOTAL line in Forward-Backward block") + + total_nums = NUM_RE.findall(total_line) + if not total_nums: + raise RuntimeError("Missing TOTAL numeric value in Forward-Backward block") + + return { + "fb_total_ms": float(total_nums[0]), + "forward_ms": metric("Forward recursion"), + "backward_ms": metric("Backward recursion"), + } + + +def run_grid(build_dir: pathlib.Path, mode: str, runs: int, warmup: int) -> list: + exe = build_dir / "tools" / "hotspot_breakdown.exe" + if not exe.exists(): + raise FileNotFoundError(f"Missing executable: {exe}") + rows = [] + for n in N_VALUES: + for t in T_VALUES: + proc = subprocess.run( + [str(exe), str(n), str(t), str(runs), str(warmup)], + cwd=str(build_dir), + capture_output=True, + text=True, + check=True, + ) + parsed = parse_output(proc.stdout) + rows.append( + { + "mode": mode, + "n": n, + "t": t, + "runs": runs, + "warmup": warmup, + "fb_total_ms": parsed["fb_total_ms"], + "forward_ms": parsed["forward_ms"], + "backward_ms": parsed["backward_ms"], + } + ) + return rows + + +def write_csv(path: pathlib.Path, rows: list) -> None: + with path.open("w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=list(rows[0].keys())) + writer.writeheader() + writer.writerows(rows) + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--compiler", choices=sorted(COMPILERS.keys()), required=True) + parser.add_argument("--runs", type=int, default=5) + parser.add_argument("--warmup", type=int, default=2) + args = parser.parse_args() + + cfg = COMPILERS[args.compiler] + out_dir = cfg["out_dir"] + out_dir.mkdir(parents=True, exist_ok=True) + + pair_rows = run_grid(cfg["pair_build"], "pairwise", args.runs, args.warmup) + max_rows = run_grid(cfg["max_build"], "max_reduce", args.runs, args.warmup) + + pair_csv = out_dir / "focused_pairwise_n2_8.csv" + max_csv = out_dir / "focused_max_reduce_n2_8.csv" + cmp_csv = out_dir / "focused_pairwise_vs_max_reduce_n2_8.csv" + + write_csv(pair_csv, pair_rows) + write_csv(max_csv, max_rows) + + pair_map = {(r["n"], r["t"]): r for r in pair_rows} + cmp_rows = [] + for mr in max_rows: + pr = pair_map[(mr["n"], mr["t"])] + speedup = pr["fb_total_ms"] / mr["fb_total_ms"] + cmp_rows.append( + { + "n": mr["n"], + "t": mr["t"], + "pairwise_fb_total_ms": pr["fb_total_ms"], + "max_reduce_fb_total_ms": mr["fb_total_ms"], + "speedup_max_over_pair": speedup, + "winner": "max_reduce" if speedup > 1.0 else "pairwise", + } + ) + + cmp_rows.sort(key=lambda row: (row["n"], row["t"])) + write_csv(cmp_csv, cmp_rows) + + speedups = [row["speedup_max_over_pair"] for row in cmp_rows] + max_wins = sum(1 for row in cmp_rows if row["winner"] == "max_reduce") + pair_wins = len(cmp_rows) - max_wins + print( + f"{args.compiler}: points={len(cmp_rows)} max_wins={max_wins} " + f"pair_wins={pair_wins} median={statistics.median(speedups):.6f}" + ) + print(f"wrote: {pair_csv}") + print(f"wrote: {max_csv}") + print(f"wrote: {cmp_csv}") + + +if __name__ == "__main__": + main() diff --git a/benchmark-analysis/run_hmmlib_passes.py b/benchmark-analysis/run_hmmlib_passes.py new file mode 100644 index 0000000..34380f8 --- /dev/null +++ b/benchmark-analysis/run_hmmlib_passes.py @@ -0,0 +1,94 @@ +import argparse +import csv +import os +import pathlib +import re +import statistics +import subprocess + + +LIBHMM_RE = re.compile(r"libhmm average throughput:\s*([0-9]+(?:\.[0-9]+)?)\s+observations/ms") +HMMLIB_RE = re.compile(r"HMMLib average throughput:\s*([0-9]+(?:\.[0-9]+)?)\s+observations/ms") +RATIO_RE = re.compile(r"Overall performance ratio:\s*([0-9]+(?:\.[0-9]+)?)x\s+\(HMMLib/libhmm\)") + + +def parse_summary(output: str) -> dict: + m_libhmm = LIBHMM_RE.search(output) + m_hmmlib = HMMLIB_RE.search(output) + m_ratio = RATIO_RE.search(output) + if not (m_libhmm and m_hmmlib and m_ratio): + raise RuntimeError("Could not parse benchmark summary lines from comparator output") + return { + "libhmm_avg_obs_ms": float(m_libhmm.group(1)), + "hmmlib_avg_obs_ms": float(m_hmmlib.group(1)), + "ratio_hmmlib_over_libhmm": float(m_ratio.group(1)), + } + + +def median(values: list[float]) -> float: + return statistics.median(values) + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--exe", required=True) + parser.add_argument("--dll-dir", required=True) + parser.add_argument("--passes", type=int, default=9) + parser.add_argument("--label", required=True) + parser.add_argument("--out-csv", required=True) + args = parser.parse_args() + + exe = pathlib.Path(args.exe) + if not exe.exists(): + raise FileNotFoundError(f"Missing executable: {exe}") + dll_dir = pathlib.Path(args.dll_dir) + if not dll_dir.exists(): + raise FileNotFoundError(f"Missing DLL directory: {dll_dir}") + + env = os.environ.copy() + env["PATH"] = f"{dll_dir};{env.get('PATH', '')}" + + rows = [] + for run_idx in range(1, args.passes + 1): + proc = subprocess.run( + [str(exe)], + cwd=str(exe.parent), + env=env, + capture_output=True, + text=True, + check=True, + ) + parsed = parse_summary(proc.stdout) + row = {"label": args.label, "pass": run_idx} + row.update(parsed) + rows.append(row) + print( + f"{args.label} pass {run_idx}/{args.passes}: " + f"libhmm={parsed['libhmm_avg_obs_ms']:.1f} " + f"hmmlib={parsed['hmmlib_avg_obs_ms']:.1f} " + f"ratio={parsed['ratio_hmmlib_over_libhmm']:.3f}" + ) + + out_csv = pathlib.Path(args.out_csv) + out_csv.parent.mkdir(parents=True, exist_ok=True) + with out_csv.open("w", newline="") as f: + writer = csv.DictWriter( + f, + fieldnames=["label", "pass", "libhmm_avg_obs_ms", "hmmlib_avg_obs_ms", "ratio_hmmlib_over_libhmm"], + ) + writer.writeheader() + writer.writerows(rows) + + lib_vals = [row["libhmm_avg_obs_ms"] for row in rows] + hm_vals = [row["hmmlib_avg_obs_ms"] for row in rows] + ratio_vals = [row["ratio_hmmlib_over_libhmm"] for row in rows] + print( + f"{args.label} medians: " + f"libhmm={median(lib_vals):.1f} hmmlib={median(hm_vals):.1f} " + f"ratio={median(ratio_vals):.3f}" + ) + print(f"wrote: {out_csv}") + + +if __name__ == "__main__": + main() diff --git a/benchmark-analysis/summarize_windows_compiler_rerun.py b/benchmark-analysis/summarize_windows_compiler_rerun.py new file mode 100644 index 0000000..918045f --- /dev/null +++ b/benchmark-analysis/summarize_windows_compiler_rerun.py @@ -0,0 +1,94 @@ +import csv +import math +import pathlib +import statistics + + +ROOT = pathlib.Path(r"C:\Users\gdwol\Development\libhmm\benchmark-analysis") + +FOCUS = { + "msvc": ROOT / "focus-n2-8-ryzen-windows-msvc-rerun" / "focused_pairwise_vs_max_reduce_n2_8.csv", + "clangcl": ROOT / "focus-n2-8-ryzen-windows-clangcl-rerun" / "focused_pairwise_vs_max_reduce_n2_8.csv", + "mingw": ROOT / "focus-n2-8-ryzen-windows-mingw-rerun" / "focused_pairwise_vs_max_reduce_n2_8.csv", +} + +HMMLIB = { + "msvc_control": ROOT / "hmmlib-9pass-ryzen-windows-msvc-rerun" / "control_passes.csv", + "msvc_adaptive": ROOT / "hmmlib-9pass-ryzen-windows-msvc-rerun" / "adaptive_passes.csv", + "mingw_control": ROOT / "hmmlib-9pass-ryzen-windows-mingw-rerun" / "control_passes.csv", + "mingw_adaptive": ROOT / "hmmlib-9pass-ryzen-windows-mingw-rerun" / "adaptive_passes.csv", + "clangcl_control": ROOT / "hmmlib-9pass-ryzen-windows-clangcl-rerun-o2" / "control_passes.csv", + "clangcl_adaptive": ROOT / "hmmlib-9pass-ryzen-windows-clangcl-rerun-o2" / "adaptive_passes.csv", +} + + +def geomean(vals: list[float]) -> float: + return math.exp(sum(math.log(v) for v in vals) / len(vals)) + + +def read_csv(path: pathlib.Path) -> list[dict]: + with path.open(newline="") as f: + return list(csv.DictReader(f)) + + +def summarize_focus() -> None: + print("FOCUSED_SWEEP_SUMMARY") + for compiler, path in FOCUS.items(): + rows = read_csv(path) + speedups = [float(r["speedup_max_over_pair"]) for r in rows] + max_wins = sum(1 for r in rows if r["winner"] == "max_reduce") + pair_wins = len(rows) - max_wins + pair_vals = [float(r["pairwise_fb_total_ms"]) for r in rows] + max_vals = [float(r["max_reduce_fb_total_ms"]) for r in rows] + print( + f"{compiler}: points={len(rows)} max_wins={max_wins} pair_wins={pair_wins} " + f"median_speedup={statistics.median(speedups):.6f} " + f"geomean_pair_ms={geomean(pair_vals):.6f} geomean_max_ms={geomean(max_vals):.6f}" + ) + for n in range(2, 9): + nrows = [r for r in rows if int(r["n"]) == n] + n_max = sum(1 for r in nrows if r["winner"] == "max_reduce") + print(f" n={n}: max_wins={n_max}/{len(nrows)}") + + +def summarize_hmmlib() -> None: + print("HMMLIB_9PASS_SUMMARY") + med = {} + for label, path in HMMLIB.items(): + rows = read_csv(path) + lib_vals = [float(r["libhmm_avg_obs_ms"]) for r in rows] + hm_vals = [float(r["hmmlib_avg_obs_ms"]) for r in rows] + ratio_vals = [float(r["ratio_hmmlib_over_libhmm"]) for r in rows] + med[label] = { + "lib": statistics.median(lib_vals), + "hm": statistics.median(hm_vals), + "ratio": statistics.median(ratio_vals), + } + print( + f"{label}: passes={len(rows)} med_libhmm={med[label]['lib']:.4f} " + f"med_hmmlib={med[label]['hm']:.4f} med_ratio={med[label]['ratio']:.6f}" + ) + + msvc_delta = (med["msvc_adaptive"]["lib"] / med["msvc_control"]["lib"] - 1.0) * 100.0 + mingw_delta = (med["mingw_adaptive"]["lib"] / med["mingw_control"]["lib"] - 1.0) * 100.0 + clangcl_delta = (med["clangcl_adaptive"]["lib"] / med["clangcl_control"]["lib"] - 1.0) * 100.0 + print(f"msvc adaptive_vs_control delta_libhmm_pct={msvc_delta:.6f}") + print(f"mingw adaptive_vs_control delta_libhmm_pct={mingw_delta:.6f}") + print(f"clangcl adaptive_vs_control delta_libhmm_pct={clangcl_delta:.6f}") + ctrl_mingw_vs_msvc = (med["mingw_control"]["lib"] / med["msvc_control"]["lib"] - 1.0) * 100.0 + adapt_mingw_vs_msvc = (med["mingw_adaptive"]["lib"] / med["msvc_adaptive"]["lib"] - 1.0) * 100.0 + ctrl_clangcl_vs_msvc = (med["clangcl_control"]["lib"] / med["msvc_control"]["lib"] - 1.0) * 100.0 + adapt_clangcl_vs_msvc = (med["clangcl_adaptive"]["lib"] / med["msvc_adaptive"]["lib"] - 1.0) * 100.0 + ctrl_clangcl_vs_mingw = (med["clangcl_control"]["lib"] / med["mingw_control"]["lib"] - 1.0) * 100.0 + adapt_clangcl_vs_mingw = (med["clangcl_adaptive"]["lib"] / med["mingw_adaptive"]["lib"] - 1.0) * 100.0 + print(f"mingw_vs_msvc control_libhmm_pct={ctrl_mingw_vs_msvc:.6f}") + print(f"mingw_vs_msvc adaptive_libhmm_pct={adapt_mingw_vs_msvc:.6f}") + print(f"clangcl_vs_msvc control_libhmm_pct={ctrl_clangcl_vs_msvc:.6f}") + print(f"clangcl_vs_msvc adaptive_libhmm_pct={adapt_clangcl_vs_msvc:.6f}") + print(f"clangcl_vs_mingw control_libhmm_pct={ctrl_clangcl_vs_mingw:.6f}") + print(f"clangcl_vs_mingw adaptive_libhmm_pct={adapt_clangcl_vs_mingw:.6f}") + + +if __name__ == "__main__": + summarize_focus() + summarize_hmmlib() diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 098e083..a50a9a6 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -65,14 +65,14 @@ if(EXISTS "${LAMP_DIR}/hmmFind") set(LAMP_READY ON) endif() +# HMMLib is a header-only library (SSE2/NEON intrinsics, no Boost runtime +# dependency). The Boost check previously here was spurious — HMMLib headers +# contain no Boost includes. Detect by checking for the canonical header. set(HMMLIB_READY OFF) -if(EXISTS "${HMMLIB_DIR}") - find_package(Boost QUIET) - if(Boost_FOUND) - set(HMMLIB_READY ON) - else() - message(WARNING "HMMLib directory found at ${HMMLIB_DIR}, but Boost was not found. HMMLib-dependent benchmarks will be skipped.") - endif() +if(EXISTS "${HMMLIB_DIR}/HMMlib/hmm.hpp") + set(HMMLIB_READY ON) +elseif(EXISTS "${HMMLIB_DIR}") + message(WARNING "HMMLib directory found at ${HMMLIB_DIR} but hmm.hpp not found. HMMLib-dependent benchmarks will be skipped.") else() message(WARNING "HMMLib directory not found at ${HMMLIB_DIR}. HMMLib-dependent benchmarks will be skipped.") endif() @@ -159,11 +159,7 @@ function(enable_hmmlib target_name) target_include_directories(${target_name} SYSTEM PRIVATE ${HMMLIB_DIR} - ${Boost_INCLUDE_DIRS} ) - if(Boost_LIBRARIES) - target_link_libraries(${target_name} PRIVATE ${Boost_LIBRARIES}) - endif() endfunction() function(enable_stochhmm target_name) diff --git a/benchmarks/docs/BENCHMARKING_RESULTS.md b/benchmarks/docs/BENCHMARKING_RESULTS.md index 9b9b900..765c7d7 100644 --- a/benchmarks/docs/BENCHMARKING_RESULTS.md +++ b/benchmarks/docs/BENCHMARKING_RESULTS.md @@ -9,7 +9,7 @@ This document summarizes benchmark results comparing libhmm against major HMM li ### Libraries Tested 1. **libhmm** - Modern C++20 implementation with zero external dependencies -2. **HMMLib** - High-performance C++ library with Boost dependencies +2. **HMMLib** - High-performance C++ library with Boost dependencies 3. **StochHMM** - Bioinformatics-focused C++ library 4. **GHMM** - General Hidden Markov Model Library (C) 5. **HTK** - Hidden Markov Model Toolkit (command-line based) @@ -39,7 +39,7 @@ Two classic HMM benchmark problems were used across all libraries: - **Transitions**: Fair→Fair (0.95), Fair→Loaded (0.05), Loaded→Fair (0.10), Loaded→Loaded (0.90) - **Emissions**: Fair die (uniform 1/6), Loaded die (symbol 5 favored at 0.50) -#### 2. Weather Model Problem +#### 2. Weather Model Problem - **States**: 2 (Sunny, Rainy) - **Observations**: 2 symbols (Hot, Cold) - **Transitions**: Sunny→Sunny (0.7), Sunny→Rainy (0.3), Rainy→Sunny (0.4), Rainy→Rainy (0.6) @@ -61,7 +61,7 @@ To ensure fair comparison and detect numerical issues: ### Performance Metrics - **Forward-Backward algorithm timing**: Primary performance metric -- **Viterbi algorithm timing**: Secondary performance metric +- **Viterbi algorithm timing**: Secondary performance metric - **Throughput**: Observations processed per millisecond - **Scaling behavior**: Performance across different sequence lengths @@ -101,7 +101,7 @@ libhmm shows machine-precision agreement with key reference libraries: **Example numerical comparison** (Casino Problem, 1000 observations): - libhmm: -1.815e+03 -- HMMLib: -1.815e+03 +- HMMLib: -1.815e+03 - StochHMM: -1.815e+03 - GHMM: -1.815e+03 - HTK: -2.000e+03 ← **Deliberately rounded for computational efficiency** @@ -162,7 +162,7 @@ Historical snapshot from earlier benchmark runs; use the April 2026 consolidated **Medium Sequences (1,000-10,000 observations):** - GHMM: 20-25x faster than libhmm -- HMMLib: 15-20x faster than libhmm +- HMMLib: 15-20x faster than libhmm - StochHMM: 2x faster than libhmm - HTK: Approaching libhmm performance @@ -245,7 +245,7 @@ All libraries successfully processed sequences up to 1,000,000 observations with - Numerical precision is important - You can handle complex C API -#### Choose **HMMLib** when: +#### Choose **HMMLib** when: - High performance is needed - C++ integration is required - Boost dependencies are acceptable @@ -253,7 +253,7 @@ All libraries successfully processed sequences up to 1,000,000 observations with #### Choose **libhmm** when: - Modern C++20 features are desired -- Zero external dependencies are required +- Zero external dependencies are required - Code maintainability is important - Moderate performance is sufficient - Cross-platform compatibility is needed @@ -278,7 +278,7 @@ libhmm's performance should be evaluated in context: - For most practical applications, this performance is more than adequate - The ~20x speed difference with top performers matters primarily for: - High-frequency real-time applications - - Massive batch processing workflows + - Massive batch processing workflows - Training on extremely large datasets ### Future Development @@ -303,7 +303,7 @@ All benchmark code and configurations are available in the `benchmarks/` directo **Note on External Libraries**: The original source code for HMMLib, StochHMM, GHMM, and HTK is not included in this repository. To reproduce these benchmarks, these libraries must be obtained from their respective developers/maintainers and built according to their official documentation: - **HMMLib**: Available from original authors/research institutions -- **StochHMM**: https://github.com/KorfLab/StochHMM +- **StochHMM**: https://github.com/KorfLab/StochHMM - **GHMM**: http://ghmm.org - **HTK**: http://htk.eng.cam.ac.uk (requires registration) @@ -312,7 +312,7 @@ The benchmark implementations in this repository provide the integration code ne ### Validation Methodology The numerical accuracy validation included: - Direct log-likelihood comparison to machine precision -- Step-by-step forward algorithm verification +- Step-by-step forward algorithm verification - Cross-validation between multiple reference implementations - Deep numerical analysis of scaling factors and intermediate values @@ -432,7 +432,7 @@ This section adds an updated snapshot without removing any prior content. Earlie | `libhmm_vs_jahmm_benchmark`** | JAHMM | 7161.5 | 3803.6 | 0.53x | `build-benchmarks-release/benchmark-logs/libhmm_vs_jahmm_benchmark_after_pathfix.log` | | `libhmm_vs_lamp_benchmark` | LAMP | 6016.7 | 48.2 | 0.01x | Windows x86_64 run, April 2026 (post-warmup) | -\* Uses post-PI-correction StochHMM continuous results (`after_pi_fix`). +\* Uses post-PI-correction StochHMM continuous results (`after_pi_fix`). \** JAHMM benchmark log does not emit an average throughput summary line; values above are computed from per-run forward timings in the same log. ### Updated Code Quality and Maintainability Snapshot (All Evaluated Libraries) @@ -463,17 +463,17 @@ To capture correctness signal separately from throughput, three updated diagnost Key outcomes: -- **Canonical numerical parity with HMMLib** (`deep_numerical_analysis_modernized.log`): +- **Canonical numerical parity with HMMLib** (`deep_numerical_analysis_modernized.log`): Across sequence lengths 10, 50, 100, 200, 500, 1000, and 2000, libhmm and HMMLib log-likelihoods match to near machine precision. Maximum absolute difference observed: `5.093170e-11` (length 2000), with no length-dependent drift pattern. -- **Step-level forward-pass agreement** (`deep_numerical_analysis_modernized.log`): +- **Step-level forward-pass agreement** (`deep_numerical_analysis_modernized.log`): Normalized per-step forward-variable differences are in floating-point noise range (`~1e-16`, max shown `4.163e-16`), and final log-probability difference is `0.000000e+00`. -- **Distribution-layer Gaussian agreement across libraries** (`gaussian_distribution_comparison_modernized.log`): +- **Distribution-layer Gaussian agreement across libraries** (`gaussian_distribution_comparison_modernized.log`): libhmm, GHMM, and StochHMM report `MATCH` across all tested Gaussian cases (standard, shifted mean, negative mean, high variance), indicating aligned PDF/log-PDF behavior at the distribution layer. -- **Constructor semantics validated for reproducibility** (`diagnostic_accuracy_test_modernized.log`): +- **Constructor semantics validated for reproducibility** (`diagnostic_accuracy_test_modernized.log`): `GaussianDistribution(mean, second_parameter)` uses **standard deviation** semantics (not variance). This check avoids silent benchmark misconfiguration when mapping model parameters. -- **Canonical calculator self-consistency checks pass** (`diagnostic_accuracy_test_modernized.log`): +- **Canonical calculator self-consistency checks pass** (`diagnostic_accuracy_test_modernized.log`): ForwardBackward pointer/reference constructors and `getLogProbability()` vs `log(probability())` are numerically identical on the test model; a manual forward calculation also matches libhmm (`probability diff 6.939e-18`, `log diff 0.000e+00`). diff --git a/benchmarks/src/diagnostic_accuracy_test.cpp b/benchmarks/src/diagnostic_accuracy_test.cpp index f70092b..b237918 100644 --- a/benchmarks/src/diagnostic_accuracy_test.cpp +++ b/benchmarks/src/diagnostic_accuracy_test.cpp @@ -16,7 +16,7 @@ using namespace std; /** * DIAGNOSTIC TEST FOR NUMERICAL ACCURACY DISCREPANCIES - * + * * This test isolates potential issues in: * 1. Distribution implementations (PDF/log-PDF calculations) * 2. HMM setup (parameter setting) diff --git a/docs/GOLD_STANDARD_CHECKLIST.md b/docs/GOLD_STANDARD_CHECKLIST.md index 9abaaa3..ce69951 100644 --- a/docs/GOLD_STANDARD_CHECKLIST.md +++ b/docs/GOLD_STANDARD_CHECKLIST.md @@ -44,28 +44,28 @@ All distributions must implement the `EmissionDistribution` abstract interface. - `getBatchLogProbabilities(span, span)` — concrete non-virtual batch loop (tier 1 minimum) - `reset()` — reset to default parameters - `toString()` — human-readable string representation - + - ✅ **Rule of Five:** - Copy Constructor - - Move Constructor + - Move Constructor - Copy Assignment Operator - Move Assignment Operator - Destructor (virtual, defaulted) - + - ✅ **Caching System:** - Comprehensive caching of expensive calculations - Cache validation flags - Automatic cache invalidation on parameter changes - + - ✅ **Input Validation:** - Robust parameter validation with appropriate exceptions - NaN/infinity handling - Data validation in fitting methods - + - ✅ **Constants Usage:** - All numeric literals replaced with constants from `libhmm::constants` - No hardcoded magic numbers - + - ✅ **I/O  Operators:** - `operator==` - Equality comparison with tolerance - `operator<<` - Stream output @@ -74,13 +74,13 @@ All distributions must implement the `EmissionDistribution` abstract interface. ### Test Requirements - ✅ **Core Tests:** - Basic Functionality - - Probability Calculations + - Probability Calculations - Parameter Fitting - Parameter Validation - Copy/Move Semantics - Invalid Input Handling - Reset Functionality - + - ✅ **Advanced Tests:** - Log Probability calculations - String Representation @@ -88,7 +88,7 @@ All distributions must implement the `EmissionDistribution` abstract interface. - Performance characteristics (recommended) - Mathematical Correctness (recommended) - Numerical Stability (recommended) - + - ✅ **Gold Standard Tests:** - CDF calculations (where applicable) - Equality/I-O operators @@ -154,7 +154,7 @@ All distributions must implement the `EmissionDistribution` abstract interface. ## Legend - ✅ **Complete**: Fully implemented and tested -- ❌ **Missing**: Needs to be implemented/added +- ❌ **Missing**: Needs to be implemented/added - ❓ **Unknown**: Needs assessment - 🔄 **In Progress**: Currently being worked on @@ -170,7 +170,7 @@ complete for all 15. No outstanding action items. ## Planned Update Order 1. ✅ **Gaussian** - Reference implementation (constants applied, comprehensive tests verified) -2. ✅ **Exponential** - Reference implementation (constants applied, comprehensive tests verified) +2. ✅ **Exponential** - Reference implementation (constants applied, comprehensive tests verified) 3. ✅ **Gamma** - Updated (constants applied, comprehensive tests verified) 4. ✅ **Uniform** - Updated (constants applied, comprehensive tests verified, performance tests added) 5. ✅ **Chi-squared** - Updated to Gold standard (constants applied, comprehensive tests verified) diff --git a/docs/STYLE_GUIDE.md b/docs/STYLE_GUIDE.md index 962133c..5d34438 100644 --- a/docs/STYLE_GUIDE.md +++ b/docs/STYLE_GUIDE.md @@ -113,13 +113,13 @@ private: double mean_{0.0}; // Private member double standardDeviation_{1.0}; // Private member mutable std::atomic cacheValid_{false}; // Private member - + static constexpr double DEFAULT_MEAN = 0.0; // Constant - + public: void setMean(double mean); // Public method double getMean() const noexcept; // Public method - + private: void validateParameters(double mean, double stdDev) const; // Private method void updateCache() const noexcept; // Private method @@ -169,14 +169,14 @@ for (const auto& item : container) { double getMean() const noexcept; // Long signatures (multi-line with parameters aligned) -void setParameters(double mean, - double standardDeviation, +void setParameters(double mean, + double standardDeviation, bool validateInputs = true); // Constructor initialization lists ExponentialDistribution(double lambda = 1.0) - : lambda_{lambda}, - logLambda_{0.0}, + : lambda_{lambda}, + logLambda_{0.0}, cacheValid_{false} { validateParameters(lambda); updateCache(); @@ -237,7 +237,7 @@ class GaussianDistribution { public: // Explicit single-argument constructor explicit GaussianDistribution(double mean = 0.0, double stdDev = 1.0); - + // Default special members when possible ~GaussianDistribution() = default; GaussianDistribution(const GaussianDistribution&) = default; @@ -286,7 +286,7 @@ private: /** * Validates parameters for the distribution * @param param1 First parameter with constraints - * @param param2 Second parameter with constraints + * @param param2 Second parameter with constraints * @throws std::invalid_argument if parameters are invalid */ void validateParameters(double param1, double param2) const { @@ -294,16 +294,16 @@ private: throw std::invalid_argument("param1 must be positive and finite"); } if (std::isnan(param2) || std::isinf(param2) || param2 <= 0.0) { - throw std::invalid_argument("param2 must be positive and finite"); + throw std::invalid_argument("param2 must be positive and finite"); } } public: - ExampleDistribution(double param1, double param2) + ExampleDistribution(double param1, double param2) : param1_{param1}, param2_{param2} { validateParameters(param1, param2); // Validate in constructor } - + void setParam1(double param1) { validateParameters(param1, param2_); // Validate in setter param1_ = param1; @@ -332,17 +332,17 @@ Use **Doxygen-style comments** for all public interfaces: ```cpp /** * Computes the probability density function for the Gaussian distribution. - * + * * The PDF is computed using the formula: * f(x) = (1/σ√(2π)) * exp(-0.5*((x-μ)/σ)²) - * + * * @param value The value at which to evaluate the PDF * @return Probability density at the given value * @throws std::invalid_argument if value is NaN or infinite - * + * * @note This method is thread-safe and uses cached normalization constants * @complexity O(1) - constant time computation - * + * * @example * @code * GaussianDistribution dist(0.0, 1.0); // Standard normal @@ -356,23 +356,23 @@ double getProbability(double value) override; ```cpp /** * Modern C++20 Gaussian distribution for modeling continuous symmetric data. - * + * * The Gaussian (Normal) distribution is a continuous probability distribution * characterized by its bell-shaped curve. It's fundamental in statistics and * is used extensively in machine learning and data analysis. - * + * * PDF: f(x) = (1/σ√(2π)) * exp(-0.5*((x-μ)/σ)²) * where μ is the mean and σ is the standard deviation (σ > 0) - * + * * Properties: - * - Mean: μ + * - Mean: μ * - Variance: σ² * - Support: x ∈ (-∞, ∞) * - Symmetry: Symmetric around μ - * + * * @note Thread-safe for read operations, not thread-safe for modifications * @note Uses efficient caching for repeated probability calculations - * + * * @example Basic usage: * @code * GaussianDistribution normal(0.0, 1.0); // Standard normal distribution @@ -415,7 +415,7 @@ if (!cacheValid_) { */ void testParameterValidation() { std::cout << "Testing parameter validation..." << std::endl; - + // Test invalid constructor parameters try { GaussianDistribution dist(0.0, 0.0); // Invalid stddev @@ -423,16 +423,16 @@ void testParameterValidation() { } catch (const std::invalid_argument&) { // Expected behavior } - + // Test NaN and infinity double nan_val = std::numeric_limits::quiet_NaN(); try { GaussianDistribution dist(nan_val, 1.0); - assert(false); // Should not reach here + assert(false); // Should not reach here } catch (const std::invalid_argument&) { // Expected behavior } - + std::cout << "✓ Parameter validation tests passed" << std::endl; } ``` @@ -449,7 +449,7 @@ void testParameterValidation() { ### 1. Required Tools - **clang-tidy**: Static analysis and code quality - **cppcheck**: Additional static analysis -- **Address Sanitizer**: Memory error detection +- **Address Sanitizer**: Memory error detection - **Undefined Behavior Sanitizer**: UB detection ### 2. Enabled Checks @@ -484,12 +484,12 @@ class GaussianDistribution { private: mutable double normalizationConstant_{0.0}; mutable std::atomic cacheValid_{false}; - + void updateCache() const noexcept { normalizationConstant_ = 1.0 / (standardDeviation_ * std::sqrt(2.0 * M_PI)); cacheValid_ = true; } - + public: double getProbability(double value) override { if (!cacheValid_) { @@ -521,15 +521,15 @@ private: } // Additional validations... } - + public: // Constructor MUST call validateParameters - DistributionName(ParamType1 param1, ParamType2 param2) + DistributionName(ParamType1 param1, ParamType2 param2) : param1_{param1}, param2_{param2} { validateParameters(param1, param2); } - - // Setters MUST call validateParameters + + // Setters MUST call validateParameters void setParam1(ParamType1 param1) { validateParameters(param1, param2_); param1_ = param1; diff --git a/examples/economics_hmm_example.cpp b/examples/economics_hmm_example.cpp index 58511fa..9ac6f31 100644 --- a/examples/economics_hmm_example.cpp +++ b/examples/economics_hmm_example.cpp @@ -17,15 +17,15 @@ using libhmm::ViterbiTrainer; /** * Example: Economic and Social Science Modeling with Negative Binomial and Pareto HMM - * + * * This example demonstrates modeling economic phenomena using: * - Negative Binomial distribution for overdispersed count data (customer purchases, accidents) * - Pareto distribution for power-law phenomena (income, wealth, city sizes) - * + * * Hidden States for Customer Behavior: * - State 0: "Low Activity" (few purchases, occasional high-value items) * - State 1: "High Activity" (many purchases, frequent transactions) - * + * * Hidden States for Economic Regimes: * - State 0: "Normal Economy" (typical income distribution) * - State 1: "Crisis Economy" (more extreme inequality) diff --git a/examples/financial_hmm_example.cpp b/examples/financial_hmm_example.cpp index 4fbbe27..e983d51 100644 --- a/examples/financial_hmm_example.cpp +++ b/examples/financial_hmm_example.cpp @@ -17,11 +17,11 @@ using libhmm::ViterbiTrainer; /** * Example: Financial Market Volatility Modeling with Beta and Log-Normal HMM - * + * * This example demonstrates modeling financial market states using: * - Beta distribution for volatility measures (bounded between 0 and 1) * - Log-Normal distribution for asset returns (always positive) - * + * * Hidden States: * - State 0: "Low Volatility" (stable market conditions) * - State 1: "High Volatility" (turbulent market conditions) diff --git a/examples/poisson_hmm_example.cpp b/examples/poisson_hmm_example.cpp index 2568db4..dc56df9 100644 --- a/examples/poisson_hmm_example.cpp +++ b/examples/poisson_hmm_example.cpp @@ -14,7 +14,7 @@ using libhmm::ViterbiTrainer; /** * Example: Modeling Website Traffic with Poisson HMM - * + * * This example demonstrates using Poisson distributions in an HMM to model * website traffic patterns. We'll model two hidden states: * - State 0: "Normal Traffic" (λ = 10 requests/minute) diff --git a/examples/quality_control_hmm_example.cpp b/examples/quality_control_hmm_example.cpp index dc27ac6..9528087 100644 --- a/examples/quality_control_hmm_example.cpp +++ b/examples/quality_control_hmm_example.cpp @@ -16,11 +16,11 @@ using libhmm::ViterbiTrainer; /** * Example: Quality Control Process Monitoring with Binomial and Uniform HMM - * + * * This example demonstrates modeling quality control processes using: * - Binomial distribution for defect counts in batches * - Uniform distribution for measurement tolerances - * + * * Hidden States: * - State 0: "In Control" (low defect rate, tight tolerances) * - State 1: "Out of Control" (high defect rate, loose tolerances) diff --git a/examples/queuing_theory_hmm_example.cpp b/examples/queuing_theory_hmm_example.cpp index e90882f..3284f2f 100644 --- a/examples/queuing_theory_hmm_example.cpp +++ b/examples/queuing_theory_hmm_example.cpp @@ -19,17 +19,17 @@ using libhmm::ViterbiTrainer; /** * Example: Queuing Theory and Service Systems with HMM - * + * * This example demonstrates modeling service systems using HMMs to represent: * - Customer arrival patterns (Poisson arrivals) * - Service time distributions (Exponential, Gamma) * - System state transitions (load levels, server availability) - * + * * Service System States: * - State 0: "Low Load" (few customers, fast service) * - State 1: "Medium Load" (moderate queue, normal service) * - State 2: "High Load" (long queue, slow service) - * + * * Models Demonstrated: * 1. M/M/1 Queue (Poisson arrivals, Exponential service) * 2. M/G/1 Queue (Poisson arrivals, Gamma service times) diff --git a/examples/reliability_hmm_example.cpp b/examples/reliability_hmm_example.cpp index 9ca1140..832298f 100644 --- a/examples/reliability_hmm_example.cpp +++ b/examples/reliability_hmm_example.cpp @@ -16,11 +16,11 @@ using libhmm::WeibullDistribution; /** * Example: Reliability Engineering with Weibull and Exponential HMM - * + * * This example demonstrates modeling system reliability using: * - Weibull distribution for component lifetimes (flexible hazard rates) * - Exponential distribution for memoryless failure times - * + * * Hidden States: * - State 0: "Normal Operation" (low failure rate) * - State 1: "Degraded State" (higher failure rate) diff --git a/examples/statistical_process_control_hmm_example.cpp b/examples/statistical_process_control_hmm_example.cpp index cb35666..4a8dcee 100644 --- a/examples/statistical_process_control_hmm_example.cpp +++ b/examples/statistical_process_control_hmm_example.cpp @@ -20,17 +20,17 @@ using libhmm::ViterbiTrainer; /** * Example: Statistical Process Control with Chi-squared Distribution HMM - * + * * This example demonstrates quality control monitoring using: * - Chi-squared distribution for test statistics and variance measures * - Gaussian distribution for measurement errors * - Exponential distribution for time-between-failures - * + * * Hidden States: * - State 0: "In Control" (process operating normally) * - State 1: "Warning" (process showing signs of deviation) * - State 2: "Out of Control" (process requires intervention) - * + * * Key applications of Chi-squared in quality control: * - Goodness-of-fit testing for process capability * - Variance monitoring and control charts diff --git a/examples/swarm_coordination_example.cpp b/examples/swarm_coordination_example.cpp index ae0487a..e6a9cb3 100644 --- a/examples/swarm_coordination_example.cpp +++ b/examples/swarm_coordination_example.cpp @@ -1,23 +1,23 @@ /** * @file swarm_coordination_example.cpp * @brief Discrete State Swarm Coordination Example using libhmm - * + * * This example demonstrates how to use Hidden Markov Models for coordinating * a drone swarm through different formation states and mission phases. - * + * * Key Features: * - Discrete state space modeling (formation types, mission phases) * - Multi-dimensional discrete observations (altitude, speed, threats) * - Automatic calculator selection with SIMD optimization * - Real-time state prediction and formation coordination * - Fault detection and recovery mechanisms - * + * * Applications: * - Autonomous drone swarm coordination * - Multi-robot formation control * - Mission state management * - System health monitoring - * + * * @author libhmm development team * @version 2.5.0 */ diff --git a/include/libhmm/calculators/forward_backward_calculator.h b/include/libhmm/calculators/forward_backward_calculator.h index 3efd38d..eb2bb2a 100755 --- a/include/libhmm/calculators/forward_backward_calculator.h +++ b/include/libhmm/calculators/forward_backward_calculator.h @@ -1,7 +1,9 @@ #pragma once #include "libhmm/calculators/calculator.h" +#include "libhmm/performance/fb_recurrence_policy.h" #include +#include #include namespace libhmm { @@ -84,24 +86,60 @@ class ForwardBackwardCalculator : public Calculator { /** Number of HMM states used by this calculator. */ [[nodiscard]] std::size_t getNumStates() const noexcept { return numStates_; } + /** + * @brief Force a specific recurrence kernel for subsequent compute() calls. + * + * Pass `std::nullopt` to clear the override and return to adaptive policy. + * The override takes precedence over the static policy bins, but is itself + * superseded by the compile-time `LIBHMM_EXPERIMENT_FB_MAX_REDUCE` and + * `LIBHMM_EXPERIMENT_FB_ADAPTIVE_SELECTOR` forcers when those are defined. + */ + void setRecurrenceModeOverride(std::optional mode) noexcept { + modeOverride_ = mode; + } + + /** Currently active recurrence-mode override, if any. */ + [[nodiscard]] std::optional getRecurrenceModeOverride() const noexcept { + return modeOverride_; + } + + /** Recurrence mode resolved on the most recent compute() call. */ + [[nodiscard]] FbRecurrenceMode getRecurrenceMode() const noexcept { return currentMode_; } + private: std::size_t numStates_{0}; // Precomputed log-transition matrix [N x N]: logTrans_(i,j) = log a_{ij} Matrix logTrans_; + // Transposed transition matrix [N x N]: logTransT_(j,i) = log a_{ij} + Matrix logTransT_; // Results Matrix logAlpha_; // T x N Matrix logBeta_; // T x N double logProbability_{-std::numeric_limits::infinity()}; - // Per-state log-emission buffer reused each timestep [T x N, row-major]. - // Allocated once; filled by getBatchLogProbabilities per state. - mutable std::vector logEmitBuf_; - + // State-major log-emission buffer: logEmitBuf_[i * T + t] = log b_i(O_t). + // Filled directly by getBatchLogProbabilities per state. + std::vector logEmitBuf_; + // Time-major emission buffer: logEmitByTime_[t * N + i] = log b_i(O_t). + // Derived from logEmitBuf_ for contiguous per-time access in recurrences. + std::vector logEmitByTime_; + // Recurrence kernel resolved by the policy + override pipeline on the most + // recent compute() call. Defaults to Pairwise (the comparator-safe choice). + FbRecurrenceMode currentMode_{FbRecurrenceMode::Pairwise}; + // Optional per-instance override (Phase A4). Set via setRecurrenceModeOverride(). + std::optional modeOverride_; + + [[nodiscard]] FbRecurrenceMode resolveRecurrenceMode(std::size_t numStates, + std::size_t sequenceLength) const noexcept; void precomputeLogTransitions(); void computeLogForward(); void computeLogBackward(); + void computeLogForwardPairwise(); + void computeLogForwardMaxReduce(); + void computeLogBackwardPairwise(); + void computeLogBackwardMaxReduce(); /** log-sum-exp of two log-space values: log(exp(a) + exp(b)). */ static double logSumExp(double a, double b) noexcept; diff --git a/include/libhmm/calculators/viterbi_calculator.h b/include/libhmm/calculators/viterbi_calculator.h index 7b9ae64..a341ecb 100755 --- a/include/libhmm/calculators/viterbi_calculator.h +++ b/include/libhmm/calculators/viterbi_calculator.h @@ -65,19 +65,24 @@ class ViterbiCalculator : public Calculator { // Precomputed log-transition matrix [N x N] Matrix logTrans_; + // Transposed transition matrix [N x N]: logTransT_(j,i) = log a_{ij} + Matrix logTransT_; // Viterbi trellis: logDelta(t,i) = max log-prob path ending at state i at time t Matrix logDelta_; - // Backtrack pointers: psi(t,i) = arg max_j [logDelta(t-1,j) + logTrans(j,i)] - std::vector> psi_; + // Backtrack pointers in time-major contiguous storage: + // psi_[t * N + j] = arg max_i [logDelta(t-1,i) + logTrans(i,j)] + std::vector psi_; // Result StateSequence sequence_; double logProbability_{-std::numeric_limits::infinity()}; - // Per-state emission buffer - mutable std::vector logEmitBuf_; + // Per-state log-emission buffer: logEmitBuf_[i * T + t] = log b_i(O_t) + std::vector logEmitBuf_; + // Time-major emission buffer: logEmitByTime_[t * N + i] = log b_i(O_t) + std::vector logEmitByTime_; void precomputeLogTransitions(); void runViterbi(); diff --git a/include/libhmm/distributions/beta_distribution.h b/include/libhmm/distributions/beta_distribution.h index 192ca40..b9be62d 100644 --- a/include/libhmm/distributions/beta_distribution.h +++ b/include/libhmm/distributions/beta_distribution.h @@ -8,14 +8,14 @@ namespace libhmm { /** * Beta distribution for modeling probabilities and proportions. - * - * The Beta distribution is a continuous probability distribution defined + * + * The Beta distribution is a continuous probability distribution defined * on the interval [0,1] and parameterized by two positive shape parameters * α (alpha) and β (beta). - * + * * PDF: f(x; α, β) = (x^(α-1) * (1-x)^(β-1)) / B(α, β) * where B(α, β) is the Beta function: B(α, β) = Γ(α)Γ(β)/Γ(α+β) - * + * * Special cases: * - α = β = 1: Uniform distribution on [0,1] * - α = β: Symmetric around 0.5 @@ -30,7 +30,7 @@ class BetaDistribution : public DistributionBase { double alpha_{1.0}; /** - * Shape parameter β (beta) - must be positive + * Shape parameter β (beta) - must be positive */ double beta_{1.0}; @@ -89,7 +89,7 @@ class BetaDistribution : public DistributionBase { public: /** * Constructs a Beta distribution with given shape parameters. - * + * * @param alpha Shape parameter α (must be positive) * @param beta Shape parameter β (must be positive) * @throws std::invalid_argument if parameters are not positive finite numbers @@ -139,7 +139,7 @@ class BetaDistribution : public DistributionBase { /** * Computes the probability density function for the Beta distribution. - * + * * @param value The value at which to evaluate the PDF (should be in [0,1]) * @return Probability density, or 0.0 if value is outside [0,1] */ @@ -154,9 +154,9 @@ class BetaDistribution : public DistributionBase { /** * Computes the cumulative distribution function for the Beta distribution. - * + * * Uses the regularized incomplete beta function I_x(α,β) - * + * * @param value The value at which to evaluate the CDF * @return Cumulative probability P(X ≤ value) */ @@ -170,7 +170,7 @@ class BetaDistribution : public DistributionBase { /** * Vectorized batch computation of PDF for multiple values. * Optimized for processing many values efficiently with cache reuse. - * + * * @param values Vector of input values * @param results Output vector for results (will be resized if needed) */ @@ -179,7 +179,7 @@ class BetaDistribution : public DistributionBase { /** * Vectorized batch computation of log PDF for multiple values. * Optimized for processing many values efficiently with cache reuse. - * + * * @param values Vector of input values * @param results Output vector for results (will be resized if needed) */ @@ -194,21 +194,21 @@ class BetaDistribution : public DistributionBase { /** * Returns a string representation of the distribution. - * + * * @return String describing the distribution parameters */ std::string toString() const override; /** * Gets the alpha (α) shape parameter. - * + * * @return Current alpha value */ double getAlpha() const noexcept { return alpha_; } /** * Sets the alpha (α) shape parameter. - * + * * @param alpha New alpha parameter (must be positive) * @throws std::invalid_argument if alpha <= 0 or is not finite */ @@ -220,14 +220,14 @@ class BetaDistribution : public DistributionBase { /** * Gets the beta (β) shape parameter. - * + * * @return Current beta value */ double getBeta() const noexcept { return beta_; } /** * Sets the beta (β) shape parameter. - * + * * @param beta New beta parameter (must be positive) * @throws std::invalid_argument if beta <= 0 or is not finite */ @@ -240,7 +240,7 @@ class BetaDistribution : public DistributionBase { /** * Gets the mean of the distribution. * For Beta(α, β), mean = α/(α+β) - * + * * @return Mean value */ double getMean() const noexcept { return alpha_ / (alpha_ + beta_); } @@ -248,7 +248,7 @@ class BetaDistribution : public DistributionBase { /** * Gets the variance of the distribution. * For Beta(α, β), variance = αβ/((α+β)²(α+β+1)) - * + * * @return Variance value */ double getVariance() const noexcept { @@ -258,7 +258,7 @@ class BetaDistribution : public DistributionBase { /** * Gets the standard deviation of the distribution. - * + * * @return Standard deviation */ double getStandardDeviation() const noexcept { return std::sqrt(getVariance()); } diff --git a/include/libhmm/distributions/chi_squared_distribution.h b/include/libhmm/distributions/chi_squared_distribution.h index 458b07f..1db1414 100644 --- a/include/libhmm/distributions/chi_squared_distribution.h +++ b/include/libhmm/distributions/chi_squared_distribution.h @@ -8,11 +8,11 @@ namespace libhmm { /** * Chi-squared distribution for modeling sums of squared standard normal variables. - * + * * The Chi-squared distribution is a continuous probability distribution with support * on non-negative real numbers. It is a special case of the Gamma distribution and * arises frequently in statistical inference, particularly in hypothesis testing. - * + * * Mathematical properties: * - PDF: f(x; k) = (1/(2^(k/2) * Γ(k/2))) * x^(k/2-1) * e^(-x/2) * - Support: x ∈ [0, ∞) @@ -20,7 +20,7 @@ namespace libhmm { * - Mean: k * - Variance: 2k * - Relation to Gamma: χ²(k) = Gamma(k/2, 2) - * + * * Applications: * - Goodness-of-fit tests * - Tests of independence in contingency tables @@ -68,7 +68,7 @@ class ChiSquaredDistribution : public DistributionBase { public: /** * Constructs a Chi-squared distribution with given degrees of freedom. - * + * * @param degrees_of_freedom Degrees of freedom k (must be positive) * @throws std::invalid_argument if degrees_of_freedom <= 0 */ @@ -116,7 +116,7 @@ class ChiSquaredDistribution : public DistributionBase { /** * Computes the probability density function for the Chi-squared distribution. - * + * * @param value The value at which to evaluate the PDF (should be non-negative) * @return Probability density f(value|k), or 0.0 if value < 0 */ @@ -141,21 +141,21 @@ class ChiSquaredDistribution : public DistributionBase { /** * Returns a string representation of the distribution. - * + * * @return String describing the distribution parameters */ std::string toString() const override; /** * Gets the degrees of freedom parameter. - * + * * @return Current degrees of freedom value */ double getDegreesOfFreedom() const noexcept { return degrees_of_freedom_; } /** * Sets the degrees of freedom parameter. - * + * * @param degrees_of_freedom New degrees of freedom parameter (must be positive) * @throws std::invalid_argument if degrees_of_freedom <= 0 or is not finite */ @@ -167,28 +167,28 @@ class ChiSquaredDistribution : public DistributionBase { /** * Gets the mean of the distribution. - * + * * @return Mean (k) */ double getMean() const noexcept { return degrees_of_freedom_; } /** * Gets the variance of the distribution. - * + * * @return Variance (2k) */ double getVariance() const noexcept { return 2.0 * degrees_of_freedom_; } /** * Gets the standard deviation of the distribution. - * + * * @return Standard deviation (√(2k)) */ double getStandardDeviation() const noexcept { return std::sqrt(2.0 * degrees_of_freedom_); } /** * Gets the mode of the distribution. - * + * * @return Mode (max(0, k-2)) */ double getMode() const noexcept { return std::max(0.0, degrees_of_freedom_ - 2.0); } diff --git a/include/libhmm/distributions/discrete_distribution.h b/include/libhmm/distributions/discrete_distribution.h index bfde0e2..54f69d1 100755 --- a/include/libhmm/distributions/discrete_distribution.h +++ b/include/libhmm/distributions/discrete_distribution.h @@ -9,21 +9,21 @@ namespace libhmm { /** * Modern C++20 Discrete distribution for modeling categorical data. - * + * * The Discrete distribution (also known as Categorical distribution) is a * discrete probability distribution that generalizes the Bernoulli distribution. * It describes the possible results of a random variable that can take on * one of K possible categories, with the probability of each category separately specified. - * + * * PMF: P(X = k) = p_k for k ∈ {0, 1, 2, ..., K-1} * where p_k is the probability of category k and ∑p_k = 1 - * + * * Properties: * - Support: {0, 1, 2, ..., numSymbols-1} * - Probability mass function defined for each discrete symbol * - All probabilities must sum to 1.0 * - Each probability must be in [0, 1] - * + * * Applications: * - Hidden Markov Models with discrete observations * - Classification problems @@ -177,7 +177,7 @@ class DiscreteDistribution : public DistributionBase { /** * Gets the probability mass function value for a discrete observation. - * + * * @param value The discrete value (will be cast to integer index) * @return Probability mass for the given value, 0.0 if out of range */ @@ -203,7 +203,7 @@ class DiscreteDistribution : public DistributionBase { /** * Sets the probability for a specific discrete observation. - * + * * @param o The discrete observation (symbol index) * @param value The probability value (must be in [0,1]) * @throws std::invalid_argument if value is not a valid probability @@ -227,21 +227,21 @@ class DiscreteDistribution : public DistributionBase { /** * Returns a string representation of the distribution. - * + * * @return String showing all symbol probabilities */ std::string toString() const override; /** * Gets the number of discrete symbols in the distribution. - * + * * @return Number of symbols/categories */ std::size_t getNumSymbols() const noexcept { return numSymbols_; } /** * Gets the probability for a specific symbol. - * + * * @param index Symbol index (must be < numSymbols) * @return Probability for the symbol * @throws std::out_of_range if index is out of range @@ -255,7 +255,7 @@ class DiscreteDistribution : public DistributionBase { /** * Gets the sum of all probabilities (should be approximately 1.0). - * + * * @return Sum of all probabilities */ double getProbabilitySum() const { @@ -277,7 +277,7 @@ class DiscreteDistribution : public DistributionBase { /** * Gets the mean of the distribution. * For discrete distribution, mean = ∑(i * p_i) for i = 0 to numSymbols-1 - * + * * @return Mean value */ double getMean() const noexcept { @@ -291,7 +291,7 @@ class DiscreteDistribution : public DistributionBase { /** * Gets the variance of the distribution. * For discrete distribution, variance = ∑(i² * p_i) - mean² - * + * * @return Variance value */ double getVariance() const noexcept { @@ -306,7 +306,7 @@ class DiscreteDistribution : public DistributionBase { /** * Gets the standard deviation of the distribution. - * + * * @return Standard deviation value */ double getStandardDeviation() const noexcept { return std::sqrt(getVariance()); } @@ -328,7 +328,7 @@ class DiscreteDistribution : public DistributionBase { /** * Evaluates the logarithm of the probability mass function * More numerically stable for small probabilities - * + * * @param value The discrete value (will be cast to integer index) * @return Log probability mass, -infinity if out of range or probability is 0 */ @@ -342,7 +342,7 @@ class DiscreteDistribution : public DistributionBase { /** * Evaluates the CDF at k using cumulative sum approach * Formula: CDF(k) = ∑(i=0 to k) P(X = i) - * + * * @param value The value at which to evaluate the CDF * @return Cumulative probability P(X ≤ value) */ diff --git a/include/libhmm/distributions/distributions.h b/include/libhmm/distributions/distributions.h index 73fde1a..3c26853 100644 --- a/include/libhmm/distributions/distributions.h +++ b/include/libhmm/distributions/distributions.h @@ -3,21 +3,21 @@ /** * @file distributions.h * @brief Convenience header that includes all libhmm probability distributions - * + * * This header provides a single include point for all probability distributions * available in libhmm. It follows the standard library convention of providing * umbrella headers for related functionality. - * + * * Usage: * @code * #include "libhmm/distributions/distributions.h" - * + * * // All distributions are now available: * GaussianDistribution gauss(0.0, 1.0); * PoissonDistribution poisson(2.5); * DiscreteDistribution discrete(6); * @endcode - * + * * @note For better compilation times, consider including only the specific * distribution headers you need in performance-critical applications. */ @@ -51,15 +51,15 @@ /** * @namespace libhmm * @brief All distributions are available in the libhmm namespace - * + * * After including this header, all distribution classes are available: - * + * * **Discrete Distributions:** * - DiscreteDistribution: General discrete distribution * - BinomialDistribution: Binomial distribution B(n,p) * - NegativeBinomialDistribution: Negative binomial distribution * - PoissonDistribution: Poisson distribution P(λ) - * + * * **Continuous Distributions:** * - GaussianDistribution: Normal distribution N(μ,σ²) * - ExponentialDistribution: Exponential distribution Exp(λ) diff --git a/include/libhmm/distributions/exponential_distribution.h b/include/libhmm/distributions/exponential_distribution.h index 27ff656..2abbe25 100755 --- a/include/libhmm/distributions/exponential_distribution.h +++ b/include/libhmm/distributions/exponential_distribution.h @@ -8,15 +8,15 @@ namespace libhmm { /** * Modern C++20 Exponential distribution for modeling waiting times and decay processes. - * + * * The Exponential distribution is a continuous probability distribution that describes * the time between events in a Poisson point process. It's commonly used to model * lifetimes, waiting times, and decay processes. - * + * * PDF: f(x) = λ * exp(-λx) for x ≥ 0, 0 otherwise * CDF: F(x) = 1 - exp(-λx) for x ≥ 0, 0 otherwise * where λ is the rate parameter (λ > 0) - * + * * Properties: * - Mean: 1/λ * - Variance: 1/λ² @@ -79,7 +79,7 @@ class ExponentialDistribution : public DistributionBase { public: /** * Constructs an Exponential distribution with given rate parameter. - * + * * @param lambda Rate parameter λ (must be positive) * @throws std::invalid_argument if lambda is invalid */ @@ -129,7 +129,7 @@ class ExponentialDistribution : public DistributionBase { /** * Computes the probability density function for the Exponential distribution. - * + * * @param value The value at which to evaluate the PDF * @return Probability density (or approximated probability for discrete sampling) */ @@ -163,21 +163,21 @@ class ExponentialDistribution : public DistributionBase { /** * Returns a string representation of the distribution. - * + * * @return String describing the distribution parameters */ std::string toString() const override; /** * Gets the rate parameter λ. - * + * * @return Current rate parameter value */ double getLambda() const noexcept { return lambda_; } /** * Sets the rate parameter λ. - * + * * @param lambda New rate parameter (must be positive) * @throws std::invalid_argument if lambda <= 0 or is not finite */ @@ -191,7 +191,7 @@ class ExponentialDistribution : public DistributionBase { * Gets the mean of the distribution. * For Exponential distribution, mean = 1/λ * Uses cached value to eliminate division. - * + * * @return Mean value */ double getMean() const noexcept { @@ -218,7 +218,7 @@ class ExponentialDistribution : public DistributionBase { /** * Evaluates the CDF at x using the standard exponential CDF formula * For exponential distribution: F(x) = 1 - exp(-λx) for x ≥ 0, 0 otherwise - * + * * @param x The value at which to evaluate the CDF * @return Cumulative probability P(X ≤ x) */ diff --git a/include/libhmm/distributions/gamma_distribution.h b/include/libhmm/distributions/gamma_distribution.h index 31381e9..3474aa1 100755 --- a/include/libhmm/distributions/gamma_distribution.h +++ b/include/libhmm/distributions/gamma_distribution.h @@ -8,19 +8,19 @@ namespace libhmm { /** * Modern C++20 Gamma distribution for modeling continuous non-negative data. - * + * * The Gamma distribution is a versatile continuous probability distribution * commonly used to model waiting times, failure rates, and size distributions. * It generalizes the exponential distribution and is the conjugate prior for * the precision of a normal distribution. - * + * * PDF: f(x) = (1/(Γ(k)θ^k)) * x^(k-1) * exp(-x/θ) for x ≥ 0 * where k is the shape parameter (k > 0) and θ is the scale parameter (θ > 0) * Γ(k) is the gamma function - * + * * Alternative parameterization uses rate parameter β = 1/θ: * PDF: f(x) = (β^k/Γ(k)) * x^(k-1) * exp(-βx) - * + * * Properties: * - Mean: k*θ (or k/β) * - Variance: k*θ² (or k/β²) @@ -94,7 +94,7 @@ class GammaDistribution : public DistributionBase { public: /** * Constructs a Gamma distribution with given parameters. - * + * * @param k Shape parameter k (must be positive) * @param theta Scale parameter θ (must be positive) * @throws std::invalid_argument if parameters are invalid @@ -140,7 +140,7 @@ class GammaDistribution : public DistributionBase { /** * Computes the probability density function for the Gamma distribution. - * + * * @param value The value at which to evaluate the PDF * @return Probability density (or approximated probability for discrete sampling) */ @@ -150,7 +150,7 @@ class GammaDistribution : public DistributionBase { * Evaluates the logarithm of the probability density function * Formula: log PDF(x) = (k-1)*ln(x) - x/θ - k*ln(θ) - ln(Γ(k)) * More numerically stable for small probabilities - * + * * @param x The value at which to evaluate the log PDF * @return Log probability density */ @@ -165,7 +165,7 @@ class GammaDistribution : public DistributionBase { * Evaluates the CDF at x using the incomplete gamma function * Formula: CDF(x) = P(k, x/θ) = γ(k, x/θ) / Γ(k) * where P is the regularized incomplete gamma function - * + * * @param x The value at which to evaluate the CDF * @return Cumulative probability P(X ≤ x) */ @@ -185,28 +185,28 @@ class GammaDistribution : public DistributionBase { /** * Returns a string representation of the distribution. - * + * * @return String describing the distribution parameters */ [[nodiscard]] std::string toString() const override; /** * Gets the shape parameter k. - * + * * @return Current shape parameter value */ [[nodiscard]] double getK() const noexcept { return k_; } /** * Gets the scale parameter θ. - * + * * @return Current scale parameter value */ [[nodiscard]] double getTheta() const noexcept { return theta_; } /** * Sets the shape parameter k. - * + * * @param k New shape parameter (must be positive) * @throws std::invalid_argument if k <= 0 or is not finite */ @@ -230,7 +230,7 @@ class GammaDistribution : public DistributionBase { /** * Gets the mean of the distribution. * For Gamma distribution, mean = k*θ - * + * * @return Mean value */ [[nodiscard]] double getMean() const noexcept { return k_ * theta_; } @@ -238,7 +238,7 @@ class GammaDistribution : public DistributionBase { /** * Gets the variance of the distribution. * For Gamma distribution, variance = k*θ² - * + * * @return Variance value */ [[nodiscard]] double getVariance() const noexcept { return k_ * theta_ * theta_; } @@ -246,7 +246,7 @@ class GammaDistribution : public DistributionBase { /** * Gets the standard deviation of the distribution. * For Gamma distribution, std_dev = θ*√k - * + * * @return Standard deviation value */ [[nodiscard]] double getStandardDeviation() const noexcept { return theta_ * std::sqrt(k_); } @@ -255,14 +255,14 @@ class GammaDistribution : public DistributionBase { * Gets the mode of the distribution. * For Gamma distribution with k > 1, mode = (k-1)*θ * For k ≤ 1, the mode is at x = 0 (but PDF may be infinite there) - * + * * @return Mode value */ [[nodiscard]] double getMode() const noexcept { return (k_ > 1.0) ? (k_ - 1.0) * theta_ : 0.0; } /** * Gets the rate parameter β = 1/θ (alternative parameterization). - * + * * @return Rate parameter (1/θ) */ [[nodiscard]] double getRate() const noexcept { return 1.0 / theta_; } diff --git a/include/libhmm/distributions/gaussian_distribution.h b/include/libhmm/distributions/gaussian_distribution.h index 40a321c..9aafafb 100755 --- a/include/libhmm/distributions/gaussian_distribution.h +++ b/include/libhmm/distributions/gaussian_distribution.h @@ -143,7 +143,7 @@ class GaussianDistribution : public DistributionBase { /** * Computes the probability density function for the Gaussian distribution. * Formula: PDF(x) = (1/(σ√(2π))) * exp(-½((x-μ)/σ)²) - * + * * @param x The value at which to evaluate the PDF * @return Probability density */ @@ -153,7 +153,7 @@ class GaussianDistribution : public DistributionBase { * Evaluates the logarithm of the probability density function * Formula: log PDF(x) = -½log(2π) - log(σ) - ½((x-μ)/σ)² * More numerically stable for small probabilities - * + * * @param x The value at which to evaluate the log PDF * @return Log probability density */ @@ -169,7 +169,7 @@ class GaussianDistribution : public DistributionBase { /** * Evaluates the CDF at x using the error function * Formula: CDF(x) = (1/2) * (1 + erf((x-μ)/(σ√2))) - * + * * @param x The value at which to evaluate the CDF * @return Cumulative probability P(X ≤ x) */ @@ -194,21 +194,21 @@ class GaussianDistribution : public DistributionBase { /** * Returns a string representation of the distribution. - * + * * @return String describing the distribution parameters */ std::string toString() const override; /** * Gets the mean parameter μ. - * + * * @return Current mean value */ double getMean() const noexcept { return mean_; } /** * Sets the mean parameter μ. - * + * * @param mean New mean parameter (any finite value) * @throws std::invalid_argument if mean is not finite */ @@ -220,14 +220,14 @@ class GaussianDistribution : public DistributionBase { /** * Gets the standard deviation parameter σ. - * + * * @return Current standard deviation value */ double getStandardDeviation() const noexcept { return standardDeviation_; } /** * Sets the standard deviation parameter σ. - * + * * @param stdDev New standard deviation parameter (must be positive) * @throws std::invalid_argument if stdDev <= 0 or is not finite */ @@ -240,14 +240,14 @@ class GaussianDistribution : public DistributionBase { /** * Gets the variance of the distribution. * For Gaussian distribution, variance = σ² - * + * * @return Variance value */ double getVariance() const noexcept { return standardDeviation_ * standardDeviation_; } /** * Sets both parameters simultaneously. - * + * * @param mean New mean parameter * @param stdDev New standard deviation parameter * @throws std::invalid_argument if parameters are invalid diff --git a/include/libhmm/distributions/log_normal_distribution.h b/include/libhmm/distributions/log_normal_distribution.h index 17cf1d6..c4a125e 100755 --- a/include/libhmm/distributions/log_normal_distribution.h +++ b/include/libhmm/distributions/log_normal_distribution.h @@ -8,20 +8,20 @@ namespace libhmm { /** * Modern C++20 Log-Normal distribution for modeling positive continuous data. - * + * * The Log-Normal distribution is a continuous probability distribution of a * random variable whose logarithm is normally distributed. It's commonly used * to model sizes, lengths, and other positive quantities that arise from * multiplicative processes. - * + * * Important note about parameterization: * This implementation uses the "log-scale" parameterization where: * - μ (mean_) is the mean of the underlying normal distribution ln(X) * - σ (standardDeviation_) is the standard deviation of ln(X) - * + * * PDF: f(x) = (1/(x·σ·√(2π))) * exp(-½((ln(x)-μ)/σ)²) for x > 0 * where μ is the mean of ln(X) and σ is the std dev of ln(X) - * + * * Properties: * - Mean: exp(μ + σ²/2) * - Variance: (exp(σ²) - 1) * exp(2μ + σ²) @@ -79,7 +79,7 @@ class LogNormalDistribution : public DistributionBase { public: /** * Constructs a Log-Normal distribution with given parameters. - * + * * @param mean Mean of the underlying normal distribution (μ, any finite value) * @param standardDeviation Standard deviation of the underlying normal distribution (σ, must be positive) * @throws std::invalid_argument if parameters are invalid @@ -139,7 +139,7 @@ class LogNormalDistribution : public DistributionBase { /** * Computes the probability density function for the Log-Normal distribution. - * + * * @param value The value at which to evaluate the PDF * @return Probability density (or approximated probability for discrete sampling) */ @@ -168,21 +168,21 @@ class LogNormalDistribution : public DistributionBase { /** * Returns a string representation of the distribution. - * + * * @return String describing the distribution parameters */ std::string toString() const override; /** * Gets the mean parameter μ of the underlying normal distribution. - * + * * @return Current mean parameter value */ double getMean() const noexcept { return mean_; } /** * Sets the mean parameter μ of the underlying normal distribution. - * + * * @param mean New mean parameter (any finite value) * @throws std::invalid_argument if mean is not finite */ @@ -194,14 +194,14 @@ class LogNormalDistribution : public DistributionBase { /** * Gets the standard deviation parameter σ of the underlying normal distribution. - * + * * @return Current standard deviation parameter value */ double getStandardDeviation() const noexcept { return standardDeviation_; } /** * Sets the standard deviation parameter σ of the underlying normal distribution. - * + * * @param stdDev New standard deviation parameter (must be positive) * @throws std::invalid_argument if stdDev <= 0 or is not finite */ @@ -213,7 +213,7 @@ class LogNormalDistribution : public DistributionBase { /** * Sets both parameters simultaneously. - * + * * @param mean New mean parameter * @param stdDev New standard deviation parameter * @throws std::invalid_argument if parameters are invalid @@ -228,7 +228,7 @@ class LogNormalDistribution : public DistributionBase { /** * Gets the mean of the Log-Normal distribution (not the underlying normal). * For Log-Normal distribution, mean = exp(μ + σ²/2) - * + * * @return Mean of the Log-Normal distribution */ double getDistributionMean() const noexcept { @@ -239,7 +239,7 @@ class LogNormalDistribution : public DistributionBase { /** * Gets the variance of the Log-Normal distribution. * For Log-Normal distribution, variance = (exp(σ²) - 1) * exp(2μ + σ²) - * + * * @return Variance of the Log-Normal distribution */ double getVariance() const noexcept { @@ -249,7 +249,7 @@ class LogNormalDistribution : public DistributionBase { /** * Gets the standard deviation of the Log-Normal distribution. - * + * * @return Standard deviation of the Log-Normal distribution */ double getDistributionStandardDeviation() const noexcept { return std::sqrt(getVariance()); } @@ -257,7 +257,7 @@ class LogNormalDistribution : public DistributionBase { /** * Gets the mode of the Log-Normal distribution. * For Log-Normal distribution, mode = exp(μ - σ²) - * + * * @return Mode of the Log-Normal distribution */ double getMode() const noexcept { @@ -268,7 +268,7 @@ class LogNormalDistribution : public DistributionBase { /** * Gets the median of the Log-Normal distribution. * For Log-Normal distribution, median = exp(μ) - * + * * @return Median of the Log-Normal distribution */ double getMedian() const noexcept { return std::exp(mean_); } diff --git a/include/libhmm/distributions/negative_binomial_distribution.h b/include/libhmm/distributions/negative_binomial_distribution.h index beb2176..e793e4f 100644 --- a/include/libhmm/distributions/negative_binomial_distribution.h +++ b/include/libhmm/distributions/negative_binomial_distribution.h @@ -8,18 +8,18 @@ namespace libhmm { /** * Modern C++20 Negative Binomial distribution for modeling discrete count data. - * - * The Negative Binomial distribution models the number of failures before - * the r-th success in a sequence of independent Bernoulli trials, each with + * + * The Negative Binomial distribution models the number of failures before + * the r-th success in a sequence of independent Bernoulli trials, each with * success probability p. - * + * * PMF: P(X = k) = C(k+r-1, k) * p^r * (1-p)^k * where C(k+r-1, k) is the binomial coefficient - * + * * Alternative parameterization (often used in practice): * - r: number of successes (positive real number) * - p: success probability (in [0,1]) - * + * * Properties: * - Mean: r * (1-p) / p * - Variance: r * (1-p) / p² @@ -99,7 +99,7 @@ class NegativeBinomialDistribution : public DistributionBase { public: /** * Constructs a Negative Binomial distribution with given parameters. - * + * * @param r Number of successes (must be positive) * @param p Success probability (must be in (0,1]) * @throws std::invalid_argument if parameters are invalid @@ -169,7 +169,7 @@ class NegativeBinomialDistribution : public DistributionBase { /** * Computes the probability mass function for the Negative Binomial distribution. - * + * * @param value The value at which to evaluate the PMF (will be rounded to nearest integer) * @return Probability mass */ @@ -189,21 +189,21 @@ class NegativeBinomialDistribution : public DistributionBase { /** * Returns a string representation of the distribution. - * + * * @return String describing the distribution parameters */ std::string toString() const override; /** * Gets the number of successes parameter r. - * + * * @return Current number of successes */ double getR() const noexcept { return r_; } /** * Sets the number of successes parameter r. - * + * * @param r New number of successes (must be positive) * @throws std::invalid_argument if r <= 0 */ @@ -215,14 +215,14 @@ class NegativeBinomialDistribution : public DistributionBase { /** * Gets the success probability parameter p. - * + * * @return Current success probability */ double getP() const noexcept { return p_; } /** * Sets the success probability parameter p. - * + * * @param p New success probability (must be in (0,1]) * @throws std::invalid_argument if p not in (0,1] */ @@ -235,7 +235,7 @@ class NegativeBinomialDistribution : public DistributionBase { /** * Gets the mean of the distribution. * For Negative Binomial distribution, mean = r * (1-p) / p - * + * * @return Mean value */ double getMean() const noexcept { return r_ * (1.0 - p_) / p_; } @@ -243,21 +243,21 @@ class NegativeBinomialDistribution : public DistributionBase { /** * Gets the variance of the distribution. * For Negative Binomial distribution, variance = r * (1-p) / p² - * + * * @return Variance value */ double getVariance() const noexcept { return r_ * (1.0 - p_) / (p_ * p_); } /** * Gets the standard deviation of the distribution. - * + * * @return Standard deviation value */ double getStandardDeviation() const noexcept { return std::sqrt(getVariance()); } /** * Sets both parameters simultaneously. - * + * * @param r New number of successes * @param p New success probability * @throws std::invalid_argument if parameters are invalid @@ -272,7 +272,7 @@ class NegativeBinomialDistribution : public DistributionBase { /** * Evaluates the logarithm of the probability mass function * More numerically stable for small probabilities - * + * * @param value The value at which to evaluate the log PMF * @return Log probability mass */ @@ -286,7 +286,7 @@ class NegativeBinomialDistribution : public DistributionBase { /** * Evaluates the CDF at k using cumulative sum approach * Formula: CDF(k) = ∑(i=0 to k) P(X = i) - * + * * @param value The value at which to evaluate the CDF * @return Cumulative probability P(X ≤ value) */ @@ -295,7 +295,7 @@ class NegativeBinomialDistribution : public DistributionBase { /** * Gets the mode of the distribution. * For Negative Binomial distribution, mode = floor((r-1)*(1-p)/p) if r > 1, else 0 - * + * * @return Mode value */ int getMode() const noexcept { @@ -308,7 +308,7 @@ class NegativeBinomialDistribution : public DistributionBase { /** * Gets the skewness of the distribution. * For Negative Binomial distribution, skewness = (2-p)/sqrt(r*(1-p)) - * + * * @return Skewness value */ double getSkewness() const noexcept { return (2.0 - p_) / std::sqrt(r_ * (1.0 - p_)); } @@ -316,7 +316,7 @@ class NegativeBinomialDistribution : public DistributionBase { /** * Gets the kurtosis of the distribution. * For Negative Binomial distribution, kurtosis = 3 + (6/r) + (p²/(r*(1-p))) - * + * * @return Kurtosis value */ double getKurtosis() const noexcept { return 3.0 + (6.0 / r_) + (p_ * p_) / (r_ * (1.0 - p_)); } diff --git a/include/libhmm/distributions/pareto_distribution.h b/include/libhmm/distributions/pareto_distribution.h index 32a7ed2..2baba0c 100755 --- a/include/libhmm/distributions/pareto_distribution.h +++ b/include/libhmm/distributions/pareto_distribution.h @@ -8,16 +8,16 @@ namespace libhmm { /** * Modern C++20 Pareto distribution for modeling power-law phenomena. - * + * * The Pareto distribution is a continuous probability distribution commonly * used to model income distribution, city population sizes, stock price * fluctuations, and other phenomena that follow the "80-20 rule" or * Pareto principle. - * + * * PDF: f(x) = (k * x_m^k) / x^(k+1) for x ≥ x_m, 0 otherwise * CDF: F(x) = 1 - (x_m/x)^k for x ≥ x_m, 0 otherwise * where k is the shape parameter (k > 0) and x_m is the scale parameter (x_m > 0) - * + * * Properties: * - Mean: k*x_m/(k-1) for k > 1, undefined for k ≤ 1 * - Variance: (k*x_m²)/((k-1)²*(k-2)) for k > 2, undefined for k ≤ 2 @@ -107,7 +107,7 @@ class ParetoDistribution : public DistributionBase { public: /** * Constructs a Pareto distribution with given parameters. - * + * * @param k Shape parameter k (must be positive) * @param xm Scale parameter x_m (must be positive) * @throws std::invalid_argument if parameters are invalid @@ -173,7 +173,7 @@ class ParetoDistribution : public DistributionBase { /** * Computes the probability density function for the Pareto distribution. - * + * * @param value The value at which to evaluate the PDF * @return Probability density (or approximated probability for discrete sampling) */ @@ -202,21 +202,21 @@ class ParetoDistribution : public DistributionBase { /** * Returns a string representation of the distribution. - * + * * @return String describing the distribution parameters */ std::string toString() const override; /** * Gets the shape parameter k. - * + * * @return Current shape parameter value */ double getK() const noexcept { return k_; } /** * Sets the shape parameter k. - * + * * @param k New shape parameter (must be positive) * @throws std::invalid_argument if k <= 0 or is not finite */ @@ -228,14 +228,14 @@ class ParetoDistribution : public DistributionBase { /** * Gets the scale parameter x_m. - * + * * @return Current scale parameter value */ double getXm() const noexcept { return xm_; } /** * Sets the scale parameter x_m. - * + * * @param xm New scale parameter (must be positive) * @throws std::invalid_argument if xm <= 0 or is not finite */ @@ -247,7 +247,7 @@ class ParetoDistribution : public DistributionBase { /** * Sets both parameters simultaneously. - * + * * @param k New shape parameter * @param xm New scale parameter * @throws std::invalid_argument if parameters are invalid @@ -262,7 +262,7 @@ class ParetoDistribution : public DistributionBase { /** * Gets the mean of the Pareto distribution. * For Pareto distribution, mean = k*x_m/(k-1) if k > 1, undefined otherwise - * + * * @return Mean value if k > 1, otherwise returns infinity */ double getMean() const noexcept { @@ -272,7 +272,7 @@ class ParetoDistribution : public DistributionBase { /** * Gets the variance of the Pareto distribution. * For Pareto distribution, variance = (k*x_m²)/((k-1)²*(k-2)) if k > 2, undefined otherwise - * + * * @return Variance value if k > 2, otherwise returns infinity */ double getVariance() const noexcept { @@ -285,7 +285,7 @@ class ParetoDistribution : public DistributionBase { /** * Gets the standard deviation of the Pareto distribution. - * + * * @return Standard deviation if k > 2, otherwise returns infinity */ double getStandardDeviation() const noexcept { @@ -296,7 +296,7 @@ class ParetoDistribution : public DistributionBase { /** * Gets the mode of the Pareto distribution. * For Pareto distribution, mode = x_m (always at the scale parameter) - * + * * @return Mode value (equals x_m) */ double getMode() const noexcept { return xm_; } @@ -304,7 +304,7 @@ class ParetoDistribution : public DistributionBase { /** * Gets the median of the Pareto distribution. * For Pareto distribution, median = x_m * 2^(1/k) - * + * * @return Median value */ double getMedian() const noexcept { diff --git a/include/libhmm/distributions/poisson_distribution.h b/include/libhmm/distributions/poisson_distribution.h index a7a3c21..01436aa 100644 --- a/include/libhmm/distributions/poisson_distribution.h +++ b/include/libhmm/distributions/poisson_distribution.h @@ -9,11 +9,11 @@ namespace libhmm { /** * Modern C++20 Poisson distribution for modeling count data and rare events. - * - * The Poisson distribution models the number of events occurring in a fixed - * interval of time or space, given that these events occur with a known + * + * The Poisson distribution models the number of events occurring in a fixed + * interval of time or space, given that these events occur with a known * constant mean rate and independently of the time since the last event. - * + * * PMF: P(X = k) = (λ^k * e^(-λ)) / k! for k = 0, 1, 2, ... * where λ (lambda) is the rate parameter (mean number of events per interval) */ @@ -56,7 +56,7 @@ class PoissonDistribution : public DistributionBase { /** * Computes log(k!) using Stirling's approximation for large k, * exact computation for small k. - * + * * @param k Non-negative integer * @return log(k!) */ @@ -69,7 +69,7 @@ class PoissonDistribution : public DistributionBase { /** * Validates that k is a valid count (non-negative integer) - * + * * @param k Value to validate * @return true if k is a valid count, false otherwise */ @@ -82,7 +82,7 @@ class PoissonDistribution : public DistributionBase { public: /** * Constructs a Poisson distribution with given rate parameter. - * + * * @param lambda Rate parameter (must be positive) * @throws std::invalid_argument if lambda <= 0 or is not finite */ @@ -138,7 +138,7 @@ class PoissonDistribution : public DistributionBase { /** * Computes the probability mass function P(X = k) for the Poisson distribution. - * + * * @param value The count value k (must be non-negative integer) * @return Probability P(X = k), or 0.0 if value is invalid */ @@ -160,21 +160,21 @@ class PoissonDistribution : public DistributionBase { /** * Returns a string representation of the distribution. - * + * * @return String describing the distribution parameters */ std::string toString() const override; /** * Gets the rate parameter λ. - * + * * @return Current lambda value */ double getLambda() const noexcept { return lambda_; } /** * Sets the rate parameter λ. - * + * * @param lambda New rate parameter (must be positive) * @throws std::invalid_argument if lambda <= 0 or is not finite */ @@ -186,14 +186,14 @@ class PoissonDistribution : public DistributionBase { /** * Gets the mean of the distribution (equal to λ). - * + * * @return Mean value */ double getMean() const noexcept { return lambda_; } /** * Gets the variance of the distribution (equal to λ). - * + * * @return Variance value */ double getVariance() const noexcept { return lambda_; } @@ -201,7 +201,7 @@ class PoissonDistribution : public DistributionBase { /** * Gets the standard deviation of the distribution (sqrt(λ)). * Uses cached value for efficiency. - * + * * @return Standard deviation */ double getStandardDeviation() const noexcept { @@ -213,7 +213,7 @@ class PoissonDistribution : public DistributionBase { /** * Evaluates the logarithm of the probability mass function * More numerically stable for small probabilities - * + * * @param value The count value k at which to evaluate the log PMF * @return Log probability mass */ @@ -227,7 +227,7 @@ class PoissonDistribution : public DistributionBase { /** * Evaluates the CDF at k using cumulative sum approach * Formula: CDF(k) = ∑(i=0 to k) P(X = i) - * + * * @param k The value at which to evaluate the CDF * @return Cumulative probability P(X ≤ k) */ diff --git a/include/libhmm/distributions/rayleigh_distribution.h b/include/libhmm/distributions/rayleigh_distribution.h index 54e8cd4..8d09308 100644 --- a/include/libhmm/distributions/rayleigh_distribution.h +++ b/include/libhmm/distributions/rayleigh_distribution.h @@ -8,24 +8,24 @@ namespace libhmm { /** * Modern C++20 Rayleigh distribution for modeling magnitudes and speeds. - * + * * The Rayleigh distribution is a continuous probability distribution that arises * when modeling the magnitude of a 2D random vector whose components are independent, * identically distributed, zero-mean Gaussian random variables. - * + * * This is a special case of the Weibull distribution with shape parameter k = 2, * but implemented as a standalone class for maximum efficiency. - * + * * PDF: f(x) = (x/σ²) * exp(-x²/(2σ²)) for x ≥ 0, 0 otherwise * CDF: F(x) = 1 - exp(-x²/(2σ²)) for x ≥ 0, 0 otherwise * where σ is the scale parameter (σ > 0) - * + * * Properties: * - Mean: σ * √(π/2) ≈ 1.253 * σ * - Variance: σ² * (4-π)/2 ≈ 0.429 * σ² * - Mode: σ * - Support: x ∈ [0, ∞) - * + * * Applications: * - Wind speed modeling * - Wave height analysis @@ -74,7 +74,7 @@ class RayleighDistribution : public DistributionBase { mutable double mean_{constants::math::SQRT_PI_OVER_TWO}; /** - * Cached value of σ² * (4-π)/2 for variance calculation + * Cached value of σ² * (4-π)/2 for variance calculation * Variance = σ² * (4-π)/2 ≈ 0.4292036732 * σ² */ mutable double variance_{constants::math::FOUR_MINUS_PI_OVER_TWO}; @@ -106,7 +106,7 @@ class RayleighDistribution : public DistributionBase { public: /** * Constructs a Rayleigh distribution with given scale parameter. - * + * * @param sigma Scale parameter σ (must be positive) * @throws std::invalid_argument if sigma is invalid */ @@ -196,21 +196,21 @@ class RayleighDistribution : public DistributionBase { /** * Returns a string representation of the distribution. - * + * * @return String describing the distribution parameters */ std::string toString() const override; /** * Gets the scale parameter σ. - * + * * @return Current scale parameter value */ double getSigma() const noexcept { return sigma_; } /** * Sets the scale parameter σ. - * + * * @param sigma New scale parameter (must be positive) * @throws std::invalid_argument if sigma is invalid */ @@ -223,7 +223,7 @@ class RayleighDistribution : public DistributionBase { /** * Gets the mean of the distribution. * Mean = σ * √(π/2) - * + * * @return Mean value */ double getMean() const noexcept { @@ -239,7 +239,7 @@ class RayleighDistribution : public DistributionBase { /** * Gets the standard deviation of the distribution. - * + * * @return Standard deviation (square root of variance) */ double getStandardDeviation() const noexcept { return std::sqrt(getVariance()); } @@ -247,7 +247,7 @@ class RayleighDistribution : public DistributionBase { /** * Gets the mode of the distribution. * Mode = σ - * + * * @return Mode value */ double getMode() const noexcept { return sigma_; } @@ -255,7 +255,7 @@ class RayleighDistribution : public DistributionBase { /** * Gets the median of the distribution. * Median = σ * √(2 * ln(2)) ≈ 1.177 * σ - * + * * @return Median value */ double getMedian() const noexcept { return sigma_ * constants::math::SQRT_TWO_LN_TWO; } diff --git a/include/libhmm/distributions/student_t_distribution.h b/include/libhmm/distributions/student_t_distribution.h index 6c37714..4fed5cf 100644 --- a/include/libhmm/distributions/student_t_distribution.h +++ b/include/libhmm/distributions/student_t_distribution.h @@ -8,18 +8,18 @@ namespace libhmm { /** * @brief Student's t-distribution implementation - * + * * The Student's t-distribution is a probability distribution used in statistics, * particularly for small sample sizes or when the population variance is unknown. * It approaches the normal distribution as degrees of freedom increase. - * + * * Mathematical properties: * - PDF: f(x|ν) = Γ((ν+1)/2) / (√(νπ) * Γ(ν/2)) * (1 + x²/ν)^(-(ν+1)/2) * - Support: x ∈ (-∞, +∞) * - Parameters: ν > 0 (degrees of freedom) * - Mean: 0 (for ν > 1), undefined otherwise * - Variance: ν/(ν-2) (for ν > 2), infinite for 1 < ν ≤ 2, undefined for ν ≤ 1 - * + * * Applications: * - Statistical hypothesis testing (t-tests) * - Confidence intervals for unknown variance @@ -118,7 +118,7 @@ class StudentTDistribution : public DistributionBase { /** * Computes the probability density function for the Student's t-distribution. - * + * * @param value The value at which to evaluate the PDF * @return Probability density f(value|ν) */ @@ -212,7 +212,7 @@ class StudentTDistribution : public DistributionBase { /** * Returns a string representation of the distribution. - * + * * @return String describing the distribution parameters */ std::string toString() const override; diff --git a/include/libhmm/distributions/uniform_distribution.h b/include/libhmm/distributions/uniform_distribution.h index bc22f98..84f8bec 100644 --- a/include/libhmm/distributions/uniform_distribution.h +++ b/include/libhmm/distributions/uniform_distribution.h @@ -8,17 +8,17 @@ namespace libhmm { /** * @brief Uniform Distribution - * + * * The uniform distribution is a continuous probability distribution where all values * within a specified interval [a, b] have equal probability density. - * + * * Probability Density Function: * f(x) = 1/(b-a) for a ≤ x ≤ b, 0 otherwise - * + * * Parameters: * - a: Lower bound (minimum value) * - b: Upper bound (maximum value) - * + * * Properties: * - Mean: μ = (a + b) / 2 * - Variance: σ² = (b - a)² / 12 diff --git a/include/libhmm/distributions/weibull_distribution.h b/include/libhmm/distributions/weibull_distribution.h index 6600495..3de493b 100644 --- a/include/libhmm/distributions/weibull_distribution.h +++ b/include/libhmm/distributions/weibull_distribution.h @@ -8,21 +8,21 @@ namespace libhmm { /** * Weibull distribution for reliability analysis and survival modeling. - * - * The Weibull distribution is a continuous probability distribution defined + * + * The Weibull distribution is a continuous probability distribution defined * on the interval [0,∞) and parameterized by two positive parameters: * k (shape parameter) and λ (scale parameter). - * + * * PDF: f(x; k, λ) = (k/λ) * (x/λ)^(k-1) * exp(-(x/λ)^k) for x ≥ 0 * CDF: F(x; k, λ) = 1 - exp(-(x/λ)^k) for x ≥ 0 - * + * * Special cases: * - k = 1: Exponential distribution with rate λ - * - k = 2: Rayleigh distribution + * - k = 2: Rayleigh distribution * - k < 1: Decreasing failure rate (infant mortality) * - k = 1: Constant failure rate (random failures) * - k > 1: Increasing failure rate (wear-out failures) - * + * * Applications: * - Reliability engineering and failure analysis * - Survival analysis and lifetime modeling @@ -38,7 +38,7 @@ class WeibullDistribution : public DistributionBase { double k_{1.0}; /** - * Scale parameter λ (lambda) - must be positive + * Scale parameter λ (lambda) - must be positive * Controls the scale/spread of the distribution */ double lambda_{1.0}; @@ -97,7 +97,7 @@ class WeibullDistribution : public DistributionBase { public: /** * Constructs a Weibull distribution with given parameters. - * + * * @param k Shape parameter (must be positive) * @param lambda Scale parameter (must be positive) * @throws std::invalid_argument if parameters are not positive finite numbers @@ -161,7 +161,7 @@ class WeibullDistribution : public DistributionBase { /** * Computes the probability density function for the Weibull distribution. - * + * * @param value The value at which to evaluate the PDF (should be ≥ 0) * @return Probability density, or 0.0 if value is negative */ @@ -189,14 +189,14 @@ class WeibullDistribution : public DistributionBase { /** * Returns a string representation of the distribution. - * + * * @return String describing the distribution parameters */ std::string toString() const override; /** * Computes the cumulative distribution function (CDF) for the Weibull distribution. - * + * * @param x The value at which to evaluate the CDF (should be ≥ 0) * @return Cumulative probability P(X ≤ x), or 0.0 if x is negative */ @@ -204,7 +204,7 @@ class WeibullDistribution : public DistributionBase { /** * Equality comparison operator with tolerance for floating-point comparison. - * + * * @param other Distribution to compare with * @return true if distributions have the same parameters within tolerance */ @@ -212,14 +212,14 @@ class WeibullDistribution : public DistributionBase { /** * Gets the shape parameter k. - * + * * @return Current k value */ double getK() const noexcept { return k_; } /** * Sets the shape parameter k. - * + * * @param k New shape parameter (must be positive) * @throws std::invalid_argument if k <= 0 or is not finite */ @@ -231,14 +231,14 @@ class WeibullDistribution : public DistributionBase { /** * Gets the scale parameter λ (lambda). - * + * * @return Current lambda value */ double getLambda() const noexcept { return lambda_; } /** * Sets the scale parameter λ (lambda). - * + * * @param lambda New scale parameter (must be positive) * @throws std::invalid_argument if lambda <= 0 or is not finite */ @@ -251,7 +251,7 @@ class WeibullDistribution : public DistributionBase { /** * Gets the mean of the distribution. * For Weibull(k, λ), mean = λ * Γ(1 + 1/k) - * + * * @return Mean value */ double getMean() const noexcept { return lambda_ * std::exp(std::lgamma(1.0 + 1.0 / k_)); } @@ -259,7 +259,7 @@ class WeibullDistribution : public DistributionBase { /** * Gets the variance of the distribution. * For Weibull(k, λ), variance = λ² * [Γ(1 + 2/k) - (Γ(1 + 1/k))²] - * + * * @return Variance value */ double getVariance() const noexcept { @@ -270,7 +270,7 @@ class WeibullDistribution : public DistributionBase { /** * Gets the standard deviation of the distribution. - * + * * @return Standard deviation */ double getStandardDeviation() const noexcept { return std::sqrt(getVariance()); } @@ -278,7 +278,7 @@ class WeibullDistribution : public DistributionBase { /** * Gets the scale parameter (alternative name for lambda). * This is sometimes called the "characteristic life" in reliability contexts. - * + * * @return Scale parameter value */ double getScale() const noexcept { return lambda_; } @@ -286,7 +286,7 @@ class WeibullDistribution : public DistributionBase { /** * Gets the shape parameter (alternative name for k). * This is sometimes called the "Weibull modulus" in reliability contexts. - * + * * @return Shape parameter value */ double getShape() const noexcept { return k_; } diff --git a/include/libhmm/io/file_io_manager.h b/include/libhmm/io/file_io_manager.h index 98f3567..fe8da0b 100644 --- a/include/libhmm/io/file_io_manager.h +++ b/include/libhmm/io/file_io_manager.h @@ -27,7 +27,7 @@ class FileIOManager { /** * Reads entire file content as a string. - * + * * @param filepath Path to the file * @return File content as string * @throws std::runtime_error if file cannot be read @@ -36,7 +36,7 @@ class FileIOManager { /** * Writes string content to a file. - * + * * @param filepath Path to the file * @param content Content to write * @param append If true, append to file; if false, overwrite @@ -47,7 +47,7 @@ class FileIOManager { /** * Reads file content as lines. - * + * * @param filepath Path to the file * @return Vector of lines * @throws std::runtime_error if file cannot be read @@ -56,7 +56,7 @@ class FileIOManager { /** * Writes lines to a file. - * + * * @param filepath Path to the file * @param lines Lines to write * @param append If true, append to file; if false, overwrite @@ -67,7 +67,7 @@ class FileIOManager { /** * Safely copies a file with error handling. - * + * * @param source Source file path * @param destination Destination file path * @param overwrite If true, overwrite existing file @@ -78,7 +78,7 @@ class FileIOManager { /** * Creates a backup of a file with timestamp. - * + * * @param filepath Path to the file to backup * @return Path to the backup file * @throws std::runtime_error if backup fails @@ -87,7 +87,7 @@ class FileIOManager { /** * Validates file path and permissions. - * + * * @param filepath Path to validate * @param checkRead Check read permissions * @param checkWrite Check write permissions @@ -98,7 +98,7 @@ class FileIOManager { /** * Gets file size safely. - * + * * @param filepath Path to the file * @return File size in bytes, or nullopt if file doesn't exist */ @@ -107,7 +107,7 @@ class FileIOManager { /** * Checks if file has expected extension. - * + * * @param filepath Path to check * @param expectedExtension Expected file extension (with or without dot) * @return true if file has the expected extension @@ -117,7 +117,7 @@ class FileIOManager { /** * Creates directory structure if it doesn't exist. - * + * * @param dirpath Directory path to create * @throws std::runtime_error if directory creation fails */ @@ -125,7 +125,7 @@ class FileIOManager { /** * Gets file modification time. - * + * * @param filepath Path to the file * @return File modification time, or nullopt if file doesn't exist */ diff --git a/include/libhmm/io/xml_file_reader.h b/include/libhmm/io/xml_file_reader.h index b508fd4..10a490f 100644 --- a/include/libhmm/io/xml_file_reader.h +++ b/include/libhmm/io/xml_file_reader.h @@ -30,7 +30,7 @@ class XMLFileReader { /** * Reads an HMM from an XML file with comprehensive error handling. - * + * * @param filename Path to the input XML file * @return Loaded HMM object * @throws std::invalid_argument if filename is empty @@ -40,7 +40,7 @@ class XMLFileReader { /** * Reads an HMM from an XML file with filesystem path. - * + * * @param filepath Path to the input XML file * @return Loaded HMM object * @throws std::invalid_argument if filepath is empty @@ -50,7 +50,7 @@ class XMLFileReader { /** * Validates that a file can be read from the given path. - * + * * @param filepath Path to validate * @return true if the file can be read, false otherwise */ @@ -58,7 +58,7 @@ class XMLFileReader { /** * Checks if a file exists and appears to be a valid XML file. - * + * * @param filepath Path to check * @return true if file exists and has XML content, false otherwise */ @@ -67,7 +67,7 @@ class XMLFileReader { private: /** * Internal implementation for reading HMM from stream. - * + * * @param stream Input stream * @return Loaded HMM object * @throws std::runtime_error if deserialization fails diff --git a/include/libhmm/io/xml_file_writer.h b/include/libhmm/io/xml_file_writer.h index 6b7f610..3efb058 100755 --- a/include/libhmm/io/xml_file_writer.h +++ b/include/libhmm/io/xml_file_writer.h @@ -30,7 +30,7 @@ class XMLFileWriter { /** * Writes an HMM to an XML file with comprehensive error handling. - * + * * @param hmm The HMM to serialize * @param filename Path to the output XML file * @throws std::invalid_argument if filename is empty @@ -40,7 +40,7 @@ class XMLFileWriter { /** * Writes an HMM to an XML file with filesystem path. - * + * * @param hmm The HMM to serialize * @param filepath Path to the output XML file * @throws std::invalid_argument if filepath is empty @@ -50,7 +50,7 @@ class XMLFileWriter { /** * Validates that a file can be written to the given path. - * + * * @param filepath Path to validate * @return true if the file can be written, false otherwise */ @@ -59,7 +59,7 @@ class XMLFileWriter { private: /** * Internal implementation for writing HMM to stream. - * + * * @param hmm The HMM to serialize * @param stream Output stream * @throws std::runtime_error if serialization fails diff --git a/include/libhmm/linalg/basic_matrix.h b/include/libhmm/linalg/basic_matrix.h index 1ccacec..b675c2a 100644 --- a/include/libhmm/linalg/basic_matrix.h +++ b/include/libhmm/linalg/basic_matrix.h @@ -14,10 +14,10 @@ namespace libhmm { /** * Lightweight Matrix class designed to replace boost::numeric::ublas::matrix * with better performance and SIMD-friendly memory layout. - * + * * Features: * - Contiguous memory storage for optimal cache performance - * - Row-major ordering for better CPU cache utilization + * - Row-major ordering for better CPU cache utilization * - SIMD-aligned memory allocation * - Compatible API with existing uBLAS usage patterns * - Zero external dependencies (pure C++17) diff --git a/include/libhmm/linalg/basic_vector.h b/include/libhmm/linalg/basic_vector.h index 5cf6e6e..7bf96d4 100644 --- a/include/libhmm/linalg/basic_vector.h +++ b/include/libhmm/linalg/basic_vector.h @@ -15,7 +15,7 @@ namespace libhmm { /** * Lightweight Vector class designed to replace boost::numeric::ublas::vector * with better performance and SIMD-friendly operations. - * + * * Features: * - Based on std::vector for optimal standard library integration * - SIMD-friendly contiguous memory layout diff --git a/include/libhmm/performance/fb_recurrence_policy.h b/include/libhmm/performance/fb_recurrence_policy.h new file mode 100644 index 0000000..54dae96 --- /dev/null +++ b/include/libhmm/performance/fb_recurrence_policy.h @@ -0,0 +1,63 @@ +#pragma once + +/** + * @file fb_recurrence_policy.h + * @brief Minimal ISA-aware policy for Forward-Backward recurrence selection. + * + * The two recurrence kernels are semantically equivalent in log-space: + * - Pairwise: repeated two-argument log-sum-exp + * - MaxReduce: max-then-reduce + * + * The only policy decision retained here is an ISA-family cutoff: + * - arm64: switch at N>=4 + * - x86/x64: switch at N>=4 + * + * Threshold calibrated by fb_crossover_sweep on Zen 4 / MSVC / AVX-512 + * (Ryzen 7 7745HX, T=1000, median 8 runs): + * N=2: MaxReduce 2.1x slower (Pairwise wins) + * N=3: MaxReduce 1.1x slower (Pairwise wins) + * N=4: MaxReduce 1.7x faster -- crossover + * N=8: MaxReduce 5.0x faster + * N=32: MaxReduce 15x faster + * Previous x86 threshold was N>=5; N=4 was incorrectly left on the slower + * Pairwise path before the TranscendentalKernels SIMD backends landed. + */ + +#include + +namespace libhmm { + +/// Selectable recurrence kernel for Forward-Backward. +enum class FbRecurrenceMode { + Pairwise, + MaxReduce, +}; + +/** + * @brief Static recurrence-mode selection from ISA-family evidence. + * + * @param numStates Number of HMM states (`N`). + * @param sequenceLength Observation length (`T`). Currently unused except for + * signature stability; reserved for future T-aware bins. + */ +constexpr FbRecurrenceMode selectFbRecurrenceMode(std::size_t numStates, + std::size_t sequenceLength) noexcept { + (void)sequenceLength; + if (numStates < 2) { + return FbRecurrenceMode::Pairwise; + } + return (numStates >= 4) ? FbRecurrenceMode::MaxReduce : FbRecurrenceMode::Pairwise; +} + +/// Human-readable name for a recurrence mode. +constexpr const char *toString(FbRecurrenceMode mode) noexcept { + switch (mode) { + case FbRecurrenceMode::Pairwise: + return "pairwise"; + case FbRecurrenceMode::MaxReduce: + return "max-reduce"; + } + return "unknown"; +} + +} // namespace libhmm diff --git a/include/libhmm/performance/simd_kernels_internal.h b/include/libhmm/performance/simd_kernels_internal.h new file mode 100644 index 0000000..da840c8 --- /dev/null +++ b/include/libhmm/performance/simd_kernels_internal.h @@ -0,0 +1,407 @@ +#pragma once +// include/libhmm/performance/simd_kernels_internal.h +// +// Internal header — NOT part of the public API. +// +// Single source of truth for vector exp/log helpers shared between +// transcendental_kernels.cpp and Tier-2 distribution TUs +// (log_normal_distribution.cpp, pareto_distribution.cpp). +// +// Include only from .cpp files compiled with LIBHMM_BEST_SIMD_FLAGS. + +#include "libhmm/platform/simd_platform.h" +#include "libhmm/math/constants.h" + +#include +#include + +namespace libhmm { +namespace performance { +namespace detail { +namespace kernels { + +// --------------------------------------------------------------------------- +// Shared constants +// --------------------------------------------------------------------------- +static constexpr double K_LN2_HI = 6.93147180369123816490e-1; +static constexpr double K_LN2_LO = 1.90821492927058770002e-10; +static constexpr double K_LOG2E = 1.44269504088896338700; +static constexpr double K_SQRT2 = 1.41421356237309504880168872420969807; +static constexpr double K_EXP_UNDERFLOW = constants::probability::MIN_LOG_PROBABILITY; // -700.0 +static constexpr double K_EXPONENT_BIAS = 1023.0; + +// log polynomial: 2y*(c0 + c1*y^2 + ... + c6*y^12), c_k = 1/(2k+1) +static constexpr double K_LOG_C0 = 1.0; +static constexpr double K_LOG_C1 = 3.3333333333333333e-1; +static constexpr double K_LOG_C2 = 2.0000000000000000e-1; +static constexpr double K_LOG_C3 = 1.4285714285714285e-1; +static constexpr double K_LOG_C4 = 1.1111111111111111e-1; +static constexpr double K_LOG_C5 = 9.0909090909090909e-2; +static constexpr double K_LOG_C6 = 7.6923076923076923e-2; + +// exp polynomial: sum(r^k/k!), k=0..12 +static constexpr double K_EXP_C0 = 1.0; +static constexpr double K_EXP_C1 = 1.0; +static constexpr double K_EXP_C2 = 0.5; +static constexpr double K_EXP_C3 = 1.6666666666666666e-1; +static constexpr double K_EXP_C4 = 4.1666666666666664e-2; +static constexpr double K_EXP_C5 = 8.3333333333333332e-3; +static constexpr double K_EXP_C6 = 1.3888888888888889e-3; +static constexpr double K_EXP_C7 = 1.9841269841269841e-4; +static constexpr double K_EXP_C8 = 2.4801587301587302e-5; +static constexpr double K_EXP_C9 = 2.7557319223985888e-6; +static constexpr double K_EXP_C10 = 2.7557319223985888e-7; +static constexpr double K_EXP_C11 = 2.5052108385441720e-8; +static constexpr double K_EXP_C12 = 2.0876756987868099e-9; + +// --------------------------------------------------------------------------- +// AVX-512 helpers +// --------------------------------------------------------------------------- +#if defined(LIBHMM_HAS_AVX512) + +[[nodiscard]] static inline __m512d k_log_pd_avx512(__m512d x) noexcept { + const __m512d neg_inf_v = _mm512_set1_pd(-std::numeric_limits::infinity()); + const __m512d sqrt2_v = _mm512_set1_pd(K_SQRT2); + const __m512d one_v = _mm512_set1_pd(1.0); + const __m512d half_v = _mm512_set1_pd(0.5); + const __m512d two_v = _mm512_set1_pd(2.0); + const __m512d ln2hi_v = _mm512_set1_pd(K_LN2_HI); + const __m512d ln2lo_v = _mm512_set1_pd(K_LN2_LO); + + const __mmask8 invalid = _mm512_cmp_pd_mask(x, _mm512_setzero_pd(), _CMP_LE_OS); + + __m512i bits = _mm512_castpd_si512(x); + __m512i e_biased = _mm512_srli_epi64(bits, 52); + const __m512i mant_mask = _mm512_set1_epi64(0x000FFFFFFFFFFFFFLL); + const __m512i exp_one = _mm512_set1_epi64(0x3FF0000000000000LL); + __m512i mbits = _mm512_or_si512(_mm512_and_si512(bits, mant_mask), exp_one); + __m512d m = _mm512_castsi512_pd(mbits); + + // Convert int64 exponent to double via scalar (no AVX-512 DQ needed). + __m512i e_ub = _mm512_sub_epi64(e_biased, _mm512_set1_epi64(1023LL)); + alignas(64) long long e_arr[8]; + _mm512_storeu_si512(reinterpret_cast<__m512i *>(e_arr), e_ub); + __m512d e = _mm512_set_pd(static_cast(e_arr[7]), static_cast(e_arr[6]), + static_cast(e_arr[5]), static_cast(e_arr[4]), + static_cast(e_arr[3]), static_cast(e_arr[2]), + static_cast(e_arr[1]), static_cast(e_arr[0])); + + __mmask8 adj = _mm512_cmp_pd_mask(m, sqrt2_v, _CMP_GT_OS); + e = _mm512_mask_add_pd(e, adj, e, one_v); + m = _mm512_mask_mul_pd(m, adj, m, half_v); + + __m512d y = _mm512_div_pd(_mm512_sub_pd(m, one_v), _mm512_add_pd(m, one_v)); + __m512d y2 = _mm512_mul_pd(y, y); + + __m512d p = _mm512_set1_pd(K_LOG_C6); + p = _mm512_fmadd_pd(p, y2, _mm512_set1_pd(K_LOG_C5)); + p = _mm512_fmadd_pd(p, y2, _mm512_set1_pd(K_LOG_C4)); + p = _mm512_fmadd_pd(p, y2, _mm512_set1_pd(K_LOG_C3)); + p = _mm512_fmadd_pd(p, y2, _mm512_set1_pd(K_LOG_C2)); + p = _mm512_fmadd_pd(p, y2, _mm512_set1_pd(K_LOG_C1)); + p = _mm512_fmadd_pd(p, y2, _mm512_set1_pd(K_LOG_C0)); + __m512d log_m = _mm512_mul_pd(_mm512_mul_pd(two_v, y), p); + + __m512d result = _mm512_fmadd_pd(e, ln2hi_v, _mm512_fmadd_pd(e, ln2lo_v, log_m)); + result = _mm512_mask_blend_pd(invalid, result, neg_inf_v); + return result; +} + +[[nodiscard]] static inline __m512d k_exp_pd_avx512(__m512d x) noexcept { + const __m512d uflow_v = _mm512_set1_pd(K_EXP_UNDERFLOW); + const __m512d log2e_v = _mm512_set1_pd(K_LOG2E); + const __m512d half_v = _mm512_set1_pd(0.5); + const __m512d ln2hi_v = _mm512_set1_pd(K_LN2_HI); + const __m512d ln2lo_v = _mm512_set1_pd(K_LN2_LO); + const __m512d zero_v = _mm512_setzero_pd(); + const __mmask8 uflow = _mm512_cmp_pd_mask(x, uflow_v, _CMP_LE_OS); + x = _mm512_max_pd(x, uflow_v); + __m512d n = _mm512_floor_pd(_mm512_fmadd_pd(x, log2e_v, half_v)); + __m512d r = _mm512_fnmadd_pd(n, ln2hi_v, x); + r = _mm512_fnmadd_pd(n, ln2lo_v, r); + __m512d p = _mm512_set1_pd(K_EXP_C12); + p = _mm512_fmadd_pd(p, r, _mm512_set1_pd(K_EXP_C11)); + p = _mm512_fmadd_pd(p, r, _mm512_set1_pd(K_EXP_C10)); + p = _mm512_fmadd_pd(p, r, _mm512_set1_pd(K_EXP_C9)); + p = _mm512_fmadd_pd(p, r, _mm512_set1_pd(K_EXP_C8)); + p = _mm512_fmadd_pd(p, r, _mm512_set1_pd(K_EXP_C7)); + p = _mm512_fmadd_pd(p, r, _mm512_set1_pd(K_EXP_C6)); + p = _mm512_fmadd_pd(p, r, _mm512_set1_pd(K_EXP_C5)); + p = _mm512_fmadd_pd(p, r, _mm512_set1_pd(K_EXP_C4)); + p = _mm512_fmadd_pd(p, r, _mm512_set1_pd(K_EXP_C3)); + p = _mm512_fmadd_pd(p, r, _mm512_set1_pd(K_EXP_C2)); + p = _mm512_fmadd_pd(p, r, _mm512_set1_pd(K_EXP_C1)); + p = _mm512_fmadd_pd(p, r, _mm512_set1_pd(K_EXP_C0)); + __m256i ni = _mm512_cvtpd_epi32(n); + __m512i ni64 = _mm512_cvtepi32_epi64(ni); + ni64 = _mm512_add_epi64(ni64, _mm512_set1_epi64(static_cast(K_EXPONENT_BIAS))); + ni64 = _mm512_slli_epi64(ni64, 52); + __m512d result = _mm512_mul_pd(p, _mm512_castsi512_pd(ni64)); + result = _mm512_mask_blend_pd(uflow, result, zero_v); + return result; +} + +#endif // LIBHMM_HAS_AVX512 + +// --------------------------------------------------------------------------- +// AVX helpers (AVX-1 compatible) +// --------------------------------------------------------------------------- +#if defined(LIBHMM_HAS_AVX) || defined(LIBHMM_HAS_AVX2) + +[[nodiscard]] static inline __m256d k_log_pd_avx(__m256d x) noexcept { + const double neg_inf = -std::numeric_limits::infinity(); + const __m256d neg_inf_v = _mm256_set1_pd(neg_inf); + const __m256d sqrt2_v = _mm256_set1_pd(K_SQRT2); + const __m256d one_v = _mm256_set1_pd(1.0); + const __m256d half_v = _mm256_set1_pd(0.5); + const __m256d two_v = _mm256_set1_pd(2.0); + const __m256d ln2hi_v = _mm256_set1_pd(K_LN2_HI); + const __m256d ln2lo_v = _mm256_set1_pd(K_LN2_LO); + const __m256d invalid_mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_LE_OS); + + auto extract_em = [](__m128d xh, __m128d &mh, __m128d &eh) { + __m128i bits = _mm_castpd_si128(xh); + __m128i eb = _mm_srli_epi64(bits, 52); + __m128i mm = _mm_set1_epi64x(0x000FFFFFFFFFFFFFLL); + __m128i eo = _mm_set1_epi64x(0x3FF0000000000000LL); + mh = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(bits, mm), eo)); + __m128i eu = _mm_sub_epi64(eb, _mm_set1_epi64x(1023LL)); + long long e0, e1; + _mm_storel_epi64(reinterpret_cast<__m128i *>(&e0), eu); + _mm_storel_epi64(reinterpret_cast<__m128i *>(&e1), _mm_unpackhi_epi64(eu, eu)); + eh = _mm_set_pd(static_cast(e1), static_cast(e0)); + }; + + __m128d m_lo, e_lo, m_hi, e_hi; + extract_em(_mm256_castpd256_pd128(x), m_lo, e_lo); + extract_em(_mm256_extractf128_pd(x, 1), m_hi, e_hi); + __m256d m = _mm256_set_m128d(m_hi, m_lo); + __m256d e = _mm256_set_m128d(e_hi, e_lo); + + __m256d adj = _mm256_cmp_pd(m, sqrt2_v, _CMP_GT_OS); + e = _mm256_add_pd(e, _mm256_and_pd(adj, one_v)); + m = _mm256_blendv_pd(m, _mm256_mul_pd(m, half_v), adj); + + __m256d y = _mm256_div_pd(_mm256_sub_pd(m, one_v), _mm256_add_pd(m, one_v)); + __m256d y2 = _mm256_mul_pd(y, y); + +#define K_FMA256(a_, b_, c_) _mm256_add_pd(_mm256_mul_pd((a_), (b_)), (c_)) + __m256d p = _mm256_set1_pd(K_LOG_C6); + p = K_FMA256(p, y2, _mm256_set1_pd(K_LOG_C5)); + p = K_FMA256(p, y2, _mm256_set1_pd(K_LOG_C4)); + p = K_FMA256(p, y2, _mm256_set1_pd(K_LOG_C3)); + p = K_FMA256(p, y2, _mm256_set1_pd(K_LOG_C2)); + p = K_FMA256(p, y2, _mm256_set1_pd(K_LOG_C1)); + p = K_FMA256(p, y2, _mm256_set1_pd(K_LOG_C0)); + __m256d log_m = _mm256_mul_pd(_mm256_mul_pd(two_v, y), p); + __m256d result = + _mm256_add_pd(_mm256_mul_pd(e, ln2hi_v), _mm256_add_pd(_mm256_mul_pd(e, ln2lo_v), log_m)); +#undef K_FMA256 + result = _mm256_blendv_pd(result, neg_inf_v, invalid_mask); + return result; +} + +[[nodiscard]] static inline __m256d k_exp_pd_avx(__m256d x) noexcept { + const __m256d uflow_v = _mm256_set1_pd(K_EXP_UNDERFLOW); + const __m256d log2e_v = _mm256_set1_pd(K_LOG2E); + const __m256d half_v = _mm256_set1_pd(0.5); + const __m256d ln2hi_v = _mm256_set1_pd(K_LN2_HI); + const __m256d ln2lo_v = _mm256_set1_pd(K_LN2_LO); + const __m256d zero_v = _mm256_setzero_pd(); + const __m256d ufl_mask = _mm256_cmp_pd(x, uflow_v, _CMP_LE_OS); + x = _mm256_max_pd(x, uflow_v); + __m256d n = _mm256_floor_pd(_mm256_add_pd(_mm256_mul_pd(x, log2e_v), half_v)); + __m256d r = _mm256_sub_pd(x, _mm256_mul_pd(n, ln2hi_v)); + r = _mm256_sub_pd(r, _mm256_mul_pd(n, ln2lo_v)); + +#define K_MA256(a_, b_, c_) _mm256_add_pd(_mm256_mul_pd((a_), (b_)), (c_)) + __m256d p = _mm256_set1_pd(K_EXP_C12); + p = K_MA256(p, r, _mm256_set1_pd(K_EXP_C11)); + p = K_MA256(p, r, _mm256_set1_pd(K_EXP_C10)); + p = K_MA256(p, r, _mm256_set1_pd(K_EXP_C9)); + p = K_MA256(p, r, _mm256_set1_pd(K_EXP_C8)); + p = K_MA256(p, r, _mm256_set1_pd(K_EXP_C7)); + p = K_MA256(p, r, _mm256_set1_pd(K_EXP_C6)); + p = K_MA256(p, r, _mm256_set1_pd(K_EXP_C5)); + p = K_MA256(p, r, _mm256_set1_pd(K_EXP_C4)); + p = K_MA256(p, r, _mm256_set1_pd(K_EXP_C3)); + p = K_MA256(p, r, _mm256_set1_pd(K_EXP_C2)); + p = K_MA256(p, r, _mm256_set1_pd(K_EXP_C1)); + p = K_MA256(p, r, _mm256_set1_pd(K_EXP_C0)); +#undef K_MA256 + + __m128d n_lo = _mm256_castpd256_pd128(n), n_hi = _mm256_extractf128_pd(n, 1); + auto bp2 = [](__m128d nd) { + __m128i ni32 = + _mm_add_epi32(_mm_cvttpd_epi32(nd), _mm_set1_epi32(static_cast(K_EXPONENT_BIAS))); + __m128i i64 = _mm_slli_epi64(_mm_unpacklo_epi32(ni32, _mm_setzero_si128()), 52); + return _mm_castsi128_pd(i64); + }; + __m256d result = _mm256_mul_pd(p, _mm256_set_m128d(bp2(n_hi), bp2(n_lo))); + result = _mm256_blendv_pd(result, zero_v, ufl_mask); + return result; +} + +#endif // LIBHMM_HAS_AVX || LIBHMM_HAS_AVX2 + +// --------------------------------------------------------------------------- +// SSE2 helpers +// --------------------------------------------------------------------------- +#if defined(LIBHMM_HAS_SSE2) + +[[nodiscard]] static inline __m128d k_log_pd_sse2(__m128d x) noexcept { + const double neg_inf = -std::numeric_limits::infinity(); + const __m128d neg_inf_v = _mm_set1_pd(neg_inf); + const __m128d sqrt2_v = _mm_set1_pd(K_SQRT2); + const __m128d one_v = _mm_set1_pd(1.0); + const __m128d half_v = _mm_set1_pd(0.5); + const __m128d two_v = _mm_set1_pd(2.0); + const __m128d ln2hi_v = _mm_set1_pd(K_LN2_HI); + const __m128d ln2lo_v = _mm_set1_pd(K_LN2_LO); + const __m128d invalid = _mm_cmple_pd(x, _mm_setzero_pd()); + __m128i bits = _mm_castpd_si128(x); + __m128i eb = _mm_srli_epi64(bits, 52); + __m128i mbits = _mm_or_si128(_mm_and_si128(bits, _mm_set1_epi64x(0x000FFFFFFFFFFFFFLL)), + _mm_set1_epi64x(0x3FF0000000000000LL)); + __m128d m = _mm_castsi128_pd(mbits); + __m128i eu = _mm_sub_epi64(eb, _mm_set1_epi64x(1023LL)); + long long e0, e1; + _mm_storel_epi64(reinterpret_cast<__m128i *>(&e0), eu); + _mm_storel_epi64(reinterpret_cast<__m128i *>(&e1), _mm_unpackhi_epi64(eu, eu)); + __m128d e = _mm_set_pd(static_cast(e1), static_cast(e0)); + __m128d adj = _mm_cmpgt_pd(m, sqrt2_v); + e = _mm_add_pd(e, _mm_and_pd(adj, one_v)); + m = _mm_or_pd(_mm_andnot_pd(adj, m), _mm_and_pd(adj, _mm_mul_pd(m, half_v))); + __m128d y = _mm_div_pd(_mm_sub_pd(m, one_v), _mm_add_pd(m, one_v)); + __m128d y2 = _mm_mul_pd(y, y); +#define K_FMA128(a_, b_, c_) _mm_add_pd(_mm_mul_pd((a_), (b_)), (c_)) + __m128d p = _mm_set1_pd(K_LOG_C6); + p = K_FMA128(p, y2, _mm_set1_pd(K_LOG_C5)); + p = K_FMA128(p, y2, _mm_set1_pd(K_LOG_C4)); + p = K_FMA128(p, y2, _mm_set1_pd(K_LOG_C3)); + p = K_FMA128(p, y2, _mm_set1_pd(K_LOG_C2)); + p = K_FMA128(p, y2, _mm_set1_pd(K_LOG_C1)); + p = K_FMA128(p, y2, _mm_set1_pd(K_LOG_C0)); + __m128d log_m = _mm_mul_pd(_mm_mul_pd(two_v, y), p); + __m128d result = _mm_add_pd(_mm_mul_pd(e, ln2hi_v), _mm_add_pd(_mm_mul_pd(e, ln2lo_v), log_m)); +#undef K_FMA128 + result = _mm_or_pd(_mm_andnot_pd(invalid, result), _mm_and_pd(invalid, neg_inf_v)); + return result; +} + +[[nodiscard]] static inline __m128d k_exp_pd_sse2(__m128d x) noexcept { + const __m128d uflow_v = _mm_set1_pd(K_EXP_UNDERFLOW); + const __m128d log2e_v = _mm_set1_pd(K_LOG2E); + const __m128d half_v = _mm_set1_pd(0.5); + const __m128d ln2hi_v = _mm_set1_pd(K_LN2_HI); + const __m128d ln2lo_v = _mm_set1_pd(K_LN2_LO); + const __m128d zero_v = _mm_setzero_pd(); + const __m128d ufl = _mm_cmple_pd(x, uflow_v); + x = _mm_max_pd(x, uflow_v); + __m128d t = _mm_add_pd(_mm_mul_pd(x, log2e_v), half_v); + __m128i ni = _mm_cvttpd_epi32(t); + __m128d n = _mm_cvtepi32_pd(ni); + n = _mm_sub_pd(n, _mm_and_pd(_mm_cmpgt_pd(n, t), _mm_set1_pd(1.0))); + __m128d r = _mm_sub_pd(x, _mm_mul_pd(n, ln2hi_v)); + r = _mm_sub_pd(r, _mm_mul_pd(n, ln2lo_v)); +#define K_MA128(a_, b_, c_) _mm_add_pd(_mm_mul_pd((a_), (b_)), (c_)) + __m128d p = _mm_set1_pd(K_EXP_C12); + p = K_MA128(p, r, _mm_set1_pd(K_EXP_C11)); + p = K_MA128(p, r, _mm_set1_pd(K_EXP_C10)); + p = K_MA128(p, r, _mm_set1_pd(K_EXP_C9)); + p = K_MA128(p, r, _mm_set1_pd(K_EXP_C8)); + p = K_MA128(p, r, _mm_set1_pd(K_EXP_C7)); + p = K_MA128(p, r, _mm_set1_pd(K_EXP_C6)); + p = K_MA128(p, r, _mm_set1_pd(K_EXP_C5)); + p = K_MA128(p, r, _mm_set1_pd(K_EXP_C4)); + p = K_MA128(p, r, _mm_set1_pd(K_EXP_C3)); + p = K_MA128(p, r, _mm_set1_pd(K_EXP_C2)); + p = K_MA128(p, r, _mm_set1_pd(K_EXP_C1)); + p = K_MA128(p, r, _mm_set1_pd(K_EXP_C0)); +#undef K_MA128 + __m128i ni32b = + _mm_add_epi32(_mm_cvttpd_epi32(n), _mm_set1_epi32(static_cast(K_EXPONENT_BIAS))); + __m128i i64 = _mm_slli_epi64(_mm_unpacklo_epi32(ni32b, _mm_setzero_si128()), 52); + __m128d result = _mm_mul_pd(p, _mm_castsi128_pd(i64)); + result = _mm_or_pd(_mm_andnot_pd(ufl, result), _mm_and_pd(ufl, zero_v)); + return result; +} + +#endif // LIBHMM_HAS_SSE2 + +// --------------------------------------------------------------------------- +// NEON helpers +// --------------------------------------------------------------------------- +#if defined(LIBHMM_HAS_NEON) + +[[nodiscard]] static inline float64x2_t k_log_pd_neon(float64x2_t x) noexcept { + const float64x2_t neg_inf_v = vdupq_n_f64(-std::numeric_limits::infinity()); + const float64x2_t sqrt2_v = vdupq_n_f64(K_SQRT2); + const float64x2_t one_v = vdupq_n_f64(1.0); + const float64x2_t half_v = vdupq_n_f64(0.5); + const float64x2_t two_v = vdupq_n_f64(2.0); + const float64x2_t ln2hi_v = vdupq_n_f64(K_LN2_HI); + const float64x2_t ln2lo_v = vdupq_n_f64(K_LN2_LO); + const uint64x2_t invalid = vcleq_f64(x, vdupq_n_f64(0.0)); + uint64x2_t bits = vreinterpretq_u64_f64(x); + uint64x2_t eb = vshrq_n_u64(bits, 52); + uint64x2_t mbits = vorrq_u64(vandq_u64(bits, vdupq_n_u64(0x000FFFFFFFFFFFFFULL)), + vdupq_n_u64(0x3FF0000000000000ULL)); + float64x2_t m = vreinterpretq_f64_u64(mbits); + float64x2_t e = vcvtq_f64_s64(vsubq_s64(vreinterpretq_s64_u64(eb), vdupq_n_s64(1023LL))); + uint64x2_t adj = vcgtq_f64(m, sqrt2_v); + e = vbslq_f64(adj, vaddq_f64(e, one_v), e); + m = vbslq_f64(adj, vmulq_f64(m, half_v), m); + float64x2_t y = vdivq_f64(vsubq_f64(m, one_v), vaddq_f64(m, one_v)); + float64x2_t y2 = vmulq_f64(y, y); + float64x2_t p = vdupq_n_f64(K_LOG_C6); + p = vfmaq_f64(vdupq_n_f64(K_LOG_C5), p, y2); + p = vfmaq_f64(vdupq_n_f64(K_LOG_C4), p, y2); + p = vfmaq_f64(vdupq_n_f64(K_LOG_C3), p, y2); + p = vfmaq_f64(vdupq_n_f64(K_LOG_C2), p, y2); + p = vfmaq_f64(vdupq_n_f64(K_LOG_C1), p, y2); + p = vfmaq_f64(vdupq_n_f64(K_LOG_C0), p, y2); + float64x2_t log_m = vmulq_f64(vmulq_f64(two_v, y), p); + float64x2_t result = vfmaq_f64(vfmaq_f64(log_m, e, ln2lo_v), e, ln2hi_v); + result = vbslq_f64(invalid, neg_inf_v, result); + return result; +} + +[[nodiscard]] static inline float64x2_t k_exp_pd_neon(float64x2_t x) noexcept { + const float64x2_t uflow_v = vdupq_n_f64(K_EXP_UNDERFLOW); + const float64x2_t log2e_v = vdupq_n_f64(K_LOG2E); + const float64x2_t half_v = vdupq_n_f64(0.5); + const float64x2_t ln2hi_v = vdupq_n_f64(K_LN2_HI); + const float64x2_t ln2lo_v = vdupq_n_f64(K_LN2_LO); + const float64x2_t zero_v = vdupq_n_f64(0.0); + const uint64x2_t valid = vcgtq_f64(x, uflow_v); + x = vmaxq_f64(x, uflow_v); + float64x2_t n = vrndmq_f64(vfmaq_f64(half_v, x, log2e_v)); + float64x2_t r = vfmsq_f64(x, n, ln2hi_v); + r = vfmsq_f64(r, n, ln2lo_v); + float64x2_t p = vdupq_n_f64(K_EXP_C12); + p = vfmaq_f64(vdupq_n_f64(K_EXP_C11), p, r); + p = vfmaq_f64(vdupq_n_f64(K_EXP_C10), p, r); + p = vfmaq_f64(vdupq_n_f64(K_EXP_C9), p, r); + p = vfmaq_f64(vdupq_n_f64(K_EXP_C8), p, r); + p = vfmaq_f64(vdupq_n_f64(K_EXP_C7), p, r); + p = vfmaq_f64(vdupq_n_f64(K_EXP_C6), p, r); + p = vfmaq_f64(vdupq_n_f64(K_EXP_C5), p, r); + p = vfmaq_f64(vdupq_n_f64(K_EXP_C4), p, r); + p = vfmaq_f64(vdupq_n_f64(K_EXP_C3), p, r); + p = vfmaq_f64(vdupq_n_f64(K_EXP_C2), p, r); + p = vfmaq_f64(vdupq_n_f64(K_EXP_C1), p, r); + p = vfmaq_f64(vdupq_n_f64(K_EXP_C0), p, r); + int64x2_t ni64 = + vaddq_s64(vcvtq_s64_f64(n), vdupq_n_s64(static_cast(K_EXPONENT_BIAS))); + float64x2_t result = vmulq_f64(p, vreinterpretq_f64_s64(vshlq_n_s64(ni64, 52))); + result = vbslq_f64(valid, result, zero_v); + return result; +} + +#endif // LIBHMM_HAS_NEON + +} // namespace kernels +} // namespace detail +} // namespace performance +} // namespace libhmm diff --git a/include/libhmm/performance/transcendental_kernels.h b/include/libhmm/performance/transcendental_kernels.h new file mode 100644 index 0000000..8cce072 --- /dev/null +++ b/include/libhmm/performance/transcendental_kernels.h @@ -0,0 +1,62 @@ +#pragma once + +#include + +/** + * @file transcendental_kernels.h + * @brief SIMD-accelerated inner-loop kernels for FB max-reduce and BW xi accumulation. + * + * Declares five static methods on TranscendentalKernels. Implementations live in + * src/performance/transcendental_kernels.cpp and are compiled with + * LIBHMM_BEST_SIMD_FLAGS, activating the appropriate #if LIBHMM_HAS_* cascade: + * AVX-512 8-wide __m512d + * AVX/AVX2 4-wide __m256d (AVX-1 compatible; AVX2 compiler fuses FMA) + * SSE2 2-wide __m128d + * NEON 2-wide float64x2_t + * scalar tail / fallback + * + * Active ISA diagnostics use libhmm::performance::simd::feature_string() and + * double_vector_width() from simd_platform.h — consistent with the rest of the library. + */ + +namespace libhmm { +namespace performance { +namespace detail { + +/** + * @brief Vectorised inner-loop kernels shared by ForwardBackwardCalculator (max-reduce + * recurrence) and BaumWelchTrainer (dense-xi accumulation). + * + * All methods are noexcept and operate on raw double pointers. Inputs are + * expected to be either finite log-probabilities or LOG_ZERO (-inf); +inf and + * NaN are not produced by any production caller and are not guarded. + */ +class TranscendentalKernels { +public: + /// Element-wise max of (a[i]+b[i]) over [0, size). No exp calls. + [[nodiscard]] static double reduce_max_sum2(const double *a, const double *b, + std::size_t size) noexcept; + + /// Sum of exp(a[i]+b[i] - maxVal) for finite terms, over [0, size). + /// Returns 0 when maxVal is not finite. + [[nodiscard]] static double sum_exp_sum2_minus_max(const double *a, const double *b, + std::size_t size, double maxVal) noexcept; + + /// Element-wise max of (a[i]+b[i]+c[i]) over [0, size). No exp calls. + [[nodiscard]] static double reduce_max_sum3(const double *a, const double *b, const double *c, + std::size_t size) noexcept; + + /// Sum of exp(a[i]+b[i]+c[i] - maxVal) for finite terms, over [0, size). + /// Returns 0 when maxVal is not finite. + [[nodiscard]] static double sum_exp_sum3_minus_max(const double *a, const double *b, + const double *c, std::size_t size, + double maxVal) noexcept; + + /// dst[i] += exp(a[i] + b[i] + bias) for i in [0, size). + static void accumulate_exp_sum2_bias(double *dst, const double *a, const double *b, + std::size_t size, double bias) noexcept; +}; + +} // namespace detail +} // namespace performance +} // namespace libhmm diff --git a/include/libhmm/platform/simd_platform.h b/include/libhmm/platform/simd_platform.h index 6bdcb62..6194c1a 100644 --- a/include/libhmm/platform/simd_platform.h +++ b/include/libhmm/platform/simd_platform.h @@ -26,11 +26,10 @@ * - SINGLE RESPONSIBILITY: This header only handles SIMD platform concerns * - EXTENSIBILITY: Easy to add new SIMD instruction sets or platforms * - * FILES THAT INCLUDE THIS HEADER: - * - src/distributions/gaussian_distribution.cpp (tier-2 SIMD intrinsics) - * - src/distributions/exponential_distribution.cpp (tier-2 SIMD intrinsics) - * - tools/simd_inspection.cpp (ISA capability report + smoke tests) - * - include/libhmm/performance/transcendental_kernels.h (perf branch) + * Included by Tier-2 distribution TUs, performance kernel TUs + * (transcendental_kernels.cpp, forward_backward_calculator.cpp, + * baum_welch_trainer.cpp), and diagnostic tools (simd_inspection.cpp). + * Also included transitively via simd_kernels_internal.h. * * Features: * - Cross-platform SIMD intrinsics inclusion diff --git a/performance/PERFORMANCE_ARCHITECTURE.md b/performance/PERFORMANCE_ARCHITECTURE.md index 7caf353..085d597 100644 --- a/performance/PERFORMANCE_ARCHITECTURE.md +++ b/performance/PERFORMANCE_ARCHITECTURE.md @@ -8,12 +8,12 @@ void getBatchLogProbabilities(std::span observations, ``` The canonical calculators (`ForwardBackwardCalculator`, `ViterbiCalculator`) call this once per state per `compute()`, producing T contiguous log-emission values that the recurrences then consume from a flat row-major buffer. Two tiers of implementation: -- **Tier 2 — explicit intrinsics.** `GaussianDistribution` and `ExponentialDistribution` ship hand-written `detail::` free functions with an AVX-512 → AVX/AVX2 → SSE2 → NEON → scalar dispatch chain. See `src/distributions/gaussian_distribution.cpp` `detail::gaussian_logpdf_batch` for the canonical shape. The free-function pattern is deliberately extractable to a separate TU for future runtime dispatch without API changes. -- **Tier 1 — auto-vectorization-friendly loops.** The other 13 distributions implement `getBatchLogProbabilities` as concrete non-virtual loops over plain arrays, compiled with `LIBHMM_BEST_SIMD_FLAGS` (the highest CPU-verified ISA on the build machine). Whether the compiler actually emits vector instructions depends on the loop body — transcendentals like `std::exp` are not auto-vectorized by MSVC even with `/arch:AVX2`, so tier 1 is best read as "well-shaped scalar code" rather than "guaranteed SIMD." +- **Tier 2 — explicit intrinsics.** `GaussianDistribution`, `ExponentialDistribution`, `LogNormalDistribution`, and `ParetoDistribution` ship hand-written `detail::` free functions with an AVX-512 → AVX/AVX2 → SSE2 → NEON → scalar dispatch chain. See `src/distributions/gaussian_distribution.cpp` `detail::gaussian_logpdf_batch` for the canonical shape. The free-function pattern is deliberately extractable to a separate TU for future runtime dispatch without API changes. Tier-2 log-probability kernels share vector log/exp helpers from `include/libhmm/performance/simd_kernels_internal.h`. +- **Tier 1 — auto-vectorization-friendly loops.** The other 11 distributions implement `getBatchLogProbabilities` as concrete non-virtual loops over plain arrays, compiled with `LIBHMM_BEST_SIMD_FLAGS` (the highest CPU-verified ISA on the build machine). Whether the compiler actually emits vector instructions depends on the loop body — transcendentals like `std::exp` are not auto-vectorized by MSVC even with `/arch:AVX2`, so tier 1 is best read as "well-shaped scalar code" rather than "guaranteed SIMD." All 15 distribution TUs are listed in `LIBHMM_SIMD_SOURCES` in the top-level `CMakeLists.txt` and receive the SIMD compile flags. ## Where SIMD does and doesn't live today - ✅ **Distribution batch emission evaluation** — `getBatchLogProbabilities`. Effective for emission-bound workloads (continuous distributions, large T). Tier 2 in particular delivers measurable speedups; tier 1 depends on compiler heuristics. -- ⚠️ **Recurrence kernels** — FB max-reduce, BW xi accumulation, Viterbi inner loop. These are state×state inner loops dominated by `exp` / `log1p` calls. Currently scalar. The active perf-branch work introduces an internal `TranscendentalKernels` abstraction in `include/libhmm/performance/transcendental_kernels.h` with scalar today and AVX2/NEON backends planned, so future explicit vector-math implementations can plug in without rewriting the call sites. +- ✅ **Recurrence kernels** — FB max-reduce and BW xi accumulation. Five kernels in `src/performance/transcendental_kernels.cpp` with an AVX-512 → AVX/AVX2 → SSE2 → NEON → scalar cascade, consumed by `ForwardBackwardCalculator` and `BaumWelchTrainer`. The `TranscendentalKernels` class in `include/libhmm/performance/transcendental_kernels.h` exposes the public interface; call sites in the two consumer TUs are unchanged. Viterbi inner loop remains scalar. - The runtime `Matrix`/`Vector` typedefs in `common/common.h` resolve to `BasicMatrix`/`BasicVector`. The library no longer ships separate "optimized" container variants (see Historical context). ## Threading: not currently used Production calculators and trainers run single-threaded on every workload. Specifically: @@ -29,6 +29,6 @@ The build system picks the highest CPU-verified ISA per machine and applies it a - **GCC/Clang on all platforms**: `-march=native`. Selects NEON on AArch64, the highest available x86 ISA on Intel/AMD. - **MSVC on x86_64**: probes `/arch:AVX512`, `/arch:AVX2`, `/arch:AVX` via `check_cxx_source_runs` and selects the highest one the build machine can actually execute (not just the highest the compiler accepts). Falls back to SSE2 baseline in cross-compilation. - **AArch64**: NEON is the mandatory ISA baseline; no flag needed. -See the `# SIMD DETECTION` block in `CMakeLists.txt` for details. The non-distribution sources (`src/common/`, `src/calculators/`, `src/training/`, `src/io/`, `src/performance/`) compile at the platform baseline ISA so that explicit intrinsics in the distribution TUs are the only place SIMD codegen is committed to. +See the `# SIMD DETECTION` block in `CMakeLists.txt` for details. Most non-distribution sources (`src/common/`, `src/io/`) compile at the platform baseline ISA. The exceptions are the three performance-critical TUs that contain explicit intrinsics: `src/performance/transcendental_kernels.cpp`, `src/calculators/forward_backward_calculator.cpp`, and `src/training/baum_welch_trainer.cpp` — these are listed in `LIBHMM_SIMD_SOURCES` alongside the distribution TUs and receive the full `LIBHMM_BEST_SIMD_FLAGS`. ## Historical context An earlier draft of this document described a four-level hierarchy in which calculators consumed `OptimizedMatrix`/`OptimizedVector` containers and a `WorkStealingPool` provided per-state parallelism. That plan was superseded by the v3.0.0-alpha (Phase 4) refactor (see `CHANGELOG.md`), which removed the per-calculator SIMD variants (`ScaledSIMD*`, `LogSIMD*`, `AdvancedLog*`) in favor of the per-distribution batch interface documented above. The Optimized\* containers, `WorkStealingPool`, the per-library `Benchmark` framework, and the parallel-execution constants/utilities they depended on were retained for several releases as "future hooks" but never wired into the canonical calculator/trainer pipeline; they were removed in a subsequent dead-code cleanup. The SIMD investment in `getBatchLogProbabilities` is the canonical and current strategy. diff --git a/scripts/configure_catalina.sh b/scripts/configure_catalina.sh index ee8a257..6bb3fc7 100755 --- a/scripts/configure_catalina.sh +++ b/scripts/configure_catalina.sh @@ -42,6 +42,7 @@ env -u CC \ -DCMAKE_C_COMPILER="${CC_BIN}" \ -DCMAKE_CXX_COMPILER="${CXX_BIN}" \ -DCMAKE_OSX_SYSROOT="${SYSROOT}" \ + -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_OSX_DEPLOYMENT_TARGET=10.15 \ "$@" diff --git a/scripts/phase_gate.ps1 b/scripts/phase_gate.ps1 new file mode 100644 index 0000000..692aa41 --- /dev/null +++ b/scripts/phase_gate.ps1 @@ -0,0 +1,125 @@ +#Requires -Version 7.0 +<# +.SYNOPSIS + Phase gate: run the required correctness suite before each phase PR. + +.DESCRIPTION + Builds and runs the seven gate tests listed in the plan (Phase D3). + Exits with code 0 on all-pass, 1 on any failure or build error. + +.PARAMETER BuildDir + Path to the CMake binary directory. Defaults to /build. + +.PARAMETER Config + CMake build configuration (Release, Debug, ...). Defaults to Release. + +.PARAMETER Rebuild + If set, rebuild all gate targets before running them. +#> +param( + [string] $BuildDir = "", + [string] $Config = "Release", + [switch] $Rebuild +) + +Set-StrictMode -Version Latest + +$scriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path +$repoRoot = Split-Path -Parent $scriptDir + +if (-not $BuildDir) { + $BuildDir = Join-Path $repoRoot "build" +} + +if (-not (Test-Path $BuildDir)) { + Write-Error "Build directory not found: $BuildDir" + Write-Error "Run cmake -S . -B build first." + exit 1 +} + +# Gate tests (plan Phase D, acceptance criteria). +$gateTargets = @( + "test_canonical_calculators", + "test_calculator_continuous", + "test_calculator_edge_cases", + "test_canonical_training", + "test_baum_welch_convergence", + "test_fb_mode_parity", + "test_bw_parity" +) + +# ── Optional rebuild ────────────────────────────────────────────────────────── +if ($Rebuild) { + Write-Host "Building gate targets ($Config)..." -ForegroundColor Cyan + $buildArgs = @( + "--build", $BuildDir, + "--config", $Config, + "--target" + ) + $gateTargets + cmake @buildArgs + if ($LASTEXITCODE -ne 0) { + Write-Host "" + Write-Host "PHASE GATE FAILED: build error." -ForegroundColor Red + exit 1 + } +} + +# ── Locate executables ──────────────────────────────────────────────────────── +# Multi-config generators (VS, Xcode) put binaries in /tests//. +# Single-config generators (Makefiles, Ninja) put them in /tests/. +$testDir = Join-Path $BuildDir "tests" +$candidates = @( + (Join-Path $testDir $Config), + $testDir +) + +function Find-Exe { + param([string]$name) + foreach ($dir in $candidates) { + $exePath = Join-Path $dir "$name.exe" + if (Test-Path $exePath) { return $exePath } + $exePath = Join-Path $dir $name + if (Test-Path $exePath) { return $exePath } + } + return $null +} + +# ── Run each gate test ──────────────────────────────────────────────────────── +$results = [ordered]@{} +$anyFail = $false + +Write-Host "" +Write-Host "Phase gate — $Config — $(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')" -ForegroundColor Cyan +Write-Host ("-" * 60) + +foreach ($target in $gateTargets) { + $exe = Find-Exe $target + if (-not $exe) { + Write-Host " SKIP $target (executable not found; run with -Rebuild)" -ForegroundColor Yellow + $results[$target] = "SKIP" + $anyFail = $true + continue + } + + & $exe --gtest_color=no 2>&1 | Out-Null + if ($LASTEXITCODE -eq 0) { + Write-Host " PASS $target" -ForegroundColor Green + $results[$target] = "PASS" + } else { + Write-Host " FAIL $target" -ForegroundColor Red + $results[$target] = "FAIL" + $anyFail = $true + # Re-run with output so the failure is visible. + & $exe --gtest_color=no + } +} + +Write-Host ("-" * 60) + +if ($anyFail) { + Write-Host "PHASE GATE FAILED" -ForegroundColor Red + exit 1 +} else { + Write-Host "PHASE GATE PASSED ($($gateTargets.Count)/$($gateTargets.Count))" -ForegroundColor Green + exit 0 +} diff --git a/src/calculators/forward_backward_calculator.cpp b/src/calculators/forward_backward_calculator.cpp index 1097acc..b0ab429 100755 --- a/src/calculators/forward_backward_calculator.cpp +++ b/src/calculators/forward_backward_calculator.cpp @@ -1,15 +1,35 @@ #include "libhmm/calculators/forward_backward_calculator.h" #include "libhmm/hmm.h" -#include +#include "libhmm/performance/transcendental_kernels.h" #include #include -#include #include +#include namespace libhmm { namespace { constexpr double LOG_ZERO = -std::numeric_limits::infinity(); +} // namespace + +FbRecurrenceMode +ForwardBackwardCalculator::resolveRecurrenceMode(const std::size_t numStates, + const std::size_t sequenceLength) const noexcept { +#if defined(LIBHMM_EXPERIMENT_FB_MAX_REDUCE) + // Compile-time forcer: highest priority. Preserves benchmark-build contract. + (void)numStates; + (void)sequenceLength; + return FbRecurrenceMode::MaxReduce; +#elif defined(LIBHMM_EXPERIMENT_FB_ADAPTIVE_SELECTOR) + // Legacy adaptive forcer: simple N>2 cutoff. Preserves benchmark-build contract. + (void)sequenceLength; + return (numStates > 2) ? FbRecurrenceMode::MaxReduce : FbRecurrenceMode::Pairwise; +#else + if (modeOverride_.has_value()) { + return *modeOverride_; + } + return selectFbRecurrenceMode(numStates, sequenceLength); +#endif } // --------------------------------------------------------------------------- @@ -46,28 +66,36 @@ void ForwardBackwardCalculator::compute() { return; } - // Allocate/resize result matrices + // Allocate/resize result matrices. logAlpha_.resize(T, numStates_); logBeta_.resize(T, numStates_); - // Pre-fill the log-emission buffer: logEmitBuf_[i * T + t] = log b_i(O_t) - // Build observation span once; reuse across all N states. + // Build state-major log-emission buffer: logEmitBuf_[i * T + t] = log b_i(O_t). + // Then derive shared time-major layout: logEmitByTime_[t * N + i] = log b_i(O_t). logEmitBuf_.resize(T * numStates_); - std::vector obsVec(T); - for (std::size_t t = 0; t < T; ++t) - obsVec[t] = observations_(t); - const std::span obsSpan(obsVec.data(), T); + logEmitByTime_.resize(T * numStates_); + const std::span obsSpan(observations_.data(), T); const Hmm &hmm = getHmmRef(); for (std::size_t i = 0; i < numStates_; ++i) { hmm.getDistribution(i).getBatchLogProbabilities( obsSpan, std::span(logEmitBuf_.data() + i * T, T)); } + for (std::size_t i = 0; i < numStates_; ++i) { + const double *stateRow = logEmitBuf_.data() + i * T; + for (std::size_t t = 0; t < T; ++t) { + logEmitByTime_[t * numStates_ + i] = stateRow[t]; + } + } + + // Resolve recurrence mode per the compile-time forcer / instance override / + // static policy pipeline. + currentMode_ = resolveRecurrenceMode(numStates_, T); computeLogForward(); computeLogBackward(); - // log P(O|λ) = log-sum-exp over states at final timestep + // log P(O|lambda) = log-sum-exp over states at final timestep. double lp = LOG_ZERO; for (std::size_t i = 0; i < numStates_; ++i) { lp = logSumExp(lp, logAlpha_(T - 1, i)); @@ -83,70 +111,192 @@ void ForwardBackwardCalculator::precomputeLogTransitions() { const Hmm &hmm = getHmmRef(); const Matrix &trans = hmm.getTrans(); logTrans_.resize(numStates_, numStates_); + logTransT_.resize(numStates_, numStates_); for (std::size_t i = 0; i < numStates_; ++i) { for (std::size_t j = 0; j < numStates_; ++j) { const double a = trans(i, j); - logTrans_(i, j) = (a > 0.0) ? std::log(a) : LOG_ZERO; + const double logA = (a > 0.0) ? std::log(a) : LOG_ZERO; + logTrans_(i, j) = logA; + logTransT_(j, i) = logA; } } } void ForwardBackwardCalculator::computeLogForward() { + if (currentMode_ == FbRecurrenceMode::MaxReduce) { + computeLogForwardMaxReduce(); + return; + } + computeLogForwardPairwise(); +} + +void ForwardBackwardCalculator::computeLogForwardPairwise() { const Hmm &hmm = getHmmRef(); const Vector &pi = hmm.getPi(); const std::size_t T = observations_.size(); + const std::size_t N = numStates_; + const double *logTransTData = logTransT_.data(); + const double *emitByTimeData = logEmitByTime_.data(); + double *alphaData = logAlpha_.data(); - // t = 0: log alpha(0, i) = log pi_i + log b_i(O_0) - for (std::size_t i = 0; i < numStates_; ++i) { + // t = 0. + const double *emitRow0 = emitByTimeData; + for (std::size_t i = 0; i < N; ++i) { const double logPi = (pi(i) > 0.0) ? std::log(pi(i)) : LOG_ZERO; - logAlpha_(0, i) = logPi + logEmitBuf_[i * T + 0]; + alphaData[i] = logPi + emitRow0[i]; } - // t > 0 + // t > 0. for (std::size_t t = 1; t < T; ++t) { - for (std::size_t j = 0; j < numStates_; ++j) { + const double *prevAlphaRow = alphaData + (t - 1) * N; + double *alphaRow = alphaData + t * N; + const double *emitRow = emitByTimeData + t * N; + for (std::size_t j = 0; j < N; ++j) { + const double *transCol = logTransTData + j * N; double logSum = LOG_ZERO; - for (std::size_t i = 0; i < numStates_; ++i) { - logSum = logSumExp(logSum, logAlpha_(t - 1, i) + logTrans_(i, j)); + for (std::size_t i = 0; i < N; ++i) { + logSum = logSumExp(logSum, prevAlphaRow[i] + transCol[i]); } - logAlpha_(t, j) = logEmitBuf_[j * T + t] + logSum; + alphaRow[j] = emitRow[j] + logSum; + } + } +} + +void ForwardBackwardCalculator::computeLogForwardMaxReduce() { + const Hmm &hmm = getHmmRef(); + const Vector &pi = hmm.getPi(); + const std::size_t T = observations_.size(); + const std::size_t N = numStates_; + const double *logTransTData = logTransT_.data(); + const double *emitByTimeData = logEmitByTime_.data(); + double *alphaData = logAlpha_.data(); + + // t = 0. + const double *emitRow0 = emitByTimeData; + for (std::size_t i = 0; i < N; ++i) { + const double logPi = (pi(i) > 0.0) ? std::log(pi(i)) : LOG_ZERO; + alphaData[i] = logPi + emitRow0[i]; + } + + // t > 0. + for (std::size_t t = 1; t < T; ++t) { + const double *prevAlphaRow = alphaData + (t - 1) * N; + double *alphaRow = alphaData + t * N; + const double *emitRow = emitByTimeData + t * N; + for (std::size_t j = 0; j < N; ++j) { + const double *transCol = logTransTData + j * N; + const double maxTerm = performance::detail::TranscendentalKernels::reduce_max_sum2( + prevAlphaRow, transCol, N); + + double logSum = LOG_ZERO; + if (std::isfinite(maxTerm)) { + const double scaledSum = + performance::detail::TranscendentalKernels::sum_exp_sum2_minus_max( + prevAlphaRow, transCol, N, maxTerm); + if (scaledSum > 0.0) { + logSum = maxTerm + std::log(scaledSum); + } + } + alphaRow[j] = emitRow[j] + logSum; } } } void ForwardBackwardCalculator::computeLogBackward() { + if (currentMode_ == FbRecurrenceMode::MaxReduce) { + computeLogBackwardMaxReduce(); + return; + } + computeLogBackwardPairwise(); +} + +void ForwardBackwardCalculator::computeLogBackwardPairwise() { const std::size_t T = observations_.size(); + const std::size_t N = numStates_; + const double *logTransData = logTrans_.data(); + const double *emitByTimeData = logEmitByTime_.data(); + double *betaData = logBeta_.data(); - // t = T-1: log beta(T-1, i) = log(1) = 0 - for (std::size_t i = 0; i < numStates_; ++i) { - logBeta_(T - 1, i) = 0.0; + // t = T - 1. + double *finalBetaRow = betaData + (T - 1) * N; + for (std::size_t i = 0; i < N; ++i) { + finalBetaRow[i] = 0.0; } - // t < T-1, working backwards + // t < T - 1. if (T > 1) { for (std::size_t t = T - 2;; --t) { - for (std::size_t i = 0; i < numStates_; ++i) { + double *betaRow = betaData + t * N; + const double *nextBetaRow = betaData + (t + 1) * N; + const double *emitNextRow = emitByTimeData + (t + 1) * N; + for (std::size_t i = 0; i < N; ++i) { + const double *transRow = logTransData + i * N; double logSum = LOG_ZERO; - for (std::size_t j = 0; j < numStates_; ++j) { - logSum = logSumExp(logSum, logTrans_(i, j) + logEmitBuf_[j * T + (t + 1)] + - logBeta_(t + 1, j)); + for (std::size_t j = 0; j < N; ++j) { + logSum = logSumExp(logSum, transRow[j] + emitNextRow[j] + nextBetaRow[j]); } - logBeta_(t, i) = logSum; + betaRow[i] = logSum; } - if (t == 0) + if (t == 0) { break; + } } } } -// Numerically stable log(exp(a) + exp(b)) +void ForwardBackwardCalculator::computeLogBackwardMaxReduce() { + const std::size_t T = observations_.size(); + const std::size_t N = numStates_; + const double *logTransData = logTrans_.data(); + const double *emitByTimeData = logEmitByTime_.data(); + double *betaData = logBeta_.data(); + + // t = T - 1. + double *finalBetaRow = betaData + (T - 1) * N; + for (std::size_t i = 0; i < N; ++i) { + finalBetaRow[i] = 0.0; + } + + // t < T - 1. + if (T > 1) { + for (std::size_t t = T - 2;; --t) { + double *betaRow = betaData + t * N; + const double *nextBetaRow = betaData + (t + 1) * N; + const double *emitNextRow = emitByTimeData + (t + 1) * N; + for (std::size_t i = 0; i < N; ++i) { + const double *transRow = logTransData + i * N; + const double maxTerm = performance::detail::TranscendentalKernels::reduce_max_sum3( + transRow, emitNextRow, nextBetaRow, N); + + double logSum = LOG_ZERO; + if (std::isfinite(maxTerm)) { + const double scaledSum = + performance::detail::TranscendentalKernels::sum_exp_sum3_minus_max( + transRow, emitNextRow, nextBetaRow, N, maxTerm); + if (scaledSum > 0.0) { + logSum = maxTerm + std::log(scaledSum); + } + } + betaRow[i] = logSum; + } + if (t == 0) { + break; + } + } + } +} + +// Numerically stable log(exp(a) + exp(b)). double ForwardBackwardCalculator::logSumExp(double a, double b) noexcept { - if (a == LOG_ZERO) + if (a == LOG_ZERO) { return b; - if (b == LOG_ZERO) + } + if (b == LOG_ZERO) { return a; - if (a > b) + } + if (a > b) { return a + std::log1p(std::exp(b - a)); + } return b + std::log1p(std::exp(a - b)); } diff --git a/src/calculators/viterbi_calculator.cpp b/src/calculators/viterbi_calculator.cpp index 3ade510..ae5c18e 100755 --- a/src/calculators/viterbi_calculator.cpp +++ b/src/calculators/viterbi_calculator.cpp @@ -44,15 +44,19 @@ StateSequence ViterbiCalculator::decode() { // Fill log-emission buffer: logEmitBuf_[i * T + t] = log b_i(O_t) logEmitBuf_.resize(T * numStates_); const Hmm &hmm = getHmmRef(); - - std::vector obsVec(T); - for (std::size_t t = 0; t < T; ++t) - obsVec[t] = observations_(t); + const std::span obsSpan(observations_.data(), T); for (std::size_t i = 0; i < numStates_; ++i) { hmm.getDistribution(i).getBatchLogProbabilities( - std::span(obsVec.data(), T), - std::span(logEmitBuf_.data() + i * T, T)); + obsSpan, std::span(logEmitBuf_.data() + i * T, T)); + } + // Build time-major emission buffer once for locality in dynamic programming. + logEmitByTime_.resize(T * numStates_); + for (std::size_t i = 0; i < numStates_; ++i) { + const double *stateRow = logEmitBuf_.data() + i * T; + for (std::size_t t = 0; t < T; ++t) { + logEmitByTime_[t * numStates_ + i] = stateRow[t]; + } } runViterbi(); @@ -68,10 +72,13 @@ void ViterbiCalculator::precomputeLogTransitions() { const Hmm &hmm = getHmmRef(); const Matrix &trans = hmm.getTrans(); logTrans_.resize(numStates_, numStates_); + logTransT_.resize(numStates_, numStates_); for (std::size_t i = 0; i < numStates_; ++i) { for (std::size_t j = 0; j < numStates_; ++j) { const double a = trans(i, j); - logTrans_(i, j) = (a > 0.0) ? std::log(a) : LOG_ZERO; + const double logA = (a > 0.0) ? std::log(a) : LOG_ZERO; + logTrans_(i, j) = logA; + logTransT_(j, i) = logA; } } } @@ -82,37 +89,48 @@ void ViterbiCalculator::runViterbi() { const std::size_t T = observations_.size(); logDelta_.resize(T, numStates_); - psi_.assign(T, std::vector(numStates_, 0)); + psi_.assign(T * numStates_, 0); + + const double *logTransTData = logTransT_.data(); + const double *logEmitByTimeData = logEmitByTime_.data(); + double *logDeltaData = logDelta_.data(); + const std::size_t N = numStates_; // t = 0: initialise + const double *emitRow0 = logEmitByTimeData; for (std::size_t i = 0; i < numStates_; ++i) { const double logPi = (pi(i) > 0.0) ? std::log(pi(i)) : LOG_ZERO; - logDelta_(0, i) = logPi + logEmitBuf_[i * T + 0]; + logDeltaData[i] = logPi + emitRow0[i]; } // t > 0: recursion for (std::size_t t = 1; t < T; ++t) { + const double *prevDeltaRow = logDeltaData + (t - 1) * N; + double *deltaRow = logDeltaData + t * N; + const double *emitRow = logEmitByTimeData + t * N; for (std::size_t j = 0; j < numStates_; ++j) { double maxVal = LOG_ZERO; int maxFrom = 0; + const double *transCol = logTransTData + j * N; for (std::size_t i = 0; i < numStates_; ++i) { - const double val = logDelta_(t - 1, i) + logTrans_(i, j); + const double val = prevDeltaRow[i] + transCol[i]; if (val > maxVal) { maxVal = val; maxFrom = static_cast(i); } } - logDelta_(t, j) = maxVal + logEmitBuf_[j * T + t]; - psi_[t][j] = maxFrom; + deltaRow[j] = maxVal + emitRow[j]; + psi_[t * N + j] = maxFrom; } } // Termination: best last state double bestVal = LOG_ZERO; int bestLast = 0; + const double *finalDeltaRow = logDeltaData + (T - 1) * N; for (std::size_t i = 0; i < numStates_; ++i) { - if (logDelta_(T - 1, i) > bestVal) { - bestVal = logDelta_(T - 1, i); + if (finalDeltaRow[i] > bestVal) { + bestVal = finalDeltaRow[i]; bestLast = static_cast(i); } } @@ -126,9 +144,10 @@ void ViterbiCalculator::backtrack() { const std::size_t T = observations_.size(); if (T <= 1) return; + const std::size_t N = numStates_; for (std::size_t t = T - 2;; --t) { - sequence_(t) = psi_[t + 1][static_cast(sequence_(t + 1))]; + sequence_(t) = psi_[(t + 1) * N + static_cast(sequence_(t + 1))]; if (t == 0) break; } diff --git a/src/distributions/beta_distribution.cpp b/src/distributions/beta_distribution.cpp index b3e7f19..5a5740d 100644 --- a/src/distributions/beta_distribution.cpp +++ b/src/distributions/beta_distribution.cpp @@ -7,7 +7,7 @@ namespace libhmm { /** * Computes the probability density function for the Beta distribution. - * + * * @param value The value at which to evaluate the PDF (should be in [0,1]) * @return Probability density, or 0.0 if value is outside [0,1] */ @@ -82,9 +82,9 @@ double BetaDistribution::getProbability(double value) const { /** * Computes the logarithm of the probability density function for numerical stability. - * + * * For Beta distribution: log(f(x)) = (α-1)log(x) + (β-1)log(1-x) - log(B(α,β)) - * + * * @param value The value at which to evaluate the log-PDF (should be in [0,1]) * @return Natural logarithm of the probability density, or -∞ for invalid values */ diff --git a/src/distributions/binomial_distribution.cpp b/src/distributions/binomial_distribution.cpp index a45108a..856fa53 100644 --- a/src/distributions/binomial_distribution.cpp +++ b/src/distributions/binomial_distribution.cpp @@ -9,10 +9,10 @@ namespace libhmm { /** * Computes the probability mass function for the Binomial distribution. - * + * * For discrete distributions, this returns the exact probability mass * P(X = k) = C(n,k) * p^k * (1-p)^(n-k) - * + * * @param value The value at which to evaluate the PMF (rounded to nearest integer) * @return Probability mass for the given value */ @@ -50,13 +50,13 @@ double BinomialDistribution::getProbability(double value) const { /** * Fits the distribution parameters to the given data using maximum likelihood estimation. - * + * * For Binomial distribution with known n, the MLE of p is: * p̂ = sample_mean / n - * + * * If n is unknown, we estimate it as the maximum observed value, then fit p. * This is a common approach when the number of trials is not known a priori. - * + * * @param values Vector of observed data points */ void BinomialDistribution::fit(std::span data) { @@ -131,7 +131,7 @@ void BinomialDistribution::reset() noexcept { /** * Returns a string representation of the distribution following the standardized format. - * + * * @return String describing the distribution parameters and statistics */ std::string BinomialDistribution::toString() const { diff --git a/src/distributions/discrete_distribution.cpp b/src/distributions/discrete_distribution.cpp index d8a0723..a26a661 100755 --- a/src/distributions/discrete_distribution.cpp +++ b/src/distributions/discrete_distribution.cpp @@ -7,7 +7,7 @@ namespace libhmm { /** * Gets the probability mass function value for a discrete observation. - * + * * @param x The discrete value (will be cast to integer index) * @return Probability mass for the given value, 0.0 if out of range */ @@ -25,7 +25,7 @@ double DiscreteDistribution::getProbability(double x) const { /** * Fits the distribution to observed data using maximum likelihood estimation. * Computes empirical probabilities: P(X = k) = count(k) / total_count - * + * * @param values Vector of observed discrete values */ void DiscreteDistribution::fit(std::span data) { @@ -90,7 +90,7 @@ void DiscreteDistribution::reset() noexcept { /** * Returns a string representation of the distribution. - * + * * @return String showing all symbol probabilities */ std::string DiscreteDistribution::toString() const { diff --git a/src/distributions/exponential_distribution.cpp b/src/distributions/exponential_distribution.cpp index 4a66d8c..4a5b052 100755 --- a/src/distributions/exponential_distribution.cpp +++ b/src/distributions/exponential_distribution.cpp @@ -10,13 +10,13 @@ namespace libhmm { /** * Computes the probability density function for the Exponential distribution. - * + * * For continuous distributions in discrete sampling contexts, we approximate * the probability as P(x - ε <= X <= x) = F(x) - F(x - ε) where ε is a small tolerance. - * + * * This provides a numerically stable approximation of the PDF scaled by the tolerance, * which is appropriate for discrete sampling of continuous distributions. - * + * * @param x The value at which to evaluate the probability * @return Approximated probability for discrete sampling */ @@ -40,9 +40,9 @@ double ExponentialDistribution::getProbability(double value) const { /** * Computes the logarithm of the probability density function for numerical stability. - * + * * For exponential distribution: log(f(x)) = log(λ) - λx for x ≥ 0 - * + * * @param x The value at which to evaluate the log-PDF * @return Natural logarithm of the probability density, or -∞ for invalid values */ @@ -59,9 +59,9 @@ double ExponentialDistribution::getLogProbability(double value) const noexcept { /** * Evaluates the CDF for the Exponential distribution at x. - * + * * Formula: F(x) = 1 - exp(-λx) for x ≥ 0 - * + * * @param x The value at which to evaluate the CDF * @return Cumulative probability P(X ≤ x) */ @@ -73,7 +73,7 @@ double ExponentialDistribution::getCumulativeProbability(double x) const noexcep /** * Fits the distribution parameters to the given data using maximum likelihood estimation. - * + * * For the Exponential distribution, the MLE of the rate parameter is: * λ = 1 / sample_mean * diff --git a/src/distributions/gamma_distribution.cpp b/src/distributions/gamma_distribution.cpp index 90d23cd..76cc848 100755 --- a/src/distributions/gamma_distribution.cpp +++ b/src/distributions/gamma_distribution.cpp @@ -8,7 +8,7 @@ namespace libhmm { /** * Computes the probability density function for the Gamma distribution. * PDF: f(x) = (1/(Γ(k)θ^k)) * x^(k-1) * exp(-x/θ) for x ≥ 0 - * + * * @param x The value at which to evaluate the probability * @return Probability density */ @@ -32,7 +32,7 @@ double GammaDistribution::getProbability(double x) const { /** * Evaluates the logarithm of the probability density function for numerical stability. * Formula: log PDF(x) = (k-1)*ln(x) - x/θ - k*ln(θ) - ln(Γ(k)) - * + * * @param x The value at which to evaluate the log PDF * @return Log probability density */ @@ -60,7 +60,7 @@ double GammaDistribution::getLogProbability(double x) const noexcept { * Evaluates the CDF at x using the incomplete gamma function * Formula: CDF(x) = P(k, x/θ) = γ(k, x/θ) / Γ(k) * where P is the regularized incomplete gamma function - * + * * @param x The value at which to evaluate the CDF * @return Cumulative probability P(X ≤ x) */ @@ -88,15 +88,15 @@ double GammaDistribution::ligamma(double a, double x) noexcept { /** * Fits the distribution parameters to the given data using method of moments estimation. - * + * * Method of moments uses: * sample_mean = k*θ * sample_variance = k*θ² - * + * * Solving: θ = sample_variance/sample_mean, k = sample_mean²/sample_variance - * + * * This is more numerically stable than MLE approximations for the Gamma distribution. - * + * * @param values Vector of observed data points */ void GammaDistribution::fit(std::span data) { diff --git a/src/distributions/gaussian_distribution.cpp b/src/distributions/gaussian_distribution.cpp index ff9f31b..d4c48ef 100755 --- a/src/distributions/gaussian_distribution.cpp +++ b/src/distributions/gaussian_distribution.cpp @@ -10,7 +10,7 @@ using namespace libhmm::constants; namespace libhmm { /** * Returns the probability density function value for the Gaussian distribution. - * + * * Formula: PDF(x) = (1/σ√(2π)) * exp(-½((x-μ)/σ)²) */ double GaussianDistribution::getProbability(double x) const { @@ -83,7 +83,7 @@ double GaussianDistribution::getCumulativeProbability(double x) const noexcept { /* * Fits the distribution parameters using maximum likelihood estimation with optimized algorithm. - * + * * Uses single-pass Welford's algorithm for numerically stable variance calculation: * - Better cache locality than two-pass algorithm * - Numerically stable for extreme values diff --git a/src/distributions/log_normal_distribution.cpp b/src/distributions/log_normal_distribution.cpp index 572fb91..e598969 100755 --- a/src/distributions/log_normal_distribution.cpp +++ b/src/distributions/log_normal_distribution.cpp @@ -1,4 +1,5 @@ #include "libhmm/distributions/log_normal_distribution.h" +#include "libhmm/performance/simd_kernels_internal.h" // Header already includes: , , , , , via common.h #include // For std::accumulate (not in common.h) #include // For std::for_each (exists in common.h, included for clarity) @@ -9,13 +10,13 @@ namespace libhmm { /** * Computes the probability density function for the Log-Normal distribution. - * + * * For continuous distributions in discrete sampling contexts, we approximate * the probability as P(x - ε <= X <= x) = F(x) - F(x - ε) where ε is a small tolerance. - * + * * This provides a numerically stable approximation of the PDF scaled by the tolerance, * which is appropriate for discrete sampling of continuous distributions. - * + * * @param x The value at which to evaluate the probability * @return Approximated probability for discrete sampling */ @@ -78,13 +79,13 @@ double LogNormalDistribution::getCumulativeProbability(double value) const noexc /** * Fits the distribution parameters to the given data using maximum likelihood estimation. - * + * * For Log-Normal distribution, the MLE estimators are: * μ = mean(ln(x_i)) for positive x_i * σ = std_dev(ln(x_i)) for positive x_i - * + * * Only positive values are used since Log-Normal distribution has support (0, ∞). - * + * * @param values Vector of observed data points */ void LogNormalDistribution::fit(std::span data) { @@ -210,20 +211,111 @@ std::istream &operator>>(std::istream &is, libhmm::LogNormalDistribution &distri return is; } +// ============================================================================= +// Batch log-PDF — explicit SIMD intrinsics (tier 2) +// +// Formula: log f(x) = -log(x) - logNormConst + negHalfInvSigma2*(log(x)-mu)^2 +// Per element: log_x = log(x); then result = -log_x - C + S*(log_x - mu)^2 +// where C = logNormalizationConstant_, S = negHalfSigmaSquaredInv_. +// +// x <= 0 lanes: log(x) is -inf; guard produces -inf output. +// Pattern mirrors gaussian_logpdf_batch (gaussian_distribution.cpp). +// ============================================================================= +namespace detail { + +void lognormal_logpdf_batch(const double *obs, double *out, std::size_t n, double mu, double S, + double C) noexcept { + using namespace performance::detail::kernels; + std::size_t i = 0; + const double neg_inf = -std::numeric_limits::infinity(); + +#if defined(LIBHMM_HAS_AVX512) + { + const __m512d vmu = _mm512_set1_pd(mu); + const __m512d vS = _mm512_set1_pd(S); + const __m512d vC = _mm512_set1_pd(C); + for (; i + 8 <= n; i += 8) { + __m512d x = _mm512_loadu_pd(obs + i); + __m512d lx = k_log_pd_avx512(x); // -inf where x<=0 + __m512d d = _mm512_sub_pd(lx, vmu); // log(x) - mu + __m512d res = _mm512_fmadd_pd( + d, _mm512_mul_pd(d, vS), + _mm512_sub_pd(_mm512_setzero_pd(), _mm512_add_pd(lx, vC))); // -lx - C + S*d^2 + _mm512_storeu_pd(out + i, res); + } + } +#endif + +#if defined(LIBHMM_HAS_AVX) || defined(LIBHMM_HAS_AVX2) + { + const __m256d vmu = _mm256_set1_pd(mu); + const __m256d vS = _mm256_set1_pd(S); + const __m256d vC = _mm256_set1_pd(C); + for (; i + 4 <= n; i += 4) { + __m256d x = _mm256_loadu_pd(obs + i); + __m256d lx = k_log_pd_avx(x); + __m256d d = _mm256_sub_pd(lx, vmu); + __m256d res = _mm256_add_pd(_mm256_mul_pd(_mm256_mul_pd(d, d), vS), + _mm256_sub_pd(_mm256_setzero_pd(), _mm256_add_pd(lx, vC))); + _mm256_storeu_pd(out + i, res); + } + } +#endif + +#if defined(LIBHMM_HAS_SSE2) + { + const __m128d vmu = _mm_set1_pd(mu); + const __m128d vS = _mm_set1_pd(S); + const __m128d vC = _mm_set1_pd(C); + for (; i + 2 <= n; i += 2) { + __m128d x = _mm_loadu_pd(obs + i); + __m128d lx = k_log_pd_sse2(x); + __m128d d = _mm_sub_pd(lx, vmu); + __m128d res = _mm_add_pd(_mm_mul_pd(_mm_mul_pd(d, d), vS), + _mm_sub_pd(_mm_setzero_pd(), _mm_add_pd(lx, vC))); + _mm_storeu_pd(out + i, res); + } + } +#endif + +#if defined(LIBHMM_HAS_NEON) + { + const float64x2_t vmu = vdupq_n_f64(mu); + const float64x2_t vS = vdupq_n_f64(S); + const float64x2_t vC = vdupq_n_f64(C); + for (; i + 2 <= n; i += 2) { + float64x2_t x = vld1q_f64(obs + i); + float64x2_t lx = k_log_pd_neon(x); + float64x2_t d = vsubq_f64(lx, vmu); + // res = S*d^2 + (-lx - C) = S*d^2 - lx - C + float64x2_t res = vfmaq_f64(vsubq_f64(vnegq_f64(lx), vC), vmulq_f64(d, d), vS); + vst1q_f64(out + i, res); + } + } +#endif + + // Scalar tail. + for (; i < n; ++i) { + const double x = obs[i]; + if (x <= 0.0 || std::isnan(x) || std::isinf(x)) { + out[i] = neg_inf; + } else { + const double lx = std::log(x); + const double d = lx - mu; + out[i] = -lx - C + S * d * d; + } + } +} + +} // namespace detail + void LogNormalDistribution::getBatchLogProbabilities(std::span observations, std::span out) const { - // Tier 1 — concrete non-virtual loop; compiler auto-vectorizes the arithmetic - // terms under -march=native / /arch:AVX512. - // Tier 2 upgrade requires vectorised log(x): the inner loop is essentially - // Gaussian on log(x), so once a vectorised log is available the pattern is - // identical to GaussianDistribution tier 2 but with an extra log-transform - // step. Available via Intel SVML, GNU libmvec, or Apple Accelerate vvlog, - // but not portably without a math-library dependency. + // Tier 2 — explicit SIMD via simd_kernels_internal.h if (!isCacheValid()) updateCache(); - for (std::size_t i = 0; i < observations.size(); ++i) { - out[i] = LogNormalDistribution::getLogProbability(observations[i]); - } + detail::lognormal_logpdf_batch(observations.data(), out.data(), observations.size(), mean_, + negHalfSigmaSquaredInv_, logNormalizationConstant_); } } // namespace libhmm diff --git a/src/distributions/negative_binomial_distribution.cpp b/src/distributions/negative_binomial_distribution.cpp index 24e8e0f..2836602 100644 --- a/src/distributions/negative_binomial_distribution.cpp +++ b/src/distributions/negative_binomial_distribution.cpp @@ -9,10 +9,10 @@ namespace libhmm { /** * Computes the probability mass function for the Negative Binomial distribution. - * + * * For discrete distributions, this returns the exact probability mass * P(X = k) = C(k+r-1, k) * p^r * (1-p)^k - * + * * @param value The value at which to evaluate the PMF (rounded to nearest integer) * @return Probability mass for the given value */ @@ -45,14 +45,14 @@ double NegativeBinomialDistribution::getProbability(double value) const { /** * Fits the distribution parameters to the given data using method of moments. - * + * * For Negative Binomial distribution, the method of moments estimators are: * p̂ = mean / variance (if variance > mean) * r̂ = mean² / (variance - mean) (if variance > mean) - * - * If variance ≤ mean, the negative binomial model is not appropriate + * + * If variance ≤ mean, the negative binomial model is not appropriate * (indicates under-dispersion), so we fall back to default parameters. - * + * * @param values Vector of observed data points */ void NegativeBinomialDistribution::fit(std::span data) { @@ -139,7 +139,7 @@ void NegativeBinomialDistribution::reset() noexcept { /** * Returns a string representation of the distribution following the standardized format. - * + * * @return String describing the distribution parameters and statistics */ std::string NegativeBinomialDistribution::toString() const { diff --git a/src/distributions/pareto_distribution.cpp b/src/distributions/pareto_distribution.cpp index aae3b3b..a6e5968 100755 --- a/src/distributions/pareto_distribution.cpp +++ b/src/distributions/pareto_distribution.cpp @@ -1,4 +1,5 @@ #include "libhmm/distributions/pareto_distribution.h" +#include "libhmm/performance/simd_kernels_internal.h" // Header already includes: , , , , , via common.h #include // For std::accumulate (not in common.h) #include // For std::min_element (exists in common.h, included for clarity) @@ -10,11 +11,11 @@ namespace libhmm { /** * Computes the probability density function for the Pareto distribution. - * + * * For Pareto distribution: f(x) = (k * x_m^k) / x^(k+1) for x ≥ x_m - * + * * Uses direct PDF calculation for optimal performance, avoiding expensive CDF differences. - * + * * @param x The value at which to evaluate the probability density * @return Probability density for the given value */ @@ -38,9 +39,9 @@ double ParetoDistribution::getProbability(double x) const { /** * Computes the logarithm of the probability density function for numerical stability. - * + * * For Pareto distribution: log(f(x)) = log(k) + k*log(x_m) - (k+1)*log(x) for x ≥ x_m - * + * * @param value The value at which to evaluate the log-PDF * @return Natural logarithm of the probability density, or -∞ for invalid values */ @@ -65,9 +66,9 @@ double ParetoDistribution::getCumulativeProbability(double value) const noexcept /** * Evaluates the CDF for the Pareto distribution at x. - * + * * Formula: F(x) = 1 - (x_m/x)^k for x ≥ x_m - * + * * @param x The value at which to evaluate the CDF * @return Cumulative probability P(X ≤ x) */ @@ -77,11 +78,11 @@ double ParetoDistribution::CDF(double x) const noexcept { /** * Fits the distribution parameters to the given data using maximum likelihood estimation. - * + * * For Pareto distribution, the MLE estimators are: * x_m = min(x_i) for all i * k = n / Σ(ln(x_i) - ln(x_m)) for i = 1 to n - * + * * @param values Vector of observed data */ void ParetoDistribution::fit(std::span data) { @@ -196,19 +197,108 @@ std::istream &operator>>(std::istream &is, libhmm::ParetoDistribution &distribut return is; } +// ============================================================================= +// Batch log-PDF — explicit SIMD intrinsics (tier 2) +// +// Formula: log f(x) = logK + kLogXm - kPlus1 * log(x) for x >= xm +// = -inf for x < xm +// ============================================================================= +namespace detail { + +void pareto_logpdf_batch(const double *obs, double *out, std::size_t n, double xm, + double logK_plus_kLogXm, double kPlus1) noexcept { + using namespace performance::detail::kernels; + std::size_t i = 0; + const double neg_inf = -std::numeric_limits::infinity(); + +#if defined(LIBHMM_HAS_AVX512) + { + const __m512d vxm = _mm512_set1_pd(xm); + const __m512d vconst = _mm512_set1_pd(logK_plus_kLogXm); + const __m512d vkp1 = _mm512_set1_pd(kPlus1); + const __m512d vneg_inf = _mm512_set1_pd(neg_inf); + for (; i + 8 <= n; i += 8) { + __m512d x = _mm512_loadu_pd(obs + i); + // x < xm: -inf + __mmask8 invalid = _mm512_cmp_pd_mask(x, vxm, _CMP_LT_OS); + __m512d lx = k_log_pd_avx512(x); + __m512d res = _mm512_fnmadd_pd(vkp1, lx, vconst); // const - kp1*log(x) + res = _mm512_mask_blend_pd(invalid, res, vneg_inf); + _mm512_storeu_pd(out + i, res); + } + } +#endif + +#if defined(LIBHMM_HAS_AVX) || defined(LIBHMM_HAS_AVX2) + { + const __m256d vxm = _mm256_set1_pd(xm); + const __m256d vconst = _mm256_set1_pd(logK_plus_kLogXm); + const __m256d vkp1 = _mm256_set1_pd(kPlus1); + const __m256d vneg_inf = _mm256_set1_pd(neg_inf); + for (; i + 4 <= n; i += 4) { + __m256d x = _mm256_loadu_pd(obs + i); + __m256d inv = _mm256_cmp_pd(x, vxm, _CMP_LT_OS); // all-1s where x < xm + __m256d lx = k_log_pd_avx(x); + __m256d res = _mm256_sub_pd(vconst, _mm256_mul_pd(vkp1, lx)); + res = _mm256_blendv_pd(res, vneg_inf, inv); + _mm256_storeu_pd(out + i, res); + } + } +#endif + +#if defined(LIBHMM_HAS_SSE2) + { + const __m128d vxm = _mm_set1_pd(xm); + const __m128d vconst = _mm_set1_pd(logK_plus_kLogXm); + const __m128d vkp1 = _mm_set1_pd(kPlus1); + const __m128d vneg_inf = _mm_set1_pd(neg_inf); + for (; i + 2 <= n; i += 2) { + __m128d x = _mm_loadu_pd(obs + i); + __m128d inv = _mm_cmplt_pd(x, vxm); + __m128d lx = k_log_pd_sse2(x); + __m128d res = _mm_sub_pd(vconst, _mm_mul_pd(vkp1, lx)); + res = _mm_or_pd(_mm_andnot_pd(inv, res), _mm_and_pd(inv, vneg_inf)); + _mm_storeu_pd(out + i, res); + } + } +#endif + +#if defined(LIBHMM_HAS_NEON) + { + const float64x2_t vxm = vdupq_n_f64(xm); + const float64x2_t vconst = vdupq_n_f64(logK_plus_kLogXm); + const float64x2_t vkp1 = vdupq_n_f64(kPlus1); + const float64x2_t vneg_inf = vdupq_n_f64(neg_inf); + for (; i + 2 <= n; i += 2) { + float64x2_t x = vld1q_f64(obs + i); + uint64x2_t inv = vcltq_f64(x, vxm); // x < xm + float64x2_t lx = k_log_pd_neon(x); + float64x2_t res = vsubq_f64(vconst, vmulq_f64(vkp1, lx)); + res = vbslq_f64(inv, vneg_inf, res); + vst1q_f64(out + i, res); + } + } +#endif + + // Scalar tail. + for (; i < n; ++i) { + const double x = obs[i]; + out[i] = (std::isnan(x) || std::isinf(x) || x < xm) + ? neg_inf + : logK_plus_kLogXm - kPlus1 * std::log(x); + } +} + +} // namespace detail + void ParetoDistribution::getBatchLogProbabilities(std::span observations, std::span out) const { - // Tier 1 — concrete non-virtual loop; compiler auto-vectorizes the arithmetic - // terms under -march=native / /arch:AVX512. - // Tier 2 upgrade requires vectorised log(x): inner loop is - // log(α) + α*log(x_m) - (α+1)*log(x), so a vectorised log is needed. - // Available via Intel SVML, GNU libmvec, or Apple Accelerate vvlog, but - // not portably without a math-library dependency. + // Tier 2 — explicit SIMD via simd_kernels_internal.h if (!isCacheValid()) updateCache(); - for (std::size_t i = 0; i < observations.size(); ++i) { - out[i] = ParetoDistribution::getLogProbability(observations[i]); - } + // logK_ + kLogXm_ is a single scalar constant — compute once. + detail::pareto_logpdf_batch(observations.data(), out.data(), observations.size(), xm_, + logK_ + kLogXm_, kPlus1_); } } // namespace libhmm diff --git a/src/distributions/rayleigh_distribution.cpp b/src/distributions/rayleigh_distribution.cpp index fa8a4c0..8ab4218 100644 --- a/src/distributions/rayleigh_distribution.cpp +++ b/src/distributions/rayleigh_distribution.cpp @@ -6,9 +6,9 @@ namespace libhmm { /** * Computes the probability density function for the Rayleigh distribution. - * + * * PDF: f(x) = (x/σ²) * exp(-x²/(2σ²)) for x ≥ 0 - * + * * @param value The value at which to evaluate the PDF * @return Probability density */ @@ -24,9 +24,9 @@ double RayleighDistribution::getProbability(double value) const { /** * Computes the logarithm of the probability density function for numerical stability. - * + * * For Rayleigh distribution: log(f(x)) = log(x) - 2*log(σ) - x²/(2σ²) for x > 0 - * + * * @param value The value at which to evaluate the log-PDF * @return Natural logarithm of the probability density, or -∞ for invalid values */ @@ -54,7 +54,7 @@ double RayleighDistribution::getCumulativeProbability(double value) const noexce * Fits the distribution parameters to the given data using maximum likelihood estimation. * This method is efficient as it requires only a single pass through the data * to compute the sum of squares. - * + * * @param values Vector of observed data */ void RayleighDistribution::fit(std::span data) { diff --git a/src/distributions/student_t_distribution.cpp b/src/distributions/student_t_distribution.cpp index c463d6e..f7944d2 100644 --- a/src/distributions/student_t_distribution.cpp +++ b/src/distributions/student_t_distribution.cpp @@ -121,7 +121,7 @@ double StudentTDistribution::getLogProbability(double value) const noexcept { /** * Computes the cumulative distribution function for the Student's t-distribution. - * + * * Uses the relationship with the incomplete beta function for numerical accuracy. */ double StudentTDistribution::getCumulativeProbability(double value) const noexcept { diff --git a/src/performance/transcendental_kernels.cpp b/src/performance/transcendental_kernels.cpp new file mode 100644 index 0000000..7d61803 --- /dev/null +++ b/src/performance/transcendental_kernels.cpp @@ -0,0 +1,436 @@ +// src/performance/transcendental_kernels.cpp +// +// SIMD implementations of TranscendentalKernels methods. +// +// Compiled with LIBHMM_BEST_SIMD_FLAGS, activating the ISA cascade: +// AVX-512 8-wide __m512d +// AVX/AVX2 4-wide __m256d (AVX-1 compatible; compiler fuses FMA under AVX2) +// SSE2 2-wide __m128d +// NEON 2-wide float64x2_t +// scalar tail and portable fallback +// +// Vector exp helpers (k_exp_pd_*) and log helpers (k_log_pd_*) are defined +// in simd_kernels_internal.h -- the single source of truth shared with +// Tier-2 distribution TUs (log_normal_distribution.cpp, pareto_distribution.cpp). + +#include "libhmm/performance/transcendental_kernels.h" +#include "libhmm/performance/simd_kernels_internal.h" +#include "libhmm/math/constants.h" +#include "libhmm/platform/simd_platform.h" + +#include +#include +#include +#include + +namespace libhmm { +namespace performance { +namespace detail { + +namespace { + +// --------------------------------------------------------------------------- +// Horizontal reduction helpers +// --------------------------------------------------------------------------- + +// SSE2: horizontal max of 2-lane vector. +#if defined(LIBHMM_HAS_SSE2) +static inline double hmax_pd_sse2(__m128d v) noexcept { + __m128d shuf = _mm_shuffle_pd(v, v, 1); + return _mm_cvtsd_f64(_mm_max_pd(v, shuf)); +} +static inline double hadd_pd_sse2(__m128d v) noexcept { + __m128d shuf = _mm_shuffle_pd(v, v, 1); + return _mm_cvtsd_f64(_mm_add_pd(v, shuf)); +} +#endif + +// AVX: horizontal max/sum of 4-lane vector. +#if defined(LIBHMM_HAS_AVX) || defined(LIBHMM_HAS_AVX2) +static inline double hmax_pd_avx(__m256d v) noexcept { + __m128d lo = _mm256_castpd256_pd128(v); + __m128d hi = _mm256_extractf128_pd(v, 1); + __m128d m = _mm_max_pd(lo, hi); + return hmax_pd_sse2(m); +} +static inline double hadd_pd_avx(__m256d v) noexcept { + __m128d lo = _mm256_castpd256_pd128(v); + __m128d hi = _mm256_extractf128_pd(v, 1); + __m128d s = _mm_add_pd(lo, hi); + return hadd_pd_sse2(s); +} +#endif + +} // anonymous namespace + +// ============================================================================= +// TranscendentalKernels method implementations +// ============================================================================= + +// ----------------------------------------------------------------------------- +// reduce_max_sum2: max of (a[i] + b[i]) +// ----------------------------------------------------------------------------- +double TranscendentalKernels::reduce_max_sum2(const double *a, const double *b, + std::size_t size) noexcept { + std::size_t i = 0; + const double neg_inf = -std::numeric_limits::infinity(); + // maxVal accumulates across ISA blocks; each lower-tier block seeds its + // vector accumulator from the value set by the highest active tier. + double maxVal; +#if defined(LIBHMM_HAS_AVX512) + { + __m512d vmax = _mm512_set1_pd(neg_inf); + for (; i + 8 <= size; i += 8) { + __m512d va = _mm512_loadu_pd(a + i); + __m512d vb = _mm512_loadu_pd(b + i); + vmax = _mm512_max_pd(vmax, _mm512_add_pd(va, vb)); + } + maxVal = _mm512_reduce_max_pd(vmax); + } +#else + maxVal = neg_inf; +#endif + +#if defined(LIBHMM_HAS_AVX) || defined(LIBHMM_HAS_AVX2) + { + __m256d vmax = _mm256_set1_pd(maxVal); + for (; i + 4 <= size; i += 4) { + __m256d va = _mm256_loadu_pd(a + i); + __m256d vb = _mm256_loadu_pd(b + i); + vmax = _mm256_max_pd(vmax, _mm256_add_pd(va, vb)); + } + maxVal = hmax_pd_avx(vmax); + } +#endif + +#if defined(LIBHMM_HAS_SSE2) + { + __m128d vmax = _mm_set1_pd(maxVal); + for (; i + 2 <= size; i += 2) { + __m128d va = _mm_loadu_pd(a + i); + __m128d vb = _mm_loadu_pd(b + i); + vmax = _mm_max_pd(vmax, _mm_add_pd(va, vb)); + } + maxVal = hmax_pd_sse2(vmax); + } +#endif + +#if defined(LIBHMM_HAS_NEON) + { + float64x2_t vmax = vdupq_n_f64(maxVal); + for (; i + 2 <= size; i += 2) { + float64x2_t va = vld1q_f64(a + i); + float64x2_t vb = vld1q_f64(b + i); + vmax = vmaxq_f64(vmax, vaddq_f64(va, vb)); + } + maxVal = vmaxvq_f64(vmax); + } +#endif + + // Scalar tail. + for (; i < size; ++i) { + const double t = a[i] + b[i]; + if (t > maxVal) + maxVal = t; + } + return maxVal; +} + +// ----------------------------------------------------------------------------- +// sum_exp_sum2_minus_max +// ----------------------------------------------------------------------------- +double TranscendentalKernels::sum_exp_sum2_minus_max(const double *a, const double *b, + std::size_t size, double maxVal) noexcept { + if (!std::isfinite(maxVal)) + return 0.0; + std::size_t i = 0; + double sum = 0.0; + +#if defined(LIBHMM_HAS_AVX512) + { + const __m512d vmaxv = _mm512_set1_pd(maxVal); + __m512d vsum = _mm512_setzero_pd(); + for (; i + 8 <= size; i += 8) { + __m512d va = _mm512_loadu_pd(a + i); + __m512d vb = _mm512_loadu_pd(b + i); + __m512d term = _mm512_sub_pd(_mm512_add_pd(va, vb), vmaxv); + vsum = _mm512_add_pd(vsum, kernels::k_exp_pd_avx512(term)); + } + sum += _mm512_reduce_add_pd(vsum); + } +#endif + +#if defined(LIBHMM_HAS_AVX) || defined(LIBHMM_HAS_AVX2) + { + const __m256d vmaxv = _mm256_set1_pd(maxVal); + __m256d vsum = _mm256_setzero_pd(); + for (; i + 4 <= size; i += 4) { + __m256d va = _mm256_loadu_pd(a + i); + __m256d vb = _mm256_loadu_pd(b + i); + __m256d term = _mm256_sub_pd(_mm256_add_pd(va, vb), vmaxv); + vsum = _mm256_add_pd(vsum, kernels::k_exp_pd_avx(term)); + } + sum += hadd_pd_avx(vsum); + } +#endif + +#if defined(LIBHMM_HAS_SSE2) + { + const __m128d vmaxv = _mm_set1_pd(maxVal); + __m128d vsum = _mm_setzero_pd(); + for (; i + 2 <= size; i += 2) { + __m128d va = _mm_loadu_pd(a + i); + __m128d vb = _mm_loadu_pd(b + i); + __m128d term = _mm_sub_pd(_mm_add_pd(va, vb), vmaxv); + vsum = _mm_add_pd(vsum, kernels::k_exp_pd_sse2(term)); + } + sum += hadd_pd_sse2(vsum); + } +#endif + +#if defined(LIBHMM_HAS_NEON) + { + const float64x2_t vmaxv = vdupq_n_f64(maxVal); + float64x2_t vsum = vdupq_n_f64(0.0); + for (; i + 2 <= size; i += 2) { + float64x2_t va = vld1q_f64(a + i); + float64x2_t vb = vld1q_f64(b + i); + float64x2_t term = vsubq_f64(vaddq_f64(va, vb), vmaxv); + vsum = vaddq_f64(vsum, kernels::k_exp_pd_neon(term)); + } + sum += vaddvq_f64(vsum); + } +#endif + + // Scalar tail. + for (; i < size; ++i) { + const double t = a[i] + b[i]; + if (std::isfinite(t)) + sum += std::exp(t - maxVal); + } + return sum; +} + +// ----------------------------------------------------------------------------- +// reduce_max_sum3: max of (a[i] + b[i] + c[i]) +// ----------------------------------------------------------------------------- +double TranscendentalKernels::reduce_max_sum3(const double *a, const double *b, const double *c, + std::size_t size) noexcept { + std::size_t i = 0; + const double neg_inf = -std::numeric_limits::infinity(); + double maxVal; +#if defined(LIBHMM_HAS_AVX512) + { + __m512d vmax = _mm512_set1_pd(neg_inf); + for (; i + 8 <= size; i += 8) { + __m512d va = _mm512_loadu_pd(a + i); + __m512d vb = _mm512_loadu_pd(b + i); + __m512d vc = _mm512_loadu_pd(c + i); + vmax = _mm512_max_pd(vmax, _mm512_add_pd(_mm512_add_pd(va, vb), vc)); + } + maxVal = _mm512_reduce_max_pd(vmax); + } +#else + maxVal = neg_inf; +#endif + +#if defined(LIBHMM_HAS_AVX) || defined(LIBHMM_HAS_AVX2) + { + __m256d vmax = _mm256_set1_pd(maxVal); + for (; i + 4 <= size; i += 4) { + __m256d va = _mm256_loadu_pd(a + i); + __m256d vb = _mm256_loadu_pd(b + i); + __m256d vc = _mm256_loadu_pd(c + i); + vmax = _mm256_max_pd(vmax, _mm256_add_pd(_mm256_add_pd(va, vb), vc)); + } + maxVal = hmax_pd_avx(vmax); + } +#endif + +#if defined(LIBHMM_HAS_SSE2) + { + __m128d vmax = _mm_set1_pd(maxVal); + for (; i + 2 <= size; i += 2) { + __m128d va = _mm_loadu_pd(a + i); + __m128d vb = _mm_loadu_pd(b + i); + __m128d vc = _mm_loadu_pd(c + i); + vmax = _mm_max_pd(vmax, _mm_add_pd(_mm_add_pd(va, vb), vc)); + } + maxVal = hmax_pd_sse2(vmax); + } +#endif + +#if defined(LIBHMM_HAS_NEON) + { + float64x2_t vmax = vdupq_n_f64(maxVal); + for (; i + 2 <= size; i += 2) { + float64x2_t va = vld1q_f64(a + i); + float64x2_t vb = vld1q_f64(b + i); + float64x2_t vc = vld1q_f64(c + i); + vmax = vmaxq_f64(vmax, vaddq_f64(vaddq_f64(va, vb), vc)); + } + maxVal = vmaxvq_f64(vmax); + } +#endif + + // Scalar tail. + for (; i < size; ++i) { + const double t = a[i] + b[i] + c[i]; + if (t > maxVal) + maxVal = t; + } + return maxVal; +} + +// ----------------------------------------------------------------------------- +// sum_exp_sum3_minus_max: sum of exp(a[i]+b[i]+c[i] - maxVal) +// ----------------------------------------------------------------------------- +double TranscendentalKernels::sum_exp_sum3_minus_max(const double *a, const double *b, + const double *c, std::size_t size, + double maxVal) noexcept { + if (!std::isfinite(maxVal)) + return 0.0; + std::size_t i = 0; + double sum = 0.0; + +#if defined(LIBHMM_HAS_AVX512) + { + const __m512d vmaxv = _mm512_set1_pd(maxVal); + __m512d vsum = _mm512_setzero_pd(); + for (; i + 8 <= size; i += 8) { + __m512d va = _mm512_loadu_pd(a + i); + __m512d vb = _mm512_loadu_pd(b + i); + __m512d vc = _mm512_loadu_pd(c + i); + __m512d term = _mm512_sub_pd(_mm512_add_pd(_mm512_add_pd(va, vb), vc), vmaxv); + vsum = _mm512_add_pd(vsum, kernels::k_exp_pd_avx512(term)); + } + sum += _mm512_reduce_add_pd(vsum); + } +#endif + +#if defined(LIBHMM_HAS_AVX) || defined(LIBHMM_HAS_AVX2) + { + const __m256d vmaxv = _mm256_set1_pd(maxVal); + __m256d vsum = _mm256_setzero_pd(); + for (; i + 4 <= size; i += 4) { + __m256d va = _mm256_loadu_pd(a + i); + __m256d vb = _mm256_loadu_pd(b + i); + __m256d vc = _mm256_loadu_pd(c + i); + __m256d term = _mm256_sub_pd(_mm256_add_pd(_mm256_add_pd(va, vb), vc), vmaxv); + vsum = _mm256_add_pd(vsum, kernels::k_exp_pd_avx(term)); + } + sum += hadd_pd_avx(vsum); + } +#endif + +#if defined(LIBHMM_HAS_SSE2) + { + const __m128d vmaxv = _mm_set1_pd(maxVal); + __m128d vsum = _mm_setzero_pd(); + for (; i + 2 <= size; i += 2) { + __m128d va = _mm_loadu_pd(a + i); + __m128d vb = _mm_loadu_pd(b + i); + __m128d vc = _mm_loadu_pd(c + i); + __m128d term = _mm_sub_pd(_mm_add_pd(_mm_add_pd(va, vb), vc), vmaxv); + vsum = _mm_add_pd(vsum, kernels::k_exp_pd_sse2(term)); + } + sum += hadd_pd_sse2(vsum); + } +#endif + +#if defined(LIBHMM_HAS_NEON) + { + const float64x2_t vmaxv = vdupq_n_f64(maxVal); + float64x2_t vsum = vdupq_n_f64(0.0); + for (; i + 2 <= size; i += 2) { + float64x2_t va = vld1q_f64(a + i); + float64x2_t vb = vld1q_f64(b + i); + float64x2_t vc = vld1q_f64(c + i); + float64x2_t term = vsubq_f64(vaddq_f64(vaddq_f64(va, vb), vc), vmaxv); + vsum = vaddq_f64(vsum, kernels::k_exp_pd_neon(term)); + } + sum += vaddvq_f64(vsum); + } +#endif + + // Scalar tail. + for (; i < size; ++i) { + const double t = a[i] + b[i] + c[i]; + if (std::isfinite(t)) + sum += std::exp(t - maxVal); + } + return sum; +} + +// ----------------------------------------------------------------------------- +// accumulate_exp_sum2_bias: dst[i] += exp(a[i] + b[i] + bias) +// ----------------------------------------------------------------------------- +void TranscendentalKernels::accumulate_exp_sum2_bias(double *dst, const double *a, const double *b, + std::size_t size, double bias) noexcept { + std::size_t i = 0; + +#if defined(LIBHMM_HAS_AVX512) + { + const __m512d vbias = _mm512_set1_pd(bias); + for (; i + 8 <= size; i += 8) { + __m512d vd = _mm512_loadu_pd(dst + i); + __m512d va = _mm512_loadu_pd(a + i); + __m512d vb = _mm512_loadu_pd(b + i); + __m512d arg = _mm512_add_pd(_mm512_add_pd(va, vb), vbias); + vd = _mm512_add_pd(vd, kernels::k_exp_pd_avx512(arg)); + _mm512_storeu_pd(dst + i, vd); + } + } +#endif + +#if defined(LIBHMM_HAS_AVX) || defined(LIBHMM_HAS_AVX2) + { + const __m256d vbias = _mm256_set1_pd(bias); + for (; i + 4 <= size; i += 4) { + __m256d vd = _mm256_loadu_pd(dst + i); + __m256d va = _mm256_loadu_pd(a + i); + __m256d vb = _mm256_loadu_pd(b + i); + __m256d arg = _mm256_add_pd(_mm256_add_pd(va, vb), vbias); + vd = _mm256_add_pd(vd, kernels::k_exp_pd_avx(arg)); + _mm256_storeu_pd(dst + i, vd); + } + } +#endif + +#if defined(LIBHMM_HAS_SSE2) + { + const __m128d vbias = _mm_set1_pd(bias); + for (; i + 2 <= size; i += 2) { + __m128d vd = _mm_loadu_pd(dst + i); + __m128d va = _mm_loadu_pd(a + i); + __m128d vb = _mm_loadu_pd(b + i); + __m128d arg = _mm_add_pd(_mm_add_pd(va, vb), vbias); + vd = _mm_add_pd(vd, kernels::k_exp_pd_sse2(arg)); + _mm_storeu_pd(dst + i, vd); + } + } +#endif + +#if defined(LIBHMM_HAS_NEON) + { + const float64x2_t vbias = vdupq_n_f64(bias); + for (; i + 2 <= size; i += 2) { + float64x2_t vd = vld1q_f64(dst + i); + float64x2_t va = vld1q_f64(a + i); + float64x2_t vb = vld1q_f64(b + i); + float64x2_t arg = vaddq_f64(vaddq_f64(va, vb), vbias); + vd = vaddq_f64(vd, kernels::k_exp_pd_neon(arg)); + vst1q_f64(dst + i, vd); + } + } +#endif + + // Scalar tail. + for (; i < size; ++i) { + dst[i] += std::exp(a[i] + b[i] + bias); + } +} + +} // namespace detail +} // namespace performance +} // namespace libhmm diff --git a/src/training/baum_welch_trainer.cpp b/src/training/baum_welch_trainer.cpp index 7ae236f..96b251a 100755 --- a/src/training/baum_welch_trainer.cpp +++ b/src/training/baum_welch_trainer.cpp @@ -1,6 +1,7 @@ #include "libhmm/training/baum_welch_trainer.h" #include "libhmm/calculators/forward_backward_calculator.h" #include "libhmm/hmm.h" +#include "libhmm/performance/transcendental_kernels.h" #include #include #include @@ -26,23 +27,41 @@ BaumWelchTrainer::BaumWelchTrainer(Hmm *hmm, const ObservationLists &obsLists) void BaumWelchTrainer::train() { Hmm &hmm = hmm_ref_.get(); const std::size_t N = static_cast(hmm.getNumStates()); + std::size_t totalExpectedLength = 0; + for (const auto &obs : obsLists_) { + totalExpectedLength += obs.size(); + } // Accumulators (linear space, summed across all sequences) std::vector piNum(N, 0.0); - std::vector> transNum(N, std::vector(N, 0.0)); std::vector transDen(N, 0.0); + // Column-major accumulation: transNumT[j * N + i] stores the expected count + // for transition i->j. This matches the t/j/i xi loop for contiguous reads + // from the transposed log-transition matrix. + std::vector transNumT(N * N, 0.0); // Per-state emission data/weights accumulated across sequences std::vector> emisData(N); std::vector> emisWts(N); + for (std::size_t i = 0; i < N; ++i) { + emisData[i].reserve(totalExpectedLength); + emisWts[i].reserve(totalExpectedLength); + } - // Precompute log-transition matrix from the current model + // Precompute transposed log-transition matrix from the current model: + // logTransT[j * N + i] = log a_{ij} const Matrix &curTrans = hmm.getTrans(); - std::vector> logTrans(N, std::vector(N)); + std::vector logTransT(N * N); + bool hasZeroTransitions = false; for (std::size_t i = 0; i < N; ++i) { for (std::size_t j = 0; j < N; ++j) { const double a = curTrans(i, j); - logTrans[i][j] = (a > 0.0) ? std::log(a) : LOG_ZERO; + if (a > 0.0) { + logTransT[j * N + i] = std::log(a); + } else { + logTransT[j * N + i] = LOG_ZERO; + hasZeroTransitions = true; + } } } @@ -60,24 +79,33 @@ void BaumWelchTrainer::train() { const Matrix &logAlpha = fbc.getLogForwardVariables(); const Matrix &logBeta = fbc.getLogBackwardVariables(); - - // Precompute log-emissions for this sequence: logEmit[i * T + t] - std::vector obsVec(T); - for (std::size_t t = 0; t < T; ++t) - obsVec[t] = obs(t); - - std::vector logEmit(N * T); + const double *logAlphaData = logAlpha.data(); + const double *logBetaData = logBeta.data(); + + // Precompute log-emissions for this sequence, then relayout to time-major: + // logEmitByTime[t * N + j] = log b_j(O_t) + std::vector logEmitStateMajor(N * T); + std::vector logEmitByTime(N * T); + const std::span obsSpan(obs.data(), T); for (std::size_t i = 0; i < N; ++i) { hmm.getDistribution(i).getBatchLogProbabilities( - std::span(obsVec.data(), T), - std::span(logEmit.data() + i * T, T)); + obsSpan, std::span(logEmitStateMajor.data() + i * T, T)); + } + for (std::size_t i = 0; i < N; ++i) { + const double *stateRow = logEmitStateMajor.data() + i * T; + for (std::size_t t = 0; t < T; ++t) { + logEmitByTime[t * N + i] = stateRow[t]; + } } // Accumulate gamma (per timestep per state) and pi/trans denominators for (std::size_t t = 0; t < T; ++t) { + const double *alphaRow = logAlphaData + t * N; + const double *betaRow = logBetaData + t * N; + const double obsVal = obs(t); for (std::size_t i = 0; i < N; ++i) { - const double g = std::exp(logAlpha(t, i) + logBeta(t, i) - logP); - emisData[i].push_back(obs(t)); + const double g = std::exp(alphaRow[i] + betaRow[i] - logP); + emisData[i].push_back(obsVal); emisWts[i].push_back(g); if (t == 0) piNum[i] += g; @@ -86,13 +114,40 @@ void BaumWelchTrainer::train() { } } - // Accumulate xi (transition counts) - for (std::size_t t = 0; t + 1 < T; ++t) { - for (std::size_t i = 0; i < N; ++i) { + // Accumulate xi (transition counts). Dense models take a branch-free + // path; sparse models keep the zero-transition skip. + // Sparse path is intentionally scalar: masking non-zero transitions in + // a SIMD loop costs more than it saves for the typically small fraction + // of non-zero entries in a sparse model. + if (hasZeroTransitions) { + for (std::size_t t = 0; t + 1 < T; ++t) { + const double *alphaRow = logAlphaData + t * N; + const double *betaNextRow = logBetaData + (t + 1) * N; + const double *emitNextRow = logEmitByTime.data() + (t + 1) * N; + for (std::size_t j = 0; j < N; ++j) { + const double emitBetaNext = emitNextRow[j] + betaNextRow[j] - logP; + const double *transCol = logTransT.data() + j * N; + double *transNumCol = transNumT.data() + j * N; + for (std::size_t i = 0; i < N; ++i) { + if (transCol[i] == LOG_ZERO) { + continue; + } + const double logXi = alphaRow[i] + transCol[i] + emitBetaNext; + transNumCol[i] += std::exp(logXi); + } + } + } + } else { + for (std::size_t t = 0; t + 1 < T; ++t) { + const double *alphaRow = logAlphaData + t * N; + const double *betaNextRow = logBetaData + (t + 1) * N; + const double *emitNextRow = logEmitByTime.data() + (t + 1) * N; for (std::size_t j = 0; j < N; ++j) { - const double logXi = logAlpha(t, i) + logTrans[i][j] + - logEmit[j * T + (t + 1)] + logBeta(t + 1, j) - logP; - transNum[i][j] += std::exp(logXi); + const double emitBetaNext = emitNextRow[j] + betaNextRow[j] - logP; + const double *transCol = logTransT.data() + j * N; + double *transNumCol = transNumT.data() + j * N; + performance::detail::TranscendentalKernels::accumulate_exp_sum2_bias( + transNumCol, alphaRow, transCol, N, emitBetaNext); } } } @@ -122,7 +177,7 @@ void BaumWelchTrainer::train() { Matrix newTrans(N, N); for (std::size_t i = 0; i < N; ++i) { for (std::size_t j = 0; j < N; ++j) { - newTrans(i, j) = (transDen[i] > 0.0) ? transNum[i][j] / transDen[i] + newTrans(i, j) = (transDen[i] > 0.0) ? transNumT[j * N + i] / transDen[i] : 1.0 / static_cast(N); } } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 29f55ef..1fb97fb 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -142,20 +142,32 @@ if(GTest_FOUND OR TARGET gtest) set(ALL_TEST_TARGETS "") # ========================================================================= - # Level 0: Platform - # No tests yet. test_simd_platform will be added in Phase 4.5.2 (tools) - # and referenced here when a portable SIMD-capability test is available. + # Platform Capabilities # ========================================================================= + add_hmm_test(test_simd_platform platform/test_simd_platform.cpp) # ========================================================================= - # Level 1: Math & Numerics + # Math & Numerics # ========================================================================= add_hmm_test(test_modern_constants common/test_modern_constants.cpp) add_hmm_test(test_numerical_stability common/test_numerical_stability.cpp) add_hmm_test(test_common common/test_common.cpp) # ========================================================================= - # Level 3: Distributions + # Performance Primitives + # Cross-cutting SIMD kernels consumed by both calculators and trainers. + # Compiled with LIBHMM_BEST_SIMD_FLAGS so the active SIMD path matches + # the production library -- parity is checked against std::exp. + # ========================================================================= + add_hmm_test(test_transcendental_kernels performance/test_transcendental_kernels.cpp) + if(LIBHMM_BEST_SIMD_FLAGS) + set_source_files_properties( + performance/test_transcendental_kernels.cpp + PROPERTIES COMPILE_FLAGS "${LIBHMM_BEST_SIMD_FLAGS}") + endif() + + # ========================================================================= + # Distributions # ========================================================================= add_hmm_test(test_distribution_traits distributions/test_distribution_traits.cpp) add_hmm_test(test_distributions_header distributions/test_distributions_header.cpp) @@ -178,27 +190,29 @@ if(GTest_FOUND OR TARGET gtest) add_hmm_test(test_weibull_distribution distributions/test_weibull_distribution.cpp) # ========================================================================= - # Level 4: Core HMM + # Core HMM # ========================================================================= add_hmm_test(test_hmm_core test_hmm_core.cpp) # ========================================================================= - # Level 5: Calculators + # Calculators # ========================================================================= add_hmm_test(test_canonical_calculators calculators/test_canonical_calculators.cpp) add_hmm_test(test_calculator_continuous calculators/test_calculator_continuous.cpp) add_hmm_test(test_calculator_edge_cases calculators/test_calculator_edge_cases.cpp) + add_hmm_test(test_fb_mode_parity calculators/test_fb_mode_parity.cpp) # ========================================================================= - # Level 6: Trainers + # Trainers # ========================================================================= add_hmm_test(test_canonical_training training/test_canonical_training.cpp) add_hmm_test(test_training training/test_training.cpp) add_hmm_test(test_training_edge_cases training/test_training_edge_cases.cpp) add_hmm_test(test_baum_welch_convergence training/test_baum_welch_convergence.cpp) + add_hmm_test(test_bw_parity training/test_bw_parity.cpp) # ========================================================================= - # Level 7: IO & Integration + # IO & Integration # ========================================================================= add_hmm_test(test_xml_file_io io/test_xml_file_io.cpp) add_hmm_test(test_hmm_stream_io io/test_hmm_stream_io.cpp) diff --git a/tests/calculators/test_fb_mode_parity.cpp b/tests/calculators/test_fb_mode_parity.cpp new file mode 100644 index 0000000..32ce496 --- /dev/null +++ b/tests/calculators/test_fb_mode_parity.cpp @@ -0,0 +1,216 @@ +#include + +#include "libhmm/performance/fb_recurrence_policy.h" +#include "libhmm/calculators/forward_backward_calculator.h" +#include "libhmm/distributions/discrete_distribution.h" +#include "libhmm/distributions/gaussian_distribution.h" + +#include +#include +#include +#include +#include + +using namespace libhmm; + +namespace { + +constexpr double kAbsTol = 1e-9; +constexpr double kRelTol = 1e-12; + +void expectClose(double a, double b, double absTol = kAbsTol, double relTol = kRelTol) { + if (std::isnan(a) || std::isnan(b)) { + FAIL() << "Unexpected NaN: a=" << a << " b=" << b; + } + if (a == b) { + return; + } + const double diff = std::abs(a - b); + if (diff <= absTol) { + return; + } + const double largest = std::max(std::abs(a), std::abs(b)); + EXPECT_LE(diff, relTol * largest) + << "values differ beyond tolerance: a=" << a << " b=" << b << " diff=" << diff; +} + +void expectMatricesClose(const Matrix &a, const Matrix &b) { + ASSERT_EQ(a.size1(), b.size1()); + ASSERT_EQ(a.size2(), b.size2()); + for (std::size_t i = 0; i < a.size1(); ++i) { + for (std::size_t j = 0; j < a.size2(); ++j) { + const double av = a(i, j); + const double bv = b(i, j); + // -inf is a valid log-space value; require an exact match in that + // case so the kernels do not silently disagree on which transitions + // are infeasible. + if (std::isinf(av) || std::isinf(bv)) { + EXPECT_EQ(av, bv) << "log-zero mismatch at (" << i << "," << j << ")"; + continue; + } + expectClose(av, bv); + } + } +} + +std::unique_ptr makeDiscreteCasinoHmm(std::size_t numStates) { + auto hmm = std::make_unique(static_cast(numStates)); + + Matrix trans(numStates, numStates); + for (std::size_t i = 0; i < numStates; ++i) { + double rowSum = 0.0; + for (std::size_t j = 0; j < numStates; ++j) { + const double w = 0.1 + 0.5 * static_cast((i + j + 1) % 7); + trans(i, j) = w; + rowSum += w; + } + for (std::size_t j = 0; j < numStates; ++j) { + trans(i, j) /= rowSum; + } + } + hmm->setTrans(trans); + + Vector pi(numStates); + for (std::size_t i = 0; i < numStates; ++i) { + pi(i) = 1.0 / static_cast(numStates); + } + hmm->setPi(pi); + + constexpr std::size_t kAlphabet = 6; + for (std::size_t i = 0; i < numStates; ++i) { + auto dist = std::make_unique(kAlphabet); + std::array weights{}; + double sum = 0.0; + for (std::size_t s = 0; s < kAlphabet; ++s) { + const double w = 0.05 + 0.2 * static_cast((i * 11 + s * 3 + 1) % 5); + weights[s] = w; + sum += w; + } + for (std::size_t s = 0; s < kAlphabet; ++s) { + dist->setProbability(static_cast(s), weights[s] / sum); + } + hmm->setDistribution(i, std::move(dist)); + } + return hmm; +} + +ObservationSet makeDeterministicObs(std::size_t length, std::size_t alphabet) { + ObservationSet obs(length); + for (std::size_t t = 0; t < length; ++t) { + obs(t) = static_cast((t * 7 + 3) % alphabet); + } + return obs; +} + +std::unique_ptr makeContinuousGaussianHmm(std::size_t numStates) { + auto hmm = std::make_unique(static_cast(numStates)); + + Matrix trans(numStates, numStates); + for (std::size_t i = 0; i < numStates; ++i) { + double rowSum = 0.0; + for (std::size_t j = 0; j < numStates; ++j) { + const double w = + 0.1 + 0.4 * std::sin(0.7 * static_cast(i) + 1.3 * static_cast(j)); + const double clamped = std::max(w, 0.05); + trans(i, j) = clamped; + rowSum += clamped; + } + for (std::size_t j = 0; j < numStates; ++j) { + trans(i, j) /= rowSum; + } + } + hmm->setTrans(trans); + + Vector pi(numStates); + for (std::size_t i = 0; i < numStates; ++i) { + pi(i) = 1.0 / static_cast(numStates); + } + hmm->setPi(pi); + + for (std::size_t i = 0; i < numStates; ++i) { + const double mean = 2.0 * static_cast(i); + const double sigma = 1.0; + hmm->setDistribution(i, std::make_unique(mean, sigma)); + } + return hmm; +} + +ObservationSet makeContinuousObs(std::size_t length, std::size_t numStates) { + ObservationSet obs(length); + for (std::size_t t = 0; t < length; ++t) { + obs(t) = std::sin(0.1 * static_cast(t)) * static_cast(numStates); + } + return obs; +} + +void runParityCheck(const Hmm &hmm, const ObservationSet &obs) { + ForwardBackwardCalculator pair(hmm, obs); + pair.setRecurrenceModeOverride(FbRecurrenceMode::Pairwise); + pair.compute(); + + ForwardBackwardCalculator maxr(hmm, obs); + maxr.setRecurrenceModeOverride(FbRecurrenceMode::MaxReduce); + maxr.compute(); + + ASSERT_EQ(pair.getRecurrenceMode(), FbRecurrenceMode::Pairwise); + ASSERT_EQ(maxr.getRecurrenceMode(), FbRecurrenceMode::MaxReduce); + + expectClose(pair.getLogProbability(), maxr.getLogProbability()); + expectMatricesClose(pair.getLogForwardVariables(), maxr.getLogForwardVariables()); + expectMatricesClose(pair.getLogBackwardVariables(), maxr.getLogBackwardVariables()); +} + +} // namespace + +// --------------------------------------------------------------------------- +// Discrete coverage across N=2..8 with a fixed-length sequence +// --------------------------------------------------------------------------- + +class FbModeParityDiscreteTest : public ::testing::TestWithParam {}; + +TEST_P(FbModeParityDiscreteTest, KernelsAgreeOnDiscreteHmm) { + const std::size_t numStates = GetParam(); + auto hmm = makeDiscreteCasinoHmm(numStates); + const ObservationSet obs = makeDeterministicObs(200, 6); + runParityCheck(*hmm, obs); +} + +INSTANTIATE_TEST_SUITE_P(N2to8, FbModeParityDiscreteTest, + ::testing::Values(2, 3, 4, 5, 6, 7, 8)); + +// --------------------------------------------------------------------------- +// Continuous (Gaussian) coverage at the medium-N regime +// --------------------------------------------------------------------------- + +class FbModeParityContinuousTest : public ::testing::TestWithParam {}; + +TEST_P(FbModeParityContinuousTest, KernelsAgreeOnContinuousHmm) { + const std::size_t numStates = GetParam(); + auto hmm = makeContinuousGaussianHmm(numStates); + const ObservationSet obs = makeContinuousObs(500, numStates); + runParityCheck(*hmm, obs); +} + +INSTANTIATE_TEST_SUITE_P(N4_8_16, FbModeParityContinuousTest, + ::testing::Values(4, 8, 16)); + +// --------------------------------------------------------------------------- +// Override accessor sanity +// --------------------------------------------------------------------------- + +TEST(FbModeParityOverride, OverrideSurfacesViaGetter) { + auto hmm = makeDiscreteCasinoHmm(4); + const ObservationSet obs = makeDeterministicObs(50, 6); + + ForwardBackwardCalculator fbc(*hmm, obs); + EXPECT_FALSE(fbc.getRecurrenceModeOverride().has_value()); + + fbc.setRecurrenceModeOverride(FbRecurrenceMode::MaxReduce); + ASSERT_TRUE(fbc.getRecurrenceModeOverride().has_value()); + EXPECT_EQ(*fbc.getRecurrenceModeOverride(), FbRecurrenceMode::MaxReduce); + fbc.compute(); + EXPECT_EQ(fbc.getRecurrenceMode(), FbRecurrenceMode::MaxReduce); + + fbc.setRecurrenceModeOverride(std::nullopt); + EXPECT_FALSE(fbc.getRecurrenceModeOverride().has_value()); +} diff --git a/tests/performance/test_transcendental_kernels.cpp b/tests/performance/test_transcendental_kernels.cpp new file mode 100644 index 0000000..c7e3546 --- /dev/null +++ b/tests/performance/test_transcendental_kernels.cpp @@ -0,0 +1,363 @@ +// tests/performance/test_transcendental_kernels.cpp +// +// Parity tests for TranscendentalKernels: verify that each of the five +// kernel methods agrees with a std::exp-based scalar reference to within +// 1e-12 relative / 1e-15 absolute tolerance. +// +// Ground truth is always computed inline here using std::exp directly — NOT +// by calling the kernel's internal scalar variant — so the test is +// independent of any internal refactor. +// +// The test binary is compiled with LIBHMM_BEST_SIMD_FLAGS (see CMakeLists.txt +// Performance Primitives section), so the active SIMD path matches the production library. + +#include "libhmm/performance/transcendental_kernels.h" +#include "libhmm/math/constants.h" + +#include + +#include +#include +#include +#include + +namespace { + +using TK = libhmm::performance::detail::TranscendentalKernels; + +constexpr double LOG_ZERO = -std::numeric_limits::infinity(); +constexpr double REL_TOL = 1e-12; +constexpr double ABS_TOL = 1e-15; + +// Sizes chosen to cover: scalar-only (1), below SSE2 width (1,3), single +// SSE2 block (2), single AVX block (4), non-multiple-of-4 (7,15,31), +// exact AVX-512 block (8), exact double-block (16,32), and large (64). +const std::vector TEST_SIZES = {1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 64}; + +// ------------------------------------------------------------------------- +// Helper: build test input vectors +// ------------------------------------------------------------------------- + +// "Normal" log-probabilities in the range (-50, 0). +static std::vector make_log_probs(std::size_t n, double offset = 0.0) { + std::vector v(n); + for (std::size_t i = 0; i < n; ++i) { + v[i] = -1.0 - static_cast(i % 20) * 2.3 + offset; + } + return v; +} + +// Mix of normal log-probs and LOG_ZERO sentinels (every 5th element). +static std::vector make_mixed(std::size_t n, double offset = 0.0) { + std::vector v = make_log_probs(n, offset); + for (std::size_t i = 4; i < n; i += 5) { + v[i] = LOG_ZERO; + } + return v; +} + +// Comparison helpers. +static void check_scalar(double got, double ref, const char *label) { + if (std::isinf(ref) && std::isinf(got)) + return; // both -inf is fine + const double diff = std::abs(got - ref); + if (ref != 0.0) { + EXPECT_LE(diff / std::abs(ref), REL_TOL) + << label << ": relative error too large got=" << got << " ref=" << ref; + } else { + EXPECT_LE(diff, ABS_TOL) << label << ": absolute error too large got=" << got + << " ref=" << ref; + } +} + +static void check_array(const std::vector &got, const std::vector &ref, + const char *label) { + ASSERT_EQ(got.size(), ref.size()); + for (std::size_t i = 0; i < got.size(); ++i) { + check_scalar(got[i], ref[i], label); + } +} + +// ========================================================================= +// 1. reduce_max_sum2 +// ========================================================================= + +static double ref_reduce_max_sum2(const std::vector &a, const std::vector &b) { + double m = -std::numeric_limits::infinity(); + for (std::size_t i = 0; i < a.size(); ++i) { + double t = a[i] + b[i]; + if (t > m) + m = t; + } + return m; +} + +TEST(TranscendentalKernels, ReduceMaxSum2_NormalInputs) { + for (std::size_t n : TEST_SIZES) { + auto a = make_log_probs(n, 0.0); + auto b = make_log_probs(n, -3.7); + double got = TK::reduce_max_sum2(a.data(), b.data(), n); + double ref = ref_reduce_max_sum2(a, b); + check_scalar(got, ref, "reduce_max_sum2/normal"); + } +} + +TEST(TranscendentalKernels, ReduceMaxSum2_WithLogZero) { + for (std::size_t n : TEST_SIZES) { + auto a = make_mixed(n, 0.0); + auto b = make_mixed(n, -1.5); + double got = TK::reduce_max_sum2(a.data(), b.data(), n); + double ref = ref_reduce_max_sum2(a, b); + // -inf + anything is -inf; max may be -inf if all are LOG_ZERO pairs. + if (std::isinf(ref) && std::isinf(got)) { + EXPECT_EQ(std::signbit(ref), std::signbit(got)); + } else { + check_scalar(got, ref, "reduce_max_sum2/mixed"); + } + } +} + +// ========================================================================= +// 2. sum_exp_sum2_minus_max +// ========================================================================= + +static double ref_sum_exp_sum2_minus_max(const std::vector &a, const std::vector &b, + double maxVal) { + if (!std::isfinite(maxVal)) + return 0.0; + double s = 0.0; + for (std::size_t i = 0; i < a.size(); ++i) { + double t = a[i] + b[i]; + if (std::isfinite(t)) + s += std::exp(t - maxVal); + } + return s; +} + +TEST(TranscendentalKernels, SumExpSum2MinusMax_NormalInputs) { + for (std::size_t n : TEST_SIZES) { + auto a = make_log_probs(n, 0.0); + auto b = make_log_probs(n, -3.7); + double maxVal = ref_reduce_max_sum2(a, b); + double got = TK::sum_exp_sum2_minus_max(a.data(), b.data(), n, maxVal); + double ref = ref_sum_exp_sum2_minus_max(a, b, maxVal); + check_scalar(got, ref, "sum_exp_sum2_minus_max/normal"); + } +} + +TEST(TranscendentalKernels, SumExpSum2MinusMax_WithLogZero) { + for (std::size_t n : TEST_SIZES) { + auto a = make_mixed(n, 0.0); + auto b = make_mixed(n, -1.5); + double maxVal = ref_reduce_max_sum2(a, b); + double got = TK::sum_exp_sum2_minus_max(a.data(), b.data(), n, maxVal); + double ref = ref_sum_exp_sum2_minus_max(a, b, maxVal); + check_scalar(got, ref, "sum_exp_sum2_minus_max/mixed"); + } +} + +TEST(TranscendentalKernels, SumExpSum2MinusMax_InfiniteMax) { + for (std::size_t n : TEST_SIZES) { + auto a = make_log_probs(n); + auto b = make_log_probs(n); + double got = TK::sum_exp_sum2_minus_max(a.data(), b.data(), n, + -std::numeric_limits::infinity()); + EXPECT_EQ(got, 0.0) << "should return 0 when maxVal is -inf"; + } +} + +// ========================================================================= +// 3. reduce_max_sum3 +// ========================================================================= + +static double ref_reduce_max_sum3(const std::vector &a, const std::vector &b, + const std::vector &c) { + double m = -std::numeric_limits::infinity(); + for (std::size_t i = 0; i < a.size(); ++i) { + double t = a[i] + b[i] + c[i]; + if (t > m) + m = t; + } + return m; +} + +TEST(TranscendentalKernels, ReduceMaxSum3_NormalInputs) { + for (std::size_t n : TEST_SIZES) { + auto a = make_log_probs(n, 0.0); + auto b = make_log_probs(n, -2.1); + auto c = make_log_probs(n, -5.3); + double got = TK::reduce_max_sum3(a.data(), b.data(), c.data(), n); + double ref = ref_reduce_max_sum3(a, b, c); + check_scalar(got, ref, "reduce_max_sum3/normal"); + } +} + +TEST(TranscendentalKernels, ReduceMaxSum3_WithLogZero) { + for (std::size_t n : TEST_SIZES) { + auto a = make_mixed(n, 0.0); + auto b = make_mixed(n, -2.1); + auto c = make_mixed(n, -5.3); + double got = TK::reduce_max_sum3(a.data(), b.data(), c.data(), n); + double ref = ref_reduce_max_sum3(a, b, c); + if (std::isinf(ref) && std::isinf(got)) { + EXPECT_EQ(std::signbit(ref), std::signbit(got)); + } else { + check_scalar(got, ref, "reduce_max_sum3/mixed"); + } + } +} + +// ========================================================================= +// 4. sum_exp_sum3_minus_max +// ========================================================================= + +static double ref_sum_exp_sum3_minus_max(const std::vector &a, const std::vector &b, + const std::vector &c, double maxVal) { + if (!std::isfinite(maxVal)) + return 0.0; + double s = 0.0; + for (std::size_t i = 0; i < a.size(); ++i) { + double t = a[i] + b[i] + c[i]; + if (std::isfinite(t)) + s += std::exp(t - maxVal); + } + return s; +} + +TEST(TranscendentalKernels, SumExpSum3MinusMax_NormalInputs) { + for (std::size_t n : TEST_SIZES) { + auto a = make_log_probs(n, 0.0); + auto b = make_log_probs(n, -2.1); + auto c = make_log_probs(n, -5.3); + double maxVal = ref_reduce_max_sum3(a, b, c); + double got = TK::sum_exp_sum3_minus_max(a.data(), b.data(), c.data(), n, maxVal); + double ref = ref_sum_exp_sum3_minus_max(a, b, c, maxVal); + check_scalar(got, ref, "sum_exp_sum3_minus_max/normal"); + } +} + +TEST(TranscendentalKernels, SumExpSum3MinusMax_WithLogZero) { + for (std::size_t n : TEST_SIZES) { + auto a = make_mixed(n, 0.0); + auto b = make_mixed(n, -2.1); + auto c = make_mixed(n, -5.3); + double maxVal = ref_reduce_max_sum3(a, b, c); + double got = TK::sum_exp_sum3_minus_max(a.data(), b.data(), c.data(), n, maxVal); + double ref = ref_sum_exp_sum3_minus_max(a, b, c, maxVal); + check_scalar(got, ref, "sum_exp_sum3_minus_max/mixed"); + } +} + +TEST(TranscendentalKernels, SumExpSum3MinusMax_InfiniteMax) { + for (std::size_t n : TEST_SIZES) { + auto a = make_log_probs(n); + auto b = make_log_probs(n); + auto c = make_log_probs(n); + double got = TK::sum_exp_sum3_minus_max(a.data(), b.data(), c.data(), n, + -std::numeric_limits::infinity()); + EXPECT_EQ(got, 0.0) << "should return 0 when maxVal is -inf"; + } +} + +// ========================================================================= +// 5. accumulate_exp_sum2_bias +// ========================================================================= + +static void ref_accumulate_exp_sum2_bias(std::vector &dst, const std::vector &a, + const std::vector &b, double bias) { + for (std::size_t i = 0; i < dst.size(); ++i) { + dst[i] += std::exp(a[i] + b[i] + bias); + } +} + +TEST(TranscendentalKernels, AccumulateExpSum2Bias_NormalInputs) { + for (std::size_t n : TEST_SIZES) { + auto a = make_log_probs(n, 0.0); + auto b = make_log_probs(n, -3.7); + const double bias = -12.5; + + std::vector got_dst(n, 0.5); + std::vector ref_dst(n, 0.5); + + TK::accumulate_exp_sum2_bias(got_dst.data(), a.data(), b.data(), n, bias); + ref_accumulate_exp_sum2_bias(ref_dst, a, b, bias); + + check_array(got_dst, ref_dst, "accumulate_exp_sum2_bias/normal"); + } +} + +TEST(TranscendentalKernels, AccumulateExpSum2Bias_LogZeroInputs) { + // LOG_ZERO inputs: exp(-inf + ...) = 0; dst[i] should be unchanged. + for (std::size_t n : TEST_SIZES) { + std::vector a(n, LOG_ZERO); + std::vector b(n, 0.0); + const double bias = 0.0; + + std::vector got_dst(n, 1.0); + std::vector ref_dst(n, 1.0); + + TK::accumulate_exp_sum2_bias(got_dst.data(), a.data(), b.data(), n, bias); + ref_accumulate_exp_sum2_bias(ref_dst, a, b, bias); + + check_array(got_dst, ref_dst, "accumulate_exp_sum2_bias/log_zero"); + } +} + +TEST(TranscendentalKernels, AccumulateExpSum2Bias_SmallBias) { + // Verify behaviour near the underflow threshold. + // The SIMD kernel intentionally returns 0 for arg <= MIN_LOG_PROBABILITY + // (branch-free mask). std::exp does not underflow to 0 until ~-708.4, so + // inputs in the range (-708.4, -700] produce a discrepancy between raw + // std::exp and the SIMD. The reference must apply the same underflow + // contract as the kernel so the comparison is against the specified + // behaviour, not against an unclamped std::exp. + constexpr double EXP_UNDERFLOW = libhmm::constants::probability::MIN_LOG_PROBABILITY; + for (std::size_t n : TEST_SIZES) { + auto a = make_log_probs(n, 0.0); + auto b = make_log_probs(n, 0.0); + const double bias = EXP_UNDERFLOW + 5.0; // -695 + + std::vector got_dst(n, 0.0); + std::vector ref_dst(n, 0.0); + + TK::accumulate_exp_sum2_bias(got_dst.data(), a.data(), b.data(), n, bias); + + // Reference: zero for arg <= EXP_UNDERFLOW, std::exp otherwise. + for (std::size_t k = 0; k < n; ++k) { + const double arg = a[k] + b[k] + bias; + if (arg > EXP_UNDERFLOW) + ref_dst[k] += std::exp(arg); + } + + check_array(got_dst, ref_dst, "accumulate_exp_sum2_bias/small_bias"); + } +} + +// ========================================================================= +// 6. Consistency: max-reduce round-trip +// reduce_max then sum_exp should reproduce log-sum-exp. +// ========================================================================= + +TEST(TranscendentalKernels, RoundTrip_LogSumExp2) { + // For finite inputs: log(sum_exp(a+b - max)) + max == log_sum_exp(a, b). + // Just check the intermediate values are consistent with each other. + for (std::size_t n : TEST_SIZES) { + if (n == 0) + continue; + auto a = make_log_probs(n, 0.0); + auto b = make_log_probs(n, -2.0); + + double maxVal = TK::reduce_max_sum2(a.data(), b.data(), n); + double scaledSum = TK::sum_exp_sum2_minus_max(a.data(), b.data(), n, maxVal); + + EXPECT_TRUE(std::isfinite(maxVal)) + << "reduce_max_sum2 should return finite max for normal inputs (n=" << n << ")"; + EXPECT_GT(scaledSum, 0.0) << "scaled sum should be positive (n=" << n << ")"; + + double logSumExp = maxVal + std::log(scaledSum); + EXPECT_TRUE(std::isfinite(logSumExp)) + << "reconstructed log-sum-exp should be finite (n=" << n << ")"; + } +} + +} // anonymous namespace diff --git a/tests/platform/test_simd_platform.cpp b/tests/platform/test_simd_platform.cpp new file mode 100644 index 0000000..457f5de --- /dev/null +++ b/tests/platform/test_simd_platform.cpp @@ -0,0 +1,169 @@ +// tests/platform/test_simd_platform.cpp +// +// Consistency checks for libhmm/platform/simd_platform.h. +// +// Two layers of verification: +// +// 1. Compile-time (#error) — ISA hierarchy invariants that can only fail if +// simd_platform.h emits a broken macro combination. A violation here is +// a build error, not a test failure. +// +// 2. Runtime (GTest) — contracts on the utility functions: +// feature_string() non-null, non-empty, agrees with active macros +// double_vector_width() power-of-two >= 1 +// float_vector_width() == 2 * double_vector_width() +// optimal_alignment() power-of-two >= 8, covers one SIMD register +// has_simd_support() consistent with double_vector_width() +// supports_vectorization()consistent with has_simd_support() +// compile-time constants DOUBLE_SIMD_WIDTH / FLOAT_SIMD_WIDTH / +// SIMD_ALIGNMENT each agree with their function +// +// Not compiled with LIBHMM_BEST_SIMD_FLAGS — tests the detection +// infrastructure, not the intrinsics. + +#include +#include "libhmm/platform/simd_platform.h" + +#include + +// ============================================================================ +// Compile-time ISA hierarchy invariants +// A #error here means simd_platform.h has emitted a broken macro combination. +// ============================================================================ + +#if defined(LIBHMM_HAS_AVX512) && !defined(LIBHMM_HAS_AVX) +#error "LIBHMM_HAS_AVX512 requires LIBHMM_HAS_AVX" +#endif +#if defined(LIBHMM_HAS_AVX512) && !defined(LIBHMM_HAS_SSE2) +#error "LIBHMM_HAS_AVX512 requires LIBHMM_HAS_SSE2" +#endif +#if defined(LIBHMM_HAS_AVX2) && !defined(LIBHMM_HAS_AVX) +#error "LIBHMM_HAS_AVX2 requires LIBHMM_HAS_AVX" +#endif +#if defined(LIBHMM_HAS_AVX2) && !defined(LIBHMM_HAS_SSE2) +#error "LIBHMM_HAS_AVX2 requires LIBHMM_HAS_SSE2" +#endif +#if defined(LIBHMM_HAS_AVX) && !defined(LIBHMM_HAS_SSE2) +#error "LIBHMM_HAS_AVX requires LIBHMM_HAS_SSE2" +#endif +#if defined(LIBHMM_HAS_SSE4_1) && !defined(LIBHMM_HAS_SSE2) +#error "LIBHMM_HAS_SSE4_1 requires LIBHMM_HAS_SSE2" +#endif +#if defined(LIBHMM_HAS_NEON) && defined(LIBHMM_HAS_SSE2) +#error "LIBHMM_HAS_NEON and x86 SIMD macros are mutually exclusive" +#endif + +// ============================================================================ +// Helpers +// ============================================================================ + +using namespace libhmm::performance::simd; + +namespace { + +constexpr bool is_power_of_two(std::size_t n) noexcept { + return n >= 1 && (n & (n - 1)) == 0; +} + +} // namespace + +// ============================================================================ +// feature_string +// ============================================================================ + +TEST(SimdPlatformFeatureString, NonNull) { + EXPECT_NE(feature_string(), nullptr); +} + +TEST(SimdPlatformFeatureString, NonEmpty) { + EXPECT_GT(std::strlen(feature_string()), 0u); +} + +// The reported string must match the highest active ISA macro. +TEST(SimdPlatformFeatureString, ConsistentWithMacros) { +#if defined(LIBHMM_HAS_AVX512) + EXPECT_STREQ(feature_string(), "AVX-512"); +#elif defined(LIBHMM_HAS_AVX2) + EXPECT_STREQ(feature_string(), "AVX2"); +#elif defined(LIBHMM_HAS_AVX) + EXPECT_STREQ(feature_string(), "AVX"); +#elif defined(LIBHMM_HAS_SSE4_1) + EXPECT_STREQ(feature_string(), "SSE4.1"); +#elif defined(LIBHMM_HAS_SSE2) + EXPECT_STREQ(feature_string(), "SSE2"); +#elif defined(LIBHMM_HAS_NEON) + // Accepts both "ARM NEON" and "ARM NEON (Apple Silicon)". + EXPECT_EQ(std::strncmp(feature_string(), "ARM NEON", 8), 0); +#else + EXPECT_STREQ(feature_string(), "Scalar (No SIMD)"); +#endif +} + +// ============================================================================ +// double_vector_width / float_vector_width +// ============================================================================ + +TEST(SimdPlatformVectorWidth, DoubleWidthAtLeastOne) { + EXPECT_GE(double_vector_width(), 1u); +} + +TEST(SimdPlatformVectorWidth, DoubleWidthIsPowerOfTwo) { + EXPECT_TRUE(is_power_of_two(double_vector_width())); +} + +// float is 32-bit, double is 64-bit: a register holds twice as many floats. +TEST(SimdPlatformVectorWidth, FloatWidthIsTwiceDoubleWidth) { + EXPECT_EQ(float_vector_width(), 2u * double_vector_width()); +} + +// ============================================================================ +// optimal_alignment +// ============================================================================ + +TEST(SimdPlatformAlignment, AtLeastEightBytes) { + EXPECT_GE(optimal_alignment(), 8u); +} + +TEST(SimdPlatformAlignment, IsPowerOfTwo) { + EXPECT_TRUE(is_power_of_two(optimal_alignment())); +} + +// Alignment must be at least enough to hold one full SIMD register of doubles. +TEST(SimdPlatformAlignment, CoversOneSimdRegister) { + EXPECT_GE(optimal_alignment(), double_vector_width() * sizeof(double)); +} + +// ============================================================================ +// has_simd_support / supports_vectorization +// ============================================================================ + +TEST(SimdPlatformSupport, HasSimdConsistentWithWidth) { + if (has_simd_support()) { + EXPECT_GE(double_vector_width(), 2u); + } else { + EXPECT_EQ(double_vector_width(), 1u); + } +} + +TEST(SimdPlatformSupport, SupportsVectorizationRequiresHasSimd) { + if (supports_vectorization()) { + EXPECT_TRUE(has_simd_support()); + EXPECT_GE(double_vector_width(), 2u); + } +} + +// ============================================================================ +// Compile-time constants agree with their corresponding functions +// ============================================================================ + +TEST(SimdPlatformConstants, DoubleSimdWidthMatchesFunction) { + EXPECT_EQ(DOUBLE_SIMD_WIDTH, double_vector_width()); +} + +TEST(SimdPlatformConstants, FloatSimdWidthMatchesFunction) { + EXPECT_EQ(FLOAT_SIMD_WIDTH, float_vector_width()); +} + +TEST(SimdPlatformConstants, SimdAlignmentMatchesFunction) { + EXPECT_EQ(SIMD_ALIGNMENT, optimal_alignment()); +} diff --git a/tests/training/test_bw_parity.cpp b/tests/training/test_bw_parity.cpp new file mode 100644 index 0000000..9bc390c --- /dev/null +++ b/tests/training/test_bw_parity.cpp @@ -0,0 +1,232 @@ +#include + +#include "libhmm/calculators/forward_backward_calculator.h" +#include "libhmm/distributions/discrete_distribution.h" +#include "libhmm/distributions/gaussian_distribution.h" +#include "libhmm/training/baum_welch_trainer.h" + +#include +#include +#include +#include + +using namespace libhmm; + +namespace { + +constexpr double kBitExactTol = 0.0; +constexpr double kRelTol = 1e-12; +constexpr double kAbsTol = 1e-14; + +void expectClose(double a, double b, double absTol = kAbsTol, double relTol = kRelTol) { + if (std::isnan(a) || std::isnan(b)) { + FAIL() << "Unexpected NaN: a=" << a << " b=" << b; + } + if (a == b) { + return; + } + const double diff = std::abs(a - b); + if (diff <= absTol) { + return; + } + const double largest = std::max(std::abs(a), std::abs(b)); + EXPECT_LE(diff, relTol * largest) + << "values differ beyond tolerance: a=" << a << " b=" << b << " diff=" << diff; +} + +void expectMatricesEqual(const Matrix &a, const Matrix &b, double absTol) { + ASSERT_EQ(a.size1(), b.size1()); + ASSERT_EQ(a.size2(), b.size2()); + for (std::size_t i = 0; i < a.size1(); ++i) { + for (std::size_t j = 0; j < a.size2(); ++j) { + if (absTol == kBitExactTol) { + EXPECT_EQ(a(i, j), b(i, j)) << "mismatch at (" << i << "," << j << ")"; + } else { + expectClose(a(i, j), b(i, j), absTol); + } + } + } +} + +void expectVectorsEqual(const Vector &a, const Vector &b, double absTol) { + ASSERT_EQ(a.size(), b.size()); + for (std::size_t i = 0; i < a.size(); ++i) { + if (absTol == kBitExactTol) { + EXPECT_EQ(a(i), b(i)) << "mismatch at (" << i << ")"; + } else { + expectClose(a(i), b(i), absTol); + } + } +} + +std::unique_ptr makeDiscreteCasino(std::size_t numStates, std::size_t alphabet) { + auto hmm = std::make_unique(static_cast(numStates)); + + Matrix trans(numStates, numStates); + for (std::size_t i = 0; i < numStates; ++i) { + double rowSum = 0.0; + for (std::size_t j = 0; j < numStates; ++j) { + const double w = 0.1 + 0.4 * static_cast((i + j + 1) % 5); + trans(i, j) = w; + rowSum += w; + } + for (std::size_t j = 0; j < numStates; ++j) { + trans(i, j) /= rowSum; + } + } + hmm->setTrans(trans); + + Vector pi(numStates); + for (std::size_t i = 0; i < numStates; ++i) { + pi(i) = 1.0 / static_cast(numStates); + } + hmm->setPi(pi); + + for (std::size_t i = 0; i < numStates; ++i) { + auto dist = std::make_unique(static_cast(alphabet)); + std::vector weights(alphabet); + double sum = 0.0; + for (std::size_t s = 0; s < alphabet; ++s) { + const double w = 0.05 + 0.2 * static_cast((i * 11 + s * 3 + 1) % 5); + weights[s] = w; + sum += w; + } + for (std::size_t s = 0; s < alphabet; ++s) { + dist->setProbability(static_cast(s), weights[s] / sum); + } + hmm->setDistribution(i, std::move(dist)); + } + return hmm; +} + +ObservationLists makeDiscreteSequences() { + ObservationLists out; + constexpr std::size_t kAlphabet = 6; + constexpr std::array kLengths{50, 75, 30, 100}; + for (std::size_t s = 0; s < kLengths.size(); ++s) { + ObservationSet seq(kLengths[s]); + for (std::size_t t = 0; t < kLengths[s]; ++t) { + seq(t) = static_cast((t * 7 + s * 13 + 3) % kAlphabet); + } + out.push_back(seq); + } + return out; +} + +double scoreSequencesUnderModel(const Hmm &hmm, const ObservationLists &seqs) { + double total = 0.0; + for (const auto &seq : seqs) { + if (seq.size() == 0) { + continue; + } + ForwardBackwardCalculator fbc(hmm, seq); + const double lp = fbc.getLogProbability(); + if (std::isfinite(lp)) { + total += lp; + } + } + return total; +} + +} // namespace + +// --------------------------------------------------------------------------- +// Determinism: two independent BW runs from the same starting point on the +// same input must produce bit-exact identical updated parameters. +// --------------------------------------------------------------------------- + +TEST(BaumWelchParity, OneStepDeterministic_DiscreteN3) { + auto hmmA = makeDiscreteCasino(3, 6); + auto hmmB = makeDiscreteCasino(3, 6); + const ObservationLists seqs = makeDiscreteSequences(); + + BaumWelchTrainer trainerA(*hmmA, seqs); + BaumWelchTrainer trainerB(*hmmB, seqs); + trainerA.train(); + trainerB.train(); + + expectVectorsEqual(hmmA->getPi(), hmmB->getPi(), kBitExactTol); + expectMatricesEqual(hmmA->getTrans(), hmmB->getTrans(), kBitExactTol); + for (int i = 0; i < hmmA->getNumStates(); ++i) { + const auto *distA = dynamic_cast(&hmmA->getDistribution(i)); + const auto *distB = dynamic_cast(&hmmB->getDistribution(i)); + ASSERT_NE(distA, nullptr); + ASSERT_NE(distB, nullptr); + ASSERT_EQ(distA->getNumSymbols(), distB->getNumSymbols()); + for (std::size_t s = 0; s < distA->getNumSymbols(); ++s) { + EXPECT_EQ(distA->getSymbolProbability(s), distB->getSymbolProbability(s)) + << "state " << i << " symbol " << s; + } + } +} + +TEST(BaumWelchParity, OneStepDeterministic_DiscreteN5) { + auto hmmA = makeDiscreteCasino(5, 6); + auto hmmB = makeDiscreteCasino(5, 6); + const ObservationLists seqs = makeDiscreteSequences(); + + BaumWelchTrainer trainerA(*hmmA, seqs); + BaumWelchTrainer trainerB(*hmmB, seqs); + trainerA.train(); + trainerB.train(); + + expectVectorsEqual(hmmA->getPi(), hmmB->getPi(), kBitExactTol); + expectMatricesEqual(hmmA->getTrans(), hmmB->getTrans(), kBitExactTol); +} + +// --------------------------------------------------------------------------- +// EM monotonicity: a single train() step on the supplied sequences must not +// reduce the total observation log-probability under the model. +// --------------------------------------------------------------------------- + +TEST(BaumWelchParity, OneStepMonotonic_Discrete) { + auto hmm = makeDiscreteCasino(3, 6); + const ObservationLists seqs = makeDiscreteSequences(); + + const double scoreBefore = scoreSequencesUnderModel(*hmm, seqs); + BaumWelchTrainer trainer(*hmm, seqs); + trainer.train(); + const double scoreAfter = scoreSequencesUnderModel(*hmm, seqs); + + EXPECT_TRUE(std::isfinite(scoreBefore)); + EXPECT_TRUE(std::isfinite(scoreAfter)); + // Allow a small tolerance for floating-point noise around stationary points. + EXPECT_GE(scoreAfter, scoreBefore - 1e-9) + << "BW step should not decrease log-likelihood: before=" << scoreBefore + << " after=" << scoreAfter; +} + +// --------------------------------------------------------------------------- +// Invariants: post-step pi sums to 1, transition rows sum to 1, no NaN/inf. +// --------------------------------------------------------------------------- + +TEST(BaumWelchParity, OneStepInvariants_Discrete) { + auto hmm = makeDiscreteCasino(4, 6); + const ObservationLists seqs = makeDiscreteSequences(); + + BaumWelchTrainer trainer(*hmm, seqs); + trainer.train(); + + const Vector &pi = hmm->getPi(); + double piSum = 0.0; + for (std::size_t i = 0; i < pi.size(); ++i) { + EXPECT_TRUE(std::isfinite(pi(i))); + EXPECT_GE(pi(i), 0.0); + EXPECT_LE(pi(i), 1.0); + piSum += pi(i); + } + EXPECT_NEAR(piSum, 1.0, 1e-12); + + const Matrix &trans = hmm->getTrans(); + for (std::size_t i = 0; i < trans.size1(); ++i) { + double rowSum = 0.0; + for (std::size_t j = 0; j < trans.size2(); ++j) { + const double v = trans(i, j); + EXPECT_TRUE(std::isfinite(v)); + EXPECT_GE(v, 0.0); + EXPECT_LE(v, 1.0); + rowSum += v; + } + EXPECT_NEAR(rowSum, 1.0, 1e-12); + } +} diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 041773c..57d0692 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -36,6 +36,18 @@ add_hmm_tool(debug_parallel debug_parallel.cpp ${LIBHMM_TOOL_THREADPOOL}) add_hmm_tool(simd_inspection simd_inspection.cpp) add_hmm_tool(batch_performance batch_performance.cpp) add_hmm_tool(hmm_validator hmm_validator.cpp) +add_hmm_tool(hotspot_breakdown hotspot_breakdown.cpp) +add_hmm_tool(fb_contour_sweep fb_contour_sweep.cpp) +add_hmm_tool(bw_hotspot bw_hotspot.cpp) +add_hmm_tool(fb_crossover_sweep fb_crossover_sweep.cpp) +if(LIBHMM_EXPERIMENT_FB_MAX_REDUCE) + target_compile_definitions(hotspot_breakdown PRIVATE LIBHMM_EXPERIMENT_FB_MAX_REDUCE=1) + target_compile_definitions(fb_contour_sweep PRIVATE LIBHMM_EXPERIMENT_FB_MAX_REDUCE=1) +endif() +if(LIBHMM_EXPERIMENT_FB_ADAPTIVE_SELECTOR) + target_compile_definitions(hotspot_breakdown PRIVATE LIBHMM_EXPERIMENT_FB_ADAPTIVE_SELECTOR=1) + target_compile_definitions(fb_contour_sweep PRIVATE LIBHMM_EXPERIMENT_FB_ADAPTIVE_SELECTOR=1) +endif() # simd_inspection must be compiled with the same SIMD flags as the distribution # TUs so that LIBHMM_HAS_AVX512 / AVX2 / NEON are correctly defined and the @@ -51,8 +63,11 @@ install(TARGETS simd_inspection batch_performance hmm_validator + hotspot_breakdown + fb_contour_sweep + bw_hotspot RUNTIME DESTINATION bin/tools COMPONENT tools ) -message(STATUS "Tools: analyze_overhead debug_parallel simd_inspection batch_performance hmm_validator") +message(STATUS "Tools: analyze_overhead debug_parallel simd_inspection batch_performance hmm_validator hotspot_breakdown fb_contour_sweep bw_hotspot") diff --git a/tools/bw_hotspot.cpp b/tools/bw_hotspot.cpp new file mode 100644 index 0000000..7109b2a --- /dev/null +++ b/tools/bw_hotspot.cpp @@ -0,0 +1,322 @@ +/** + * @file bw_hotspot.cpp + * @brief Baum-Welch inner-loop cost breakdown. + * + * Profiles the three separable cost centres of one BW E-step: + * 1. FB computation (delegated to ForwardBackwardCalculator) + * 2. Gamma accumulation — N*T exp() calls + * 3. Xi accumulation — N^2*(T-1) exp() calls (dominant for N>1) + * + * Implemented inline here (not through BaumWelchTrainer) so each phase + * can be timed independently without modifying the library. + * + * Usage: + * bw_hotspot (default configs) + * bw_hotspot [runs] [warmup] + */ + +#include "libhmm/calculators/forward_backward_calculator.h" +#include "libhmm/hmm.h" +#include "libhmm/distributions/discrete_distribution.h" +#include "libhmm/distributions/gaussian_distribution.h" +#include "libhmm/performance/transcendental_kernels.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace libhmm; +using Clock = std::chrono::high_resolution_clock; +using Millis = std::chrono::duration; + +namespace { + +constexpr double LOG_ZERO = -std::numeric_limits::infinity(); + +// Prevent dead-code elimination on accumulated values. +volatile double g_sink = 0.0; + +// --------------------------------------------------------------------------- + +double elapsed_ms(const Clock::time_point start) { + return Millis(Clock::now() - start).count(); +} + +template +double median(std::vector v) { + if (v.empty()) + return 0.0; + std::sort(v.begin(), v.end()); + return static_cast(v[v.size() / 2]); +} + +// --------------------------------------------------------------------------- + +std::unique_ptr make_hmm(int n) { + auto hmm = std::make_unique(n); + Matrix trans(n, n); + for (int i = 0; i < n; ++i) { + double sum = 0.0; + for (int j = 0; j < n; ++j) { + trans(i, j) = 0.1 + 0.8 * (0.5 + 0.5 * std::sin(i * 0.7 + j * 1.3)); + sum += trans(i, j); + } + for (int j = 0; j < n; ++j) + trans(i, j) /= sum; + } + hmm->setTrans(trans); + + Vector pi(n); + for (int i = 0; i < n; ++i) + pi(i) = 1.0 / static_cast(n); + hmm->setPi(pi); + + for (int i = 0; i < n; ++i) + hmm->setDistribution(i, std::make_unique(i * 2.0, 1.0)); + return hmm; +} + +ObservationSet make_obs(int t, int n) { + ObservationSet obs(t); + for (int i = 0; i < t; ++i) + obs(i) = std::sin(i * 0.1) * static_cast(n); + return obs; +} + +// --------------------------------------------------------------------------- +// One E-step with independent phase timers. +// --------------------------------------------------------------------------- + +struct BwBreakdown { + double fb_ms = 0.0; // ForwardBackwardCalculator (construct + compute) + double gamma_ms = 0.0; // gamma accumulation: N*T exp() calls + double xi_ms = 0.0; // xi accumulation: N^2*(T-1) exp() calls + std::uint64_t gamma_exp_calls = 0; + std::uint64_t xi_exp_calls = 0; +}; + +BwBreakdown profile_bw(const Hmm &hmm, const ObservationSet &obs, int warmup, int runs) { + const std::size_t N = static_cast(hmm.getNumStates()); + const std::size_t T = obs.size(); + + // Precompute flat log-transition (row-major N×N) once — same as trainer would do. + std::vector logTrans(N * N); + bool hasZeroTransitions = false; + { + const Matrix &t = hmm.getTrans(); + for (std::size_t i = 0; i < N; ++i) + for (std::size_t j = 0; j < N; ++j) { + const double a = t(i, j); + if (a > 0.0) { + logTrans[i * N + j] = std::log(a); + } else { + logTrans[i * N + j] = LOG_ZERO; + hasZeroTransitions = true; + } + } + } + + // Log-emission: time-major logEmitByTime[t*N+j] = log b_j(O_t). + std::vector logEmitByTime(T * N); + { + std::vector stateMajor(N * T); + const std::span obsSpan(obs.data(), T); + for (std::size_t i = 0; i < N; ++i) + hmm.getDistribution(i).getBatchLogProbabilities( + obsSpan, std::span(stateMajor.data() + i * T, T)); + for (std::size_t i = 0; i < N; ++i) + for (std::size_t t2 = 0; t2 < T; ++t2) + logEmitByTime[t2 * N + i] = stateMajor[i * T + t2]; + } + + std::vector fb_ms_v, gamma_ms_v, xi_ms_v; + fb_ms_v.reserve(static_cast(runs)); + gamma_ms_v.reserve(static_cast(runs)); + xi_ms_v.reserve(static_cast(runs)); + + // Accumulators (reset per run to prevent dead-code elim). + std::vector piNum(N); + std::vector transDen(N); + std::vector transNum(N * N); + std::vector emisWts(N * T); + + for (int iter = 0; iter < warmup + runs; ++iter) { + // Phase 1: FB + auto t0 = Clock::now(); + ForwardBackwardCalculator fbc(hmm, obs); + const double logP = fbc.getLogProbability(); + const double fb_time = elapsed_ms(t0); + + if (!std::isfinite(logP)) + continue; + + const Matrix &logAlpha = fbc.getLogForwardVariables(); + const Matrix &logBeta = fbc.getLogBackwardVariables(); + + // Phase 2: gamma accumulation (N*T exp() calls) + std::fill(piNum.begin(), piNum.end(), 0.0); + std::fill(transDen.begin(), transDen.end(), 0.0); + + t0 = Clock::now(); + for (std::size_t t2 = 0; t2 < T; ++t2) { + for (std::size_t i = 0; i < N; ++i) { + const double g = std::exp(logAlpha(t2, i) + logBeta(t2, i) - logP); + emisWts[t2 * N + i] = g; + if (t2 == 0) + piNum[i] += g; + if (t2 < T - 1) + transDen[i] += g; + } + } + const double gamma_time = elapsed_ms(t0); + + // Phase 3: xi accumulation (N^2*(T-1) exp() calls) + std::fill(transNum.begin(), transNum.end(), 0.0); + + t0 = Clock::now(); + if (hasZeroTransitions) { + for (std::size_t t2 = 0; t2 + 1 < T; ++t2) { + const double *emitNext = logEmitByTime.data() + (t2 + 1) * N; + for (std::size_t i = 0; i < N; ++i) { + const double logAlphaI = logAlpha(t2, i); + const double *logTransRow = logTrans.data() + i * N; + for (std::size_t j = 0; j < N; ++j) { + if (logTransRow[j] == LOG_ZERO) { + continue; + } + const double logXi = + logAlphaI + logTransRow[j] + emitNext[j] + logBeta(t2 + 1, j) - logP; + transNum[i * N + j] += std::exp(logXi); + } + } + } + } else { + for (std::size_t t2 = 0; t2 + 1 < T; ++t2) { + const double *emitNext = logEmitByTime.data() + (t2 + 1) * N; + for (std::size_t i = 0; i < N; ++i) { + const double logAlphaI = logAlpha(t2, i); + const double *logTransRow = logTrans.data() + i * N; + const double bias = -logP; + // The hotspot tool keeps the same dense-xi shape as the trainer: + // exp(alpha[i] + trans[i,j] + (emitNext[j] + betaNext[j] - logP)). + // Since this tool stores row-major transNum, keep the scalar loop + // here rather than inventing a second helper shape prematurely. + for (std::size_t j = 0; j < N; ++j) { + const double logXi = + logAlphaI + logTransRow[j] + emitNext[j] + logBeta(t2 + 1, j) + bias; + transNum[i * N + j] += std::exp(logXi); + } + } + } + } + const double xi_time = elapsed_ms(t0); + + // Sink to prevent elision. + g_sink += piNum[0] + transDen[0] + transNum[0] + emisWts[0]; + + if (iter >= warmup) { + fb_ms_v.push_back(fb_time); + gamma_ms_v.push_back(gamma_time); + xi_ms_v.push_back(xi_time); + } + } + + BwBreakdown r; + r.fb_ms = median(fb_ms_v); + r.gamma_ms = median(gamma_ms_v); + r.xi_ms = median(xi_ms_v); + r.gamma_exp_calls = static_cast(N) * T; + r.xi_exp_calls = static_cast(N) * N * (T > 0 ? T - 1 : 0); + return r; +} + +int parse_pos(const char *v, const char *name) { + try { + const int x = std::stoi(v); + if (x <= 0) + throw std::invalid_argument("non-positive"); + return x; + } catch (...) { + throw std::invalid_argument(std::string("Invalid ") + name + ": " + v); + } +} + +} // namespace + +int main(int argc, char *argv[]) { + struct Config { + int n; + int t; + }; + std::vector configs = {{4, 500}, {8, 1000}, {16, 500}, {32, 2000}}; + int warmup = 2, runs = 8; + + if (argc == 3 || argc == 4 || argc == 5) { + configs = {{parse_pos(argv[1], "N"), parse_pos(argv[2], "T")}}; + if (argc >= 4) + runs = parse_pos(argv[3], "runs"); + if (argc == 5) + warmup = parse_pos(argv[4], "warmup"); + } else if (argc != 1) { + std::cerr << "Usage: bw_hotspot [N T [runs [warmup]]]\n"; + return 1; + } + + std::cout << "libhmm BW Hotspot Breakdown (median of " << runs << " runs, " << warmup + << " warmup)\n"; + std::cout << std::string(66, '=') << "\n\n"; + std::cout << std::fixed << std::setprecision(3); + + for (const auto &cfg : configs) { + auto hmm = make_hmm(cfg.n); + auto obs = make_obs(cfg.t, cfg.n); + const auto bw = profile_bw(*hmm, obs, warmup, runs); + + const double total = bw.fb_ms + bw.gamma_ms + bw.xi_ms; + auto pct = [&](double v) { + return (total > 0.0) ? 100.0 * v / total : 0.0; + }; + + std::cout << "N=" << cfg.n << " T=" << cfg.t << "\n"; + std::cout << " exp() call volume: gamma=" << static_cast(bw.gamma_exp_calls) / 1e3 + << "K" + << " xi=" << static_cast(bw.xi_exp_calls) / 1e6 << "M" + << " ratio xi/gamma=" + << (bw.gamma_exp_calls > 0 ? static_cast(bw.xi_exp_calls) / + static_cast(bw.gamma_exp_calls) + : 0.0) + << "x\n"; + + auto row = [&](const char *label, double ms, std::uint64_t calls) { + std::cout << " " << std::left << std::setw(24) << label << std::right << std::setw(8) + << ms << " ms" + << " " << std::setw(6) << std::setprecision(1) << pct(ms) << "%"; + if (calls > 0) { + const double ns_per = (ms * 1e6) / static_cast(calls); + std::cout << " " << std::setprecision(1) << ns_per << " ns/exp()"; + } + std::cout << "\n"; + std::cout << std::setprecision(3); + }; + + row("FB (fwd+bwd)", bw.fb_ms, 0); + row("Gamma accum", bw.gamma_ms, bw.gamma_exp_calls); + row("Xi accum", bw.xi_ms, bw.xi_exp_calls); + std::cout << " " << std::left << std::setw(24) << "TOTAL (1 BW iter)" << std::right + << std::setw(8) << total << " ms\n"; + std::cout << "\n"; + } + + if (g_sink == 1.23456789) + std::cout << "sink=" << g_sink << "\n"; + return 0; +} diff --git a/tools/fb_contour_sweep.cpp b/tools/fb_contour_sweep.cpp new file mode 100644 index 0000000..5b10626 --- /dev/null +++ b/tools/fb_contour_sweep.cpp @@ -0,0 +1,415 @@ +#include "libhmm/hmm.h" +#include "libhmm/distributions/gaussian_distribution.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace libhmm; +using Clock = std::chrono::high_resolution_clock; +using Millis = std::chrono::duration; +namespace fs = std::filesystem; + +namespace { + +constexpr double LOG_ZERO = -std::numeric_limits::infinity(); +constexpr std::size_t FB_MAX_REDUCE_FORCE_PAIRWISE_MAX_STATES = 2; +volatile double g_sink_double = 0.0; + +struct Config { + int n; + int t; +}; + +struct Timings { + double transition_ms = 0.0; + double obs_copy_ms = 0.0; + double emission_ms = 0.0; + double alloc_ms = 0.0; + double forward_ms = 0.0; + double backward_ms = 0.0; + double reduction_ms = 0.0; + double total_ms = 0.0; +}; + +double elapsed_ms(const Clock::time_point start) { + return Millis(Clock::now() - start).count(); +} + +bool should_use_max_reduce(const std::size_t n, const std::size_t t) noexcept { +#if defined(LIBHMM_EXPERIMENT_FB_MAX_REDUCE) + (void)n; + (void)t; + return true; +#elif defined(LIBHMM_EXPERIMENT_FB_ADAPTIVE_SELECTOR) + (void)t; + return n > FB_MAX_REDUCE_FORCE_PAIRWISE_MAX_STATES; +#else + (void)n; + (void)t; + return false; +#endif +} + +double log_sum_exp_pairwise(const double a, const double b) noexcept { + if (a == LOG_ZERO) { + return b; + } + if (b == LOG_ZERO) { + return a; + } + if (a > b) { + return a + std::log1p(std::exp(b - a)); + } + return b + std::log1p(std::exp(a - b)); +} + +template +double median(std::vector values) { + if (values.empty()) { + return 0.0; + } + std::sort(values.begin(), values.end()); + return static_cast(values[values.size() / 2]); +} + +std::unique_ptr make_hmm(const int n) { + auto hmm = std::make_unique(n); + Matrix trans(n, n); + for (int i = 0; i < n; ++i) { + double sum = 0.0; + for (int j = 0; j < n; ++j) { + trans(i, j) = 0.1 + 0.8 * (0.5 + 0.5 * std::sin(i * 0.7 + j * 1.3)); + sum += trans(i, j); + } + for (int j = 0; j < n; ++j) { + trans(i, j) /= sum; + } + } + hmm->setTrans(trans); + + Vector pi(n); + for (int i = 0; i < n; ++i) { + pi(i) = 1.0 / static_cast(n); + } + hmm->setPi(pi); + + for (int i = 0; i < n; ++i) { + hmm->setDistribution(i, std::make_unique(i * 2.0, 1.0)); + } + return hmm; +} + +ObservationSet make_obs(const int t, const int n) { + ObservationSet obs(t); + for (int i = 0; i < t; ++i) { + obs(i) = std::sin(i * 0.1) * static_cast(n); + } + return obs; +} + +Timings run_once(const Hmm &hmm, const ObservationSet &obs) { + Timings out; + const std::size_t n = static_cast(hmm.getNumStates()); + const std::size_t t = obs.size(); + + auto total_start = Clock::now(); + + auto stage_start = Clock::now(); + Matrix log_trans(n, n); + for (std::size_t i = 0; i < n; ++i) { + for (std::size_t j = 0; j < n; ++j) { + const double a = hmm.getTrans()(i, j); + log_trans(i, j) = (a > 0.0) ? std::log(a) : LOG_ZERO; + } + } + out.transition_ms = elapsed_ms(stage_start); + + stage_start = Clock::now(); + std::vector obs_copy(t); + for (std::size_t i = 0; i < t; ++i) { + obs_copy[i] = obs(i); + } + const std::span obs_span(obs_copy.data(), t); + out.obs_copy_ms = elapsed_ms(stage_start); + + stage_start = Clock::now(); + std::vector log_emit_buf(n * t); + for (std::size_t i = 0; i < n; ++i) { + hmm.getDistribution(i).getBatchLogProbabilities( + obs_span, std::span(log_emit_buf.data() + i * t, t)); + } + out.emission_ms = elapsed_ms(stage_start); + + stage_start = Clock::now(); + Matrix log_alpha(t, n); + Matrix log_beta(t, n); + out.alloc_ms = elapsed_ms(stage_start); + + stage_start = Clock::now(); + for (std::size_t i = 0; i < n; ++i) { + const double pi = hmm.getPi()(i); + const double log_pi = (pi > 0.0) ? std::log(pi) : LOG_ZERO; + log_alpha(0, i) = log_pi + log_emit_buf[i * t]; + } + const bool use_max_reduce = should_use_max_reduce(n, t); + if (use_max_reduce) { + for (std::size_t ti = 1; ti < t; ++ti) { + for (std::size_t j = 0; j < n; ++j) { + double max_term = LOG_ZERO; + for (std::size_t i = 0; i < n; ++i) { + const double term = log_alpha(ti - 1, i) + log_trans(i, j); + if (term > max_term) { + max_term = term; + } + } + double log_sum = LOG_ZERO; + if (std::isfinite(max_term)) { + double scaled_sum = 0.0; + for (std::size_t i = 0; i < n; ++i) { + const double term = log_alpha(ti - 1, i) + log_trans(i, j); + if (std::isfinite(term)) { + scaled_sum += std::exp(term - max_term); + } + } + if (scaled_sum > 0.0) { + log_sum = max_term + std::log(scaled_sum); + } + } + log_alpha(ti, j) = log_emit_buf[j * t + ti] + log_sum; + } + } + } else { + for (std::size_t ti = 1; ti < t; ++ti) { + for (std::size_t j = 0; j < n; ++j) { + double log_sum = LOG_ZERO; + for (std::size_t i = 0; i < n; ++i) { + log_sum = log_sum_exp_pairwise(log_sum, log_alpha(ti - 1, i) + log_trans(i, j)); + } + log_alpha(ti, j) = log_emit_buf[j * t + ti] + log_sum; + } + } + } + out.forward_ms = elapsed_ms(stage_start); + + stage_start = Clock::now(); + for (std::size_t i = 0; i < n; ++i) { + log_beta(t - 1, i) = 0.0; + } + if (t > 1) { + if (use_max_reduce) { + for (std::size_t ti = t - 2;; --ti) { + for (std::size_t i = 0; i < n; ++i) { + double max_term = LOG_ZERO; + for (std::size_t j = 0; j < n; ++j) { + const double term = + log_trans(i, j) + log_emit_buf[j * t + (ti + 1)] + log_beta(ti + 1, j); + if (term > max_term) { + max_term = term; + } + } + double log_sum = LOG_ZERO; + if (std::isfinite(max_term)) { + double scaled_sum = 0.0; + for (std::size_t j = 0; j < n; ++j) { + const double term = log_trans(i, j) + log_emit_buf[j * t + (ti + 1)] + + log_beta(ti + 1, j); + if (std::isfinite(term)) { + scaled_sum += std::exp(term - max_term); + } + } + if (scaled_sum > 0.0) { + log_sum = max_term + std::log(scaled_sum); + } + } + log_beta(ti, i) = log_sum; + } + if (ti == 0) { + break; + } + } + } else { + for (std::size_t ti = t - 2;; --ti) { + for (std::size_t i = 0; i < n; ++i) { + double log_sum = LOG_ZERO; + for (std::size_t j = 0; j < n; ++j) { + log_sum = log_sum_exp_pairwise(log_sum, log_trans(i, j) + + log_emit_buf[j * t + (ti + 1)] + + log_beta(ti + 1, j)); + } + log_beta(ti, i) = log_sum; + } + if (ti == 0) { + break; + } + } + } + } + out.backward_ms = elapsed_ms(stage_start); + + stage_start = Clock::now(); + double log_probability = LOG_ZERO; + for (std::size_t i = 0; i < n; ++i) { + log_probability = log_sum_exp_pairwise(log_probability, log_alpha(t - 1, i)); + } + out.reduction_ms = elapsed_ms(stage_start); + g_sink_double += log_probability; + + out.total_ms = elapsed_ms(total_start); + return out; +} + +Timings profile_config(const Hmm &hmm, const ObservationSet &obs, const int runs, + const int warmup) { + std::vector transition_ms; + std::vector obs_copy_ms; + std::vector emission_ms; + std::vector alloc_ms; + std::vector forward_ms; + std::vector backward_ms; + std::vector reduction_ms; + std::vector total_ms; + + transition_ms.reserve(static_cast(runs)); + obs_copy_ms.reserve(static_cast(runs)); + emission_ms.reserve(static_cast(runs)); + alloc_ms.reserve(static_cast(runs)); + forward_ms.reserve(static_cast(runs)); + backward_ms.reserve(static_cast(runs)); + reduction_ms.reserve(static_cast(runs)); + total_ms.reserve(static_cast(runs)); + + for (int iter = 0; iter < warmup + runs; ++iter) { + const Timings t = run_once(hmm, obs); + if (iter >= warmup) { + transition_ms.push_back(t.transition_ms); + obs_copy_ms.push_back(t.obs_copy_ms); + emission_ms.push_back(t.emission_ms); + alloc_ms.push_back(t.alloc_ms); + forward_ms.push_back(t.forward_ms); + backward_ms.push_back(t.backward_ms); + reduction_ms.push_back(t.reduction_ms); + total_ms.push_back(t.total_ms); + } + } + + return { + median(transition_ms), median(obs_copy_ms), median(emission_ms), median(alloc_ms), + median(forward_ms), median(backward_ms), median(reduction_ms), median(total_ms), + }; +} + +int parse_positive_int(const char *value, const char *name) { + try { + const int parsed = std::stoi(value); + if (parsed <= 0) { + throw std::invalid_argument("non-positive"); + } + return parsed; + } catch (...) { + throw std::invalid_argument(std::string("Invalid ") + name + ": " + value); + } +} + +std::string mode_name() { +#if defined(LIBHMM_EXPERIMENT_FB_MAX_REDUCE) + return "max_reduce"; +#elif defined(LIBHMM_EXPERIMENT_FB_ADAPTIVE_SELECTOR) + return "adaptive_static_v1"; +#else + return "pairwise"; +#endif +} + +} // namespace + +int main(int argc, char *argv[]) { + int runs = 5; + int warmup = 1; + + fs::path output_path = + fs::path("benchmark-analysis") / ("fb_contour_sweep_" + mode_name() + ".csv"); + + if (argc >= 2) { + output_path = argv[1]; + } + if (argc >= 3) { + runs = parse_positive_int(argv[2], "runs"); + } + if (argc >= 4) { + warmup = parse_positive_int(argv[3], "warmup"); + } + if (argc > 4) { + std::cerr << "Usage:\n"; + std::cerr << " fb_contour_sweep [output_csv] [runs] [warmup]\n"; + return 1; + } + + const std::vector configs = { + {2, 1000}, {2, 10000}, {2, 100000}, {2, 1000000}, {4, 1000}, {4, 10000}, + {4, 100000}, {8, 1000}, {8, 5000}, {8, 10000}, {16, 1000}, {16, 2000}, + {16, 5000}, {32, 500}, {32, 1000}, {32, 2000}, {64, 200}, {64, 500}, + {64, 1000}, {128, 100}, {128, 250}, {128, 500}, + }; + + const fs::path output_dir = output_path.parent_path(); + if (!output_dir.empty()) { + fs::create_directories(output_dir); + } + std::ofstream csv(output_path); + if (!csv) { + std::cerr << "Failed to open output file: " << output_path << "\n"; + return 1; + } + + csv << "mode,n,t,runs,warmup,recurrence_work,emission_work,transition_ms,obs_copy_ms," + "emission_ms,alloc_ms,forward_ms,backward_ms,reduction_ms,total_ms\n"; + + std::cout << "libhmm FB contour sweep\n"; + std::cout << "Mode: " << mode_name() << "\n"; + std::cout << "Runs: " << runs << " (warmup " << warmup << ")\n"; + std::cout << "Output: " << output_path << "\n\n"; + std::cout << std::fixed << std::setprecision(3); + + for (const auto &cfg : configs) { + auto hmm = make_hmm(cfg.n); + auto obs = make_obs(cfg.t, cfg.n); + const Timings timed = profile_config(*hmm, obs, runs, warmup); + + const std::uint64_t recurrence_work = + static_cast(cfg.n) * cfg.n * static_cast(cfg.t - 1); + const std::uint64_t emission_work = + static_cast(cfg.n) * static_cast(cfg.t); + + csv << mode_name() << "," << cfg.n << "," << cfg.t << "," << runs << "," << warmup << "," + << recurrence_work << "," << emission_work << "," << timed.transition_ms << "," + << timed.obs_copy_ms << "," << timed.emission_ms << "," << timed.alloc_ms << "," + << timed.forward_ms << "," << timed.backward_ms << "," << timed.reduction_ms << "," + << timed.total_ms << "\n"; + + const double recurrence_pct = + (timed.total_ms > 0.0) + ? ((timed.forward_ms + timed.backward_ms) * 100.0 / timed.total_ms) + : 0.0; + std::cout << "N=" << std::setw(3) << cfg.n << " T=" << std::setw(8) << cfg.t + << " total=" << std::setw(9) << timed.total_ms << " ms" + << " recur=" << std::setw(6) << recurrence_pct << "%\n"; + } + + csv.close(); + if (g_sink_double == 42.0) { + std::cout << "sink=" << g_sink_double << "\n"; + } + std::cout << "\nDone.\n"; + return 0; +} diff --git a/tools/fb_crossover_sweep.cpp b/tools/fb_crossover_sweep.cpp new file mode 100644 index 0000000..10d6e14 --- /dev/null +++ b/tools/fb_crossover_sweep.cpp @@ -0,0 +1,119 @@ +// tools/fb_crossover_sweep.cpp +// +// Measures ForwardBackwardCalculator runtime for Pairwise vs MaxReduce modes +// at a range of N values using the production calculator (which has SIMD +// transcendental kernels active in the MaxReduce path). +// +// Output: tab-separated table of N, pairwise_ms, maxreduce_ms, ratio. + +#include "libhmm/performance/fb_recurrence_policy.h" +#include "libhmm/calculators/forward_backward_calculator.h" +#include "libhmm/distributions/gaussian_distribution.h" +#include "libhmm/hmm.h" +#include "libhmm/platform/simd_platform.h" + +#include +#include +#include +#include +#include +#include +#include + +using namespace libhmm; +using Clock = std::chrono::high_resolution_clock; +using Millis = std::chrono::duration; + +namespace { + +constexpr int WARMUP_RUNS = 2; +constexpr int TIMED_RUNS = 8; +// T large enough that measurement is stable; small enough to finish quickly. +constexpr int T_DEFAULT = 1000; + +std::unique_ptr make_hmm(int n) { + auto hmm = std::make_unique(n); + Matrix trans(n, n); + for (int i = 0; i < n; ++i) { + double s = 0.0; + for (int j = 0; j < n; ++j) { + trans(i, j) = 0.1 + 0.8 * (0.5 + 0.5 * std::sin(i * 0.7 + j * 1.3)); + s += trans(i, j); + } + for (int j = 0; j < n; ++j) + trans(i, j) /= s; + } + hmm->setTrans(trans); + Vector pi(n); + for (int i = 0; i < n; ++i) + pi(i) = 1.0 / n; + hmm->setPi(pi); + for (int i = 0; i < n; ++i) + hmm->setDistribution(i, std::make_unique(i * 2.0, 1.0)); + return hmm; +} + +ObservationSet make_obs(int t, int n) { + ObservationSet obs(t); + for (int i = 0; i < t; ++i) + obs(i) = std::sin(i * 0.1) * n; + return obs; +} + +double time_mode(Hmm &hmm, const ObservationSet &obs, FbRecurrenceMode mode) { + ForwardBackwardCalculator fbc(hmm, obs); + fbc.setRecurrenceModeOverride(mode); + + // Warmup. + for (int r = 0; r < WARMUP_RUNS; ++r) + fbc.compute(); + + // Timed runs. + std::vector samples; + samples.reserve(TIMED_RUNS); + for (int r = 0; r < TIMED_RUNS; ++r) { + auto t0 = Clock::now(); + fbc.compute(); + samples.push_back(Millis(Clock::now() - t0).count()); + } + + std::sort(samples.begin(), samples.end()); + return samples[samples.size() / 2]; // median +} + +} // anonymous namespace + +int main() { + const std::vector N_VALUES = {2, 3, 4, 5, 6, 7, 8, 10, 12, 16, 24, 32, 48, 64}; + const int T = T_DEFAULT; + + std::cout << "FB mode crossover sweep (T=" << T << ", median of " << TIMED_RUNS << " runs, " + << WARMUP_RUNS << " warmup)\n"; + std::cout << "Active ISA: " << libhmm::performance::simd::feature_string() << "\n\n"; + + std::cout << std::setw(6) << "N" << std::setw(14) << "Pairwise(ms)" << std::setw(14) + << "MaxReduce(ms)" << std::setw(10) << "MR/PW" << std::setw(12) << "Winner" + << "\n"; + std::cout << std::string(56, '-') << "\n"; + + for (int n : N_VALUES) { + auto hmm = make_hmm(n); + auto obs = make_obs(T, n); + + const double pw = time_mode(*hmm, obs, FbRecurrenceMode::Pairwise); + const double mr = time_mode(*hmm, obs, FbRecurrenceMode::MaxReduce); + const double ratio = mr / pw; + const char *winner = (mr < pw) ? "MaxReduce" : "Pairwise"; + const char *current = + (selectFbRecurrenceMode(n, T) == FbRecurrenceMode::MaxReduce) ? " [current]" : ""; + + std::cout << std::setw(6) << n << std::setw(14) << std::fixed << std::setprecision(3) << pw + << std::setw(14) << std::fixed << std::setprecision(3) << mr << std::setw(10) + << std::fixed << std::setprecision(3) << ratio << " " << winner << current + << "\n"; + } + + std::cout << "\n(ratio < 1 = MaxReduce faster; > 1 = Pairwise faster)\n"; + std::cout << "[current] = what selectFbRecurrenceMode() currently picks for this N\n"; + return 0; +} diff --git a/tools/hotspot_breakdown.cpp b/tools/hotspot_breakdown.cpp new file mode 100644 index 0000000..7e59c40 --- /dev/null +++ b/tools/hotspot_breakdown.cpp @@ -0,0 +1,559 @@ +#include "libhmm/hmm.h" +#include "libhmm/distributions/gaussian_distribution.h" +#include "libhmm/math/constants.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace libhmm; +using Clock = std::chrono::high_resolution_clock; +using Millis = std::chrono::duration; + +namespace { + +constexpr double LOG_ZERO = -std::numeric_limits::infinity(); +constexpr std::size_t FB_MAX_REDUCE_FORCE_PAIRWISE_MAX_STATES = 2; +volatile double g_sink_double = 0.0; +volatile int g_sink_int = 0; + +struct Config { + int num_states; + int sequence_length; +}; + +struct ForwardBreakdown { + double transition_ms = 0.0; + double obs_copy_ms = 0.0; + double emission_ms = 0.0; + double buffer_alloc_ms = 0.0; + double forward_ms = 0.0; + double backward_ms = 0.0; + double reduction_ms = 0.0; +}; + +struct ViterbiBreakdown { + double transition_ms = 0.0; + double emission_ms = 0.0; + double emission_relayout_ms = 0.0; + double buffer_alloc_ms = 0.0; + double recursion_ms = 0.0; + double backtrack_ms = 0.0; +}; + +template +double median(std::vector values) { + if (values.empty()) { + return 0.0; + } + std::sort(values.begin(), values.end()); + return static_cast(values[values.size() / 2]); +} + +bool should_use_max_reduce(const std::size_t n, const std::size_t t) noexcept { +#if defined(LIBHMM_EXPERIMENT_FB_MAX_REDUCE) + (void)n; + (void)t; + return true; +#elif defined(LIBHMM_EXPERIMENT_FB_ADAPTIVE_SELECTOR) + (void)t; + return n > FB_MAX_REDUCE_FORCE_PAIRWISE_MAX_STATES; +#else + (void)n; + (void)t; + return false; +#endif +} + +double elapsed_ms(const Clock::time_point start) { + return Millis(Clock::now() - start).count(); +} + +double log_sum_exp(const double a, const double b) noexcept { + if (a == LOG_ZERO) { + return b; + } + if (b == LOG_ZERO) { + return a; + } + if (a > b) { + return a + std::log1p(std::exp(b - a)); + } + return b + std::log1p(std::exp(a - b)); +} + +std::unique_ptr make_hmm(const int n) { + auto hmm = std::make_unique(n); + Matrix trans(n, n); + for (int i = 0; i < n; ++i) { + double sum = 0.0; + for (int j = 0; j < n; ++j) { + trans(i, j) = 0.1 + 0.8 * (0.5 + 0.5 * std::sin(i * 0.7 + j * 1.3)); + sum += trans(i, j); + } + for (int j = 0; j < n; ++j) { + trans(i, j) /= sum; + } + } + hmm->setTrans(trans); + + Vector pi(n); + for (int i = 0; i < n; ++i) { + pi(i) = 1.0 / static_cast(n); + } + hmm->setPi(pi); + + for (int i = 0; i < n; ++i) { + hmm->setDistribution(i, std::make_unique(i * 2.0, 1.0)); + } + + return hmm; +} + +ObservationSet make_obs(const int t, const int n) { + ObservationSet obs(t); + for (int i = 0; i < t; ++i) { + obs(i) = std::sin(i * 0.1) * static_cast(n); + } + return obs; +} + +ForwardBreakdown profile_forward_backward(const Hmm &hmm, const ObservationSet &obs, + const int warmup, const int runs) { + const std::size_t n = static_cast(hmm.getNumStates()); + const std::size_t t = obs.size(); + + std::vector transition_ms; + std::vector obs_copy_ms; + std::vector emission_ms; + std::vector buffer_alloc_ms; + std::vector forward_ms; + std::vector backward_ms; + std::vector reduction_ms; + + transition_ms.reserve(static_cast(runs)); + obs_copy_ms.reserve(static_cast(runs)); + emission_ms.reserve(static_cast(runs)); + buffer_alloc_ms.reserve(static_cast(runs)); + forward_ms.reserve(static_cast(runs)); + backward_ms.reserve(static_cast(runs)); + reduction_ms.reserve(static_cast(runs)); + + for (int iter = 0; iter < warmup + runs; ++iter) { + auto stage_start = Clock::now(); + Matrix log_trans(n, n); + for (std::size_t i = 0; i < n; ++i) { + for (std::size_t j = 0; j < n; ++j) { + const double a = hmm.getTrans()(i, j); + log_trans(i, j) = (a > 0.0) ? std::log(a) : LOG_ZERO; + } + } + const double trans_time = elapsed_ms(stage_start); + + stage_start = Clock::now(); + std::vector obs_copy(t); + for (std::size_t i = 0; i < t; ++i) { + obs_copy[i] = obs(i); + } + const std::span obs_span(obs_copy.data(), t); + const double obs_copy_time = elapsed_ms(stage_start); + + stage_start = Clock::now(); + std::vector log_emit_buf(n * t); + for (std::size_t i = 0; i < n; ++i) { + hmm.getDistribution(i).getBatchLogProbabilities( + obs_span, std::span(log_emit_buf.data() + i * t, t)); + } + const double emission_time = elapsed_ms(stage_start); + + stage_start = Clock::now(); + Matrix log_alpha(t, n); + Matrix log_beta(t, n); + const double buffer_time = elapsed_ms(stage_start); + + stage_start = Clock::now(); + for (std::size_t i = 0; i < n; ++i) { + const double pi = hmm.getPi()(i); + const double log_pi = (pi > 0.0) ? std::log(pi) : LOG_ZERO; + log_alpha(0, i) = log_pi + log_emit_buf[i * t]; + } + const bool use_max_reduce = should_use_max_reduce(n, t); + for (std::size_t ti = 1; ti < t; ++ti) { + for (std::size_t j = 0; j < n; ++j) { + double log_sum = LOG_ZERO; + if (use_max_reduce) { + double max_term = LOG_ZERO; + for (std::size_t i = 0; i < n; ++i) { + const double term = log_alpha(ti - 1, i) + log_trans(i, j); + if (term > max_term) { + max_term = term; + } + } + if (std::isfinite(max_term)) { + double scaled_sum = 0.0; + for (std::size_t i = 0; i < n; ++i) { + const double term = log_alpha(ti - 1, i) + log_trans(i, j); + if (std::isfinite(term)) { + scaled_sum += std::exp(term - max_term); + } + } + if (scaled_sum > 0.0) { + log_sum = max_term + std::log(scaled_sum); + } + } + } else { + for (std::size_t i = 0; i < n; ++i) { + log_sum = log_sum_exp(log_sum, log_alpha(ti - 1, i) + log_trans(i, j)); + } + } + log_alpha(ti, j) = log_emit_buf[j * t + ti] + log_sum; + } + } + const double forward_time = elapsed_ms(stage_start); + + stage_start = Clock::now(); + for (std::size_t i = 0; i < n; ++i) { + log_beta(t - 1, i) = 0.0; + } + if (t > 1) { + for (std::size_t ti = t - 2;; --ti) { + for (std::size_t i = 0; i < n; ++i) { + double log_sum = LOG_ZERO; + if (use_max_reduce) { + double max_term = LOG_ZERO; + for (std::size_t j = 0; j < n; ++j) { + const double term = log_trans(i, j) + log_emit_buf[j * t + (ti + 1)] + + log_beta(ti + 1, j); + if (term > max_term) { + max_term = term; + } + } + if (std::isfinite(max_term)) { + double scaled_sum = 0.0; + for (std::size_t j = 0; j < n; ++j) { + const double term = log_trans(i, j) + + log_emit_buf[j * t + (ti + 1)] + + log_beta(ti + 1, j); + if (std::isfinite(term)) { + scaled_sum += std::exp(term - max_term); + } + } + if (scaled_sum > 0.0) { + log_sum = max_term + std::log(scaled_sum); + } + } + } else { + for (std::size_t j = 0; j < n; ++j) { + log_sum = log_sum_exp(log_sum, log_trans(i, j) + + log_emit_buf[j * t + (ti + 1)] + + log_beta(ti + 1, j)); + } + } + log_beta(ti, i) = log_sum; + } + if (ti == 0) { + break; + } + } + } + const double backward_time = elapsed_ms(stage_start); + + stage_start = Clock::now(); + double log_probability = LOG_ZERO; + for (std::size_t i = 0; i < n; ++i) { + log_probability = log_sum_exp(log_probability, log_alpha(t - 1, i)); + } + const double reduction_time = elapsed_ms(stage_start); + g_sink_double += log_probability; + + if (iter >= warmup) { + transition_ms.push_back(trans_time); + obs_copy_ms.push_back(obs_copy_time); + emission_ms.push_back(emission_time); + buffer_alloc_ms.push_back(buffer_time); + forward_ms.push_back(forward_time); + backward_ms.push_back(backward_time); + reduction_ms.push_back(reduction_time); + } + } + + return { + median(transition_ms), median(obs_copy_ms), median(emission_ms), median(buffer_alloc_ms), + median(forward_ms), median(backward_ms), median(reduction_ms), + }; +} + +ViterbiBreakdown profile_viterbi(const Hmm &hmm, const ObservationSet &obs, const int warmup, + const int runs) { + const std::size_t n = static_cast(hmm.getNumStates()); + const std::size_t t = obs.size(); + + std::vector transition_ms; + std::vector emission_ms; + std::vector emission_relayout_ms; + std::vector buffer_alloc_ms; + std::vector recursion_ms; + std::vector backtrack_ms; + + transition_ms.reserve(static_cast(runs)); + emission_ms.reserve(static_cast(runs)); + emission_relayout_ms.reserve(static_cast(runs)); + buffer_alloc_ms.reserve(static_cast(runs)); + recursion_ms.reserve(static_cast(runs)); + backtrack_ms.reserve(static_cast(runs)); + + for (int iter = 0; iter < warmup + runs; ++iter) { + auto stage_start = Clock::now(); + Matrix log_trans(n, n); + Matrix log_trans_t(n, n); + for (std::size_t i = 0; i < n; ++i) { + for (std::size_t j = 0; j < n; ++j) { + const double a = hmm.getTrans()(i, j); + const double log_a = (a > 0.0) ? std::log(a) : LOG_ZERO; + log_trans(i, j) = log_a; + log_trans_t(j, i) = log_a; + } + } + const double trans_time = elapsed_ms(stage_start); + + stage_start = Clock::now(); + std::vector log_emit_buf(n * t); + const std::span obs_span(obs.data(), t); + for (std::size_t i = 0; i < n; ++i) { + hmm.getDistribution(i).getBatchLogProbabilities( + obs_span, std::span(log_emit_buf.data() + i * t, t)); + } + const double emission_time = elapsed_ms(stage_start); + + stage_start = Clock::now(); + std::vector log_emit_by_time(n * t); + for (std::size_t i = 0; i < n; ++i) { + const double *state_row = log_emit_buf.data() + i * t; + for (std::size_t ti = 0; ti < t; ++ti) { + log_emit_by_time[ti * n + i] = state_row[ti]; + } + } + const double relayout_time = elapsed_ms(stage_start); + + stage_start = Clock::now(); + Matrix log_delta(t, n); + std::vector psi(t * n, 0); + std::vector sequence(t, 0); + const double buffer_time = elapsed_ms(stage_start); + + stage_start = Clock::now(); + const double *log_trans_t_data = log_trans_t.data(); + const double *log_emit_by_time_data = log_emit_by_time.data(); + double *log_delta_data = log_delta.data(); + + const double *emit_row_0 = log_emit_by_time_data; + for (std::size_t i = 0; i < n; ++i) { + const double pi = hmm.getPi()(i); + const double log_pi = (pi > 0.0) ? std::log(pi) : LOG_ZERO; + log_delta_data[i] = log_pi + emit_row_0[i]; + } + + for (std::size_t ti = 1; ti < t; ++ti) { + const double *prev_delta_row = log_delta_data + (ti - 1) * n; + double *delta_row = log_delta_data + ti * n; + const double *emit_row = log_emit_by_time_data + ti * n; + for (std::size_t j = 0; j < n; ++j) { + double max_val = LOG_ZERO; + int max_from = 0; + const double *trans_col = log_trans_t_data + j * n; + for (std::size_t i = 0; i < n; ++i) { + const double value = prev_delta_row[i] + trans_col[i]; + if (value > max_val) { + max_val = value; + max_from = static_cast(i); + } + } + delta_row[j] = max_val + emit_row[j]; + psi[ti * n + j] = max_from; + } + } + + double best_val = LOG_ZERO; + int best_last = 0; + const double *final_delta_row = log_delta_data + (t - 1) * n; + for (std::size_t i = 0; i < n; ++i) { + if (final_delta_row[i] > best_val) { + best_val = final_delta_row[i]; + best_last = static_cast(i); + } + } + sequence[t - 1] = best_last; + const double recursion_time = elapsed_ms(stage_start); + + stage_start = Clock::now(); + if (t > 1) { + for (std::size_t ti = t - 2;; --ti) { + sequence[ti] = psi[(ti + 1) * n + static_cast(sequence[ti + 1])]; + if (ti == 0) { + break; + } + } + } + const double backtrack_time = elapsed_ms(stage_start); + g_sink_double += best_val; + g_sink_int += sequence[0]; + + if (iter >= warmup) { + transition_ms.push_back(trans_time); + emission_ms.push_back(emission_time); + emission_relayout_ms.push_back(relayout_time); + buffer_alloc_ms.push_back(buffer_time); + recursion_ms.push_back(recursion_time); + backtrack_ms.push_back(backtrack_time); + } + } + + return { + median(transition_ms), median(emission_ms), median(emission_relayout_ms), + median(buffer_alloc_ms), median(recursion_ms), median(backtrack_ms), + }; +} + +std::size_t estimate_forward_working_set_bytes(const std::size_t n, const std::size_t t) { + const std::size_t doubles = (n * n) + (3 * n * t) + t; + return doubles * sizeof(double); +} + +std::size_t estimate_viterbi_working_set_bytes(const std::size_t n, const std::size_t t) { + const std::size_t double_count = (2 * n * n) + (3 * n * t); + const std::size_t int_count = (2 * n * t); + return double_count * sizeof(double) + int_count * sizeof(int); +} + +double bytes_to_mib(const std::size_t bytes) { + return static_cast(bytes) / (1024.0 * 1024.0); +} + +void print_phase(const std::string &label, const double value_ms, const double total_ms) { + const double pct = (total_ms > 0.0) ? (100.0 * value_ms / total_ms) : 0.0; + std::cout << " " << std::left << std::setw(28) << label << std::right << std::setw(10) + << value_ms << " ms " << std::setw(6) << pct << "%\n"; +} + +int parse_positive_int(const char *value, const char *arg_name) { + try { + const int parsed = std::stoi(value); + if (parsed <= 0) { + throw std::invalid_argument("non-positive"); + } + return parsed; + } catch (...) { + throw std::invalid_argument(std::string("Invalid ") + arg_name + ": " + value); + } +} + +} // namespace + +int main(int argc, char *argv[]) { + std::vector configs = { + {8, 1000}, + {32, 2000}, + {64, 1000}, + }; + + int warmup = 2; + int runs = 8; + + if (argc == 3 || argc == 4 || argc == 5) { + const int n = parse_positive_int(argv[1], "N"); + const int t = parse_positive_int(argv[2], "T"); + configs = {{n, t}}; + if (argc >= 4) { + runs = parse_positive_int(argv[3], "runs"); + } + if (argc == 5) { + warmup = parse_positive_int(argv[4], "warmup"); + } + } else if (argc != 1) { + std::cerr << "Usage:\n"; + std::cerr << " hotspot_breakdown\n"; + std::cerr << " hotspot_breakdown [runs] [warmup]\n"; + return 1; + } + + std::cout << "libhmm Hotspot Breakdown Tool\n"; + std::cout << "============================\n"; + std::cout << "Median of " << runs << " timed runs (" << warmup << " warmup).\n\n"; +#if defined(LIBHMM_EXPERIMENT_FB_MAX_REDUCE) + std::cout << "Forward-Backward accumulation mode: max-then-reduce (experimental)\n\n"; +#elif defined(LIBHMM_EXPERIMENT_FB_ADAPTIVE_SELECTOR) + std::cout << "Forward-Backward accumulation mode: static adaptive selector (stage-1)\n\n"; +#else + std::cout << "Forward-Backward accumulation mode: pairwise logSumExp (control)\n\n"; +#endif + + std::cout << std::fixed << std::setprecision(3); + + for (const auto &cfg : configs) { + auto hmm = make_hmm(cfg.num_states); + auto obs = make_obs(cfg.sequence_length, cfg.num_states); + + const auto fb = profile_forward_backward(*hmm, obs, warmup, runs); + const auto vt = profile_viterbi(*hmm, obs, warmup, runs); + + const double fb_total = fb.transition_ms + fb.obs_copy_ms + fb.emission_ms + + fb.buffer_alloc_ms + fb.forward_ms + fb.backward_ms + + fb.reduction_ms; + const double vt_total = vt.transition_ms + vt.emission_ms + vt.emission_relayout_ms + + vt.buffer_alloc_ms + vt.recursion_ms + vt.backtrack_ms; + + const std::size_t n = static_cast(cfg.num_states); + const std::size_t t = static_cast(cfg.sequence_length); + const std::uint64_t emission_work = static_cast(n) * t; + const std::uint64_t recurrence_work = + (t > 0) ? static_cast(n) * n * (t - 1) : 0ULL; + + std::cout << "Config: N=" << cfg.num_states << ", T=" << cfg.sequence_length << "\n"; + std::cout << " Estimated recurrence work per pass: " + << static_cast(recurrence_work) / 1.0e6 << " M (N^2*(T-1))\n"; + std::cout << " Emission evaluations per pass: " + << static_cast(emission_work) / 1.0e6 << " M (N*T)\n"; + + std::cout << "\nForward-Backward phase breakdown:\n"; + print_phase("Transition log precompute", fb.transition_ms, fb_total); + print_phase("Observation copy", fb.obs_copy_ms, fb_total); + print_phase("Emission batch eval", fb.emission_ms, fb_total); + print_phase("Alpha/Beta buffer alloc", fb.buffer_alloc_ms, fb_total); + print_phase("Forward recursion", fb.forward_ms, fb_total); + print_phase("Backward recursion", fb.backward_ms, fb_total); + print_phase("Final log-sum-exp reduce", fb.reduction_ms, fb_total); + std::cout << " " << std::left << std::setw(28) << "TOTAL" << std::right << std::setw(10) + << fb_total << " ms\n"; + + std::cout << " Estimated FB working set: " + << bytes_to_mib(estimate_forward_working_set_bytes(n, t)) << " MiB\n"; + + std::cout << "\nViterbi phase breakdown:\n"; + print_phase("Transition log precompute", vt.transition_ms, vt_total); + print_phase("Emission batch eval", vt.emission_ms, vt_total); + print_phase("Emission relayout (T-major)", vt.emission_relayout_ms, vt_total); + print_phase("Delta/Psi buffer alloc", vt.buffer_alloc_ms, vt_total); + print_phase("Viterbi recursion", vt.recursion_ms, vt_total); + print_phase("Backtrack", vt.backtrack_ms, vt_total); + std::cout << " " << std::left << std::setw(28) << "TOTAL" << std::right << std::setw(10) + << vt_total << " ms\n"; + + std::cout << " Estimated Viterbi working set: " + << bytes_to_mib(estimate_viterbi_working_set_bytes(n, t)) << " MiB\n"; + std::cout << "\n------------------------------------------------------------\n\n"; + } + + if (g_sink_int == 42) { + std::cout << "sink=" << g_sink_double << "\n"; + } + + return 0; +}